At my employer Archelon, the boost::tokenizer
class is a core part of our message processing software, and is
used to tokenizer most of our incoming data streams. We recently
discovered that the
Both offset_separator and char_separator create tokens one character at a time, with operator +=. Using only operator += to construct the token has several advantages:
In our environment, neither advantage is compelling. Our token class
is almost always std::string, which can be much more efficiently
constructed with it's assign method. A small amount of
cleverness allows the tokenizer functions to revert to the slower single
pass algorithm when an input iterator is
provided.
I have conducted some timing tests with the code available here. The program constructs a string of "x"'s with periodic "|"s. The number of "x"'s is the block size, and the number of "|"'s is the number of blocks. After constructing the string the test tokenizes the string back into the blocks of "x"'s. I have assumed that the compiler isn't smart enough to optimize away the entire tokenization effort (I get similar but not as impressive results when each token is sent to a stringstream).
I have the following results for the char_separator classes with gcc 3.3 (with -O3) under Red Hat 9:
blocks | block size |
boost time |
fast time |
ratio |
10 |
100 |
1500 |
240 |
6.2 |
10 |
1000 |
7860 |
2050 |
3.8 |
1 |
1000 |
780 |
230 |
3.4 |
1 |
10 |
20 |
10 |
2 |
For the offset_separator class the results are a bit more dramatic:
blocks |
block size |
boost time |
fast time |
ratio |
10 |
100 |
880 |
40 |
22 |
2 |
1000 |
1220 |
50 |
24 |
The fast tokenizer functions are based on work done at Archelon, and a version that was briefly checked into the boost CVS. It has some restrictions that the current boost functions do not:
Happily, the std::string class meets all of these requirements.
Here is the source code for the fast tokenizer functions. It is also available here as a file.
#ifndef FAST_TOKEN_FUNCTIONS_12
#define FAST_TOKEN_FUNCTIONS_12
// Copyright John R. Bandela 2001.
// Permission to copy, use, modify, sell and distribute this software
// is granted provided this copyright notice appears in all
// copies. This software is provided "as is" without express or
// implied warranty, and with no claim as to its suitability for any
// purpose.
// See http://www.boost.org/libs/tokenizer for documentation.
// Revision History:
// 20 Feb 2002 John Maddock
// Removed using namespace std declarations and added
// workaround for BOOST_NO_STDC_NAMESPACE (the library
// can be safely mixed with regex).
// 06 Feb 2002 Jeremy Siek
// Added char_separator.
// 02 Feb 2002 Jeremy Siek
// Removed tabs and a little cleanup.
// 28 Nov 2003 Robert Zeh
// Converted into "fast" functions that avoid using += when
// the supplied iterator isn't an input_iterator; based on
// some work done at Archelon and a version that was checked into
// the boost CVS for a short period of time.
#include "boost/token_functions.hpp"
/* This is a collection of tokenizer functions for the boost tokenizer
* that are faster then their counterparts in the boost library. They
* are faster because they use assign and operator = rather than
* operator += to build up the tokens. For larger strings this can
* make a difference. For example, with gcc 3.3 under Linux a
* char_separator example using 10 100 character tokens runs 6 times
* faster with fast_char_separator then with boost::char_separator.
* The same example with a 10 character string and 1 token runs twice
* as fast as boost::char_separator.
*
* These performance enhancements convert the tokenizer into a
* multi-pass algorithm instead of a single pass algorithm. A
* fallback method is provided so that input iterators are built a
* character at a time, using +=.
*/
namespace fast_detail {
/* The assign_or_plus_equal struct contains functions that implement
* assign, +=, and clearing based on the iterator type. The
* generic case does nothing for plus_equal and clearing, while
* passing through the call for assign.
*
* When an input iterator is being used, the situation is reversed.
* The assign method does nothing, plus_equal invokes operator +=,
* and the clearing method sets the supplied token to the default
* token constructor's result.
*/
template<class IteratorTag>
struct assign_or_plus_equal {
template<class Iterator, class Token>
static void assign(Iterator b, Iterator e, Token &t) {
t.assign(b, e);
}
template<class Token, class Value>
static void plus_equal(Token &t, const Value &v) {
}
/* If we are doing an assign, there is no need for the
* the clear.
*/
template<class Token>
static void clear(Token &t) {
}
};
template <>
struct assign_or_plus_equal<std::input_iterator_tag> {
template<class Iterator, class Token>
static void assign(Iterator b, Iterator e, Token &t) {
}
template<class Token, class Value>
static void plus_equal(Token &t, const Value &v) {
t += v;
}
template<class Token>
static void clear(Token &t) {
t = Token();
}
};
};
class fast_offset_separator {
private:
std::vector<int> offsets_;
unsigned int current_offset_;
bool wrap_offsets_;
bool return_partial_last_;
public:
template <typename Iter>
fast_offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
bool return_partial_last = true)
: offsets_(begin,end), current_offset_(0),
wrap_offsets_(wrap_offsets),
return_partial_last_(return_partial_last) { }
fast_offset_separator()
: offsets_(1,1), current_offset_(),
wrap_offsets_(true), return_partial_last_(true) { }
void reset() {
current_offset_ = 0;
}
template <typename InputIterator, typename Token>
bool operator()(InputIterator& next, InputIterator end, Token& tok)
{
typedef fast_detail::assign_or_plus_equal<typename InputIterator::iterator_category> assigner;
assert(!offsets_.empty());
if (next == end) {
tok = Token();
return false;
}
if (current_offset_ == offsets_.size())
if (wrap_offsets_)
current_offset_=0;
else {
tok = Token();
return false;
}
int c = offsets_[current_offset_];
int i = 0;
assigner::clear(tok);
InputIterator start(next);
for (; i < c; ++i) {
if (next == end)break;
assigner::plus_equal(tok, *next++);
}
assigner::assign(start, next, tok);
if (!return_partial_last_)
if (i < (c-1) )
return false;
++current_offset_;
return true;
}
};
// The out of the box GCC 2.95 on cygwin does not have a char_traits class.
#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
template <typename Char,
typename Traits = typename std::basic_string<Char>::traits_type >
#else
template <typename Char,
typename Traits = std::basic_string<Char>::traits_type >
#endif
class fast_char_separator
{
typedef std::basic_string<Char,Traits> string_type;
public:
explicit
fast_char_separator(const Char* dropped_delims,
const Char* kept_delims = 0,
boost::empty_token_policy empty_tokens =
boost::drop_empty_tokens)
: m_dropped_delims(dropped_delims),
m_use_ispunct(false),
m_use_isspace(false),
m_empty_tokens(empty_tokens),
m_output_done(false)
{
// Borland workaround
if (kept_delims)
m_kept_delims = kept_delims;
}
// use ispunct() for kept delimiters and isspace for dropped.
explicit
fast_char_separator()
: m_use_ispunct(true),
m_use_isspace(true),
m_empty_tokens(drop_empty_tokens) { }
void reset() { }
private:
template <typename InputIterator, typename Token>
bool tokenize_and_drop(InputIterator& next,
InputIterator end,
Token& tok)
{
typedef fast_detail::assign_or_plus_equal<typename InputIterator::iterator_category> assigner;
// skip past all dropped_delims
for (; next != end && is_dropped(*next); ++next)
{ }
if (next == end) {
tok = Token();
return false;
}
// if we are on a kept_delims move past it and stop
if (is_kept(*next)) {
tok = *next;
++next;
} else {
// append all the non delim characters
assigner::clear(tok);
InputIterator start(next);
for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
assigner::plus_equal(tok, *next);
assigner::assign(start, next, tok);
}
return true;
}
template <typename InputIterator, typename Token>
bool tokenize_and_keep(InputIterator& next,
InputIterator end,
Token& tok)
{
typedef fast_detail::assign_or_plus_equal<typename InputIterator::iterator_category> assigner;
// Handle empty token at the end
if (next == end) {
tok = Token();
if (m_output_done == false) {
m_output_done = true;
return true;
} else
return false;
}
if (is_kept(*next)) {
if (m_output_done == false) {
tok = Token();
m_output_done = true;
}
else {
tok = *next;
++next;
m_output_done = false;
}
}
else if (m_output_done == false && is_dropped(*next)) {
tok = Token();
m_output_done = true;
}
else {
if (is_dropped(*next))
++next;
assigner::clear(tok);
InputIterator start(next);
for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
assigner::plus_equal(tok, *next);
assigner::assign(start, next, tok);
m_output_done = true;
}
return true;
};
public:
template <typename InputIterator, typename Token>
bool operator()(InputIterator& next, InputIterator end, Token& tok)
{
if (m_empty_tokens == boost::drop_empty_tokens) {
return tokenize_and_drop(next, end, tok);
}
else { // m_empty_tokens == keep_empty_tokens
return tokenize_and_keep(next, end, tok);
}
return true;
}
private:
string_type m_kept_delims;
string_type m_dropped_delims;
bool m_use_ispunct;
bool m_use_isspace;
boost::empty_token_policy m_empty_tokens;
bool m_output_done;
bool is_kept(Char E) const
{
if (m_kept_delims.length())
return m_kept_delims.find(E) != string_type::npos;
else if (m_use_ispunct) {
return std::ispunct(E) != 0;
} else
return false;
}
bool is_dropped(Char E) const
{
if (m_dropped_delims.length())
return m_dropped_delims.find(E) != string_type::npos;
else if (m_use_isspace) {
return std::isspace(E) != 0;
} else
return false;
}
};
#endif