goldendict-ng/src/btreeidx.cc

/* This file is (c) 2008-2009 Konstantin Isakov <ikm@users.berlios.de>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */

#include "btreeidx.hh"
#include "folding.hh"
#include "utf8.hh"
#include <math.h>
#include<string.h>
#include <stdlib.h>

//#define __BTREE_USE_LZO
// LZO mode is experimental and unsupported. Tests didn't show any substantial
// speed improvements.

#ifdef __BTREE_USE_LZO
#include <lzo/lzo1x.h>

namespace {
struct __LzoInit
{
  __LzoInit()
  {
    lzo_init();
  }
} __lzoInit;
}

#else
#include <zlib.h>
#endif

namespace BtreeIndexing {

enum
{
  BtreeMinElements = 64,
  BtreeMaxElements = 2048
};

BtreeDictionary::BtreeDictionary( string const & id,
                                  vector< string > const & dictionaryFiles ):
  Dictionary::Class( id, dictionaryFiles ), idxFile( 0 )
{
}

void BtreeDictionary::openIndex( File::Class & file )
{
  indexNodeSize = file.read< uint32_t >();
  rootOffset = file.read< uint32_t >();

  idxFile = &file;
}

vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )
{
  vector< WordArticleLink > result;

  wstring folded = Folding::apply( str );

  bool exactMatch;

  vector< char > leaf;
  uint32_t nextLeaf;

  char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,
                                                           leaf, nextLeaf );

  if ( chainOffset && exactMatch )
  {
    result = readChain( chainOffset );

    antialias( str, result );
  }

  return result;
}


void BtreeDictionary::findExact( wstring const & str,
                                 vector< wstring > & exactMatches,
                                 vector< wstring > & prefixMatches,
                                 unsigned long maxPrefixResults )
  throw( std::exception )
{
  exactMatches.clear();
  prefixMatches.clear();

  wstring folded = Folding::apply( str );

  bool exactMatch;

  vector< char > leaf;
  uint32_t nextLeaf;

  char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,
                                                           leaf, nextLeaf );

  if ( !chainOffset )
    return;

  for( ; ; )
  {
    //printf( "offset = %u, size = %u\n", chainOffset - &leaf.front(), leaf.size() );

    vector< WordArticleLink > chain = readChain( chainOffset );
    vector< wstring > wstrings = convertChainToWstrings( chain );

    wstring resultFolded = Folding::apply( wstrings[ 0 ] );

    if ( resultFolded == folded )
      // Exact match
      exactMatches.insert( exactMatches.end(), wstrings.begin(), wstrings.end() );
    else
    if ( resultFolded.size() > folded.size() && !resultFolded.compare( 0, folded.size(), folded ) )
    {
      // Prefix match
      prefixMatches.insert( prefixMatches.end(), wstrings.begin(), wstrings.end() );

      if ( prefixMatches.size() >= maxPrefixResults )
      {
        // For now we actually allow more than maxPrefixResults if the last
        // chain yield more than one result. That's ok and maybe even more
        // desirable.
        break;
      }
    }
    else
      // No match at all, end this
      break;

    // Fetch new leaf if we're out of chains here

    if ( chainOffset > &leaf.back() )
    {
      // We're past the current leaf, fetch the next one

      //printf( "advancing\n" );

      if ( nextLeaf )
      {
        readNode( nextLeaf, leaf );
        nextLeaf = idxFile->read< uint32_t >();
        chainOffset = &leaf.front() + sizeof( uint32_t );

        uint32_t leafEntries = *(uint32_t *)&leaf.front();

        if ( leafEntries == 0xffffFFFF )
        {
          //printf( "bah!\n" );
          exit( 1 );
        }
      }
      else
        break; // That was the last leaf
    }
  }
}

void BtreeDictionary::readNode( uint32_t offset, vector< char > & out )
{
  idxFile->seek( offset );

  uint32_t uncompressedSize = idxFile->read< uint32_t >();
  uint32_t compressedSize = idxFile->read< uint32_t >();

  //printf( "%x,%x\n", uncompressedSize, compressedSize );

  out.resize( uncompressedSize );

  vector< unsigned char > compressedData( compressedSize );

  idxFile->read( &compressedData.front(), compressedData.size() );

  #ifdef __BTREE_USE_LZO

  lzo_uint decompressedLength = out.size();

  if ( lzo1x_decompress( &compressedData.front(), compressedData.size(),
                         (unsigned char *)&out.front(), &decompressedLength, 0 )
       != LZO_E_OK || decompressedLength != out.size() )
    throw exFailedToDecompressNode();

  #else

  unsigned long decompressedLength = out.size();

  if ( uncompress( (unsigned char *)&out.front(),
                   &decompressedLength,
                   &compressedData.front(),
                   compressedData.size() ) != Z_OK ||
       decompressedLength != out.size() )
    throw exFailedToDecompressNode();
  #endif
}

char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & target,
                                                            bool & exactMatch,
                                                            vector< char > & leaf,
                                                            uint32_t & nextLeaf )
{
  if ( !idxFile )
    throw exIndexWasNotOpened();

  // Lookup the index by traversing the index btree

  vector< char > charBuffer;
  vector< wchar_t > wcharBuffer;
  vector< char > wordsBuffer;

  exactMatch = false;

  // Read a node

  uint32_t currentNodeOffset = rootOffset;

  for( ; ; )
  {
    //printf( "reading node at %x\n", currentNodeOffset );
    readNode( currentNodeOffset, leaf );

    // Is it a leaf or a node?

    uint32_t leafEntries = *(uint32_t *)&leaf.front();

    if ( leafEntries == 0xffffFFFF )
    {
      // A node

      //printf( "=>a node\n" );

      uint32_t const * offsets = (uint32_t *)&leaf.front() + 1;

      char const * ptr = &leaf.front() + sizeof( uint32_t ) +
                         ( indexNodeSize + 1 ) * sizeof( uint32_t );

      unsigned entry;

      for( entry = 0; entry < indexNodeSize; ++entry )
      {
        //printf( "checking node agaist word %s\n", ptr );
        size_t wordSize = strlen( ptr );

        if ( wcharBuffer.size() <= wordSize )
          wcharBuffer.resize( wordSize + 1 );

        long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );

        if ( result < 0 )
          throw Utf8::exCantDecode( ptr );

        wcharBuffer[ result ] = 0;

        int compareResult = target.compare( &wcharBuffer.front() );

        if ( !compareResult )
        {
          // The target string matches the current one.
          // Go to the right, since it's there where we store such results.
          currentNodeOffset = offsets[ entry + 1 ];
          break;
        }
        if ( compareResult < 0 )
        {
          // The target string is smaller than the current one.
          // Go to the left.
          currentNodeOffset = offsets[ entry ];
          break;
        }

        ptr += wordSize + 1;
      }

      if ( entry == indexNodeSize )
      {
        // We iterated through all entries, but our string is larger than
        // all of them. Go the the rightmost node.
        currentNodeOffset = offsets[ entry ];
      }
    }
    else
    {
      //printf( "=>a leaf\n" );
      // A leaf
      nextLeaf = idxFile->read< uint32_t >();

      // Iterate through chains until we find one that matches

      char const * ptr = &leaf.front() + sizeof( uint32_t );

      uint32_t chainSize;

      while( leafEntries-- )
      {
        memcpy( &chainSize, ptr, sizeof( uint32_t ) );
        ptr += sizeof( uint32_t );

        if( chainSize )
        {
          size_t wordSize = strlen( ptr );

          if ( wcharBuffer.size() <= wordSize )
            wcharBuffer.resize( wordSize + 1 );

          //printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );

          long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );

          if ( result < 0 )
            throw Utf8::exCantDecode( ptr );

          wcharBuffer[ result ] = 0;

          wstring foldedWord = Folding::apply( &wcharBuffer.front() );

          int compareResult = target.compare( foldedWord );

          if ( !compareResult )
          {
            // Exact match -- return and be done
            exactMatch = true;

            return ptr - sizeof( uint32_t );
          }
          else
          if ( compareResult < 0 )
          {
            // The target string is smaller than the current one.
            // No point in travering further, return this result.
            
            return ptr - sizeof( uint32_t );
          }
          ptr += chainSize;
        }
      }

      // Well, our target is larger than all the chains here. This would mean
      // that the next leaf is the right one.

      if ( nextLeaf )
      {
        readNode( nextLeaf, leaf );

        nextLeaf = idxFile->read< uint32_t >();

        return &leaf.front() + sizeof( uint32_t );
      }
      else
        return 0; // This was the last leaf
    }
  }
}

vector< WordArticleLink > BtreeDictionary::readChain( char const * & ptr )
{
  uint32_t chainSize;

  memcpy( &chainSize, ptr, sizeof( uint32_t ) );

  ptr += sizeof( uint32_t );

  vector< WordArticleLink > result;

  vector< char > charBuffer;

  while( chainSize )
  {
    string str = ptr;
    ptr += str.size() + 1;

    uint32_t articleOffset;

    memcpy( &articleOffset, ptr, sizeof( uint32_t ) );

    ptr += sizeof( uint32_t );

    result.push_back( WordArticleLink( str, articleOffset ) );

    if ( chainSize < str.size() + 1 + sizeof( uint32_t ) )
      throw exCorruptedChainData();
    else
      chainSize -= str.size() + 1 + sizeof( uint32_t );
  }

  return result;
}

vector< wstring > BtreeDictionary::convertChainToWstrings(
                                      vector< WordArticleLink > const & chain )
{
  vector< wchar_t > wcharBuffer;

  vector< wstring > result;

  for( unsigned x = 0; x < chain.size(); ++x )
  {
    unsigned wordSize = chain[ x ].word.size();

    if ( wcharBuffer.size() <= wordSize )
      wcharBuffer.resize( wordSize + 1 );

    long len = Utf8::decode( chain[ x ].word.data(), wordSize,
                                &wcharBuffer.front() );

    if ( len < 0 )
    {
      fprintf( stderr, "Failed to decode utf8 of a word %s, skipping it.\n",
               chain[ x ].word.c_str() );
      continue;
    }

    wcharBuffer[ len ] = 0;

    result.push_back( &wcharBuffer.front() );
  }

  return result;
}

void BtreeDictionary::antialias( wstring const & str,
                                 vector< WordArticleLink > & chain )
{
  wstring caseFolded = Folding::applySimpleCaseOnly( str );

  for( unsigned x = chain.size(); x--; )
  {
    // If after applying case folding to each word they wouldn't match, we
    // drop the entry.
    if ( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ) !=
         caseFolded )
      chain.erase( chain.begin() + x );
  }
}


/// A function which recursively creates btree node.
/// The nextIndex iterator is being iterated over and increased when building
/// leaf nodes.
static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,
                                size_t indexSize,
                                File::Class & file, size_t maxElements,
                                uint32_t & lastLeafLinkOffset )
{
  // We compress all the node data. This buffer would hold it.
  vector< unsigned char > uncompressedData;

  bool isLeaf = indexSize <= maxElements;

  if ( isLeaf )
  {
    // A leaf.

    uint32_t totalChainsLength = 0;

    IndexedWords::const_iterator nextWord = nextIndex;

    for( unsigned x = indexSize; x--; ++nextWord )
    {
      totalChainsLength += sizeof( uint32_t );

      vector< WordArticleLink > const & chain = nextWord->second;

      for( unsigned y = 0; y < chain.size(); ++y )
        totalChainsLength += chain[ y ].word.size() + 1 + sizeof( uint32_t );
    }

    uncompressedData.resize( sizeof( uint32_t ) + totalChainsLength );

    // First uint32_t indicates that this is a leaf.
    *(uint32_t *)&uncompressedData.front() = indexSize;

    unsigned char * ptr = &uncompressedData.front() + sizeof( uint32_t );

    for( unsigned x = indexSize; x--; ++nextIndex )
    {
      vector< WordArticleLink > const & chain = nextIndex->second;

      unsigned char * saveSizeHere = ptr;

      ptr += sizeof( uint32_t );

      uint32_t size = 0;

      for( unsigned y = 0; y < chain.size(); ++y )
      {
        memcpy( ptr, chain[ y ].word.c_str(), chain[ y ].word.size() + 1 );
        ptr += chain[ y ].word.size() + 1;

        memcpy( ptr, &(chain[ y ].articleOffset), sizeof( uint32_t ) );
        ptr += sizeof( uint32_t );

        size += chain[ y ].word.size() + 1 + sizeof( uint32_t );
      }

      memcpy( saveSizeHere, &size, sizeof( uint32_t ) );
    }
  }
  else
  {
    // A node which will have children.

    uncompressedData.resize( sizeof( uint32_t ) + ( maxElements + 1 ) * sizeof( uint32_t ) );

    // First uint32_t indicates that this is a node.
    *(uint32_t *)&uncompressedData.front() = 0xffffFFFF;

    unsigned prevEntry = 0;

    vector< char > charBuffer;

    for( unsigned x = 0; x < maxElements; ++x )
    {
      unsigned curEntry = (uint64_t) indexSize * ( x + 1 ) / ( maxElements + 1 );

      uint32_t offset = buildBtreeNode( nextIndex,
                                        curEntry - prevEntry,
                                        file, maxElements,
                                        lastLeafLinkOffset );

      memcpy( &uncompressedData.front() + sizeof( uint32_t ) + x * sizeof( uint32_t ), &offset, sizeof( uint32_t ) );

      if ( charBuffer.size() < nextIndex->first.size() * 4 )
        charBuffer.resize( nextIndex->first.size() * 4 );

      size_t sz = Utf8::encode( nextIndex->first.data(), nextIndex->first.size(),
                                &charBuffer.front() );

      size_t prevSize = uncompressedData.size();
      uncompressedData.resize( prevSize + sz + 1 );

      memcpy( &uncompressedData.front() + prevSize, &charBuffer.front(), sz );

      uncompressedData.back() = 0;

      prevEntry = curEntry;
    }

    // Rightmost child
    uint32_t offset = buildBtreeNode( nextIndex,
                                      indexSize - prevEntry,
                                      file, maxElements,
                                      lastLeafLinkOffset );
    memcpy( &uncompressedData.front() + sizeof( uint32_t ) +
            maxElements * sizeof( uint32_t ), &offset, sizeof( offset ) );
  }

  // Save the result.

  #ifdef __BTREE_USE_LZO

  vector< unsigned char > compressedData( uncompressedData.size() + uncompressedData.size() / 16 + 64 + 3 );

  char workMem[ LZO1X_1_MEM_COMPRESS ];

  lzo_uint compressedSize;

  if ( lzo1x_1_compress( &uncompressedData.front(), uncompressedData.size(),
                         &compressedData.front(), &compressedSize, workMem )
       != LZO_E_OK )
  {
    fprintf( stderr, "Failed to compress btree node.\n" );
    abort();
  }

  #else

  vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );

  unsigned long compressedSize = compressedData.size();

  if ( compress( &compressedData.front(), &compressedSize,
                 &uncompressedData.front(), uncompressedData.size() ) != Z_OK )
  {
    fprintf( stderr, "Failed to compress btree node.\n" );
    abort();
  }

  #endif

  uint32_t offset = file.tell();

  file.write< uint32_t >( uncompressedData.size() );
  file.write< uint32_t >( compressedSize );
  file.write( &compressedData.front(), compressedSize );

  if ( isLeaf )
  {
    // A link to the next leef, which is zero and which will be updated
    // should we happen to have another leaf.
    
    file.write( ( uint32_t ) 0 );

    uint32_t here = file.tell();

    if ( lastLeafLinkOffset )
    {
      // Update the previous leaf to have the offset of this one.
      file.seek( lastLeafLinkOffset );
      file.write( offset );
      file.seek( here );
    }

    // Make sure next leaf knows where to write its offset for us.
    lastLeafLinkOffset = here - sizeof( uint32_t );
  }

  return offset;
}

uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )
{
  // We try to stick to two-level tree for most dictionaries. Try finding
  // the right size for it.

  size_t btreeMaxElements = ( (size_t) sqrt( indexedWords.size() ) ) + 1;

  if ( btreeMaxElements < BtreeMinElements )
    btreeMaxElements = BtreeMinElements;
  else
  if ( btreeMaxElements > BtreeMaxElements )
    btreeMaxElements = BtreeMaxElements;

  printf( "Building a tree of %u elements\n", btreeMaxElements );

  IndexedWords::const_iterator nextIndex = indexedWords.begin();

  uint32_t lastLeafOffset = 0;

  uint32_t rootOffset = buildBtreeNode( nextIndex, indexedWords.size(),
                                        file, btreeMaxElements,
                                        lastLeafOffset );

  // We need to save btreeMaxElements. For simplicity, we just save it here
  // along with root offset, and then return that record's offset as the
  // offset of the index itself.

  uint32_t indexOffset = file.tell();

  file.write( btreeMaxElements );
  file.write( rootOffset );

  return indexOffset;
}


}
Mass-update email to point to a @berlios one. 2009-02-05 14:21:47 +00:00			`/* This file is (c) 2008-2009 Konstantin Isakov <ikm@users.berlios.de>`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */`

			`#include "btreeidx.hh"`
			`#include "folding.hh"`
			`#include "utf8.hh"`
			`#include <math.h>`
Applied a patch from Heinz van Saanen to fix several missing #includes. 2009-01-30 01:20:37 +00:00			`#include<string.h>`
			`#include <stdlib.h>`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00
			`//#define __BTREE_USE_LZO`
			`// LZO mode is experimental and unsupported. Tests didn't show any substantial`
			`// speed improvements.`

			`#ifdef __BTREE_USE_LZO`
			`#include <lzo/lzo1x.h>`

			`namespace {`
			`struct __LzoInit`
			`{`
			`__LzoInit()`
			`{`
			`lzo_init();`
			`}`
			`} __lzoInit;`
			`}`

			`#else`
			`#include <zlib.h>`
			`#endif`

			`namespace BtreeIndexing {`

			`enum`
			`{`
			`BtreeMinElements = 64,`
			`BtreeMaxElements = 2048`
			`};`

			`BtreeDictionary::BtreeDictionary( string const & id,`
			`vector< string > const & dictionaryFiles ):`
			`Dictionary::Class( id, dictionaryFiles ), idxFile( 0 )`
			`{`
			`}`

			`void BtreeDictionary::openIndex( File::Class & file )`
			`{`
			`indexNodeSize = file.read< uint32_t >();`
			`rootOffset = file.read< uint32_t >();`

			`idxFile = &file;`
			`}`

			`vector< WordArticleLink > BtreeDictionary::findArticles( wstring const & str )`
			`{`
			`vector< WordArticleLink > result;`

			`wstring folded = Folding::apply( str );`

			`bool exactMatch;`

			`vector< char > leaf;`
			`uint32_t nextLeaf;`

			`char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,`
			`leaf, nextLeaf );`

			`if ( chainOffset && exactMatch )`
			`{`
			`result = readChain( chainOffset );`

			`antialias( str, result );`
			`}`

			`return result;`
			`}`


			`void BtreeDictionary::findExact( wstring const & str,`
			`vector< wstring > & exactMatches,`
			`vector< wstring > & prefixMatches,`
			`unsigned long maxPrefixResults )`
			`throw( std::exception )`
			`{`
			`exactMatches.clear();`
			`prefixMatches.clear();`

			`wstring folded = Folding::apply( str );`

			`bool exactMatch;`

			`vector< char > leaf;`
			`uint32_t nextLeaf;`

			`char const * chainOffset = findChainOffsetExactOrPrefix( folded, exactMatch,`
			`leaf, nextLeaf );`

			`if ( !chainOffset )`
			`return;`

			`for( ; ; )`
			`{`
			`//printf( "offset = %u, size = %u\n", chainOffset - &leaf.front(), leaf.size() );`

			`vector< WordArticleLink > chain = readChain( chainOffset );`
			`vector< wstring > wstrings = convertChainToWstrings( chain );`

			`wstring resultFolded = Folding::apply( wstrings[ 0 ] );`

			`if ( resultFolded == folded )`
			`// Exact match`
			`exactMatches.insert( exactMatches.end(), wstrings.begin(), wstrings.end() );`
			`else`
			`if ( resultFolded.size() > folded.size() && !resultFolded.compare( 0, folded.size(), folded ) )`
			`{`
			`// Prefix match`
			`prefixMatches.insert( prefixMatches.end(), wstrings.begin(), wstrings.end() );`

			`if ( prefixMatches.size() >= maxPrefixResults )`
			`{`
			`// For now we actually allow more than maxPrefixResults if the last`
			`// chain yield more than one result. That's ok and maybe even more`
			`// desirable.`
			`break;`
			`}`
			`}`
			`else`
			`// No match at all, end this`
			`break;`

			`// Fetch new leaf if we're out of chains here`

			`if ( chainOffset > &leaf.back() )`
			`{`
			`// We're past the current leaf, fetch the next one`

			`//printf( "advancing\n" );`

			`if ( nextLeaf )`
			`{`
			`readNode( nextLeaf, leaf );`
			`nextLeaf = idxFile->read< uint32_t >();`
			`chainOffset = &leaf.front() + sizeof( uint32_t );`

			`uint32_t leafEntries = (uint32_t )&leaf.front();`

			`if ( leafEntries == 0xffffFFFF )`
			`{`
			`//printf( "bah!\n" );`
			`exit( 1 );`
			`}`
			`}`
			`else`
			`break; // That was the last leaf`
			`}`
			`}`
			`}`

			`void BtreeDictionary::readNode( uint32_t offset, vector< char > & out )`
			`{`
			`idxFile->seek( offset );`

			`uint32_t uncompressedSize = idxFile->read< uint32_t >();`
			`uint32_t compressedSize = idxFile->read< uint32_t >();`

			`//printf( "%x,%x\n", uncompressedSize, compressedSize );`

			`out.resize( uncompressedSize );`

			`vector< unsigned char > compressedData( compressedSize );`

			`idxFile->read( &compressedData.front(), compressedData.size() );`

			`#ifdef __BTREE_USE_LZO`

			`lzo_uint decompressedLength = out.size();`

			`if ( lzo1x_decompress( &compressedData.front(), compressedData.size(),`
			`(unsigned char *)&out.front(), &decompressedLength, 0 )`
			`!= LZO_E_OK \|\| decompressedLength != out.size() )`
			`throw exFailedToDecompressNode();`

			`#else`

			`unsigned long decompressedLength = out.size();`

			`if ( uncompress( (unsigned char *)&out.front(),`
			`&decompressedLength,`
			`&compressedData.front(),`
			`compressedData.size() ) != Z_OK \|\|`
			`decompressedLength != out.size() )`
			`throw exFailedToDecompressNode();`
			`#endif`
			`}`

			`char const * BtreeDictionary::findChainOffsetExactOrPrefix( wstring const & target,`
			`bool & exactMatch,`
			`vector< char > & leaf,`
			`uint32_t & nextLeaf )`
			`{`
			`if ( !idxFile )`
			`throw exIndexWasNotOpened();`

			`// Lookup the index by traversing the index btree`

			`vector< char > charBuffer;`
			`vector< wchar_t > wcharBuffer;`
			`vector< char > wordsBuffer;`

			`exactMatch = false;`

			`// Read a node`

			`uint32_t currentNodeOffset = rootOffset;`

			`for( ; ; )`
			`{`
			`//printf( "reading node at %x\n", currentNodeOffset );`
			`readNode( currentNodeOffset, leaf );`

			`// Is it a leaf or a node?`

			`uint32_t leafEntries = (uint32_t )&leaf.front();`

			`if ( leafEntries == 0xffffFFFF )`
			`{`
			`// A node`

			`//printf( "=>a node\n" );`

			`uint32_t const * offsets = (uint32_t *)&leaf.front() + 1;`

			`char const * ptr = &leaf.front() + sizeof( uint32_t ) +`
			`( indexNodeSize + 1 ) * sizeof( uint32_t );`

			`unsigned entry;`

			`for( entry = 0; entry < indexNodeSize; ++entry )`
			`{`
			`//printf( "checking node agaist word %s\n", ptr );`
			`size_t wordSize = strlen( ptr );`

			`if ( wcharBuffer.size() <= wordSize )`
			`wcharBuffer.resize( wordSize + 1 );`

			`long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );`

			`if ( result < 0 )`
			`throw Utf8::exCantDecode( ptr );`

			`wcharBuffer[ result ] = 0;`

			`int compareResult = target.compare( &wcharBuffer.front() );`

			`if ( !compareResult )`
			`{`
			`// The target string matches the current one.`
			`// Go to the right, since it's there where we store such results.`
			`currentNodeOffset = offsets[ entry + 1 ];`
			`break;`
			`}`
			`if ( compareResult < 0 )`
			`{`
			`// The target string is smaller than the current one.`
			`// Go to the left.`
			`currentNodeOffset = offsets[ entry ];`
			`break;`
			`}`

			`ptr += wordSize + 1;`
			`}`

			`if ( entry == indexNodeSize )`
			`{`
			`// We iterated through all entries, but our string is larger than`
			`// all of them. Go the the rightmost node.`
			`currentNodeOffset = offsets[ entry ];`
			`}`
			`}`
			`else`
			`{`
			`//printf( "=>a leaf\n" );`
			`// A leaf`
			`nextLeaf = idxFile->read< uint32_t >();`

			`// Iterate through chains until we find one that matches`

			`char const * ptr = &leaf.front() + sizeof( uint32_t );`

			`uint32_t chainSize;`

			`while( leafEntries-- )`
			`{`
			`memcpy( &chainSize, ptr, sizeof( uint32_t ) );`
			`ptr += sizeof( uint32_t );`

			`if( chainSize )`
			`{`
			`size_t wordSize = strlen( ptr );`

			`if ( wcharBuffer.size() <= wordSize )`
			`wcharBuffer.resize( wordSize + 1 );`

			`//printf( "checking agaist word %s, left = %u\n", ptr, leafEntries );`

			`long result = Utf8::decode( ptr, wordSize, &wcharBuffer.front() );`

			`if ( result < 0 )`
			`throw Utf8::exCantDecode( ptr );`

			`wcharBuffer[ result ] = 0;`

			`wstring foldedWord = Folding::apply( &wcharBuffer.front() );`

			`int compareResult = target.compare( foldedWord );`

			`if ( !compareResult )`
			`{`
			`// Exact match -- return and be done`
			`exactMatch = true;`

			`return ptr - sizeof( uint32_t );`
			`}`
			`else`
			`if ( compareResult < 0 )`
			`{`
			`// The target string is smaller than the current one.`
			`// No point in travering further, return this result.`

			`return ptr - sizeof( uint32_t );`
			`}`
			`ptr += chainSize;`
			`}`
			`}`

			`// Well, our target is larger than all the chains here. This would mean`
			`// that the next leaf is the right one.`

			`if ( nextLeaf )`
			`{`
			`readNode( nextLeaf, leaf );`

			`nextLeaf = idxFile->read< uint32_t >();`

			`return &leaf.front() + sizeof( uint32_t );`
			`}`
			`else`
			`return 0; // This was the last leaf`
			`}`
			`}`
			`}`

			`vector< WordArticleLink > BtreeDictionary::readChain( char const * & ptr )`
			`{`
			`uint32_t chainSize;`

			`memcpy( &chainSize, ptr, sizeof( uint32_t ) );`

			`ptr += sizeof( uint32_t );`

			`vector< WordArticleLink > result;`

			`vector< char > charBuffer;`

			`while( chainSize )`
			`{`
			`string str = ptr;`
			`ptr += str.size() + 1;`

			`uint32_t articleOffset;`

			`memcpy( &articleOffset, ptr, sizeof( uint32_t ) );`

			`ptr += sizeof( uint32_t );`

			`result.push_back( WordArticleLink( str, articleOffset ) );`

			`if ( chainSize < str.size() + 1 + sizeof( uint32_t ) )`
			`throw exCorruptedChainData();`
			`else`
			`chainSize -= str.size() + 1 + sizeof( uint32_t );`
			`}`

			`return result;`
			`}`

			`vector< wstring > BtreeDictionary::convertChainToWstrings(`
			`vector< WordArticleLink > const & chain )`
			`{`
			`vector< wchar_t > wcharBuffer;`

			`vector< wstring > result;`

			`for( unsigned x = 0; x < chain.size(); ++x )`
			`{`
			`unsigned wordSize = chain[ x ].word.size();`

			`if ( wcharBuffer.size() <= wordSize )`
			`wcharBuffer.resize( wordSize + 1 );`

			`long len = Utf8::decode( chain[ x ].word.data(), wordSize,`
			`&wcharBuffer.front() );`

			`if ( len < 0 )`
			`{`
			`fprintf( stderr, "Failed to decode utf8 of a word %s, skipping it.\n",`
			`chain[ x ].word.c_str() );`
			`continue;`
			`}`

			`wcharBuffer[ len ] = 0;`

			`result.push_back( &wcharBuffer.front() );`
			`}`

			`return result;`
			`}`

			`void BtreeDictionary::antialias( wstring const & str,`
			`vector< WordArticleLink > & chain )`
			`{`
			`wstring caseFolded = Folding::applySimpleCaseOnly( str );`

			`for( unsigned x = chain.size(); x--; )`
			`{`
			`// If after applying case folding to each word they wouldn't match, we`
			`// drop the entry.`
			`if ( Folding::applySimpleCaseOnly( Utf8::decode( chain[ x ].word ) ) !=`
			`caseFolded )`
			`chain.erase( chain.begin() + x );`
			`}`
			`}`


			`/// A function which recursively creates btree node.`
			`/// The nextIndex iterator is being iterated over and increased when building`
			`/// leaf nodes.`
			`static uint32_t buildBtreeNode( IndexedWords::const_iterator & nextIndex,`
			`size_t indexSize,`
			`File::Class & file, size_t maxElements,`
			`uint32_t & lastLeafLinkOffset )`
			`{`
			`// We compress all the node data. This buffer would hold it.`
			`vector< unsigned char > uncompressedData;`

			`bool isLeaf = indexSize <= maxElements;`

			`if ( isLeaf )`
			`{`
			`// A leaf.`

			`uint32_t totalChainsLength = 0;`

			`IndexedWords::const_iterator nextWord = nextIndex;`

			`for( unsigned x = indexSize; x--; ++nextWord )`
			`{`
			`totalChainsLength += sizeof( uint32_t );`

			`vector< WordArticleLink > const & chain = nextWord->second;`

			`for( unsigned y = 0; y < chain.size(); ++y )`
			`totalChainsLength += chain[ y ].word.size() + 1 + sizeof( uint32_t );`
			`}`

			`uncompressedData.resize( sizeof( uint32_t ) + totalChainsLength );`

			`// First uint32_t indicates that this is a leaf.`
			`(uint32_t )&uncompressedData.front() = indexSize;`

			`unsigned char * ptr = &uncompressedData.front() + sizeof( uint32_t );`

			`for( unsigned x = indexSize; x--; ++nextIndex )`
			`{`
			`vector< WordArticleLink > const & chain = nextIndex->second;`

			`unsigned char * saveSizeHere = ptr;`

			`ptr += sizeof( uint32_t );`

			`uint32_t size = 0;`

			`for( unsigned y = 0; y < chain.size(); ++y )`
			`{`
			`memcpy( ptr, chain[ y ].word.c_str(), chain[ y ].word.size() + 1 );`
			`ptr += chain[ y ].word.size() + 1;`

			`memcpy( ptr, &(chain[ y ].articleOffset), sizeof( uint32_t ) );`
			`ptr += sizeof( uint32_t );`

			`size += chain[ y ].word.size() + 1 + sizeof( uint32_t );`
			`}`

			`memcpy( saveSizeHere, &size, sizeof( uint32_t ) );`
			`}`
			`}`
			`else`
			`{`
			`// A node which will have children.`

			`uncompressedData.resize( sizeof( uint32_t ) + ( maxElements + 1 ) * sizeof( uint32_t ) );`

			`// First uint32_t indicates that this is a node.`
			`(uint32_t )&uncompressedData.front() = 0xffffFFFF;`

			`unsigned prevEntry = 0;`

			`vector< char > charBuffer;`

			`for( unsigned x = 0; x < maxElements; ++x )`
			`{`
			`unsigned curEntry = (uint64_t) indexSize * ( x + 1 ) / ( maxElements + 1 );`

			`uint32_t offset = buildBtreeNode( nextIndex,`
			`curEntry - prevEntry,`
			`file, maxElements,`
			`lastLeafLinkOffset );`

			`memcpy( &uncompressedData.front() + sizeof( uint32_t ) + x * sizeof( uint32_t ), &offset, sizeof( uint32_t ) );`

			`if ( charBuffer.size() < nextIndex->first.size() * 4 )`
			`charBuffer.resize( nextIndex->first.size() * 4 );`

			`size_t sz = Utf8::encode( nextIndex->first.data(), nextIndex->first.size(),`
			`&charBuffer.front() );`

			`size_t prevSize = uncompressedData.size();`
			`uncompressedData.resize( prevSize + sz + 1 );`

			`memcpy( &uncompressedData.front() + prevSize, &charBuffer.front(), sz );`

			`uncompressedData.back() = 0;`

			`prevEntry = curEntry;`
			`}`

			`// Rightmost child`
			`uint32_t offset = buildBtreeNode( nextIndex,`
			`indexSize - prevEntry,`
			`file, maxElements,`
			`lastLeafLinkOffset );`
			`memcpy( &uncompressedData.front() + sizeof( uint32_t ) +`
			`maxElements * sizeof( uint32_t ), &offset, sizeof( offset ) );`
			`}`

			`// Save the result.`

			`#ifdef __BTREE_USE_LZO`

			`vector< unsigned char > compressedData( uncompressedData.size() + uncompressedData.size() / 16 + 64 + 3 );`

			`char workMem[ LZO1X_1_MEM_COMPRESS ];`

			`lzo_uint compressedSize;`

			`if ( lzo1x_1_compress( &uncompressedData.front(), uncompressedData.size(),`
			`&compressedData.front(), &compressedSize, workMem )`
			`!= LZO_E_OK )`
			`{`
			`fprintf( stderr, "Failed to compress btree node.\n" );`
			`abort();`
			`}`

			`#else`

			`vector< unsigned char > compressedData( compressBound( uncompressedData.size() ) );`

			`unsigned long compressedSize = compressedData.size();`

			`if ( compress( &compressedData.front(), &compressedSize,`
			`&uncompressedData.front(), uncompressedData.size() ) != Z_OK )`
			`{`
			`fprintf( stderr, "Failed to compress btree node.\n" );`
			`abort();`
			`}`

			`#endif`

			`uint32_t offset = file.tell();`

			`file.write< uint32_t >( uncompressedData.size() );`
			`file.write< uint32_t >( compressedSize );`
			`file.write( &compressedData.front(), compressedSize );`

			`if ( isLeaf )`
			`{`
			`// A link to the next leef, which is zero and which will be updated`
			`// should we happen to have another leaf.`

			`file.write( ( uint32_t ) 0 );`

			`uint32_t here = file.tell();`

			`if ( lastLeafLinkOffset )`
			`{`
			`// Update the previous leaf to have the offset of this one.`
			`file.seek( lastLeafLinkOffset );`
			`file.write( offset );`
			`file.seek( here );`
			`}`

			`// Make sure next leaf knows where to write its offset for us.`
			`lastLeafLinkOffset = here - sizeof( uint32_t );`
			`}`

			`return offset;`
			`}`

			`uint32_t buildIndex( IndexedWords const & indexedWords, File::Class & file )`
			`{`
			`// We try to stick to two-level tree for most dictionaries. Try finding`
			`// the right size for it.`

			`size_t btreeMaxElements = ( (size_t) sqrt( indexedWords.size() ) ) + 1;`

			`if ( btreeMaxElements < BtreeMinElements )`
			`btreeMaxElements = BtreeMinElements;`
			`else`
			`if ( btreeMaxElements > BtreeMaxElements )`
			`btreeMaxElements = BtreeMaxElements;`

			`printf( "Building a tree of %u elements\n", btreeMaxElements );`

			`IndexedWords::const_iterator nextIndex = indexedWords.begin();`

			`uint32_t lastLeafOffset = 0;`

			`uint32_t rootOffset = buildBtreeNode( nextIndex, indexedWords.size(),`
			`file, btreeMaxElements,`
			`lastLeafOffset );`

			`// We need to save btreeMaxElements. For simplicity, we just save it here`
			`// along with root offset, and then return that record's offset as the`
			`// offset of the index itself.`

			`uint32_t indexOffset = file.tell();`

			`file.write( btreeMaxElements );`
			`file.write( rootOffset );`

			`return indexOffset;`
			`}`


			`}`