Support 64-bit index in AARD dictionaries

2024-11-24 00:14:06 +00:00 · 2012-09-17 19:47:45 +04:00 · 2012-09-17 19:47:45 +04:00 · 3abb74ad51
parent bc7a8b39c8
commit 3abb74ad51
1 changed files with 44 additions and 8 deletions
--- a/aard.cc
+++ b/aard.cc
@ -97,6 +97,7 @@ __attribute__((packed))

 typedef BigEndian< uint16_t > uint16_be;
 typedef BigEndian< uint32_t > uint32_be;
+typedef BigEndian< uint64_t > uint64_be;

 /// AAR file header
 struct AAR_header
@ -129,6 +130,16 @@ __attribute__((packed))
 #endif
 ;

+struct IndexElement64
+{
+    uint32_be wordOffset;
+    uint64_be articleOffset;
+}
+#ifndef _MSC_VER
+__attribute__((packed))
+#endif
+;
+
 enum
 {
  Signature = 0x58524141, // AARX on little-endian, XRAA on big-endian
@ -795,13 +806,23 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
      {
        try
        {
+          {
+            QFileInfo info( FsEncoding::decode( i->c_str() ) );
+            if( info.size() > ULONG_MAX )
+            {
+              DPRINTF( "File %s is too large", i->c_str() );
+              continue;
+            }
+          }
+
          File::Class df( *i, "rb" );

          AAR_header dictHeader;

          df.read( &dictHeader, sizeof(dictHeader) );
+          bool has64bitIndex = !strncmp( dictHeader.indexItemFormat, ">LQ", 4 );
          if( strncmp( dictHeader.signature, "aard", 4 )
-              || strncmp( dictHeader.indexItemFormat, ">LL", 4 )
+              || ( !has64bitIndex && strncmp( dictHeader.indexItemFormat, ">LL", 4 ) )
              || strncmp( dictHeader.keyLengthFormat, ">H", 2 )
              || strncmp( dictHeader.articleLengthFormat, ">L", 2) )
          {
@ -877,17 +898,32 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
          uint32_t wordCount = dictHeader.wordsCount;
          set< uint32_t > articleOffsets;
          uint32_t pos = df.tell();
-          uint32_t wordsBase = pos + wordCount * sizeof( IndexElement );
+          uint32_t wordsBase = pos + wordCount * ( has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement ) );
          uint32_t articlesBase = dictHeader.articleOffset;

          for( uint32_t j = 0; j < wordCount; j++ )
          {
-            IndexElement el;
+            uint32_t articleOffset;
+            uint32_t wordOffset;

-            df.seek( pos );
-            df.read( &el, sizeof(el) );
-            uint32_t articleOffset = articlesBase + el.articleOffset;
-            uint32_t wordOffset = wordsBase + el.wordOffset;
+            if( has64bitIndex )
+            {
+              IndexElement64 el64;
+
+              df.seek( pos );
+              df.read( &el64, sizeof(el64) );
+              articleOffset = articlesBase + el64.articleOffset;
+              wordOffset = wordsBase + el64.wordOffset;
+            }
+            else
+            {
+              IndexElement el;
+
+              df.seek( pos );
+              df.read( &el, sizeof(el) );
+              articleOffset = articlesBase + el.articleOffset;
+              wordOffset = wordsBase + el.wordOffset;
+            }

            df.seek( wordOffset );

@ -903,7 +939,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
            // Insert new entry
            indexedWords.addWord( Utf8::decode( string( data.data(), wordSize ) ), articleOffset);

-            pos += sizeof(el);
+            pos += has64bitIndex ? sizeof( IndexElement64 ) : sizeof( IndexElement );
          }
          // Finish with the chunks