From 69650fff43aeb045fa60fb03f6f98ac06d935fea Mon Sep 17 00:00:00 2001
From: Abs62 <ottomann@yandex.ru>
Date: Wed, 7 Mar 2018 16:45:54 +0300
Subject: [PATCH] Zim: Add some caching for multi-elements clusters

---
 zim.cc | 262 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 203 insertions(+), 59 deletions(-)

diff --git a/zim.cc b/zim.cc
index cdbbb184..dc9e5634 100644
--- a/zim.cc
+++ b/zim.cc
@@ -45,6 +45,8 @@
 
 namespace Zim {
 
+#define CACHE_SIZE 3
+
 using std::string;
 using std::map;
 using std::vector;
@@ -151,19 +153,52 @@ __attribute__((packed))
 
 // Class for support of split zim files
 
+struct Cache
+{
+  char * data;
+  quint32 clusterNumber;
+  int stamp;
+  int count, size;
+
+  Cache() :
+    data( 0 ),
+    clusterNumber( 0 ),
+    stamp( -1 ),
+    count( 0 ),
+    size( 0 )
+  {}
+};
+
 class ZimFile : public SplitFile::SplitFile
 {
 public:
-
   ZimFile();
   ZimFile( const QString & name );
   ~ZimFile();
 
   virtual void setFileName( const QString & name );
+  bool open();
+  void close()
+  {
+    SplitFile::close();
+    clearCache();
+  }
+  const ZIM_header & header() const
+  { return zimHeader; }
+  string getClusterData( quint32 cluster_nom );
+
+private:
+  ZIM_header zimHeader;
+  Cache cache[ CACHE_SIZE ];
+  int stamp;
+
+  void clearCache();
 };
 
-ZimFile::ZimFile()
+ZimFile::ZimFile() :
+  stamp( 0 )
 {
+  memset( &zimHeader, 0, sizeof( zimHeader ) );
 }
 
 ZimFile::ZimFile( const QString & name )
@@ -173,11 +208,14 @@ ZimFile::ZimFile( const QString & name )
 
 ZimFile::~ZimFile()
 {
+  clearCache();
 }
 
 void ZimFile::setFileName( const QString & name )
 {
   close();
+  memset( &zimHeader, 0, sizeof( zimHeader ) );
+  clearCache();
 
   appendFile( name );
 
@@ -205,6 +243,151 @@ void ZimFile::setFileName( const QString & name )
   }
 }
 
+void ZimFile::clearCache()
+{
+  for( int i = 0; i < CACHE_SIZE; i++ )
+  {
+    if( cache[ i ].data )
+    {
+      free( cache[ i ].data );
+      cache[ i ].data = 0;
+    }
+    cache[ i ].clusterNumber = 0;
+    cache[ i ].stamp = -1;
+    cache[ i ].count = 0;
+    cache[ i ].size = 0;
+  }
+  stamp = 0;
+}
+
+bool ZimFile::open()
+{
+  if( !SplitFile::open( QIODevice::ReadOnly ) )
+    return false;
+
+  memset( &zimHeader, 0, sizeof( zimHeader ) );
+
+  if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) )
+    return false;
+
+  return true;
+}
+
+string ZimFile::getClusterData( quint32 cluster_nom )
+{
+  // Check cache
+  int target = 0;
+  bool found = false;
+  int lastStamp = INT_MAX;
+
+  for( int i = 0; i < CACHE_SIZE; i++ )
+  {
+    if( cache[ i ].clusterNumber == cluster_nom && cache[ i ].count )
+    {
+      found = true;
+      target = i;
+      break;
+    }
+
+    if( cache[ i ].stamp < lastStamp )
+    {
+      lastStamp = cache[ i ].stamp;
+      target = i;
+    }
+  }
+
+  cache[ target ].stamp = ++stamp;
+  if( stamp < 0 )
+  {
+     stamp = 0;
+     for (int i = 0; i < CACHE_SIZE; i++)
+       cache[ i ].stamp = -1;
+  }
+
+  if( found )
+  {
+    // Cache hit
+    return string( cache[ target ].data, cache[ target ].count );
+  }
+
+  // Cache miss, read data from file
+
+  // Read cluster pointers
+
+  quint64 clusters[ 2 ];
+  seek( zimHeader.clusterPtrPos + cluster_nom * 8 );
+  if( read( reinterpret_cast< char * >( clusters ), sizeof(clusters) ) != sizeof(clusters) )
+    return string();
+
+  // Calculate cluster size
+
+  quint64 clusterSize;
+  if( cluster_nom < zimHeader.clusterCount - 1 )
+    clusterSize = clusters[ 1 ] - clusters[ 0 ];
+  else
+    clusterSize = size() - clusters[ 0 ];
+
+  // Read cluster data
+
+  seek( clusters[ 0 ] );
+
+  char compressionType;
+  if( !getChar( &compressionType ) )
+    return string();
+
+  string decompressedData;
+
+  QByteArray data = read( clusterSize );
+
+  if( compressionType == Default || compressionType == None )
+    decompressedData = string( data.data(), data.size() );
+  else
+  if( compressionType == Zlib )
+    decompressedData = decompressZlib( data.constData(), data.size() );
+  else
+  if( compressionType == Bzip2 )
+    decompressedData = decompressBzip2( data.constData(), data.size() );
+  else
+  if( compressionType == Lzma2 )
+    decompressedData = decompressLzma2( data.constData(), data.size() );
+  else
+    return string();
+
+  // Check BLOBs number in the cluster
+  // We cache multi-element clusters only
+
+  quint32 firstOffset;
+  memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
+  quint32 blobCount = ( firstOffset - 4 ) / 4;
+
+  if( blobCount > 1 )
+  {
+    // Fill cache
+    int size = decompressedData.size();
+    if( cache[ target ].count < size )
+    {
+      if( cache[ target ].data )
+        free( cache[ target ].data );
+      cache[ target ].data = ( char * )malloc( size );
+      if( cache[ target ].data )
+        cache[ target ].size = size;
+      else
+      {
+        cache[ target ].size = 0;
+        cache[ target ].count = 0;
+      }
+    }
+    if( cache[ target ].size )
+    {
+      memcpy( cache[ target ].data, decompressedData.c_str(), size );
+      cache[ target ].count = size;
+      cache[ target ].clusterNumber = cluster_nom;
+    }
+  }
+
+  return decompressedData;
+}
+
 // Some supporting functions
 
 bool indexIsOldOrBad( string const & indexFile )
@@ -218,13 +401,14 @@ bool indexIsOldOrBad( string const & indexFile )
          header.formatVersion != CurrentFormatVersion;
 }
 
-quint32 readArticle( ZimFile & file, ZIM_header & header, quint32 articleNumber, string & result,
+quint32 readArticle( ZimFile & file, quint32 articleNumber, string & result,
                      set< quint32 > * loadedArticles = NULL )
 {
   result.clear();
 
   while( 1 )
   {
+    ZIM_header const & header = file.header();
     if( articleNumber >= header.articleCount )
       break;
 
@@ -260,50 +444,17 @@ quint32 readArticle( ZimFile & file, ZIM_header & header, quint32 articleNumber,
     if( file.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(artEntry) - 2 ) != sizeof(artEntry) - 2 )
       break;
 
-    // Read cluster pointers
-
-    quint64 clusters[ 2 ];
-    file.seek( header.clusterPtrPos + (quint64)artEntry.clusterNumber * 8 );
-    if( file.read( reinterpret_cast< char * >( clusters ), sizeof(clusters) ) != sizeof(clusters) )
-      break;
-
-    // Calculate cluster size
-
-    quint64 clusterSize;
-    if( artEntry.clusterNumber < header.clusterCount - 1 )
-      clusterSize = clusters[ 1 ] - clusters[ 0 ];
-    else
-      clusterSize = file.size() - clusters[ 0 ];
-
     // Read cluster data
 
-    file.seek( clusters[ 0 ] );
-
-    char compressionType;
-    if( !file.getChar( &compressionType ) )
-      break;
-
-    string decompressedData;
-
-    QByteArray data = file.read( clusterSize );
-    if( compressionType == Default || compressionType == None )
-      decompressedData = string( data.data(), data.size() );
-    else
-    if( compressionType == Zlib )
-      decompressedData = decompressZlib( data.constData(), data.size() );
-    else
-    if( compressionType == Bzip2 )
-      decompressedData = decompressBzip2( data.constData(), data.size() );
-    else
-    if( compressionType == Lzma2 )
-      decompressedData = decompressLzma2( data.constData(), data.size() );
-    else
+    string decompressedData = file.getClusterData( artEntry.clusterNumber );
+    if( decompressedData.empty() )
       break;
 
     // Take article data from cluster
 
-    quint32 blobCount;
-    memcpy( &blobCount, decompressedData.data(), sizeof(blobCount) );
+    quint32 firstOffset;
+    memcpy( &firstOffset, decompressedData.data(), sizeof(firstOffset) );
+    quint32 blobCount = ( firstOffset - 4 ) / 4;
     if( artEntry.blobNumber > blobCount )
       break;
 
@@ -331,7 +482,6 @@ class ZimDictionary: public BtreeIndexing::BtreeDictionary
     IdxHeader idxHeader;
     string dictionaryName;
     ZimFile df;
-    ZIM_header zimHeader;
     set< quint32 > articlesIndexedForFTS;
     LINKS_TYPE linksType;
 
@@ -420,9 +570,7 @@ ZimDictionary::ZimDictionary( string const & id,
 {
     // Open data file
 
-    df.open( QFile::ReadOnly );
-    memset( &zimHeader, 0, sizeof(zimHeader) );
-    df.read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) );
+    df.open();
 
     // Initialize the indexes
 
@@ -444,7 +592,7 @@ ZimDictionary::ZimDictionary( string const & id,
     }
     else
     {
-      readArticle( df, zimHeader, idxHeader.namePtr, dictionaryName );
+      readArticle( df, idxHeader.namePtr, dictionaryName );
     }
 
     // Full-text search parameters
@@ -491,7 +639,7 @@ quint32 ZimDictionary::loadArticle( quint32 address,
 quint32 ret;
   {
     Mutex::Lock _( zimMutex );
-    ret = readArticle( df, zimHeader, address, articleText, loadedArticles );
+    ret = readArticle( df, address, articleText, loadedArticles );
   }
   if( !rawText )
     articleText = convert( articleText );
@@ -717,7 +865,7 @@ void ZimDictionary::loadResource( std::string & resourceName, string & data )
 
   {
     Mutex::Lock _( zimMutex );
-    readArticle( df, zimHeader, link[ 0 ].articleOffset, data );
+    readArticle( df, link[ 0 ].articleOffset, data );
   }
 }
 
@@ -729,7 +877,7 @@ QString const& ZimDictionary::getDescription()
     string str;
     {
       Mutex::Lock _( zimMutex );
-      readArticle( df, zimHeader, idxHeader.descriptionPtr, str );
+      readArticle( df, idxHeader.descriptionPtr, str );
     }
 
     if( !str.empty() )
@@ -1309,16 +1457,12 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
         {
           gdDebug( "Zim: Building the index for dictionary: %s\n", i->c_str() );
 
-          ZIM_header zh;
 
           unsigned articleCount = 0;
           unsigned wordCount = 0;
 
-          df.open( QFile::ReadOnly );
-
-          qint64 ret = df.read( reinterpret_cast< char * >( &zh ), sizeof( zh ) );
-          if( ret != sizeof( zh ) )
-            throw exCantReadFile( i->c_str() );
+          df.open();
+          ZIM_header const & zh = df.header();
 
           if( zh.magicNumber != 0x44D495A )
             throw exNotZimFile( i->c_str() );
@@ -1368,7 +1512,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
             if( mimetype == 0xFFFF )
             {
               redEntry.mimetype = mimetype;
-              ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 );
+              qint64 ret = df.read( reinterpret_cast< char * >( &redEntry ) + 2, sizeof(RedirectEntry) - 2 );
               if( ret != sizeof(RedirectEntry) - 2 )
                 throw exCantReadFile( i->c_str() );
 
@@ -1377,7 +1521,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
             else
             {
               artEntry.mimetype = mimetype;
-              ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 );
+              qint64 ret = df.read( reinterpret_cast< char * >( &artEntry ) + 2, sizeof(ArticleEntry) - 2 );
               if( ret != sizeof(ArticleEntry) - 2 )
                 throw exCantReadFile( i->c_str() );
 
@@ -1427,7 +1571,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
               {
                 idxHeader.namePtr = n;
                 string name;
-                readArticle( df, zh, n, name );
+                readArticle( df, n, name );
                 initializing.indexingDictionary( name );
               }
               else
@@ -1437,7 +1581,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
               if( url.compare( "Language") == 0 )
               {
                 string lang;
-                readArticle( df, zh, n, lang );
+                readArticle( df, n, lang );
                 if( lang.size() == 2 )
                   idxHeader.langFrom = LangCoder::code2toInt( lang.c_str() );
                 else