From 7dae5186abb1182ba5128de238561b5ea1bb05a4 Mon Sep 17 00:00:00 2001 From: xiaoyifang Date: Mon, 4 Apr 2022 10:27:47 +0800 Subject: [PATCH] fix: some zim dictionary can not parse correctly --- zim.cc | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/zim.cc b/zim.cc index 7084a5c4..9babb0ab 100644 --- a/zim.cc +++ b/zim.cc @@ -1612,14 +1612,31 @@ vector< sptr< Dictionary::Class > > makeDictionaries( { wstring word; if( !title.empty() ) - word = Utf8::decode( title ); - else - word = Utf8::decode( url ); + { + + word = Utf8::decode( title ); + if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand ) + indexedWords.addSingleWord( word, n ); + else + indexedWords.addWord( word, n ); + } + if( !url.empty() ) + { + word = Utf8::decode( url ); + + // begin, the same process order as ZimDictionary::convert before findArticle's invocation + QString qword = QString::fromStdU32String( word ); + QRegularExpression htmlRx( "\\.(s|)htm(l|)$", QRegularExpression::CaseInsensitiveOption ); + qword.remove( htmlRx ).replace( "_", " " ).remove( QRegularExpression( ".*/" ) ); + //end + + word = qword.toStdU32String(); + if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand ) + indexedWords.addSingleWord( word, n ); + else + indexedWords.addWord( word, n ); + } - if( maxHeadwordsToExpand && zh.articleCount >= maxHeadwordsToExpand ) - indexedWords.addSingleWord( word, n ); - else - indexedWords.addWord( word, n ); wordCount++; } else