diff --git a/config.cc b/config.cc index 2660095a..d4ce380c 100644 --- a/config.cc +++ b/config.cc @@ -199,7 +199,7 @@ InputPhrase Preferences::sanitizeInputPhrase( QString const & inputPhrase ) cons return result; } - const QString withPunct = _phase.simplified(); + const QString withPunct = _phase.simplified().remove( QChar( 0xAD ) ); // Simplify whitespaces and remove soft hyphens; result.phrase = gd::toQString( Folding::trimWhitespaceOrPunct( gd::toWString( withPunct ) ) ); if ( !result.isValid() ) return result; // The suffix of an invalid input phrase must be empty. diff --git a/xdxf.cc b/xdxf.cc index bea69777..6fabadde 100644 --- a/xdxf.cc +++ b/xdxf.cc @@ -63,6 +63,22 @@ using BtreeIndexing::WordArticleLink; using BtreeIndexing::IndexedWords; using BtreeIndexing::IndexInfo; +quint32 getLanguageId( const QString & lang ) +{ + QString lstr = lang.left( 3 ); + + if( lstr.endsWith( QChar( '-' ) ) ) + lstr.chop( 1 ); + + switch( lstr.size() ) + { + case 2: return LangCoder::code2toInt( lstr.toLatin1().data() ); + case 3: return LangCoder::findIdForLanguageCode3( lstr.toLatin1().data() ); + } + + return 0; +} + namespace { DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) @@ -73,7 +89,7 @@ DEF_EX_STR( exDictzipError, "DICTZIP error", Dictionary::Ex ) enum { Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian - CurrentFormatVersion = 5 + BtreeIndexing::FormatVersion + Folding::Version + CurrentFormatVersion = 6 + BtreeIndexing::FormatVersion + Folding::Version }; enum ArticleFormat @@ -1208,25 +1224,19 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // Read the xdxf string str = stream.attributes().value( "lang_from" ).toString().toLatin1().data(); - - if ( str.size() > 3 ) - str.resize( 3 ); - - idxHeader.langFrom = LangCoder::findIdForLanguageCode3( str.c_str() ); + if( !str.empty() ) + idxHeader.langFrom = getLanguageId( str.c_str() ); str = stream.attributes().value( "lang_to" ).toString().toLatin1().data(); - - if ( str.size() > 3 ) - str.resize( 3 ); - - idxHeader.langTo = LangCoder::findIdForLanguageCode3( str.c_str() ); - - bool isLogical = ( stream.attributes().value( "format" ) == u"logical" ); + if( !str.empty() ) + idxHeader.langTo = getLanguageId( str.c_str() ); QRegExp regNum( "\\d+" ); regNum.indexIn( stream.attributes().value( "revision" ).toString() ); idxHeader.revisionNumber = regNum.cap().toUInt(); + bool isLogical = ( stream.attributes().value( "format" ) == u"logical" || idxHeader.revisionNumber >= 34 ); + idxHeader.articleFormat = isLogical ? Logical : Visual; unsigned articleCount = 0, wordCount = 0; @@ -1269,6 +1279,13 @@ vector< sptr< Dictionary::Class > > makeDictionaries( // todo implement adding other information to the description like , , , , , , , QString desc = readXhtmlData( stream ); + if( isLogical ) + { + desc = desc.simplified(); + QRegularExpression br( "\\s*
" ); + desc.replace( br, QString("\n") ); + } + if ( dictionaryDescription.isEmpty() ) { dictionaryDescription = desc; @@ -1286,6 +1303,36 @@ vector< sptr< Dictionary::Class > > makeDictionaries( } } else + if( stream.name() == u"languages" ) + { + while( !( stream.isEndElement() && stream.name() == u"languages" ) && !stream.atEnd() ) + { + if( !stream.readNext() ) + break; + if ( stream.isStartElement() ) + { + if( stream.name() == u"from" ) + { + if( idxHeader.langFrom == 0 ) + { + QString lang = stream.attributes().value( "xml:lang" ).toString(); + idxHeader.langFrom = getLanguageId( lang ); + } + } + else if( stream.name() == u"to" ) + { + if( idxHeader.langTo == 0 ) + { + QString lang = stream.attributes().value( "xml:lang" ).toString(); + idxHeader.langTo = getLanguageId( lang ); + } + } + } + else if ( stream.isEndElement() && stream.name() == u"languages" ) + break; + } + } + else if ( stream.name() == u"abbreviations" ) { QString s; diff --git a/xdxf.hh b/xdxf.hh index c562a56a..c6a9e502 100644 --- a/xdxf.hh +++ b/xdxf.hh @@ -12,6 +12,8 @@ namespace Xdxf { using std::vector; using std::string; +quint32 getLanguageId( const QString & lang ); + vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & fileNames, string const & indicesDir, diff --git a/xdxf2html.cc b/xdxf2html.cc index b5da8b66..c2570518 100644 --- a/xdxf2html.cc +++ b/xdxf2html.cc @@ -14,6 +14,7 @@ #include "htmlescape.hh" #include "utils.hh" #include +#include "xdxf.hh" #include @@ -231,8 +232,20 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const el.setTagName( "div" ); el.setAttribute( "class", "xdxf_headwords" ); - if( dictPtr->isFromLanguageRTL() != dictPtr->isToLanguageRTL() ) - el.setAttribute( "dir", dictPtr->isFromLanguageRTL() ? "rtl" : "ltr" ); + bool isLanguageRtl = dictPtr->isFromLanguageRTL(); + if( el.hasAttribute( "xml:lang" ) ) + { + // Change xml-attribute "xml:lang" to html-attribute "lang" + QString lang = el.attribute( "xml:lang" ); + el.removeAttribute( "xml:lang" ); + el.setAttribute( "lang", lang ); + + quint32 langID = Xdxf::getLanguageId( lang ); + if( langID ) + isLanguageRtl = LangCoder::isLanguageRTL( langID ); + } + if( isLanguageRtl != dictPtr->isToLanguageRTL() ) + el.setAttribute( "dir", isLanguageRtl ? "rtl" : "ltr" ); } } @@ -327,6 +340,20 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const QDomElement el = nodes.at( 0 ).toElement(); el.setTagName( "span" ); el.setAttribute( "class", "xdxf_def" ); + bool isLanguageRtl = dictPtr->isToLanguageRTL(); + if( el.hasAttribute( "xml:lang" ) ) + { + // Change xml-attribute "xml:lang" to html-attribute "lang" + QString lang = el.attribute( "xml:lang" ); + el.removeAttribute( "xml:lang" ); + el.setAttribute( "lang", lang ); + + quint32 langID = Xdxf::getLanguageId( lang ); + if( langID ) + isLanguageRtl = LangCoder::isLanguageRTL( langID ); + } + if( isLanguageRtl != dictPtr->isToLanguageRTL() ) + el.setAttribute( "dir", isLanguageRtl ? "rtl" : "ltr" ); } } diff --git a/zim.cc b/zim.cc index 8fab4705..9057e29e 100644 --- a/zim.cc +++ b/zim.cc @@ -62,6 +62,7 @@ using BtreeIndexing::IndexInfo; DEF_EX_STR( exNotZimFile, "Not an Zim file", Dictionary::Ex ) DEF_EX_STR( exCantReadFile, "Can't read file", Dictionary::Ex ) +DEF_EX_STR( exInvalidZimHeader, "Invalid Zim header", Dictionary::Ex ) DEF_EX( exUserAbort, "User abort", Dictionary::Ex ) @@ -287,6 +288,9 @@ bool ZimFile::open() if( read( reinterpret_cast< char * >( &zimHeader ), sizeof( zimHeader ) ) != sizeof( zimHeader ) ) return false; + if( zimHeader.magicNumber != 0x44D495A || zimHeader.mimeListPos != sizeof( zimHeader ) ) + return false; + // Clusters in zim file may be placed in random order. // We create sorted offsets list to calculate clusters size. @@ -1567,11 +1571,15 @@ vector< sptr< Dictionary::Class > > makeDictionaries( df.open(); ZIM_header const & zh = df.header(); - bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 ); if( zh.magicNumber != 0x44D495A ) throw exNotZimFile( i->c_str() ); + if( zh.mimeListPos != sizeof( ZIM_header ) ) + throw exInvalidZimHeader( i->c_str() ); + + bool new_namespaces = ( zh.majorVersion >= 6 && zh.minorVersion >= 1 ); + { int n = firstName.lastIndexOf( '/' ); initializing.indexingDictionary( firstName.mid( n + 1 ).toUtf8().constData() );