Fix article count and dictionary languages presentation for xdxf dictionaries

This commit is contained in:
Abs62 2012-01-27 15:40:42 +04:00
parent fb179acb59
commit 25bca9076b
3 changed files with 218 additions and 192 deletions

View file

@ -93,6 +93,20 @@ quint32 LangCoder::findIdForLanguage( gd::wstring const & lang )
return 0; return 0;
} }
quint32 LangCoder::findIdForLanguageCode3( const char * code3 )
{
for( LangCode const * lc = LangCodes; lc->code[ 0 ]; ++lc )
{
if ( strcasecmp( code3, lc->code3 ) == 0 )
{
// We've got a match
return code2toInt( lc->code );
}
}
return 0;
}
quint32 LangCoder::guessId( const QString & lang ) quint32 LangCoder::guessId( const QString & lang )
{ {
QString lstr = lang.simplified().toLower(); QString lstr = lang.simplified().toLower();

View file

@ -7,6 +7,7 @@
struct LangCode struct LangCode
{ {
char code[ 3 ]; // ISO 639-1 char code[ 3 ]; // ISO 639-1
char code3[ 4 ]; // ISO 639-2B ( http://www.loc.gov/standards/iso639-2/ )
char const * lang; // Language name in English char const * lang; // Language name in English
}; };
@ -14,193 +15,193 @@ struct LangCode
const LangCode LangCodes[] = { const LangCode LangCodes[] = {
{ "aa", "Afar" }, { "aa", "aar", "Afar" },
{ "ab", "Abkhazian" }, { "ab", "abk", "Abkhazian" },
{ "ae", "Avestan" }, { "ae", "ave", "Avestan" },
{ "af", "Afrikaans" }, { "af", "afr", "Afrikaans" },
{ "ak", "Akan" }, { "ak", "aka", "Akan" },
{ "am", "Amharic" }, { "am", "amh", "Amharic" },
{ "an", "Aragonese" }, { "an", "arg", "Aragonese" },
{ "ar", "Arabic" }, { "ar", "ara", "Arabic" },
{ "as", "Assamese" }, { "as", "asm", "Assamese" },
{ "av", "Avaric" }, { "av", "ava", "Avaric" },
{ "ay", "Aymara" }, { "ay", "aym", "Aymara" },
{ "az", "Azerbaijani" }, { "az", "aze", "Azerbaijani" },
{ "ba", "Bashkir" }, { "ba", "bak", "Bashkir" },
{ "be", "Belarusian" }, { "be", "bel", "Belarusian" },
{ "bg", "Bulgarian" }, { "bg", "bul", "Bulgarian" },
{ "bh", "Bihari" }, { "bh", "bih", "Bihari" },
{ "bi", "Bislama" }, { "bi", "bis", "Bislama" },
{ "bm", "Bambara" }, { "bm", "bam", "Bambara" },
{ "bn", "Bengali" }, { "bn", "ben", "Bengali" },
{ "bo", "Tibetan" }, { "bo", "tib", "Tibetan" },
{ "br", "Breton" }, { "br", "bre", "Breton" },
{ "bs", "Bosnian" }, { "bs", "bos", "Bosnian" },
{ "ca", "Catalan" }, { "ca", "cat", "Catalan" },
{ "ce", "Chechen" }, { "ce", "che", "Chechen" },
{ "ch", "Chamorro" }, { "ch", "cha", "Chamorro" },
{ "co", "Corsican" }, { "co", "cos", "Corsican" },
{ "cr", "Cree" }, { "cr", "cre", "Cree" },
{ "cs", "Czech" }, { "cs", "cze", "Czech" },
{ "cu", "Church Slavic" }, { "cu", "chu", "Church Slavic" },
{ "cv", "Chuvash" }, { "cv", "chv", "Chuvash" },
{ "cy", "Welsh" }, { "cy", "wel", "Welsh" },
{ "da", "Danish" }, { "da", "dan", "Danish" },
{ "de", "German" }, { "de", "ger", "German" },
{ "dv", "Divehi" }, { "dv", "div", "Divehi" },
{ "dz", "Dzongkha" }, { "dz", "dzo", "Dzongkha" },
{ "ee", "Ewe" }, { "ee", "ewe", "Ewe" },
{ "el", "Greek" }, { "el", "gre", "Greek" },
{ "en", "English" }, { "en", "eng", "English" },
{ "eo", "Esperanto" }, { "eo", "epo", "Esperanto" },
{ "es", "Spanish" }, { "es", "spa", "Spanish" },
{ "et", "Estonian" }, { "et", "est", "Estonian" },
{ "eu", "Basque" }, { "eu", "baq", "Basque" },
{ "fa", "Persian" }, { "fa", "per", "Persian" },
{ "ff", "Fulah" }, { "ff", "ful", "Fulah" },
{ "fi", "Finnish" }, { "fi", "fin", "Finnish" },
{ "fj", "Fijian" }, { "fj", "fij", "Fijian" },
{ "fo", "Faroese" }, { "fo", "fao", "Faroese" },
{ "fr", "French" }, { "fr", "fre", "French" },
{ "fy", "Western Frisian" }, { "fy", "fry", "Western Frisian" },
{ "ga", "Irish" }, { "ga", "gle", "Irish" },
{ "gd", "Scottish Gaelic" }, { "gd", "gla", "Scottish Gaelic" },
{ "gl", "Galician" }, { "gl", "glg", "Galician" },
{ "gn", "Guarani" }, { "gn", "grn", "Guarani" },
{ "gu", "Gujarati" }, { "gu", "guj", "Gujarati" },
{ "gv", "Manx" }, { "gv", "glv", "Manx" },
{ "ha", "Hausa" }, { "ha", "hau", "Hausa" },
{ "he", "Hebrew" }, { "he", "heb", "Hebrew" },
{ "hi", "Hindi" }, { "hi", "hin", "Hindi" },
{ "ho", "Hiri Motu" }, { "ho", "hmo", "Hiri Motu" },
{ "hr", "Croatian" }, { "hr", "hrv", "Croatian" },
{ "ht", "Haitian" }, { "ht", "hat", "Haitian" },
{ "hu", "Hungarian" }, { "hu", "hun", "Hungarian" },
{ "hy", "Armenian" }, { "hy", "arm", "Armenian" },
{ "hz", "Herero" }, { "hz", "her", "Herero" },
{ "ia", "Interlingua" }, { "ia", "ina", "Interlingua" },
{ "id", "Indonesian" }, { "id", "ind", "Indonesian" },
{ "ie", "Interlingue" }, { "ie", "ile", "Interlingue" },
{ "ig", "Igbo" }, { "ig", "ibo", "Igbo" },
{ "ii", "Sichuan Yi" }, { "ii", "iii", "Sichuan Yi" },
{ "ik", "Inupiaq" }, { "ik", "ipk", "Inupiaq" },
{ "io", "Ido" }, { "io", "ido", "Ido" },
{ "is", "Icelandic" }, { "is", "ice", "Icelandic" },
{ "it", "Italian" }, { "it", "ita", "Italian" },
{ "iu", "Inuktitut" }, { "iu", "iku", "Inuktitut" },
{ "ja", "Japanese" }, { "ja", "jpn", "Japanese" },
{ "jv", "Javanese" }, { "jv", "jav", "Javanese" },
{ "ka", "Georgian" }, { "ka", "geo", "Georgian" },
{ "kg", "Kongo" }, { "kg", "kon", "Kongo" },
{ "ki", "Kikuyu" }, { "ki", "kik", "Kikuyu" },
{ "kj", "Kwanyama" }, { "kj", "kua", "Kwanyama" },
{ "kk", "Kazakh" }, { "kk", "kaz", "Kazakh" },
{ "kl", "Kalaallisut" }, { "kl", "kal", "Kalaallisut" },
{ "km", "Khmer" }, { "km", "khm", "Khmer" },
{ "kn", "Kannada" }, { "kn", "kan", "Kannada" },
{ "ko", "Korean" }, { "ko", "kor", "Korean" },
{ "kr", "Kanuri" }, { "kr", "kau", "Kanuri" },
{ "ks", "Kashmiri" }, { "ks", "kas", "Kashmiri" },
{ "ku", "Kurdish" }, { "ku", "kur", "Kurdish" },
{ "kv", "Komi" }, { "kv", "kom", "Komi" },
{ "kw", "Cornish" }, { "kw", "cor", "Cornish" },
{ "ky", "Kirghiz" }, { "ky", "kir", "Kirghiz" },
{ "la", "Latin" }, { "la", "lat", "Latin" },
{ "lb", "Luxembourgish" }, { "lb", "ltz", "Luxembourgish" },
{ "lg", "Ganda" }, { "lg", "lug", "Ganda" },
{ "li", "Limburgish" }, { "li", "lim", "Limburgish" },
{ "ln", "Lingala" }, { "ln", "lin", "Lingala" },
{ "lo", "Lao" }, { "lo", "lao", "Lao" },
{ "lt", "Lithuanian" }, { "lt", "lit", "Lithuanian" },
{ "lu", "Luba-Katanga" }, { "lu", "lub", "Luba-Katanga" },
{ "lv", "Latvian" }, { "lv", "lav", "Latvian" },
{ "mg", "Malagasy" }, { "mg", "mlg", "Malagasy" },
{ "mh", "Marshallese" }, { "mh", "mah", "Marshallese" },
{ "mi", "Maori" }, { "mi", "mao", "Maori" },
{ "mk", "Macedonian" }, { "mk", "mac", "Macedonian" },
{ "ml", "Malayalam" }, { "ml", "mal", "Malayalam" },
{ "mn", "Mongolian" }, { "mn", "mon", "Mongolian" },
{ "mr", "Marathi" }, { "mr", "mar", "Marathi" },
{ "ms", "Malay" }, { "ms", "may", "Malay" },
{ "mt", "Maltese" }, { "mt", "mlt", "Maltese" },
{ "my", "Burmese" }, { "my", "bur", "Burmese" },
{ "na", "Nauru" }, { "na", "nau", "Nauru" },
{ "nb", "Norwegian Bokmal" }, { "nb", "nob", "Norwegian Bokmal" },
{ "nd", "North Ndebele" }, { "nd", "nde", "North Ndebele" },
{ "ne", "Nepali" }, { "ne", "nep", "Nepali" },
{ "ng", "Ndonga" }, { "ng", "ndo", "Ndonga" },
{ "nl", "Dutch" }, { "nl", "dut", "Dutch" },
{ "nn", "Norwegian Nynorsk" }, { "nn", "nno", "Norwegian Nynorsk" },
{ "no", "Norwegian" }, { "no", "nor", "Norwegian" },
{ "nr", "South Ndebele" }, { "nr", "nbl", "South Ndebele" },
{ "nv", "Navajo" }, { "nv", "nav", "Navajo" },
{ "ny", "Chichewa" }, { "ny", "nya", "Chichewa" },
{ "oc", "Occitan" }, { "oc", "oci", "Occitan" },
{ "oj", "Ojibwa" }, { "oj", "oji", "Ojibwa" },
{ "om", "Oromo" }, { "om", "orm", "Oromo" },
{ "or", "Oriya" }, { "or", "ori", "Oriya" },
{ "os", "Ossetian" }, { "os", "oss", "Ossetian" },
{ "pa", "Panjabi" }, { "pa", "pan", "Panjabi" },
{ "pi", "Pali" }, { "pi", "pli", "Pali" },
{ "pl", "Polish" }, { "pl", "pol", "Polish" },
{ "ps", "Pashto" }, { "ps", "pus", "Pashto" },
{ "pt", "Portuguese" }, { "pt", "por", "Portuguese" },
{ "qu", "Quechua" }, { "qu", "que", "Quechua" },
{ "rm", "Raeto-Romance" }, { "rm", "roh", "Raeto-Romance" },
{ "rn", "Kirundi" }, { "rn", "run", "Kirundi" },
{ "ro", "Romanian" }, { "ro", "rum", "Romanian" },
{ "ru", "Russian" }, { "ru", "rus", "Russian" },
{ "rw", "Kinyarwanda" }, { "rw", "kin", "Kinyarwanda" },
{ "sa", "Sanskrit" }, { "sa", "san", "Sanskrit" },
{ "sc", "Sardinian" }, { "sc", "srd", "Sardinian" },
{ "sd", "Sindhi" }, { "sd", "snd", "Sindhi" },
{ "se", "Northern Sami" }, { "se", "sme", "Northern Sami" },
{ "sg", "Sango" }, { "sg", "sag", "Sango" },
{ "sh", "Serbo-Croatian" }, { "sh", "shr", "Serbo-Croatian" },
{ "si", "Sinhala" }, { "si", "sin", "Sinhala" },
{ "sk", "Slovak" }, { "sk", "slo", "Slovak" },
{ "sl", "Slovenian" }, { "sl", "slv", "Slovenian" },
{ "sm", "Samoan" }, { "sm", "smo", "Samoan" },
{ "sn", "Shona" }, { "sn", "sna", "Shona" },
{ "so", "Somali" }, { "so", "som", "Somali" },
{ "sq", "Albanian" }, { "sq", "alb", "Albanian" },
{ "sr", "Serbian" }, { "sr", "srp", "Serbian" },
{ "ss", "Swati" }, { "ss", "ssw", "Swati" },
{ "st", "Southern Sotho" }, { "st", "sot", "Southern Sotho" },
{ "su", "Sundanese" }, { "su", "sun", "Sundanese" },
{ "sv", "Swedish" }, { "sv", "swe", "Swedish" },
{ "sw", "Swahili" }, { "sw", "swa", "Swahili" },
{ "ta", "Tamil" }, { "ta", "tam", "Tamil" },
{ "te", "Telugu" }, { "te", "tel", "Telugu" },
{ "tg", "Tajik" }, { "tg", "tgk", "Tajik" },
{ "th", "Thai" }, { "th", "tha", "Thai" },
{ "ti", "Tigrinya" }, { "ti", "tir", "Tigrinya" },
{ "tk", "Turkmen" }, { "tk", "tuk", "Turkmen" },
{ "tl", "Tagalog" }, { "tl", "tgl", "Tagalog" },
{ "tn", "Tswana" }, { "tn", "tsn", "Tswana" },
{ "to", "Tonga" }, { "to", "ton", "Tonga" },
{ "tr", "Turkish" }, { "tr", "tur", "Turkish" },
{ "ts", "Tsonga" }, { "ts", "tso", "Tsonga" },
{ "tt", "Tatar" }, { "tt", "tat", "Tatar" },
{ "tw", "Twi" }, { "tw", "twi", "Twi" },
{ "ty", "Tahitian" }, { "ty", "tah", "Tahitian" },
{ "ug", "Uighur" }, { "ug", "uig", "Uighur" },
{ "uk", "Ukrainian" }, { "uk", "ukr", "Ukrainian" },
{ "ur", "Urdu" }, { "ur", "urd", "Urdu" },
{ "uz", "Uzbek" }, { "uz", "uzb", "Uzbek" },
{ "ve", "Venda" }, { "ve", "ven", "Venda" },
{ "vi", "Vietnamese" }, { "vi", "vie", "Vietnamese" },
{ "vo", "Volapuk" }, { "vo", "vol", "Volapuk" },
{ "wa", "Walloon" }, { "wa", "wln", "Walloon" },
{ "wo", "Wolof" }, { "wo", "wol", "Wolof" },
{ "xh", "Xhosa" }, { "xh", "xho", "Xhosa" },
{ "yi", "Yiddish" }, { "yi", "yid", "Yiddish" },
{ "yo", "Yoruba" }, { "yo", "yor", "Yoruba" },
{ "za", "Zhuang" }, { "za", "zha", "Zhuang" },
{ "zh", "Chinese" }, { "zh", "chi", "Chinese" },
{ "zu", "Zulu" }, { "zu", "zul", "Zulu" },
{ "", "" } { "", "", "" }
}; };
@ -232,6 +233,7 @@ public:
/// is case- and punctuation insensitive. /// is case- and punctuation insensitive.
static quint32 findIdForLanguage( gd::wstring const & ); static quint32 findIdForLanguage( gd::wstring const & );
static quint32 findIdForLanguageCode3( const char * );
static QPair<quint32,quint32> findIdsForFilename( QString const & ); static QPair<quint32,quint32> findIdsForFilename( QString const & );

22
xdxf.cc
View file

@ -21,6 +21,7 @@
#include "xdxf2html.hh" #include "xdxf2html.hh"
#include "ufile.hh" #include "ufile.hh"
#include "dictzip.h" #include "dictzip.h"
#include "langcoder.hh"
#include <QIODevice> #include <QIODevice>
#include <QXmlStreamReader> #include <QXmlStreamReader>
@ -57,7 +58,7 @@ DEF_EX( exCorruptedIndex, "The index file is corrupted", Dictionary::Ex )
enum enum
{ {
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version CurrentFormatVersion = 2 + BtreeIndexing::FormatVersion + Folding::Version
}; };
enum ArticleFormat enum ArticleFormat
@ -72,8 +73,8 @@ struct IdxHeader
uint32_t signature; // First comes the signature, XDXF uint32_t signature; // First comes the signature, XDXF
uint32_t formatVersion; // File format version (CurrentFormatVersion) uint32_t formatVersion; // File format version (CurrentFormatVersion)
uint32_t articleFormat; // ArticleFormat value, except that 0 = bad file uint32_t articleFormat; // ArticleFormat value, except that 0 = bad file
char fromLang[ 4 ]; // 3-letter ISO-639.2 language code uint32_t langFrom; // Source language
char toLang[ 4 ]; // 3-letter ISO-639.2 language code uint32_t langTo; // Target language
uint32_t articleCount; // Total number of articles uint32_t articleCount; // Total number of articles
uint32_t wordCount; // Total number of words uint32_t wordCount; // Total number of words
uint32_t nameAddress; // Address of an utf8 name string, in chunks uint32_t nameAddress; // Address of an utf8 name string, in chunks
@ -128,7 +129,7 @@ public:
{ return map< Dictionary::Property, string >(); } { return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw() virtual unsigned long getArticleCount() throw()
{ return idxHeader.wordCount; } { return idxHeader.articleCount; }
virtual unsigned long getWordCount() throw() virtual unsigned long getWordCount() throw()
{ return idxHeader.wordCount; } { return idxHeader.wordCount; }
@ -137,6 +138,12 @@ public:
virtual QIcon getNativeIcon() throw(); virtual QIcon getNativeIcon() throw();
inline virtual quint32 getLangFrom() const
{ return idxHeader.langFrom; }
inline virtual quint32 getLangTo() const
{ return idxHeader.langTo; }
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &, virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
vector< wstring > const & alts, vector< wstring > const & alts,
wstring const & ) wstring const & )
@ -855,14 +862,14 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
if ( str.size() > 3 ) if ( str.size() > 3 )
str.resize( 3 ); str.resize( 3 );
strcpy( idxHeader.fromLang, str.c_str() ); idxHeader.langFrom = LangCoder::findIdForLanguageCode3( str.c_str() );
str = stream.attributes().value( "lang_to" ).toString().toAscii().data(); str = stream.attributes().value( "lang_to" ).toString().toAscii().data();
if ( str.size() > 3 ) if ( str.size() > 3 )
str.resize( 3 ); str.resize( 3 );
strcpy( idxHeader.toLang, str.c_str() ); idxHeader.langTo = LangCoder::findIdForLanguageCode3( str.c_str() );
bool isLogical = ( stream.attributes().value( "format" ) == "logical" ); bool isLogical = ( stream.attributes().value( "format" ) == "logical" );
@ -997,6 +1004,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idxHeader.signature = Signature; idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion; idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idx.rewind(); idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) ); idx.write( &idxHeader, sizeof( idxHeader ) );