Fix article count and dictionary languages presentation for xdxf dictionaries

This commit is contained in:
Abs62 2012-01-27 15:40:42 +04:00
parent fb179acb59
commit 25bca9076b
3 changed files with 218 additions and 192 deletions

View file

@ -93,6 +93,20 @@ quint32 LangCoder::findIdForLanguage( gd::wstring const & lang )
return 0;
}
quint32 LangCoder::findIdForLanguageCode3( const char * code3 )
{
for( LangCode const * lc = LangCodes; lc->code[ 0 ]; ++lc )
{
if ( strcasecmp( code3, lc->code3 ) == 0 )
{
// We've got a match
return code2toInt( lc->code );
}
}
return 0;
}
quint32 LangCoder::guessId( const QString & lang )
{
QString lstr = lang.simplified().toLower();

View file

@ -7,6 +7,7 @@
struct LangCode
{
char code[ 3 ]; // ISO 639-1
char code3[ 4 ]; // ISO 639-2B ( http://www.loc.gov/standards/iso639-2/ )
char const * lang; // Language name in English
};
@ -14,193 +15,193 @@ struct LangCode
const LangCode LangCodes[] = {
{ "aa", "Afar" },
{ "ab", "Abkhazian" },
{ "ae", "Avestan" },
{ "af", "Afrikaans" },
{ "ak", "Akan" },
{ "am", "Amharic" },
{ "an", "Aragonese" },
{ "ar", "Arabic" },
{ "as", "Assamese" },
{ "av", "Avaric" },
{ "ay", "Aymara" },
{ "az", "Azerbaijani" },
{ "ba", "Bashkir" },
{ "be", "Belarusian" },
{ "bg", "Bulgarian" },
{ "bh", "Bihari" },
{ "bi", "Bislama" },
{ "bm", "Bambara" },
{ "bn", "Bengali" },
{ "bo", "Tibetan" },
{ "br", "Breton" },
{ "bs", "Bosnian" },
{ "ca", "Catalan" },
{ "ce", "Chechen" },
{ "ch", "Chamorro" },
{ "co", "Corsican" },
{ "cr", "Cree" },
{ "cs", "Czech" },
{ "cu", "Church Slavic" },
{ "cv", "Chuvash" },
{ "cy", "Welsh" },
{ "da", "Danish" },
{ "de", "German" },
{ "dv", "Divehi" },
{ "dz", "Dzongkha" },
{ "ee", "Ewe" },
{ "el", "Greek" },
{ "en", "English" },
{ "eo", "Esperanto" },
{ "es", "Spanish" },
{ "et", "Estonian" },
{ "eu", "Basque" },
{ "fa", "Persian" },
{ "ff", "Fulah" },
{ "fi", "Finnish" },
{ "fj", "Fijian" },
{ "fo", "Faroese" },
{ "fr", "French" },
{ "fy", "Western Frisian" },
{ "ga", "Irish" },
{ "gd", "Scottish Gaelic" },
{ "gl", "Galician" },
{ "gn", "Guarani" },
{ "gu", "Gujarati" },
{ "gv", "Manx" },
{ "ha", "Hausa" },
{ "he", "Hebrew" },
{ "hi", "Hindi" },
{ "ho", "Hiri Motu" },
{ "hr", "Croatian" },
{ "ht", "Haitian" },
{ "hu", "Hungarian" },
{ "hy", "Armenian" },
{ "hz", "Herero" },
{ "ia", "Interlingua" },
{ "id", "Indonesian" },
{ "ie", "Interlingue" },
{ "ig", "Igbo" },
{ "ii", "Sichuan Yi" },
{ "ik", "Inupiaq" },
{ "io", "Ido" },
{ "is", "Icelandic" },
{ "it", "Italian" },
{ "iu", "Inuktitut" },
{ "ja", "Japanese" },
{ "jv", "Javanese" },
{ "ka", "Georgian" },
{ "kg", "Kongo" },
{ "ki", "Kikuyu" },
{ "kj", "Kwanyama" },
{ "kk", "Kazakh" },
{ "kl", "Kalaallisut" },
{ "km", "Khmer" },
{ "kn", "Kannada" },
{ "ko", "Korean" },
{ "kr", "Kanuri" },
{ "ks", "Kashmiri" },
{ "ku", "Kurdish" },
{ "kv", "Komi" },
{ "kw", "Cornish" },
{ "ky", "Kirghiz" },
{ "la", "Latin" },
{ "lb", "Luxembourgish" },
{ "lg", "Ganda" },
{ "li", "Limburgish" },
{ "ln", "Lingala" },
{ "lo", "Lao" },
{ "lt", "Lithuanian" },
{ "lu", "Luba-Katanga" },
{ "lv", "Latvian" },
{ "mg", "Malagasy" },
{ "mh", "Marshallese" },
{ "mi", "Maori" },
{ "mk", "Macedonian" },
{ "ml", "Malayalam" },
{ "mn", "Mongolian" },
{ "mr", "Marathi" },
{ "ms", "Malay" },
{ "mt", "Maltese" },
{ "my", "Burmese" },
{ "na", "Nauru" },
{ "nb", "Norwegian Bokmal" },
{ "nd", "North Ndebele" },
{ "ne", "Nepali" },
{ "ng", "Ndonga" },
{ "nl", "Dutch" },
{ "nn", "Norwegian Nynorsk" },
{ "no", "Norwegian" },
{ "nr", "South Ndebele" },
{ "nv", "Navajo" },
{ "ny", "Chichewa" },
{ "oc", "Occitan" },
{ "oj", "Ojibwa" },
{ "om", "Oromo" },
{ "or", "Oriya" },
{ "os", "Ossetian" },
{ "pa", "Panjabi" },
{ "pi", "Pali" },
{ "pl", "Polish" },
{ "ps", "Pashto" },
{ "pt", "Portuguese" },
{ "qu", "Quechua" },
{ "rm", "Raeto-Romance" },
{ "rn", "Kirundi" },
{ "ro", "Romanian" },
{ "ru", "Russian" },
{ "rw", "Kinyarwanda" },
{ "sa", "Sanskrit" },
{ "sc", "Sardinian" },
{ "sd", "Sindhi" },
{ "se", "Northern Sami" },
{ "sg", "Sango" },
{ "sh", "Serbo-Croatian" },
{ "si", "Sinhala" },
{ "sk", "Slovak" },
{ "sl", "Slovenian" },
{ "sm", "Samoan" },
{ "sn", "Shona" },
{ "so", "Somali" },
{ "sq", "Albanian" },
{ "sr", "Serbian" },
{ "ss", "Swati" },
{ "st", "Southern Sotho" },
{ "su", "Sundanese" },
{ "sv", "Swedish" },
{ "sw", "Swahili" },
{ "ta", "Tamil" },
{ "te", "Telugu" },
{ "tg", "Tajik" },
{ "th", "Thai" },
{ "ti", "Tigrinya" },
{ "tk", "Turkmen" },
{ "tl", "Tagalog" },
{ "tn", "Tswana" },
{ "to", "Tonga" },
{ "tr", "Turkish" },
{ "ts", "Tsonga" },
{ "tt", "Tatar" },
{ "tw", "Twi" },
{ "ty", "Tahitian" },
{ "ug", "Uighur" },
{ "uk", "Ukrainian" },
{ "ur", "Urdu" },
{ "uz", "Uzbek" },
{ "ve", "Venda" },
{ "vi", "Vietnamese" },
{ "vo", "Volapuk" },
{ "wa", "Walloon" },
{ "wo", "Wolof" },
{ "xh", "Xhosa" },
{ "yi", "Yiddish" },
{ "yo", "Yoruba" },
{ "za", "Zhuang" },
{ "zh", "Chinese" },
{ "zu", "Zulu" },
{ "aa", "aar", "Afar" },
{ "ab", "abk", "Abkhazian" },
{ "ae", "ave", "Avestan" },
{ "af", "afr", "Afrikaans" },
{ "ak", "aka", "Akan" },
{ "am", "amh", "Amharic" },
{ "an", "arg", "Aragonese" },
{ "ar", "ara", "Arabic" },
{ "as", "asm", "Assamese" },
{ "av", "ava", "Avaric" },
{ "ay", "aym", "Aymara" },
{ "az", "aze", "Azerbaijani" },
{ "ba", "bak", "Bashkir" },
{ "be", "bel", "Belarusian" },
{ "bg", "bul", "Bulgarian" },
{ "bh", "bih", "Bihari" },
{ "bi", "bis", "Bislama" },
{ "bm", "bam", "Bambara" },
{ "bn", "ben", "Bengali" },
{ "bo", "tib", "Tibetan" },
{ "br", "bre", "Breton" },
{ "bs", "bos", "Bosnian" },
{ "ca", "cat", "Catalan" },
{ "ce", "che", "Chechen" },
{ "ch", "cha", "Chamorro" },
{ "co", "cos", "Corsican" },
{ "cr", "cre", "Cree" },
{ "cs", "cze", "Czech" },
{ "cu", "chu", "Church Slavic" },
{ "cv", "chv", "Chuvash" },
{ "cy", "wel", "Welsh" },
{ "da", "dan", "Danish" },
{ "de", "ger", "German" },
{ "dv", "div", "Divehi" },
{ "dz", "dzo", "Dzongkha" },
{ "ee", "ewe", "Ewe" },
{ "el", "gre", "Greek" },
{ "en", "eng", "English" },
{ "eo", "epo", "Esperanto" },
{ "es", "spa", "Spanish" },
{ "et", "est", "Estonian" },
{ "eu", "baq", "Basque" },
{ "fa", "per", "Persian" },
{ "ff", "ful", "Fulah" },
{ "fi", "fin", "Finnish" },
{ "fj", "fij", "Fijian" },
{ "fo", "fao", "Faroese" },
{ "fr", "fre", "French" },
{ "fy", "fry", "Western Frisian" },
{ "ga", "gle", "Irish" },
{ "gd", "gla", "Scottish Gaelic" },
{ "gl", "glg", "Galician" },
{ "gn", "grn", "Guarani" },
{ "gu", "guj", "Gujarati" },
{ "gv", "glv", "Manx" },
{ "ha", "hau", "Hausa" },
{ "he", "heb", "Hebrew" },
{ "hi", "hin", "Hindi" },
{ "ho", "hmo", "Hiri Motu" },
{ "hr", "hrv", "Croatian" },
{ "ht", "hat", "Haitian" },
{ "hu", "hun", "Hungarian" },
{ "hy", "arm", "Armenian" },
{ "hz", "her", "Herero" },
{ "ia", "ina", "Interlingua" },
{ "id", "ind", "Indonesian" },
{ "ie", "ile", "Interlingue" },
{ "ig", "ibo", "Igbo" },
{ "ii", "iii", "Sichuan Yi" },
{ "ik", "ipk", "Inupiaq" },
{ "io", "ido", "Ido" },
{ "is", "ice", "Icelandic" },
{ "it", "ita", "Italian" },
{ "iu", "iku", "Inuktitut" },
{ "ja", "jpn", "Japanese" },
{ "jv", "jav", "Javanese" },
{ "ka", "geo", "Georgian" },
{ "kg", "kon", "Kongo" },
{ "ki", "kik", "Kikuyu" },
{ "kj", "kua", "Kwanyama" },
{ "kk", "kaz", "Kazakh" },
{ "kl", "kal", "Kalaallisut" },
{ "km", "khm", "Khmer" },
{ "kn", "kan", "Kannada" },
{ "ko", "kor", "Korean" },
{ "kr", "kau", "Kanuri" },
{ "ks", "kas", "Kashmiri" },
{ "ku", "kur", "Kurdish" },
{ "kv", "kom", "Komi" },
{ "kw", "cor", "Cornish" },
{ "ky", "kir", "Kirghiz" },
{ "la", "lat", "Latin" },
{ "lb", "ltz", "Luxembourgish" },
{ "lg", "lug", "Ganda" },
{ "li", "lim", "Limburgish" },
{ "ln", "lin", "Lingala" },
{ "lo", "lao", "Lao" },
{ "lt", "lit", "Lithuanian" },
{ "lu", "lub", "Luba-Katanga" },
{ "lv", "lav", "Latvian" },
{ "mg", "mlg", "Malagasy" },
{ "mh", "mah", "Marshallese" },
{ "mi", "mao", "Maori" },
{ "mk", "mac", "Macedonian" },
{ "ml", "mal", "Malayalam" },
{ "mn", "mon", "Mongolian" },
{ "mr", "mar", "Marathi" },
{ "ms", "may", "Malay" },
{ "mt", "mlt", "Maltese" },
{ "my", "bur", "Burmese" },
{ "na", "nau", "Nauru" },
{ "nb", "nob", "Norwegian Bokmal" },
{ "nd", "nde", "North Ndebele" },
{ "ne", "nep", "Nepali" },
{ "ng", "ndo", "Ndonga" },
{ "nl", "dut", "Dutch" },
{ "nn", "nno", "Norwegian Nynorsk" },
{ "no", "nor", "Norwegian" },
{ "nr", "nbl", "South Ndebele" },
{ "nv", "nav", "Navajo" },
{ "ny", "nya", "Chichewa" },
{ "oc", "oci", "Occitan" },
{ "oj", "oji", "Ojibwa" },
{ "om", "orm", "Oromo" },
{ "or", "ori", "Oriya" },
{ "os", "oss", "Ossetian" },
{ "pa", "pan", "Panjabi" },
{ "pi", "pli", "Pali" },
{ "pl", "pol", "Polish" },
{ "ps", "pus", "Pashto" },
{ "pt", "por", "Portuguese" },
{ "qu", "que", "Quechua" },
{ "rm", "roh", "Raeto-Romance" },
{ "rn", "run", "Kirundi" },
{ "ro", "rum", "Romanian" },
{ "ru", "rus", "Russian" },
{ "rw", "kin", "Kinyarwanda" },
{ "sa", "san", "Sanskrit" },
{ "sc", "srd", "Sardinian" },
{ "sd", "snd", "Sindhi" },
{ "se", "sme", "Northern Sami" },
{ "sg", "sag", "Sango" },
{ "sh", "shr", "Serbo-Croatian" },
{ "si", "sin", "Sinhala" },
{ "sk", "slo", "Slovak" },
{ "sl", "slv", "Slovenian" },
{ "sm", "smo", "Samoan" },
{ "sn", "sna", "Shona" },
{ "so", "som", "Somali" },
{ "sq", "alb", "Albanian" },
{ "sr", "srp", "Serbian" },
{ "ss", "ssw", "Swati" },
{ "st", "sot", "Southern Sotho" },
{ "su", "sun", "Sundanese" },
{ "sv", "swe", "Swedish" },
{ "sw", "swa", "Swahili" },
{ "ta", "tam", "Tamil" },
{ "te", "tel", "Telugu" },
{ "tg", "tgk", "Tajik" },
{ "th", "tha", "Thai" },
{ "ti", "tir", "Tigrinya" },
{ "tk", "tuk", "Turkmen" },
{ "tl", "tgl", "Tagalog" },
{ "tn", "tsn", "Tswana" },
{ "to", "ton", "Tonga" },
{ "tr", "tur", "Turkish" },
{ "ts", "tso", "Tsonga" },
{ "tt", "tat", "Tatar" },
{ "tw", "twi", "Twi" },
{ "ty", "tah", "Tahitian" },
{ "ug", "uig", "Uighur" },
{ "uk", "ukr", "Ukrainian" },
{ "ur", "urd", "Urdu" },
{ "uz", "uzb", "Uzbek" },
{ "ve", "ven", "Venda" },
{ "vi", "vie", "Vietnamese" },
{ "vo", "vol", "Volapuk" },
{ "wa", "wln", "Walloon" },
{ "wo", "wol", "Wolof" },
{ "xh", "xho", "Xhosa" },
{ "yi", "yid", "Yiddish" },
{ "yo", "yor", "Yoruba" },
{ "za", "zha", "Zhuang" },
{ "zh", "chi", "Chinese" },
{ "zu", "zul", "Zulu" },
{ "", "" }
{ "", "", "" }
};
@ -232,6 +233,7 @@ public:
/// is case- and punctuation insensitive.
static quint32 findIdForLanguage( gd::wstring const & );
static quint32 findIdForLanguageCode3( const char * );
static QPair<quint32,quint32> findIdsForFilename( QString const & );

22
xdxf.cc
View file

@ -21,6 +21,7 @@
#include "xdxf2html.hh"
#include "ufile.hh"
#include "dictzip.h"
#include "langcoder.hh"
#include <QIODevice>
#include <QXmlStreamReader>
@ -57,7 +58,7 @@ DEF_EX( exCorruptedIndex, "The index file is corrupted", Dictionary::Ex )
enum
{
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
CurrentFormatVersion = 1 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 2 + BtreeIndexing::FormatVersion + Folding::Version
};
enum ArticleFormat
@ -72,8 +73,8 @@ struct IdxHeader
uint32_t signature; // First comes the signature, XDXF
uint32_t formatVersion; // File format version (CurrentFormatVersion)
uint32_t articleFormat; // ArticleFormat value, except that 0 = bad file
char fromLang[ 4 ]; // 3-letter ISO-639.2 language code
char toLang[ 4 ]; // 3-letter ISO-639.2 language code
uint32_t langFrom; // Source language
uint32_t langTo; // Target language
uint32_t articleCount; // Total number of articles
uint32_t wordCount; // Total number of words
uint32_t nameAddress; // Address of an utf8 name string, in chunks
@ -128,7 +129,7 @@ public:
{ return map< Dictionary::Property, string >(); }
virtual unsigned long getArticleCount() throw()
{ return idxHeader.wordCount; }
{ return idxHeader.articleCount; }
virtual unsigned long getWordCount() throw()
{ return idxHeader.wordCount; }
@ -137,6 +138,12 @@ public:
virtual QIcon getNativeIcon() throw();
inline virtual quint32 getLangFrom() const
{ return idxHeader.langFrom; }
inline virtual quint32 getLangTo() const
{ return idxHeader.langTo; }
virtual sptr< Dictionary::DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const & )
@ -855,14 +862,14 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
if ( str.size() > 3 )
str.resize( 3 );
strcpy( idxHeader.fromLang, str.c_str() );
idxHeader.langFrom = LangCoder::findIdForLanguageCode3( str.c_str() );
str = stream.attributes().value( "lang_to" ).toString().toAscii().data();
if ( str.size() > 3 )
str.resize( 3 );
strcpy( idxHeader.toLang, str.c_str() );
idxHeader.langTo = LangCoder::findIdForLanguageCode3( str.c_str() );
bool isLogical = ( stream.attributes().value( "format" ) == "logical" );
@ -997,6 +1004,9 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );