#include "lingualibre.hh" #include "utf8.hh" #include "audiolink.hh" #include #include #include #include namespace Lingua { using namespace Dictionary; namespace { class LinguaDictionary: public Dictionary::Class { string name; QString languageCode; QString langWikipediaID; QNetworkAccessManager & netMgr; public: LinguaDictionary( string const & id, string name_, QString languageCode_, QNetworkAccessManager & netMgr_ ): Dictionary::Class( id, vector< string >() ), name( std::move( name_ ) ), languageCode( std::move( languageCode_ ) ), netMgr( netMgr_ ) { /* map of iso lang code to wikipedia lang id Data was obtained by this query on https://commons-query.wikimedia.org/ SELECT ?language ?languageLabel ?iso ?audios WHERE { { SELECT ?language (COUNT(?audio) AS ?audios) WHERE { ?audio # Filter: P2 'instance of' is Q2 'record' wdt:P407 ?language . } GROUP BY ?language } SERVICE { ?language wdt:P220 ?iso . # Assign value: P220 'ISO-639-3' into ?iso. } SERVICE { SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". ?language rdfs:label ?languageLabel . } } } */ const map< string, string > iso_to_wikipedia_id = { { "grc", "Q35497" }, { "non", "Q35505" }, { "ken", "Q35650" }, { "got", "Q35722" }, { "gsc", "Q35735" }, { "ldn", "Q35757" }, { "csc", "Q35768" }, { "lns", "Q35788" }, { "kab", "Q35853" }, { "ina", "Q35934" }, { "jam", "Q35939" }, { "mua", "Q36032" }, { "mhk", "Q36068" }, { "mos", "Q36096" }, { "kmr", "Q36163" }, { "num", "Q36173" }, { "mad", "Q36213" }, { "lin", "Q36217" }, { "mal", "Q36236" }, { "srr", "Q36284" }, { "jbo", "Q36350" }, { "yas", "Q36358" }, { "kur", "Q36368" }, { "ell", "Q36510" }, { "swl", "Q36558" }, { "gya", "Q36594" }, { "tvu", "Q36632" }, { "mlu", "Q36645" }, { "tui", "Q36646" }, { "ota", "Q36730" }, { "rar", "Q36745" }, { "ckb", "Q36811" }, { "tok", "Q36846" }, { "twi", "Q36850" }, { "vut", "Q36897" }, { "ybb", "Q36917" }, { "bse", "Q36973" }, { "wls", "Q36979" }, { "lzh", "Q37041" }, { "ang", "Q42365" }, { "ker", "Q56251" }, { "ary", "Q56426" }, { "sjn", "Q56437" }, { "hau", "Q56475" }, { "arq", "Q56499" }, { "atj", "Q56590" }, { "mcn", "Q56668" }, { "tpw", "Q56944" }, { "vls", "Q100103" }, { "gsw", "Q131339" }, { "hrx", "Q304049" }, { "lms", "Q427614" }, { "sxu", "Q699284" }, { "pwn", "Q715755" }, { "tay", "Q715766" }, { "guc", "Q891085" }, { "lnc", "Q942602" }, { "blc", "Q977808" }, { "avk", "Q1377116" }, { "sba", "Q2372207" }, { "gcf", "Q3006280" }, { "far", "Q3067168" }, { "kld", "Q3111818" }, { "swh", "Q3197533" }, { "rhg", "Q3241177" }, { "vsl", "Q3322064" }, { "xzh", "Q3437292" }, { "ane", "Q3571097" }, { "kcg", "Q3912765" }, { "hav", "Q5684097" }, { "isu", "Q6089423" }, { "mdl", "Q6744816" }, { "duf", "Q6983819" }, { "sru", "Q7646993" }, { "yat", "Q8048020" }, { "lem", "Q13479983" }, { "mul", "Q20923490" }, { "cmn", "Q9192" }, { "vie", "Q9199" }, { "tha", "Q9217" }, { "msa", "Q9237" }, { "ind", "Q9240" }, { "mon", "Q9246" }, { "tgk", "Q9260" }, { "uzb", "Q9264" }, { "heb", "Q9288" }, { "aze", "Q9292" }, { "mkd", "Q9296" }, { "bos", "Q9303" }, { "glg", "Q9307" }, { "cym", "Q9309" }, { "gla", "Q9314" }, { "ben", "Q9610" }, { "tlh", "Q10134" }, { "bre", "Q12107" }, { "rcf", "Q13198" }, { "xho", "Q13218" }, { "hsb", "Q13248" }, { "sms", "Q13271" }, { "nav", "Q13310" }, { "min", "Q13324" }, { "mnw", "Q13349" }, { "ara", "Q13955" }, { "oci", "Q14185" }, { "afr", "Q14196" }, { "sco", "Q14549" }, { "ase", "Q14759" }, { "pms", "Q15085" }, { "fao", "Q25258" }, { "tat", "Q25285" }, { "cor", "Q25289" }, { "kal", "Q25355" }, { "nds", "Q25433" }, { "fry", "Q27175" }, { "ace", "Q27683" }, { "ain", "Q27969" }, { "aka", "Q28026" }, { "amh", "Q28244" }, { "anp", "Q28378" }, { "rup", "Q29316" }, { "arz", "Q29919" }, { "myv", "Q29952" }, { "dag", "Q32238" }, { "dyu", "Q32706" }, { "bfi", "Q33000" }, { "dua", "Q33013" }, { "ban", "Q33070" }, { "bas", "Q33093" }, { "cos", "Q33111" }, { "bam", "Q33243" }, { "chy", "Q33265" }, { "shy", "Q33274" }, { "bcl", "Q33284" }, { "gaa", "Q33287" }, { "fon", "Q33291" }, { "fil", "Q33298" }, { "fsl", "Q33302" }, { "che", "Q33350" }, { "chr", "Q33388" }, { "fur", "Q33441" }, { "smn", "Q33462" }, { "hat", "Q33491" }, { "syc", "Q33538" }, { "jav", "Q33549" }, { "kas", "Q33552" }, { "haw", "Q33569" }, { "ibo", "Q33578" }, { "kik", "Q33587" }, { "mnc", "Q33638" }, { "kan", "Q33673" }, { "krc", "Q33714" }, { "ory", "Q33810" }, { "orm", "Q33864" }, { "mni", "Q33868" }, { "nso", "Q33890" }, { "sat", "Q33965" }, { "scn", "Q33973" }, { "srd", "Q33976" }, { "srn", "Q33989" }, { "snd", "Q33997" }, { "sun", "Q34002" }, { "pcd", "Q34024" }, { "ddo", "Q34033" }, { "tvl", "Q34055" }, { "tgl", "Q34057" }, { "nmg", "Q34098" }, { "tsn", "Q34137" }, { "shi", "Q34152" }, { "lua", "Q34173" }, { "rif", "Q34174" }, { "wln", "Q34219" }, { "wol", "Q34257" }, { "bci", "Q35107" }, { "cak", "Q35115" }, { "ido", "Q35224" }, { "bbj", "Q35271" }, { "bik", "Q35455" }, { "epo", "Q143" }, { "fra", "Q150" }, { "deu", "Q188" }, { "tur", "Q256" }, { "isl", "Q294" }, { "lat", "Q397" }, { "ita", "Q652" }, { "pol", "Q809" }, { "spa", "Q1321" }, { "fin", "Q1412" }, { "hin", "Q1568" }, { "mar", "Q1571" }, { "eng", "Q1860" }, { "aym", "Q4627" }, { "guj", "Q5137" }, { "por", "Q5146" }, { "que", "Q5218" }, { "jpn", "Q5287" }, { "tam", "Q5885" }, { "hrv", "Q6654" }, { "cat", "Q7026" }, { "nld", "Q7411" }, { "rus", "Q7737" }, { "swa", "Q7838" }, { "zho", "Q7850" }, { "ron", "Q7913" }, { "bul", "Q7918" }, { "mlg", "Q7930" }, { "tel", "Q8097" }, { "yid", "Q8641" }, { "sqi", "Q8748" }, { "eus", "Q8752" }, { "hye", "Q8785" }, { "ukr", "Q8798" }, { "swe", "Q9027" }, { "dan", "Q9035" }, { "nor", "Q9043" }, { "ltz", "Q9051" }, { "ces", "Q9056" }, { "slv", "Q9063" }, { "hun", "Q9067" }, { "est", "Q9072" }, { "bel", "Q9091" }, { "ell", "Q9129" }, { "gle", "Q9142" }, { "mlt", "Q9166" }, { "fas", "Q9168" }, { "kor", "Q9176" }, { "yue", "Q9186" } }; // END OF iso_to_wikipedia_id auto it = iso_to_wikipedia_id.find(languageCode.toStdString()); if (it != iso_to_wikipedia_id.end()){ langWikipediaID = QString::fromStdString(it->second); } } string getName() noexcept override { return name; } map< Property, string > getProperties() noexcept override { return {}; } unsigned long getArticleCount() noexcept override { return 0; } unsigned long getWordCount() noexcept override { return 0; } sptr< WordSearchRequest > prefixMatch( wstring const & /*word*/, unsigned long /*maxResults*/ ) override { sptr< WordSearchRequestInstant > sr = std::make_shared< WordSearchRequestInstant >(); sr->setUncertain( true ); return sr; } sptr< DataRequest > getArticle( wstring const & word, vector< wstring > const & alts, wstring const &, bool ) override { if( word.size() < 50 ) { return std::make_shared< LinguaArticleRequest >( word, alts, languageCode,langWikipediaID, getId(), netMgr ); } else { return std::make_shared< DataRequestInstant >( false ); } } protected: void loadIcon() noexcept override { if( dictionaryIconLoaded ) return; dictionaryIcon = dictionaryNativeIcon = QIcon( ":/icons/lingualibre.svg" ); dictionaryIconLoaded = true; } }; } // namespace vector< sptr< Dictionary::Class > > makeDictionaries( Dictionary::Initializing &, Config::Lingua const & lingua, QNetworkAccessManager & mgr ) { vector< sptr< Dictionary::Class > > result; if( lingua.enable and !lingua.languageCodes.isEmpty() ) { QCryptographicHash hash( QCryptographicHash::Md5 ); hash.addData( "Lingua libre via Wiki Commons" ); result.push_back( std::make_shared< LinguaDictionary >( hash.result().toHex().data(), QString( "LinguaLibre" ).toUtf8().data(), lingua.languageCodes, mgr ) ); } return result; }; void LinguaArticleRequest::cancel() { finish(); } LinguaArticleRequest::LinguaArticleRequest( const wstring & str, const vector< wstring > & alts, const QString & languageCode_, const QString & langWikipediaID, const string & dictionaryId_, QNetworkAccessManager & mgr ): languageCode(languageCode_), langWikipediaID(langWikipediaID) { connect( &mgr, &QNetworkAccessManager::finished, this, &LinguaArticleRequest::requestFinished, Qt::QueuedConnection ); addQuery( mgr, str ); } void LinguaArticleRequest::addQuery( QNetworkAccessManager & mgr, const wstring & word ) { // Doc of the QString reqUrl = R"(https://commons.wikimedia.org/w/api.php?)" R"(action=query)" R"(&format=json)" R"(&prop=imageinfo)" R"(&generator=search)" R"(&iiprop=url)" R"(&iimetadataversion=1)" R"(&iiextmetadatafilter=Categories)" R"(&gsrsearch=intitle:LL-%1 \(%2\)-.*-%3\.wav/)" // https://en.wikipedia.org/wiki/Help:Searching/Regex R"(&gsrnamespace=6)" R"(&gsrlimit=10)" R"(&gsrwhat=text)"; reqUrl = reqUrl.arg(langWikipediaID,languageCode,QString::fromStdU32String( word ) ); qDebug()<< "lingualibre query " << reqUrl; auto netRequest = QNetworkRequest( reqUrl ); netRequest.setTransferTimeout(3000); auto netReply = std::shared_ptr< QNetworkReply >( mgr.get(netRequest)); netReplies.emplace_back( netReply, Utf8::encode( word ) ); } void LinguaArticleRequest::requestFinished( QNetworkReply * r ) { qDebug() << "Lingua query finished "; sptr< QNetworkReply > netReply = netReplies.front().reply; if ( isFinished() ) { return; } if ( !netReply->isFinished() || netReply->error() != QNetworkReply::NoError ) { qWarning()<< "Lingua query failed: " << netReply->error(); cancel(); return; } QJsonObject resultJson = QJsonDocument::fromJson( netReply->readAll() ).object(); /* Code below is to process returned json: { "batchcomplete": "", "query": { "pages": { "88511149": { "pageid": 88511149, "ns": 6, "title": "File:LL-Q1860 (eng)-Back ache-nice.wav", "index": 2, "imagerepository": "local", "imageinfo": [ { "url": "https://upload.wikimedia.org/wikipedia/commons/6/6a/LL-Q1860_%28eng%29-Back_ache-nice.wav", "descriptionurl": "https://commons.wikimedia.org/wiki/File:LL-Q1860_(eng)-Back_ache-nice.wav", "descriptionshorturl": "https://commons.wikimedia.org/w/index.php?curid=88511149" } ] }, "73937351": { "pageid": 73937351, "ns": 6, "title": "File:LL-Q1860 (eng)-Nattes à chat-nice.wav", "index": 1, "imagerepository": "local", "imageinfo": [ { "url": "https://upload.wikimedia.org/wikipedia/commons/b/b0/LL-Q1860_%28eng%29-Nattes_%C3%A0_chat-nice.wav", "descriptionurl": "https://commons.wikimedia.org/wiki/File:LL-Q1860_(eng)-Nattes_%C3%A0_chat-nice.wav", "descriptionshorturl": "https://commons.wikimedia.org/w/index.php?curid=73937351" } ] } } } } */ if( resultJson.contains( "query" ) ) { string articleBody = "

"; for( auto pageJsonVal : resultJson[ "query" ].toObject()[ "pages" ].toObject() ) { auto pageJsonObj = pageJsonVal.toObject(); string title = pageJsonObj[ "title" ].toString().toHtmlEscaped().toStdString(); string audiolink = pageJsonObj[ "imageinfo" ].toArray().at( 0 ).toObject()[ "url" ].toString().toHtmlEscaped().toStdString(); articleBody += addAudioLink( "\""+audiolink+"\"", dictionaryId ); articleBody += R"()"; articleBody += R"(Play)"; articleBody += title; articleBody += "
"; } articleBody += "

"; QMutexLocker _( &dataMutex ); size_t prevSize = data.size(); data.resize( prevSize + articleBody.size() ); memcpy( &data.front() + prevSize, articleBody.data(), articleBody.size() ); hasAnyData = true; finish(); } else { hasAnyData = false; finish(); } } } // end namespace Lingua