/* This file is (c) 2008-2012 Konstantin Isakov
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "dsl.hh"
#include "dsl_details.hh"
#include "btreeidx.hh"
#include "folding.hh"
#include "utf8.hh"
#include "chunkedstorage.hh"
#include "dictzip.hh"
#include "htmlescape.hh"
#include "iconv.hh"
#include "filetype.hh"
#include "audiolink.hh"
#include "langcoder.hh"
#include "wstring_qt.hh"
#include "indexedzip.hh"
#include "gddebug.hh"
#include "tiff.hh"
#include "ftshelpers.hh"
#include
" ) == 0 )
articleText.insert( articleText.size() - 4, " " + button );
else
articleText += button;
}
articleText += articleAfter;
}
catch( std::exception &ex )
{
gdWarning( "DSL: Failed loading article from \"%s\", reason: %s\n", dict.getName().c_str(), ex.what() );
articleText = string( "" )
+ QObject::tr( "Article loading error" ).toStdString()
+ "";
}
QMutexLocker _( &dataMutex );
data.resize( data.size() + articleText.size() );
memcpy( &data.front() + data.size() - articleText.size(),
articleText.data(), articleText.size() );
hasAnyData = true;
}
finish();
}
sptr< Dictionary::DataRequest > DslDictionary::getArticle( wstring const & word,
vector< wstring > const & alts,
wstring const &,
bool ignoreDiacritics )
{
return std::make_shared( word, alts, *this, ignoreDiacritics );
}
//// DslDictionary::getResource()
class DslResourceRequest: public Dictionary::DataRequest
{
DslDictionary & dict;
string resourceName;
QAtomicInt isCancelled;
QSemaphore hasExited;
QFuture< void > f;
public:
DslResourceRequest( DslDictionary & dict_,
string const & resourceName_ ):
dict( dict_ ),
resourceName( resourceName_ )
{
f = QtConcurrent::run( [ this ]() { this->run(); } );
}
void run();
void cancel() override
{
isCancelled.ref();
}
~DslResourceRequest()
{
isCancelled.ref();
f.waitForFinished();
//hasExited.acquire();
}
};
void DslResourceRequest::run()
{
// Some runnables linger enough that they are cancelled before they start
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
if ( dict.ensureInitDone().size() )
{
setErrorString( QString::fromUtf8( dict.ensureInitDone().c_str() ) );
finish();
return;
}
string n = dict.getContainingFolder().toStdString() + Utils::Fs::separator() + resourceName;
GD_DPRINTF( "n is %s\n", n.c_str() );
try
{
try
{
QMutexLocker _( &dataMutex );
File::loadFromFile( n, data );
}
catch( File::exCantOpen & )
{
n = dict.getResourceDir1() + resourceName;
try {
QMutexLocker _( &dataMutex );
File::loadFromFile( n, data );
}
catch( File::exCantOpen & )
{
n = dict.getResourceDir2() + resourceName;
try
{
QMutexLocker _( &dataMutex );
File::loadFromFile( n, data );
}
catch( File::exCantOpen & )
{
// Try reading from zip file
if ( dict.resourceZip.isOpen() )
{
QMutexLocker _( &dataMutex );
if ( !dict.resourceZip.loadFile( Utf8::decode( resourceName ), data ) )
throw; // Make it fail since we couldn't read the archive
}
else
throw;
}
}
}
if ( Filetype::isNameOfTiff( resourceName ) )
{
// Convert it
QMutexLocker _( &dataMutex );
GdTiff::tiff2img( data );
}
QMutexLocker _( &dataMutex );
hasAnyData = true;
}
catch( std::exception &ex )
{
gdWarning( "DSL: Failed loading resource \"%s\" for \"%s\", reason: %s\n",
resourceName.c_str(), dict.getName().c_str(), ex.what() );
// Resource not loaded -- we don't set the hasAnyData flag then
}
finish();
}
sptr< Dictionary::DataRequest > DslDictionary::getResource( string const & name )
{
return std::make_shared( *this, name );
}
sptr< Dictionary::DataRequest > DslDictionary::getSearchResults( QString const & searchString,
int searchMode,
bool matchCase,
bool ignoreDiacritics )
{
return std::make_shared< FtsHelpers::FTSResultsRequest >( *this,
searchString,
searchMode,
matchCase,
ignoreDiacritics );
}
} // anonymous namespace
/// makeDictionaries
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
string const & indicesDir,
Dictionary::Initializing & initializing,
int maxPictureWidth, unsigned int maxHeadwordSize )
{
vector< sptr< Dictionary::Class > > dictionaries;
for ( const auto & fileName : fileNames ) {
// Try .dsl and .dsl.dz suffixes
bool uncompressedDsl = ( fileName.size() >= 4 &&
strcasecmp( fileName.c_str() + ( fileName.size() - 4 ), ".dsl" ) == 0 );
if ( !uncompressedDsl &&
( fileName.size() < 7 ||
strcasecmp( fileName.c_str() + ( fileName.size() - 7 ), ".dsl.dz" ) != 0 ) )
continue;
// Make sure it's not an abbreviation file
int extSize = ( uncompressedDsl ? 4 : 7 );
if ( fileName.size() - extSize >= 5 &&
strncasecmp( fileName.c_str() + fileName.size() - extSize - 5, "_abrv", 5 ) == 0 )
{
// It is, skip it
continue;
}
unsigned atLine = 0; // Indicates current line in .dsl, for debug purposes
try
{
vector< string > dictFiles( 1, fileName );
// Check if there is an 'abrv' file present
string baseName = ( fileName[ fileName.size() - 4 ] == '.' ) ?
string( fileName, 0, fileName.size() - 4 ) : string( fileName, 0, fileName.size() - 7 );
string abrvFileName;
if ( File::tryPossibleName( baseName + "_abrv.dsl", abrvFileName ) ||
File::tryPossibleName( baseName + "_abrv.dsl.dz", abrvFileName ) ||
File::tryPossibleName( baseName + "_ABRV.DSL", abrvFileName ) ||
File::tryPossibleName( baseName + "_ABRV.DSL.DZ", abrvFileName ) ||
File::tryPossibleName( baseName + "_ABRV.DSL.dz", abrvFileName ) )
dictFiles.push_back( abrvFileName );
string dictId = Dictionary::makeDictionaryId( dictFiles );
// See if there's a zip file with resources present. If so, include it.
string zipFileName;
if ( File::tryPossibleZipName( baseName + ".dsl.files.zip", zipFileName ) ||
File::tryPossibleZipName( baseName + ".dsl.dz.files.zip", zipFileName ) ||
File::tryPossibleZipName( baseName + ".DSL.FILES.ZIP", zipFileName ) ||
File::tryPossibleZipName( baseName + ".DSL.DZ.FILES.ZIP", zipFileName ) )
dictFiles.push_back( zipFileName );
string indexFile = indicesDir + dictId;
if ( Dictionary::needToRebuildIndex( dictFiles, indexFile ) ||
indexIsOldOrBad( indexFile, zipFileName.size() ) )
{
DslScanner scanner( fileName );
try { // Here we intercept any errors during the read to save line at
// which the incident happened. We need alive scanner for that.
if( scanner.getDictionaryName() == U"Abbrev" )
continue; // For now just skip abbreviations
// Building the index
initializing.indexingDictionary( Utf8::encode( scanner.getDictionaryName() ) );
gdDebug( "Dsl: Building the index for dictionary: %s\n",
QString::fromStdU32String( scanner.getDictionaryName() ).toUtf8().data() );
File::Class idx( indexFile, "wb" );
IdxHeader idxHeader;
memset( &idxHeader, 0, sizeof( idxHeader ) );
// We write a dummy header first. At the end of the process the header
// will be rewritten with the right values.
idx.write( idxHeader );
string dictionaryName = Utf8::encode( scanner.getDictionaryName() );
idx.write( (uint32_t)dictionaryName.size() );
idx.write( dictionaryName.data(), dictionaryName.size() );
string soundDictName = Utf8::encode( scanner.getSoundDictionaryName() );
if( !soundDictName.empty() )
{
idxHeader.hasSoundDictionaryName = 1;
idx.write( (uint32_t)soundDictName.size() );
idx.write( soundDictName.data(), soundDictName.size() );
}
idxHeader.dslEncoding = scanner.getEncoding();
IndexedWords indexedWords;
ChunkedStorage::Writer chunks( idx );
// Read the abbreviations
if ( abrvFileName.size() )
{
try
{
DslScanner abrvScanner( abrvFileName );
map< string, string > abrv;
wstring curString;
size_t curOffset;
for( ; ; )
{
// Skip any whitespace
if ( !abrvScanner.readNextLineWithoutComments( curString, curOffset, true ) )
break;
if ( curString.empty() || isDslWs( curString[ 0 ] ) )
continue;
list< wstring > keys;
bool eof = false;
// Insert the key and read more, or get to the definition
for( ; ; )
{
processUnsortedParts( curString, true );
if ( keys.size() )
expandTildes( curString, keys.front() );
expandOptionalParts( curString, &keys );
if ( !abrvScanner.readNextLineWithoutComments( curString, curOffset ) || curString.empty() )
{
gdWarning( "Premature end of file %s\n", abrvFileName.c_str() );
eof = true;
break;
}
if ( isDslWs( curString[ 0 ] ) )
break;
}
if ( eof )
break;
curString.erase( 0, curString.find_first_not_of( U" \t" ) );
if ( keys.size() )
expandTildes( curString, keys.front() );
// If the string has any dsl markup, we strip it
string value = Utf8::encode( ArticleDom( curString ).root.renderAsText() );
for ( auto & key : keys ) {
unescapeDsl( key );
normalizeHeadword( key );
abrv[ Utf8::encode( Folding::trimWhitespace( key ) ) ] = value;
}
}
idxHeader.hasAbrv = 1;
idxHeader.abrvAddress = chunks.startNewBlock();
uint32_t sz = abrv.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
for ( const auto & i : abrv ) {
// GD_DPRINTF( "%s:%s\n", i->first.c_str(), i->second.c_str() );
sz = i.first.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
chunks.addToBlock( i.first.data(), sz );
sz = i.second.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
chunks.addToBlock( i.second.data(), sz );
}
}
catch( std::exception & e )
{
gdWarning( "Error reading abrv file \"%s\", error: %s. Skipping it.\n",
abrvFileName.c_str(), e.what() );
}
}
bool hasString = false;
wstring curString;
size_t curOffset;
uint32_t articleCount = 0, wordCount = 0;
for( ; ; )
{
// Find the main headword
if ( !hasString && !scanner.readNextLineWithoutComments( curString, curOffset, true) )
break; // Clean end of file
hasString = false;
// The line read should either consist of pure whitespace, or be a headword
// skip too long headword,it can never be headword.
if( curString.empty() || curString.size() > 100 )
continue;
if ( isDslWs( curString[ 0 ] ) )
{
// The first character is blank. Let's make sure that all other
// characters are blank, too.
for( size_t x = 1; x < curString.size(); ++x )
{
if ( !isDslWs( curString[ x ] ) )
{
gdWarning( "Garbage string in %s at offset 0x%lX\n", fileName.c_str(), curOffset );
break;
}
}
continue;
}
// Ok, got the headword
list< wstring > allEntryWords;
processUnsortedParts( curString, true );
expandOptionalParts( curString, &allEntryWords );
uint32_t articleOffset = curOffset;
//GD_DPRINTF( "Headword: %ls\n", curString.c_str() );
// More headwords may follow
for( ; ; )
{
if ( ! ( hasString = scanner.readNextLineWithoutComments( curString, curOffset ) ) )
{
gdWarning( "Premature end of file %s\n", fileName.c_str() );
break;
}
// Lingvo skips empty strings between the headwords
if ( curString.empty() )
continue;
if ( isDslWs( curString[ 0 ] ) )
break; // No more headwords
#ifdef QT_DEBUG
qDebug() << "Alt headword" << QString::fromStdU32String( curString );
#endif
processUnsortedParts( curString, true );
expandTildes( curString, allEntryWords.front() );
expandOptionalParts( curString, &allEntryWords );
}
if ( !hasString )
break;
// Insert new entry
uint32_t descOffset = chunks.startNewBlock();
chunks.addToBlock( &articleOffset, sizeof( articleOffset ) );
for ( auto & allEntryWord : allEntryWords ) {
unescapeDsl( allEntryWord );
normalizeHeadword( allEntryWord );
indexedWords.addWord( allEntryWord, descOffset, maxHeadwordSize );
}
++articleCount;
wordCount += allEntryWords.size();
int insideInsided = 0;
wstring headword;
QVector< InsidedCard > insidedCards;
uint32_t offset = curOffset;
QVector< wstring > insidedHeadwords;
unsigned linesInsideCard = 0;
int dogLine = 0;
bool wasEmptyLine = false;
int headwordLine = scanner.getLinesRead() - 2;
bool noSignificantLines = Folding::applyWhitespaceOnly( curString ).empty();
bool haveLine = !noSignificantLines;
// Skip the article's body
for( ; ; )
{
hasString = haveLine ? true : scanner.readNextLineWithoutComments( curString, curOffset);
haveLine = false;
if ( !hasString || ( curString.size() && !isDslWs( curString[ 0 ] ) ) )
{
if( insideInsided )
{
gdWarning( "Unclosed tag '@' at line %i", dogLine );
insidedCards.append( InsidedCard( offset, curOffset - offset, insidedHeadwords ) );
}
if( noSignificantLines )
gdWarning( "Orphan headword at line %i", headwordLine );
break;
}
// Check for orphan strings
if( curString.empty() )
{
wasEmptyLine = true;
continue;
}
else
{
if( wasEmptyLine && !Folding::applyWhitespaceOnly( curString ).empty() )
gdWarning( "Orphan string at line %i", scanner.getLinesRead() - 1 );
}
if( noSignificantLines )
noSignificantLines = Folding::applyWhitespaceOnly( curString ).empty();
// Find embedded cards
wstring::size_type n = curString.find( L'@' );
if( n == wstring::npos || curString[ n - 1 ] == L'\\' )
{
if( insideInsided )
linesInsideCard++;
continue;
}
else
{
// Embedded card tag must be placed at first position in line after spaces
if( !isAtSignFirst( curString ) )
{
gdWarning( "Unescaped '@' symbol at line %i", scanner.getLinesRead() - 1 );
if( insideInsided )
linesInsideCard++;
continue;
}
}
dogLine = scanner.getLinesRead() - 1;
// Handle embedded card
if( insideInsided )
{
if( linesInsideCard )
{
insidedCards.append( InsidedCard( offset, curOffset - offset, insidedHeadwords ) );
insidedHeadwords.clear();
linesInsideCard = 0;
offset = curOffset;
}
}
else
{
offset = curOffset;
linesInsideCard = 0;
}
headword = Folding::trimWhitespace( curString.substr( n + 1 ) );
if( !headword.empty() )
{
processUnsortedParts( headword, true );
expandTildes( headword, allEntryWords.front() );
insidedHeadwords.append( headword );
insideInsided = true;
}
else
insideInsided = false;
}
// Now that we're having read the first string after the article
// itself, we can use its offset to calculate the article's size.
// An end of file works here, too.
uint32_t articleSize = ( curOffset - articleOffset );
chunks.addToBlock( &articleSize, sizeof( articleSize ) );
for ( auto & insidedCard : insidedCards ) {
uint32_t desc_offset = chunks.startNewBlock();
chunks.addToBlock( &insidedCard.offset, sizeof( insidedCard.offset ) );
chunks.addToBlock( &insidedCard.size, sizeof( insidedCard.size ) );
for ( auto & hw : insidedCard.headwords ) {
allEntryWords.clear();
expandOptionalParts( hw, &allEntryWords );
for ( auto & allEntryWord : allEntryWords ) {
unescapeDsl( allEntryWord );
normalizeHeadword( allEntryWord );
indexedWords.addWord( allEntryWord, desc_offset, maxHeadwordSize );
}
wordCount += allEntryWords.size();
}
++articleCount;
}
if ( !hasString )
break;
}
// Finish with the chunks
idxHeader.chunksOffset = chunks.finish();
// Build index
IndexInfo idxInfo = BtreeIndexing::buildIndex( indexedWords, idx );
idxHeader.indexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.indexRootOffset = idxInfo.rootOffset;
indexedWords.clear(); // Release memory -- no need for this data
// If there was a zip file, index it too
if ( zipFileName.size() )
{
GD_DPRINTF( "Indexing zip file\n" );
idxHeader.hasZipFile = 1;
IndexedWords zipFileNames;
IndexedZip zipFile;
if ( zipFile.openZipFile( QDir::fromNativeSeparators( zipFileName.c_str() ) ) )
zipFile.indexFile( zipFileNames );
if( !zipFileNames.empty() )
{
// Build the resulting zip file index
IndexInfo idxInfo = BtreeIndexing::buildIndex( zipFileNames, idx );
idxHeader.zipIndexBtreeMaxElements = idxInfo.btreeMaxElements;
idxHeader.zipIndexRootOffset = idxInfo.rootOffset;
}
else
{
// Bad zip file -- no index (though the mark that we have one
// remains)
idxHeader.zipIndexBtreeMaxElements = 0;
idxHeader.zipIndexRootOffset = 0;
}
}
else
idxHeader.hasZipFile = 0;
// That concludes it. Update the header.
idxHeader.signature = Signature;
idxHeader.formatVersion = CurrentFormatVersion;
idxHeader.zipSupportVersion = CurrentZipSupportVersion;
idxHeader.articleCount = articleCount;
idxHeader.wordCount = wordCount;
idxHeader.langFrom = dslLanguageToId( scanner.getLangFrom() );
idxHeader.langTo = dslLanguageToId( scanner.getLangTo() );
idx.rewind();
idx.write( &idxHeader, sizeof( idxHeader ) );
} // In-place try for saving line count
catch( ... )
{
atLine = scanner.getLinesRead();
throw;
}
} // if need to rebuild
dictionaries.push_back( std::make_shared( dictId,
indexFile,
dictFiles,
maxPictureWidth ) );
}
catch( std::exception & e )
{
gdWarning( "DSL dictionary reading failed: %s:%u, error: %s\n",
fileName.c_str(), atLine, e.what() );
}
}
return dictionaries;
}
}