goldendict-ng/hunspell.cc
shenleban tongying e66f97f7db refactor: remove FsEncoding::encode/decode
They never serve any purpose other than converting between std::string <> QString <> char*.
2023-04-14 07:27:59 +08:00

871 lines
21 KiB
C++

/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#include "hunspell.hh"
#include "utf8.hh"
#include "htmlescape.hh"
#include "iconv.hh"
#include "folding.hh"
#include "wstring_qt.hh"
#include "language.hh"
#include "langcoder.hh"
#include <QRunnable>
#include <QThreadPool>
#include <QSemaphore>
#if (QT_VERSION >= QT_VERSION_CHECK(6,0,0))
#include <QtCore5Compat/QRegExp>
#else
#include <QRegExp>
#endif
#include <QDir>
#include <QCoreApplication>
#include <QFileInfo>
#include <set>
#ifndef INCLUDE_LIBRARY_PATH
#include <hunspell.hxx>
#else
#include <hunspell/hunspell.hxx>
#endif
#include "gddebug.hh"
#include "fsencoding.hh"
#include "utils.hh"
namespace HunspellMorpho {
using namespace Dictionary;
using gd::wchar;
namespace {
class HunspellDictionary: public Dictionary::Class
{
string name;
Hunspell hunspell;
#ifdef Q_OS_WIN32
static string Utf8ToLocal8Bit( string const & name )
{
return string( QString::fromUtf8( name.c_str() ).toLocal8Bit().data() );
}
#endif
public:
/// files[ 0 ] should be .aff file, files[ 1 ] should be .dic file.
HunspellDictionary( string const & id, string const & name_,
vector< string > const & files ):
Dictionary::Class( id, files ),
name( name_ ),
#ifdef Q_OS_WIN32
hunspell( Utf8ToLocal8Bit( files[ 0 ] ).c_str(), Utf8ToLocal8Bit( files[ 1 ] ).c_str() )
#else
hunspell( files[ 0 ].c_str(), files[ 1 ].c_str() )
#endif
{
}
string getName() noexcept override
{ return name; }
map< Property, string > getProperties() noexcept override
{ return map< Property, string >(); }
unsigned long getArticleCount() noexcept override
{ return 0; }
unsigned long getWordCount() noexcept override
{ return 0; }
sptr< WordSearchRequest > prefixMatch( wstring const &,
unsigned long maxResults ) override
;
sptr< WordSearchRequest > findHeadwordsForSynonym( wstring const & ) override
;
sptr< DataRequest > getArticle( wstring const &,
vector< wstring > const & alts,
wstring const &,
bool ) override
;
bool isLocalDictionary() override
{ return true; }
vector< wstring > getAlternateWritings( const wstring & word ) noexcept override;
protected:
void loadIcon() noexcept override;
private:
// We used to have a separate mutex for each Hunspell instance, assuming
// that its code was reentrant (though probably not thread-safe). However,
// crashes were discovered later when using several Hunspell dictionaries
// simultaneously, and we've switched to have a single mutex for all hunspell
// calls - evidently it's not really reentrant.
static Mutex & getHunspellMutex()
{
static Mutex mutex;
return mutex;
}
// Mutex hunspellMutex;
};
/// Encodes the given string to be passed to the hunspell object. May throw
/// Iconv::Ex
string encodeToHunspell( Hunspell &, wstring const & );
/// Decodes the given string returned by the hunspell object. May throw
/// Iconv::Ex
wstring decodeFromHunspell( Hunspell &, char const * );
/// Generates suggestions via hunspell
QVector< wstring > suggest( wstring & word, Mutex & hunspellMutex,
Hunspell & hunspell );
/// Generates suggestions for compound expression
void getSuggestionsForExpression( wstring const & expression,
vector< wstring > & suggestions,
Mutex & hunspellMutex,
Hunspell & hunspell );
/// Returns true if the string contains whitespace, false otherwise
bool containsWhitespace( wstring const & str )
{
wchar const * next = str.c_str();
for( ; *next; ++next )
if ( Folding::isWhitespace( *next ) )
return true;
return false;
}
void HunspellDictionary::loadIcon() noexcept
{
if ( dictionaryIconLoaded )
return;
QString fileName = QDir::fromNativeSeparators( getDictionaryFilenames()[ 0 ].c_str() );
// Remove the extension
fileName.chop( 3 );
if( !loadIconFromFile( fileName ) )
{
// Load failed -- use default icons
dictionaryNativeIcon = dictionaryIcon = QIcon(":/icons/icon32_hunspell.png");
}
dictionaryIconLoaded = true;
}
vector< wstring > HunspellDictionary::getAlternateWritings( wstring const & word ) noexcept
{
vector< wstring > results;
if( containsWhitespace( word ) )
{
getSuggestionsForExpression( word, results, getHunspellMutex(), hunspell );
}
return results;
}
/// HunspellDictionary::getArticle()
class HunspellArticleRequest;
class HunspellArticleRequestRunnable: public QRunnable
{
HunspellArticleRequest & r;
QSemaphore & hasExited;
public:
HunspellArticleRequestRunnable( HunspellArticleRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~HunspellArticleRequestRunnable()
{
hasExited.release();
}
void run() override;
};
class HunspellArticleRequest: public Dictionary::DataRequest
{
friend class HunspellArticleRequestRunnable;
Mutex & hunspellMutex;
Hunspell & hunspell;
wstring word;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
HunspellArticleRequest( wstring const & word_,
Mutex & hunspellMutex_,
Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ),
word( word_ )
{
QThreadPool::globalInstance()->start(
new HunspellArticleRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by HunspellArticleRequestRunnable
void cancel() override
{
isCancelled.ref();
}
~HunspellArticleRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void HunspellArticleRequestRunnable::run()
{
r.run();
}
void HunspellArticleRequest::run()
{
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
vector< string > suggestions;
try
{
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( containsWhitespace( trimmedWord ) )
{
// For now we don't analyze whitespace-containing phrases
finish();
return;
}
Mutex::Lock _( hunspellMutex );
string encodedWord = encodeToHunspell( hunspell, trimmedWord );
if ( hunspell.spell( encodedWord ) )
{
// Good word -- no spelling suggestions then.
finish();
return;
}
suggestions = hunspell.suggest( encodedWord );
if ( !suggestions.empty() )
{
// There were some suggestions made for us. Make an appropriate output.
string result = "<div class=\"gdspellsuggestion\">" +
Html::escape( QCoreApplication::translate( "Hunspell", "Spelling suggestions: " ).toUtf8().data() );
wstring lowercasedWord = Folding::applySimpleCaseOnly( word );
for( vector< string >::size_type x = 0; x < suggestions.size(); ++x )
{
wstring suggestion = decodeFromHunspell( hunspell, suggestions[ x ].c_str() );
if ( Folding::applySimpleCaseOnly( suggestion ) == lowercasedWord )
{
// If among suggestions we see the same word just with the different
// case, we botch the search -- our searches are case-insensitive, and
// there's no need for suggestions on a good word.
finish();
return;
}
string suggestionUtf8 = Utf8::encode( suggestion );
result += "<a href=\"bword:";
result += Html::escape( suggestionUtf8 ) + "\">";
result += Html::escape( suggestionUtf8 ) + "</a>";
if ( x != suggestions.size() - 1 )
result += ", ";
}
result += "</div>";
Mutex::Lock _( dataMutex );
data.resize( result.size() );
memcpy( &data.front(), result.data(), result.size() );
hasAnyData = true;
}
}
catch( Iconv::Ex & e )
{
gdWarning( "Hunspell: charset conversion error, no processing's done: %s\n", e.what() );
}
catch( std::exception & e )
{
gdWarning( "Hunspell: error: %s\n", e.what() );
}
finish();
}
sptr< DataRequest > HunspellDictionary::getArticle( wstring const & word,
vector< wstring > const &,
wstring const &, bool )
{
return std::make_shared<HunspellArticleRequest>( word, getHunspellMutex(), hunspell );
}
/// HunspellDictionary::findHeadwordsForSynonym()
class HunspellHeadwordsRequest;
class HunspellHeadwordsRequestRunnable: public QRunnable
{
HunspellHeadwordsRequest & r;
QSemaphore & hasExited;
public:
HunspellHeadwordsRequestRunnable( HunspellHeadwordsRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~HunspellHeadwordsRequestRunnable()
{
hasExited.release();
}
void run() override;
};
class HunspellHeadwordsRequest: public Dictionary::WordSearchRequest
{
friend class HunspellHeadwordsRequestRunnable;
Mutex & hunspellMutex;
Hunspell & hunspell;
wstring word;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
HunspellHeadwordsRequest( wstring const & word_,
Mutex & hunspellMutex_,
Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ),
word( word_ )
{
QThreadPool::globalInstance()->start(
new HunspellHeadwordsRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by HunspellHeadwordsRequestRunnable
void cancel() override
{
isCancelled.ref();
}
~HunspellHeadwordsRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void HunspellHeadwordsRequestRunnable::run()
{
r.run();
}
void HunspellHeadwordsRequest::run()
{
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( trimmedWord.size() > 80 )
{
// We won't do anything for overly long sentences since that would probably
// only waste time.
finish();
return;
}
if ( containsWhitespace( trimmedWord ) )
{
vector< wstring > results;
getSuggestionsForExpression( trimmedWord, results, hunspellMutex, hunspell );
Mutex::Lock _( dataMutex );
for( unsigned i = 0; i < results.size(); i++ )
matches.push_back( results[ i ] );
}
else
{
QVector< wstring > suggestions = suggest( trimmedWord, hunspellMutex, hunspell );
if ( !suggestions.empty() )
{
Mutex::Lock _( dataMutex );
for( int x = 0; x < suggestions.size(); ++x )
matches.push_back( suggestions[ x ] );
}
}
finish();
}
QVector< wstring > suggest( wstring & word, Mutex & hunspellMutex, Hunspell & hunspell )
{
QVector< wstring > result;
vector< string > suggestions;
try
{
Mutex::Lock _( hunspellMutex );
string encodedWord = encodeToHunspell( hunspell, word );
suggestions = hunspell.analyze( encodedWord );
if ( !suggestions.empty() )
{
// There were some suggestions made for us. Make an appropriate output.
wstring lowercasedWord = Folding::applySimpleCaseOnly( word );
static QRegExp cutStem( R"(^\s*st:(((\s+(?!\w{2}:)(?!-)(?!\+))|\S+)+))" );
for( vector< string >::size_type x = 0; x < suggestions.size(); ++x )
{
QString suggestion = gd::toQString( decodeFromHunspell( hunspell, suggestions[ x ].c_str() ) );
// Strip comments
int n = suggestion.indexOf( '#' );
if( n >= 0 )
suggestion.chop( suggestion.length() - n );
GD_DPRINTF( ">>>Sugg: %s\n", suggestion.toLocal8Bit().data() );
if ( cutStem.indexIn( suggestion.trimmed() ) != -1 )
{
wstring alt = gd::toWString( cutStem.cap( 1 ) );
if ( Folding::applySimpleCaseOnly( alt ) != lowercasedWord ) // No point in providing same word
{
#ifdef QT_DEBUG
qDebug() << ">>>>>Alt:" << gd::toQString( alt );
#endif
result.append( alt );
}
}
}
}
}
catch( Iconv::Ex & e )
{
gdWarning( "Hunspell: charset conversion error, no processing's done: %s\n", e.what() );
}
return result;
}
sptr< WordSearchRequest > HunspellDictionary::findHeadwordsForSynonym( wstring const & word )
{
return std::make_shared<HunspellHeadwordsRequest>( word, getHunspellMutex(), hunspell );
}
/// HunspellDictionary::prefixMatch()
class HunspellPrefixMatchRequest;
class HunspellPrefixMatchRequestRunnable: public QRunnable
{
HunspellPrefixMatchRequest & r;
QSemaphore & hasExited;
public:
HunspellPrefixMatchRequestRunnable( HunspellPrefixMatchRequest & r_,
QSemaphore & hasExited_ ): r( r_ ),
hasExited( hasExited_ )
{}
~HunspellPrefixMatchRequestRunnable()
{
hasExited.release();
}
void run() override;
};
class HunspellPrefixMatchRequest: public Dictionary::WordSearchRequest
{
friend class HunspellPrefixMatchRequestRunnable;
Mutex & hunspellMutex;
Hunspell & hunspell;
wstring word;
QAtomicInt isCancelled;
QSemaphore hasExited;
public:
HunspellPrefixMatchRequest( wstring const & word_,
Mutex & hunspellMutex_,
Hunspell & hunspell_ ):
hunspellMutex( hunspellMutex_ ),
hunspell( hunspell_ ),
word( word_ )
{
QThreadPool::globalInstance()->start(
new HunspellPrefixMatchRequestRunnable( *this, hasExited ) );
}
void run(); // Run from another thread by HunspellPrefixMatchRequestRunnable
void cancel() override
{
isCancelled.ref();
}
~HunspellPrefixMatchRequest()
{
isCancelled.ref();
hasExited.acquire();
}
};
void HunspellPrefixMatchRequestRunnable::run()
{
r.run();
}
void HunspellPrefixMatchRequest::run()
{
if ( Utils::AtomicInt::loadAcquire( isCancelled ) )
{
finish();
return;
}
try
{
wstring trimmedWord = Folding::trimWhitespaceOrPunct( word );
if ( trimmedWord.empty() || containsWhitespace( trimmedWord ) )
{
// For now we don't analyze whitespace-containing phrases
finish();
return;
}
Mutex::Lock _( hunspellMutex );
string encodedWord = encodeToHunspell( hunspell, trimmedWord );
if ( hunspell.spell( encodedWord ) )
{
// Known word -- add it to the result
Mutex::Lock _( dataMutex );
matches.push_back( WordMatch( trimmedWord, 1 ) );
}
}
catch( Iconv::Ex & e )
{
gdWarning( "Hunspell: charset conversion error, no processing's done: %s\n", e.what() );
}
finish();
}
sptr< WordSearchRequest > HunspellDictionary::prefixMatch( wstring const & word,
unsigned long /*maxResults*/ )
{
return std::make_shared<HunspellPrefixMatchRequest>( word, getHunspellMutex(), hunspell );
}
void getSuggestionsForExpression( wstring const & expression,
vector<wstring> & suggestions,
Mutex & hunspellMutex,
Hunspell & hunspell )
{
// Analyze each word separately and use the first two suggestions, if any.
// This is useful for compound expressions where some words is
// in different form, e.g. "dozing off" -> "doze off".
wstring trimmedWord = Folding::trimWhitespaceOrPunct( expression );
wstring word, punct;
QVector< wstring > words;
suggestions.clear();
// Parse string to separate words
for( wchar const * c = trimmedWord.c_str(); ; ++c )
{
if ( !*c || Folding::isPunct( *c ) || Folding::isWhitespace( * c ) )
{
if ( word.size() )
{
words.push_back( word );
word.clear();
}
if ( *c )
punct.push_back( *c );
}
else
{
if( punct.size() )
{
words.push_back( punct );
punct.clear();
}
if( *c )
word.push_back( *c );
}
if( !*c )
break;
}
if( words.size() > 21 )
{
// Too many words - no suggestions
return;
}
// Combine result strings from suggestions
QVector< wstring > results;
for( int i = 0; i < words.size(); i++ )
{
word = words.at( i );
if( Folding::isPunct( word[ 0 ] ) || Folding::isWhitespace( word[ 0 ] ) )
{
for( int j = 0; j < results.size(); j++ )
results[ j ].append( word );
}
else
{
QVector< wstring > sugg = suggest( word, hunspellMutex, hunspell );
int suggNum = sugg.size() + 1;
if( suggNum > 3 )
suggNum = 3;
int resNum = results.size();
wstring resultStr;
if( resNum == 0 )
{
for( int k = 0; k < suggNum; k++ )
results.push_back( k == 0 ? word : sugg.at( k - 1 ) );
}
else
{
for( int j = 0; j < resNum; j++ )
{
resultStr = results.at( j );
for( int k = 0; k < suggNum; k++ )
{
if( k == 0)
results[ j ].append( word );
else
results.push_back( resultStr + sugg.at( k - 1 ) );
}
}
}
}
}
for( int i = 0; i < results.size(); i++ )
if( results.at( i ) != trimmedWord )
suggestions.push_back( results.at( i ) );
}
string encodeToHunspell( Hunspell & hunspell, wstring const & str )
{
Iconv conv( hunspell.get_dic_encoding(), Iconv::GdWchar );
void const * in = str.data();
size_t inLeft = str.size() * sizeof( wchar );
vector< char > result( str.size() * 4 + 1 ); // +1 isn't actually needed,
// but then iconv complains on empty
// words
void * out = &result.front();
size_t outLeft = result.size();
QString convStr= conv.convert( in, inLeft);
return convStr.toStdString();
}
wstring decodeFromHunspell( Hunspell & hunspell, char const * str )
{
Iconv conv( Iconv::GdWchar, hunspell.get_dic_encoding() );
void const * in = str;
size_t inLeft = strlen( str );
vector< wchar > result( inLeft + 1 ); // +1 isn't needed, but see above
void * out = &result.front();
size_t outLeft = result.size() * sizeof( wchar );
QString convStr= conv.convert( in, inLeft);
return gd::toWString(convStr);
}
}
vector< sptr< Dictionary::Class > > makeDictionaries( Config::Hunspell const & cfg )
{
vector< sptr< Dictionary::Class > > result;
vector< DataFiles > dataFiles = findDataFiles( cfg.dictionariesPath );
for( int x = 0; x < cfg.enabledDictionaries.size(); ++x )
{
for( unsigned d = dataFiles.size(); d--; )
{
if ( dataFiles[ d ].dictId == cfg.enabledDictionaries[ x ] )
{
// Found it
vector< string > dictFiles;
dictFiles.push_back( QDir::toNativeSeparators( dataFiles[ d ].affFileName ).toStdString() );
dictFiles.push_back( QDir::toNativeSeparators( dataFiles[ d ].dicFileName ).toStdString() );
result.push_back(
std::make_shared<HunspellDictionary>( Dictionary::makeDictionaryId( dictFiles ),
dataFiles[ d ].dictName.toUtf8().data(),
dictFiles ) );
break;
}
}
}
return result;
}
vector< DataFiles > findDataFiles( QString const & path )
{
// Empty path means unconfigured directory
if ( path.isEmpty() )
return vector< DataFiles >();
QDir dir( path );
// Find all affix files
QFileInfoList affixFiles = dir.entryInfoList( ( QStringList() << "*.aff" << "*.AFF" ), QDir::Files );
vector< DataFiles > result;
std::set< QString > presentNames;
for( QFileInfoList::const_iterator i = affixFiles.constBegin();
i != affixFiles.constEnd(); ++i )
{
QString affFileName = i->absoluteFilePath();
// See if there's a corresponding .dic file
QString dicFileNameBase = affFileName.mid( 0, affFileName.size() - 3 );
QString dicFileName = dicFileNameBase + "dic";
if ( !QFile( dicFileName ).exists() )
{
dicFileName = dicFileNameBase + "DIC";
if ( !QFile( dicFileName ).exists() )
continue; // No dic file
}
QString dictId = i->fileName();
dictId.chop( 4 );
QString dictBaseId = dictId.size() < 3 ? dictId :
( ( dictId[ 2 ] == '-' || dictId[ 2 ] == '_' ) ? dictId.mid( 0, 2 ) : QString() );
dictBaseId = dictBaseId.toLower();
// Try making up good readable name from dictBaseId
QString localizedName;
if ( dictBaseId.size() == 2 )
localizedName = Language::localizedNameForId( LangCoder::code2toInt( dictBaseId.toLatin1().data() ) );
QString dictName = dictId;
if ( localizedName.size() )
{
dictName = localizedName;
if ( dictId.size() > 2 && ( dictId[ 2 ] == '-' || dictId[ 2 ] == '_' ) &&
dictId.mid( 3 ).toLower() != dictBaseId )
dictName += " (" + dictId.mid( 3 ) + ")";
}
dictName = QCoreApplication::translate( "Hunspell", "%1 Morphology" ).arg( dictName );
if ( presentNames.insert( dictName ).second )
{
// Only include dictionaries with unique names. This combats stuff
// like symlinks en-US->en_US and such
result.push_back( DataFiles( affFileName, dicFileName, dictId, dictName ) );
}
}
return result;
}
}