From 0f3336977f232db60114cf43c87cc3f637646baf Mon Sep 17 00:00:00 2001 From: xiaoyifang Date: Fri, 22 Nov 2024 11:24:56 +0800 Subject: [PATCH] fix: dsl ann file codec detection --- src/common/utils.cc | 11 +++++++++++ src/common/utils.hh | 20 ++++++++++++++++++++ src/dict/dsl.cc | 2 ++ src/dict/dsl_details.cc | 24 +----------------------- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/common/utils.cc b/src/common/utils.cc index b431e2fd..f27ced15 100644 --- a/src/common/utils.cc +++ b/src/common/utils.cc @@ -47,6 +47,17 @@ QString unescapeAmps( QString const & str ) result.replace( "&&", "&" ); return result; } + +QTextCodec::Encoding detectEncoding( QByteArray & ba, char16_t expectedFirstCharacter = 0 ) +{ + QStringConverter::Encoding detectedEncoding = QStringConverter::encodingForData( data, expectedFirstCharacter ); + // mapping the encoding + if ( detectedEncoding.has_value() && encodingMap.contains( detectedEncoding.value() ) ) { + return encodingMap[ encoding ]; + } + // default utf8 + return QTextCodec::Utf8; +} } // namespace Utils QString Utils::Path::combine( const QString & path1, const QString & path2 ) diff --git a/src/common/utils.hh b/src/common/utils.hh index dfa7301e..8f014ce3 100644 --- a/src/common/utils.hh +++ b/src/common/utils.hh @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include "filetype.hh" #include using std::string; @@ -353,4 +356,21 @@ QString escapeAmps( QString const & str ); QString unescapeAmps( QString const & str ); +//restrict scope to this very file. +namespace{ +// encoding mappings between QStringConverter::Encoding and QTextCodec::Encoding +const QMap< QStringConverter::Encoding, QTextCodec::Encoding > encodingMap = { + { QStringConverter::Latin1, QTextCodec::Latin1 }, + { QStringConverter::Utf8, QTextCodec::Utf8 }, + { QStringConverter::Utf16, QTextCodec::Utf16 }, + { QStringConverter::Utf16LE , QTextCodec::Utf16LE }, + { QStringConverter::Utf16BE , QTextCodec::Utf16BE }, + { QStringConverter::Utf32LE , QTextCodec::Utf32LE }, + { QStringConverter::Utf32BE , QTextCodec::Utf32BE }, + // others +}; +} + +QTextCodec::Encoding detectEncoding( QByteArray & ba, char16_t expectedFirstCharacter = 0 ); + } // namespace Utils diff --git a/src/dict/dsl.cc b/src/dict/dsl.cc index 0325582f..caebf7b6 100644 --- a/src/dict/dsl.cc +++ b/src/dict/dsl.cc @@ -1042,6 +1042,8 @@ QString const & DslDictionary::getDescription() QString data, str; str = annStream.readLine(); + auto codec = Utils::detectEncoding( str ); + annStream.setCodec( codec ); if ( str.left( 10 ).compare( "#LANGUAGE " ) != 0 ) { annStream.seek( 0 ); diff --git a/src/dict/dsl_details.cc b/src/dict/dsl_details.cc index 55595563..3ab8a718 100644 --- a/src/dict/dsl_details.cc +++ b/src/dict/dsl_details.cc @@ -872,30 +872,8 @@ DslScanner::DslScanner( string const & fileName ): bool needExactEncoding = false; // Note that .dsl format always starts with "#NAME" - if ( auto guessedEncoding = QStringConverter::encodingForData( { firstBytes, firstBytesSize }, '#' ); - guessedEncoding.has_value() ) { - switch ( guessedEncoding.value() ) { - case QStringConverter::Utf8: - encoding = Utf8::Utf8; - break; - case QStringConverter::Utf16LE: - encoding = Utf8::Utf16LE; - break; - case QStringConverter::Utf16BE: - encoding = Utf8::Utf16BE; - break; - case QStringConverter::Utf32LE: - encoding = Utf8::Utf16LE; - break; - case QStringConverter::Utf32BE: - encoding = Utf8::Utf32BE; - break; - default: - break; - } - } - codec = QTextCodec::codecForName( getEncodingNameFor( encoding ) ); + codec = Utils::detectEncoding( { firstBytes, firstBytesSize }, '#' ); qDebug() << "DSL encoding ->" << codec->name();