mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-27 15:24:05 +00:00
refactor: merge some encoding and encoding names related code
Some checks are pending
SonarCloud / Build and analyze (push) Waiting to run
Some checks are pending
SonarCloud / Build and analyze (push) Waiting to run
This commit is contained in:
parent
52a9427b8b
commit
dda91a30dd
|
@ -26,6 +26,7 @@ Checks: >
|
|||
-google-readability-casting,
|
||||
-hicpp-deprecated-headers,
|
||||
-hicpp-no-array-decay,
|
||||
-misc-confusable-identifiers,
|
||||
-misc-const-correctness,
|
||||
-misc-include-cleaner,
|
||||
-misc-non-private-member-variables-in-classes,
|
||||
|
@ -33,6 +34,7 @@ Checks: >
|
|||
-modernize-deprecated-headers,
|
||||
-modernize-use-nodiscard,
|
||||
-modernize-use-trailing-return-type,
|
||||
-performance-enum-size,
|
||||
-readability-function-cognitive-complexity,
|
||||
-readability-identifier-length,
|
||||
-readability-magic-numbers,
|
||||
|
|
|
@ -6,12 +6,8 @@
|
|||
#include <errno.h>
|
||||
#include <string.h>
|
||||
|
||||
char const * const Iconv::GdWchar = "UTF-32LE";
|
||||
char const * const Iconv::Utf16Le = "UTF-16LE";
|
||||
char const * const Iconv::Utf8 = "UTF-8";
|
||||
|
||||
Iconv::Iconv( char const * from ):
|
||||
state( iconv_open( Utf8, from ) )
|
||||
state( iconv_open( Text::utf8, from ) )
|
||||
{
|
||||
if ( state == (iconv_t)-1 ) {
|
||||
throw exCantInit( strerror( errno ) );
|
||||
|
|
|
@ -3,14 +3,11 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <QString>
|
||||
|
||||
#include "text.hh"
|
||||
#include "ex.hh"
|
||||
|
||||
#include "text.hh"
|
||||
#include <QString>
|
||||
#include <iconv.h>
|
||||
|
||||
|
||||
/// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
|
||||
/// Only supports converting from a known "from" to UTF8
|
||||
class Iconv
|
||||
|
@ -22,12 +19,6 @@ public:
|
|||
DEF_EX( Ex, "Iconv exception", std::exception )
|
||||
DEF_EX_STR( exCantInit, "Can't initialize iconv conversion:", Ex )
|
||||
|
||||
// Some predefined character sets' names
|
||||
|
||||
static char const * const GdWchar;
|
||||
static char const * const Utf16Le;
|
||||
static char const * const Utf8;
|
||||
|
||||
explicit Iconv( char const * from );
|
||||
|
||||
~Iconv();
|
||||
|
|
|
@ -10,6 +10,60 @@
|
|||
|
||||
namespace Text {
|
||||
|
||||
const char * getEncodingNameFor( Encoding e )
|
||||
{
|
||||
switch ( e ) {
|
||||
case Encoding::Utf32LE:
|
||||
return utf32_le;
|
||||
case Encoding::Utf32BE:
|
||||
return utf32_be;
|
||||
case Encoding::Utf32:
|
||||
return utf32;
|
||||
case Encoding::Utf16LE:
|
||||
return utf16_le;
|
||||
case Encoding::Utf16BE:
|
||||
return utf16_be;
|
||||
case Encoding::Windows1252:
|
||||
return windows_1252;
|
||||
case Encoding::Windows1251:
|
||||
return windows_1251;
|
||||
case Encoding::Windows1250:
|
||||
return windows_1250;
|
||||
case Encoding::Utf8:
|
||||
default:
|
||||
return utf8;
|
||||
}
|
||||
}
|
||||
|
||||
Encoding getEncodingForName( const QByteArray & name )
|
||||
{
|
||||
auto const n = name.toUpper();
|
||||
if ( n == utf32_le ) {
|
||||
return Encoding::Utf32LE;
|
||||
}
|
||||
if ( n == utf32_be ) {
|
||||
return Encoding::Utf32BE;
|
||||
}
|
||||
if ( n == utf32 ) {
|
||||
return Encoding::Utf32;
|
||||
}
|
||||
if ( n == utf16_le ) {
|
||||
return Encoding::Utf16LE;
|
||||
}
|
||||
if ( n == utf16_be ) {
|
||||
return Encoding::Utf16BE;
|
||||
}
|
||||
if ( n == windows_1252 ) {
|
||||
return Encoding::Windows1252;
|
||||
}
|
||||
if ( n == windows_1251 ) {
|
||||
return Encoding::Windows1251;
|
||||
}
|
||||
if ( n == windows_1250 ) {
|
||||
return Encoding::Windows1250;
|
||||
}
|
||||
return Encoding::Utf8;
|
||||
}
|
||||
|
||||
/// Encodes the given UTF-32 into UTF-8. The inSize specifies the number
|
||||
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
|
||||
|
@ -200,87 +254,31 @@ int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2lengt
|
|||
return pos - s1 + s2length;
|
||||
}
|
||||
|
||||
char const * getEncodingNameFor( Encoding e )
|
||||
{
|
||||
switch ( e ) {
|
||||
case Utf32LE:
|
||||
return "UTF-32LE";
|
||||
case Utf32BE:
|
||||
return "UTF-32BE";
|
||||
case Utf16LE:
|
||||
return "UTF-16LE";
|
||||
case Utf16BE:
|
||||
return "UTF-16BE";
|
||||
case Windows1252:
|
||||
return "WINDOWS-1252";
|
||||
case Windows1251:
|
||||
return "WINDOWS-1251";
|
||||
case Utf8:
|
||||
return "UTF-8";
|
||||
case Windows1250:
|
||||
return "WINDOWS-1250";
|
||||
default:
|
||||
return "UTF-8";
|
||||
}
|
||||
}
|
||||
|
||||
Encoding getEncodingForName( const QByteArray & _name )
|
||||
{
|
||||
const auto name = _name.toUpper();
|
||||
if ( name == "UTF-32LE" ) {
|
||||
return Utf32LE;
|
||||
}
|
||||
if ( name == "UTF-32BE" ) {
|
||||
return Utf32BE;
|
||||
}
|
||||
if ( name == "UTF-16LE" ) {
|
||||
return Utf16LE;
|
||||
}
|
||||
if ( name == "UTF-16BE" ) {
|
||||
return Utf16BE;
|
||||
}
|
||||
if ( name == "WINDOWS-1252" ) {
|
||||
return Windows1252;
|
||||
}
|
||||
if ( name == "WINDOWS-1251" ) {
|
||||
return Windows1251;
|
||||
}
|
||||
if ( name == "UTF-8" ) {
|
||||
return Utf8;
|
||||
}
|
||||
if ( name == "WINDOWS-1250" ) {
|
||||
return Windows1250;
|
||||
}
|
||||
return Utf8;
|
||||
}
|
||||
|
||||
LineFeed initLineFeed( const Encoding e )
|
||||
{
|
||||
LineFeed lf{};
|
||||
switch ( e ) {
|
||||
case Utf32LE:
|
||||
case Encoding::Utf32LE:
|
||||
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
|
||||
lf.length = 4;
|
||||
break;
|
||||
case Utf32BE:
|
||||
case Encoding::Utf32BE:
|
||||
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
|
||||
lf.length = 4;
|
||||
break;
|
||||
case Utf16LE:
|
||||
case Encoding::Utf16LE:
|
||||
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
|
||||
lf.length = 2;
|
||||
break;
|
||||
case Utf16BE:
|
||||
case Encoding::Utf16BE:
|
||||
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
|
||||
lf.length = 2;
|
||||
break;
|
||||
case Windows1252:
|
||||
|
||||
case Windows1251:
|
||||
|
||||
case Utf8:
|
||||
|
||||
case Windows1250:
|
||||
case Encoding::Windows1252:
|
||||
case Encoding::Windows1251:
|
||||
case Encoding::Windows1250:
|
||||
case Encoding::Utf8:
|
||||
default:
|
||||
lf.length = 1;
|
||||
lf.lineFeed = new char[ 1 ]{ 0x0A };
|
||||
|
|
|
@ -2,18 +2,18 @@
|
|||
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
||||
#pragma once
|
||||
|
||||
#include <cstdio>
|
||||
#include "ex.hh"
|
||||
#include <QByteArray>
|
||||
#include <string>
|
||||
#include "ex.hh"
|
||||
|
||||
/// Facilities to process Text, focusing on Unicode
|
||||
namespace Text {
|
||||
DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )
|
||||
|
||||
// Those are possible encodings for .dsl files
|
||||
enum Encoding {
|
||||
Utf16LE,
|
||||
/// Encoding names. Ref -> IANA's encoding names https://www.iana.org/assignments/character-sets/character-sets.xhtml
|
||||
/// Notice: The ordering must not be changed before Utf32LE. The current .dsl format index file depends on it.
|
||||
enum class Encoding {
|
||||
Utf16LE = 0,
|
||||
Utf16BE,
|
||||
Windows1252,
|
||||
Windows1251,
|
||||
|
@ -21,9 +21,25 @@ enum Encoding {
|
|||
Utf8,
|
||||
Utf32BE,
|
||||
Utf32LE,
|
||||
Utf32,
|
||||
};
|
||||
|
||||
inline constexpr auto utf16_be = "UTF-16BE";
|
||||
inline constexpr auto utf16_le = "UTF-16LE";
|
||||
inline constexpr auto utf32 = "UTF-32";
|
||||
inline constexpr auto utf32_be = "UTF-32BE";
|
||||
inline constexpr auto utf32_le = "UTF-32LE";
|
||||
inline constexpr auto utf8 = "UTF-8";
|
||||
inline constexpr auto windows_1250 = "WINDOWS-1250";
|
||||
inline constexpr auto windows_1251 = "WINDOWS-1251";
|
||||
inline constexpr auto windows_1252 = "WINDOWS-1252";
|
||||
|
||||
const char * getEncodingNameFor( Encoding e );
|
||||
Encoding getEncodingForName( const QByteArray & name );
|
||||
|
||||
/// utf32 -> utf8
|
||||
std::string toUtf8( std::u32string const & ) noexcept;
|
||||
/// utf8 -> utf32
|
||||
std::u32string toUtf32( std::string const & );
|
||||
|
||||
/// Since the standard isspace() is locale-specific, we need something
|
||||
|
@ -33,8 +49,6 @@ bool isspace( int c );
|
|||
|
||||
//get the first line in string s1. -1 if not found
|
||||
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
|
||||
char const * getEncodingNameFor( Encoding e );
|
||||
Encoding getEncodingForName( const QByteArray & name );
|
||||
|
||||
struct LineFeed
|
||||
{
|
||||
|
|
|
@ -1144,8 +1144,9 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
|
|||
}
|
||||
else {
|
||||
try {
|
||||
articleData =
|
||||
Iconv::toWstring( getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize );
|
||||
articleData = Iconv::toWstring( getEncodingNameFor( static_cast< Encoding >( idxHeader.dslEncoding ) ),
|
||||
articleBody,
|
||||
articleSize );
|
||||
free( articleBody );
|
||||
|
||||
// Strip DSL comments
|
||||
|
@ -1789,7 +1790,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
idx.write( soundDictName.data(), soundDictName.size() );
|
||||
}
|
||||
|
||||
idxHeader.dslEncoding = scanner.getEncoding();
|
||||
idxHeader.dslEncoding = static_cast< uint32_t >( scanner.getEncoding() );
|
||||
|
||||
IndexedWords indexedWords;
|
||||
|
||||
|
|
|
@ -844,7 +844,7 @@ bool ArticleDom::atSignFirstInLine()
|
|||
/////////////// DslScanner
|
||||
|
||||
DslScanner::DslScanner( string const & fileName ):
|
||||
encoding( Text::Utf8 ),
|
||||
encoding( Text::Encoding::Utf8 ),
|
||||
readBufferPtr( readBuffer ),
|
||||
readBufferLeft( 0 ),
|
||||
linesRead( 0 )
|
||||
|
@ -875,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ):
|
|||
guessedEncoding.has_value() ) {
|
||||
switch ( guessedEncoding.value() ) {
|
||||
case QStringConverter::Utf8:
|
||||
encoding = Text::Utf8;
|
||||
encoding = Text::Encoding::Utf8;
|
||||
break;
|
||||
case QStringConverter::Utf16LE:
|
||||
encoding = Text::Utf16LE;
|
||||
encoding = Text::Encoding::Utf16LE;
|
||||
break;
|
||||
case QStringConverter::Utf16BE:
|
||||
encoding = Text::Utf16BE;
|
||||
encoding = Text::Encoding::Utf16BE;
|
||||
break;
|
||||
case QStringConverter::Utf32LE:
|
||||
encoding = Text::Utf16LE;
|
||||
encoding = Text::Encoding::Utf16LE;
|
||||
break;
|
||||
case QStringConverter::Utf32BE:
|
||||
encoding = Text::Utf32BE;
|
||||
encoding = Text::Encoding::Utf32BE;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -976,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ):
|
|||
qWarning( "Warning: encoding was specified in a Unicode file, ignoring." );
|
||||
}
|
||||
else if ( !arg.compare( U"Latin" ) ) {
|
||||
encoding = Text::Windows1252;
|
||||
encoding = Text::Encoding::Windows1252;
|
||||
}
|
||||
else if ( !arg.compare( U"Cyrillic" ) ) {
|
||||
encoding = Text::Windows1251;
|
||||
encoding = Text::Encoding::Windows1251;
|
||||
}
|
||||
else if ( !arg.compare( U"EasternEuropean" ) ) {
|
||||
encoding = Text::Windows1250;
|
||||
encoding = Text::Encoding::Windows1250;
|
||||
}
|
||||
else {
|
||||
gzclose( f );
|
||||
|
|
|
@ -207,8 +207,8 @@ void stripComments( std::u32string &, bool & );
|
|||
inline size_t DslScanner::distanceToBytes( size_t x ) const
|
||||
{
|
||||
switch ( encoding ) {
|
||||
case Text::Utf16LE:
|
||||
case Text::Utf16BE:
|
||||
case Encoding::Utf16LE:
|
||||
case Encoding::Utf16BE:
|
||||
return x * 2;
|
||||
default:
|
||||
return x;
|
||||
|
|
|
@ -123,7 +123,7 @@ public:
|
|||
};
|
||||
|
||||
GlsScanner::GlsScanner( string const & fileName ):
|
||||
encoding( Text::Utf8 ),
|
||||
encoding( Encoding::Utf8 ),
|
||||
readBufferPtr( readBuffer ),
|
||||
readBufferLeft( 0 ),
|
||||
linesRead( 0 )
|
||||
|
@ -149,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ):
|
|||
// If the file begins with the dedicated Unicode marker, we just consume
|
||||
// it. If, on the other hand, it's not, we return the bytes back
|
||||
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) {
|
||||
encoding = Text::Utf16LE;
|
||||
encoding = Encoding::Utf16LE;
|
||||
}
|
||||
else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) {
|
||||
encoding = Text::Utf16BE;
|
||||
encoding = Encoding::Utf16BE;
|
||||
}
|
||||
else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) {
|
||||
// Looks like Utf8, read one more byte
|
||||
|
@ -161,14 +161,14 @@ GlsScanner::GlsScanner( string const & fileName ):
|
|||
gzclose( f );
|
||||
throw exMalformedGlsFile( fileName );
|
||||
}
|
||||
encoding = Text::Utf8;
|
||||
encoding = Encoding::Utf8;
|
||||
}
|
||||
else {
|
||||
if ( gzrewind( f ) ) {
|
||||
gzclose( f );
|
||||
throw exCantOpen( fileName );
|
||||
}
|
||||
encoding = Text::Utf8;
|
||||
encoding = Encoding::Utf8;
|
||||
}
|
||||
|
||||
codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) );
|
||||
|
@ -1259,7 +1259,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
|
|||
idx.write( (uint32_t)dictionaryName.size() );
|
||||
idx.write( dictionaryName.data(), dictionaryName.size() );
|
||||
|
||||
idxHeader.glsEncoding = scanner.getEncoding();
|
||||
idxHeader.glsEncoding = static_cast< uint32_t >( scanner.getEncoding() );
|
||||
|
||||
IndexedWords indexedWords;
|
||||
|
||||
|
|
|
@ -207,7 +207,7 @@ void HunspellArticleRequest::run()
|
|||
|
||||
QMutexLocker _( &hunspellMutex );
|
||||
|
||||
string trimmedWord_utf8 = Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() );
|
||||
string trimmedWord_utf8 = Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() );
|
||||
|
||||
if ( hunspell.spell( trimmedWord_utf8 ) ) {
|
||||
// Good word -- no spelling suggestions then.
|
||||
|
@ -361,7 +361,7 @@ QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex,
|
|||
try {
|
||||
QMutexLocker _( &hunspellMutex );
|
||||
|
||||
auto suggestions = hunspell.analyze( Iconv::toUtf8( Iconv::GdWchar, word.data(), word.size() ) );
|
||||
auto suggestions = hunspell.analyze( Iconv::toUtf8( Text::utf32, word.data(), word.size() ) );
|
||||
if ( !suggestions.empty() ) {
|
||||
// There were some suggestions made for us. Make an appropriate output.
|
||||
|
||||
|
@ -464,7 +464,7 @@ void HunspellPrefixMatchRequest::run()
|
|||
|
||||
QMutexLocker _( &hunspellMutex );
|
||||
|
||||
if ( hunspell.spell( Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() ) ) ) {
|
||||
if ( hunspell.spell( Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() ) ) ) {
|
||||
// Known word -- add it to the result
|
||||
|
||||
QMutexLocker _( &dataMutex );
|
||||
|
|
|
@ -143,7 +143,7 @@ Entry::Entry( File::Index & f )
|
|||
// Read the size of the recording, in samples
|
||||
samplesLength = f.read< uint32_t >();
|
||||
|
||||
name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
|
||||
name = Iconv::toUtf8( Text::utf16_le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
|
||||
}
|
||||
|
||||
class LsaDictionary: public BtreeIndexing::BtreeDictionary
|
||||
|
|
Loading…
Reference in a new issue