refactor: merge some encoding and encoding names related code
Some checks are pending
SonarCloud / Build and analyze (push) Waiting to run

This commit is contained in:
shenleban tongying 2024-11-24 01:29:09 -05:00 committed by GitHub
parent 52a9427b8b
commit dda91a30dd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 113 additions and 111 deletions

View file

@ -26,6 +26,7 @@ Checks: >
-google-readability-casting,
-hicpp-deprecated-headers,
-hicpp-no-array-decay,
-misc-confusable-identifiers,
-misc-const-correctness,
-misc-include-cleaner,
-misc-non-private-member-variables-in-classes,
@ -33,6 +34,7 @@ Checks: >
-modernize-deprecated-headers,
-modernize-use-nodiscard,
-modernize-use-trailing-return-type,
-performance-enum-size,
-readability-function-cognitive-complexity,
-readability-identifier-length,
-readability-magic-numbers,

View file

@ -6,12 +6,8 @@
#include <errno.h>
#include <string.h>
char const * const Iconv::GdWchar = "UTF-32LE";
char const * const Iconv::Utf16Le = "UTF-16LE";
char const * const Iconv::Utf8 = "UTF-8";
Iconv::Iconv( char const * from ):
state( iconv_open( Utf8, from ) )
state( iconv_open( Text::utf8, from ) )
{
if ( state == (iconv_t)-1 ) {
throw exCantInit( strerror( errno ) );

View file

@ -3,14 +3,11 @@
#pragma once
#include <QString>
#include "text.hh"
#include "ex.hh"
#include "text.hh"
#include <QString>
#include <iconv.h>
/// "Internationalization conversion" for char encoding conversion, currently implemented with iconv()
/// Only supports converting from a known "from" to UTF8
class Iconv
@ -22,12 +19,6 @@ public:
DEF_EX( Ex, "Iconv exception", std::exception )
DEF_EX_STR( exCantInit, "Can't initialize iconv conversion:", Ex )
// Some predefined character sets' names
static char const * const GdWchar;
static char const * const Utf16Le;
static char const * const Utf8;
explicit Iconv( char const * from );
~Iconv();

View file

@ -10,6 +10,60 @@
namespace Text {
const char * getEncodingNameFor( Encoding e )
{
switch ( e ) {
case Encoding::Utf32LE:
return utf32_le;
case Encoding::Utf32BE:
return utf32_be;
case Encoding::Utf32:
return utf32;
case Encoding::Utf16LE:
return utf16_le;
case Encoding::Utf16BE:
return utf16_be;
case Encoding::Windows1252:
return windows_1252;
case Encoding::Windows1251:
return windows_1251;
case Encoding::Windows1250:
return windows_1250;
case Encoding::Utf8:
default:
return utf8;
}
}
Encoding getEncodingForName( const QByteArray & name )
{
auto const n = name.toUpper();
if ( n == utf32_le ) {
return Encoding::Utf32LE;
}
if ( n == utf32_be ) {
return Encoding::Utf32BE;
}
if ( n == utf32 ) {
return Encoding::Utf32;
}
if ( n == utf16_le ) {
return Encoding::Utf16LE;
}
if ( n == utf16_be ) {
return Encoding::Utf16BE;
}
if ( n == windows_1252 ) {
return Encoding::Windows1252;
}
if ( n == windows_1251 ) {
return Encoding::Windows1251;
}
if ( n == windows_1250 ) {
return Encoding::Windows1250;
}
return Encoding::Utf8;
}
/// Encodes the given UTF-32 into UTF-8. The inSize specifies the number
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
@ -200,87 +254,31 @@ int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2lengt
return pos - s1 + s2length;
}
char const * getEncodingNameFor( Encoding e )
{
switch ( e ) {
case Utf32LE:
return "UTF-32LE";
case Utf32BE:
return "UTF-32BE";
case Utf16LE:
return "UTF-16LE";
case Utf16BE:
return "UTF-16BE";
case Windows1252:
return "WINDOWS-1252";
case Windows1251:
return "WINDOWS-1251";
case Utf8:
return "UTF-8";
case Windows1250:
return "WINDOWS-1250";
default:
return "UTF-8";
}
}
Encoding getEncodingForName( const QByteArray & _name )
{
const auto name = _name.toUpper();
if ( name == "UTF-32LE" ) {
return Utf32LE;
}
if ( name == "UTF-32BE" ) {
return Utf32BE;
}
if ( name == "UTF-16LE" ) {
return Utf16LE;
}
if ( name == "UTF-16BE" ) {
return Utf16BE;
}
if ( name == "WINDOWS-1252" ) {
return Windows1252;
}
if ( name == "WINDOWS-1251" ) {
return Windows1251;
}
if ( name == "UTF-8" ) {
return Utf8;
}
if ( name == "WINDOWS-1250" ) {
return Windows1250;
}
return Utf8;
}
LineFeed initLineFeed( const Encoding e )
{
LineFeed lf{};
switch ( e ) {
case Utf32LE:
case Encoding::Utf32LE:
lf.lineFeed = new char[ 4 ]{ 0x0A, 0, 0, 0 };
lf.length = 4;
break;
case Utf32BE:
case Encoding::Utf32BE:
lf.lineFeed = new char[ 4 ]{ 0, 0, 0, 0x0A };
lf.length = 4;
break;
case Utf16LE:
case Encoding::Utf16LE:
lf.lineFeed = new char[ 2 ]{ 0x0A, 0 };
lf.length = 2;
break;
case Utf16BE:
case Encoding::Utf16BE:
lf.lineFeed = new char[ 2 ]{ 0, 0x0A };
lf.length = 2;
break;
case Windows1252:
case Windows1251:
case Utf8:
case Windows1250:
case Encoding::Windows1252:
case Encoding::Windows1251:
case Encoding::Windows1250:
case Encoding::Utf8:
default:
lf.length = 1;
lf.lineFeed = new char[ 1 ]{ 0x0A };

View file

@ -2,18 +2,18 @@
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#pragma once
#include <cstdio>
#include "ex.hh"
#include <QByteArray>
#include <string>
#include "ex.hh"
/// Facilities to process Text, focusing on Unicode
namespace Text {
DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )
// Those are possible encodings for .dsl files
enum Encoding {
Utf16LE,
/// Encoding names. Ref -> IANA's encoding names https://www.iana.org/assignments/character-sets/character-sets.xhtml
/// Notice: The ordering must not be changed before Utf32LE. The current .dsl format index file depends on it.
enum class Encoding {
Utf16LE = 0,
Utf16BE,
Windows1252,
Windows1251,
@ -21,9 +21,25 @@ enum Encoding {
Utf8,
Utf32BE,
Utf32LE,
Utf32,
};
inline constexpr auto utf16_be = "UTF-16BE";
inline constexpr auto utf16_le = "UTF-16LE";
inline constexpr auto utf32 = "UTF-32";
inline constexpr auto utf32_be = "UTF-32BE";
inline constexpr auto utf32_le = "UTF-32LE";
inline constexpr auto utf8 = "UTF-8";
inline constexpr auto windows_1250 = "WINDOWS-1250";
inline constexpr auto windows_1251 = "WINDOWS-1251";
inline constexpr auto windows_1252 = "WINDOWS-1252";
const char * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
/// utf32 -> utf8
std::string toUtf8( std::u32string const & ) noexcept;
/// utf8 -> utf32
std::u32string toUtf32( std::string const & );
/// Since the standard isspace() is locale-specific, we need something
@ -33,8 +49,6 @@ bool isspace( int c );
//get the first line in string s1. -1 if not found
int findFirstLinePosition( char * s1, int s1length, const char * s2, int s2length );
char const * getEncodingNameFor( Encoding e );
Encoding getEncodingForName( const QByteArray & name );
struct LineFeed
{

View file

@ -1144,8 +1144,9 @@ void DslDictionary::getArticleText( uint32_t articleAddress, QString & headword,
}
else {
try {
articleData =
Iconv::toWstring( getEncodingNameFor( Encoding( idxHeader.dslEncoding ) ), articleBody, articleSize );
articleData = Iconv::toWstring( getEncodingNameFor( static_cast< Encoding >( idxHeader.dslEncoding ) ),
articleBody,
articleSize );
free( articleBody );
// Strip DSL comments
@ -1789,7 +1790,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.write( soundDictName.data(), soundDictName.size() );
}
idxHeader.dslEncoding = scanner.getEncoding();
idxHeader.dslEncoding = static_cast< uint32_t >( scanner.getEncoding() );
IndexedWords indexedWords;

View file

@ -844,7 +844,7 @@ bool ArticleDom::atSignFirstInLine()
/////////////// DslScanner
DslScanner::DslScanner( string const & fileName ):
encoding( Text::Utf8 ),
encoding( Text::Encoding::Utf8 ),
readBufferPtr( readBuffer ),
readBufferLeft( 0 ),
linesRead( 0 )
@ -875,19 +875,19 @@ DslScanner::DslScanner( string const & fileName ):
guessedEncoding.has_value() ) {
switch ( guessedEncoding.value() ) {
case QStringConverter::Utf8:
encoding = Text::Utf8;
encoding = Text::Encoding::Utf8;
break;
case QStringConverter::Utf16LE:
encoding = Text::Utf16LE;
encoding = Text::Encoding::Utf16LE;
break;
case QStringConverter::Utf16BE:
encoding = Text::Utf16BE;
encoding = Text::Encoding::Utf16BE;
break;
case QStringConverter::Utf32LE:
encoding = Text::Utf16LE;
encoding = Text::Encoding::Utf16LE;
break;
case QStringConverter::Utf32BE:
encoding = Text::Utf32BE;
encoding = Text::Encoding::Utf32BE;
break;
default:
break;
@ -976,13 +976,13 @@ DslScanner::DslScanner( string const & fileName ):
qWarning( "Warning: encoding was specified in a Unicode file, ignoring." );
}
else if ( !arg.compare( U"Latin" ) ) {
encoding = Text::Windows1252;
encoding = Text::Encoding::Windows1252;
}
else if ( !arg.compare( U"Cyrillic" ) ) {
encoding = Text::Windows1251;
encoding = Text::Encoding::Windows1251;
}
else if ( !arg.compare( U"EasternEuropean" ) ) {
encoding = Text::Windows1250;
encoding = Text::Encoding::Windows1250;
}
else {
gzclose( f );

View file

@ -207,8 +207,8 @@ void stripComments( std::u32string &, bool & );
inline size_t DslScanner::distanceToBytes( size_t x ) const
{
switch ( encoding ) {
case Text::Utf16LE:
case Text::Utf16BE:
case Encoding::Utf16LE:
case Encoding::Utf16BE:
return x * 2;
default:
return x;

View file

@ -123,7 +123,7 @@ public:
};
GlsScanner::GlsScanner( string const & fileName ):
encoding( Text::Utf8 ),
encoding( Encoding::Utf8 ),
readBufferPtr( readBuffer ),
readBufferLeft( 0 ),
linesRead( 0 )
@ -149,10 +149,10 @@ GlsScanner::GlsScanner( string const & fileName ):
// If the file begins with the dedicated Unicode marker, we just consume
// it. If, on the other hand, it's not, we return the bytes back
if ( firstBytes[ 0 ] == 0xFF && firstBytes[ 1 ] == 0xFE ) {
encoding = Text::Utf16LE;
encoding = Encoding::Utf16LE;
}
else if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) {
encoding = Text::Utf16BE;
encoding = Encoding::Utf16BE;
}
else if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) {
// Looks like Utf8, read one more byte
@ -161,14 +161,14 @@ GlsScanner::GlsScanner( string const & fileName ):
gzclose( f );
throw exMalformedGlsFile( fileName );
}
encoding = Text::Utf8;
encoding = Encoding::Utf8;
}
else {
if ( gzrewind( f ) ) {
gzclose( f );
throw exCantOpen( fileName );
}
encoding = Text::Utf8;
encoding = Encoding::Utf8;
}
codec = QTextCodec::codecForName( Text::getEncodingNameFor( encoding ) );
@ -1259,7 +1259,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries( vector< string > const & f
idx.write( (uint32_t)dictionaryName.size() );
idx.write( dictionaryName.data(), dictionaryName.size() );
idxHeader.glsEncoding = scanner.getEncoding();
idxHeader.glsEncoding = static_cast< uint32_t >( scanner.getEncoding() );
IndexedWords indexedWords;

View file

@ -207,7 +207,7 @@ void HunspellArticleRequest::run()
QMutexLocker _( &hunspellMutex );
string trimmedWord_utf8 = Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() );
string trimmedWord_utf8 = Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() );
if ( hunspell.spell( trimmedWord_utf8 ) ) {
// Good word -- no spelling suggestions then.
@ -361,7 +361,7 @@ QList< std::u32string > suggest( std::u32string & word, QMutex & hunspellMutex,
try {
QMutexLocker _( &hunspellMutex );
auto suggestions = hunspell.analyze( Iconv::toUtf8( Iconv::GdWchar, word.data(), word.size() ) );
auto suggestions = hunspell.analyze( Iconv::toUtf8( Text::utf32, word.data(), word.size() ) );
if ( !suggestions.empty() ) {
// There were some suggestions made for us. Make an appropriate output.
@ -464,7 +464,7 @@ void HunspellPrefixMatchRequest::run()
QMutexLocker _( &hunspellMutex );
if ( hunspell.spell( Iconv::toUtf8( Iconv::GdWchar, trimmedWord.data(), trimmedWord.size() ) ) ) {
if ( hunspell.spell( Iconv::toUtf8( Text::utf32, trimmedWord.data(), trimmedWord.size() ) ) ) {
// Known word -- add it to the result
QMutexLocker _( &dataMutex );

View file

@ -143,7 +143,7 @@ Entry::Entry( File::Index & f )
// Read the size of the recording, in samples
samplesLength = f.read< uint32_t >();
name = Iconv::toUtf8( Iconv::Utf16Le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
name = Iconv::toUtf8( Text::utf16_le, &filenameBuffer.front(), read * sizeof( uint16_t ) );
}
class LsaDictionary: public BtreeIndexing::BtreeDictionary