clean: port DSL encoding guessing away from QTextCodec (#1799)

This commit is contained in:
shenleban tongying 2024-10-06 15:59:07 -04:00 committed by GitHub
parent 96ada0737c
commit 1c2f93e393
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 30 additions and 32 deletions

View file

@ -27,6 +27,7 @@ Checks: >
-google-readability-casting, -google-readability-casting,
-hicpp-deprecated-headers, -hicpp-deprecated-headers,
-misc-const-correctness, -misc-const-correctness,
-misc-include-cleaner,
-misc-non-private-member-variables-in-classes, -misc-non-private-member-variables-in-classes,
-modernize-avoid-c-arrays, -modernize-avoid-c-arrays,
-modernize-deprecated-headers, -modernize-deprecated-headers,

View file

@ -807,7 +807,7 @@ bool ArticleDom::atSignFirstInLine()
/////////////// DslScanner /////////////// DslScanner
DslScanner::DslScanner( string const & fileName ): DslScanner::DslScanner( string const & fileName ):
encoding( Utf8::Windows1252 ), encoding( Utf8::Utf8 ),
readBufferPtr( readBuffer ), readBufferPtr( readBuffer ),
readBufferLeft( 0 ), readBufferLeft( 0 ),
linesRead( 0 ) linesRead( 0 )
@ -819,11 +819,12 @@ DslScanner::DslScanner( string const & fileName ):
if ( !f ) if ( !f )
throw exCantOpen( fileName ); throw exCantOpen( fileName );
// Now try guessing the encoding by reading the first two bytes // Now try guessing the encoding
unsigned char firstBytes[ 50 ]; constexpr size_t firstBytesSize = 50;
unsigned char firstBytes[ firstBytesSize ];
if ( gzread( f, firstBytes, sizeof( firstBytes ) ) != sizeof( firstBytes ) ) { if ( gzread( f, firstBytes, firstBytesSize ) != firstBytesSize ) {
// Apparently the file's too short // Apparently the file's too short
gzclose( f ); gzclose( f );
throw exMalformedDslFile( fileName ); throw exMalformedDslFile( fileName );
@ -831,37 +832,33 @@ DslScanner::DslScanner( string const & fileName ):
bool needExactEncoding = false; bool needExactEncoding = false;
QByteArray ba = QByteArray::fromRawData( (const char *)firstBytes, 50 ); // Note that .dsl format always starts with "#NAME"
codec = QTextCodec::codecForUtfText( ba, nullptr ); if ( auto guessedEncoding = QStringConverter::encodingForData( { firstBytes, firstBytesSize }, '#' );
if ( !codec ) { guessedEncoding.has_value() ) {
// the encoding has no bom. switch ( guessedEncoding.value() ) {
// check the first char # (0x23). case QStringConverter::Utf8:
auto hashTag = 0x0023; encoding = Utf8::Utf8;
break;
auto uci = qFromUnaligned< uint32_t >( firstBytes ); case QStringConverter::Utf16LE:
if ( uci == qToBigEndian( hashTag ) ) { encoding = Utf8::Utf16LE;
codec = QTextCodec::codecForMib( 1018 ); // utf-32 be break;
} case QStringConverter::Utf16BE:
else if ( uci == qToLittleEndian( hashTag ) ) { encoding = Utf8::Utf16BE;
codec = QTextCodec::codecForMib( 1019 ); // utf-32 le break;
} case QStringConverter::Utf32LE:
else { encoding = Utf8::Utf16LE;
auto uc = qFromUnaligned< uint16_t >( firstBytes ); break;
if ( uc == qToBigEndian( uint16_t( hashTag ) ) ) { case QStringConverter::Utf32BE:
codec = QTextCodec::codecForMib( 1013 ); // utf16 be encoding = Utf8::Utf32BE;
} break;
else if ( uc == qToLittleEndian( uint16_t( hashTag ) ) ) { default:
codec = QTextCodec::codecForMib( 1014 ); // utf16 le break;
}
else {
//default encoding
codec = QTextCodec::codecForName( "UTF-8" );
}
} }
} }
encoding = Utf8::getEncodingForName( codec->name() ); codec = QTextCodec::codecForName( getEncodingNameFor( encoding ) );
qDebug() << codec->name();
qDebug() << "DSL encoding ->" << codec->name();
if ( gzrewind( f ) ) { if ( gzrewind( f ) ) {
gzclose( f ); gzclose( f );