From d443ec5620269be16df27f97c0119745c5ccf28e Mon Sep 17 00:00:00 2001 From: Konstantin Isakov Date: Fri, 8 May 2009 10:21:03 +0000 Subject: [PATCH] + Support UTF8 for the .dsl files which begin with UTF8 BOM. --- src/dsl_details.cc | 15 +++++++++++++++ src/dsl_details.hh | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/dsl_details.cc b/src/dsl_details.cc index 5c63f107..66a8cc67 100644 --- a/src/dsl_details.cc +++ b/src/dsl_details.cc @@ -471,6 +471,19 @@ DslScanner::DslScanner( string const & fileName ) throw( Ex, Iconv::Ex ): if ( firstBytes[ 0 ] == 0xFE && firstBytes[ 1 ] == 0xFF ) encoding = Utf16BE; else + if ( firstBytes[ 0 ] == 0xEF && firstBytes[ 1 ] == 0xBB ) + { + // Looks like Utf8, read one more byte + if ( gzread( f, firstBytes, 1 ) != 1 || firstBytes[ 0 ] != 0xBF ) + { + // Either the file's too short, or the BOM is weird + gzclose( f ); + throw exMalformedDslFile( fileName ); + } + + encoding = Utf8; + } + else { if ( firstBytes[ 0 ] && !firstBytes[ 1 ] ) encoding = Utf16LE; @@ -720,6 +733,8 @@ char const * DslIconv::getEncodingNameFor( DslEncoding e ) return "WINDOWS-1252"; case Windows1251: return "WINDOWS-1251"; + case Details::Utf8: + return "UTF-8"; case Windows1250: default: return "WINDOWS-1250"; diff --git a/src/dsl_details.hh b/src/dsl_details.hh index daf90f88..ad7d7113 100644 --- a/src/dsl_details.hh +++ b/src/dsl_details.hh @@ -28,7 +28,8 @@ enum DslEncoding Utf16BE, Windows1252, Windows1251, - Windows1250 + Windows1250, + Utf8 // This is an extension. Detected solely by the UTF8 BOM. };