mirror of
https://github.com/xiaoyifang/goldendict-ng.git
synced 2024-11-25 01:14:07 +00:00
239 lines
4.6 KiB
C++
239 lines
4.6 KiB
C++
/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
|
|
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
|
|
|
|
#include "utf8.hh"
|
|
#include <vector>
|
|
#include <algorithm>
|
|
|
|
namespace Utf8 {
|
|
|
|
size_t encode( wchar const * in, size_t inSize, char * out_ )
|
|
{
|
|
unsigned char * out = (unsigned char *) out_;
|
|
|
|
while( inSize-- )
|
|
{
|
|
if ( *in < 0x80 )
|
|
*out++ = *in++;
|
|
else
|
|
if ( *in < 0x800 )
|
|
{
|
|
*out++ = 0xC0 | ( *in >> 6 );
|
|
*out++ = 0x80 | ( *in++ & 0x3F );
|
|
}
|
|
else
|
|
if ( *in < 0x10000 )
|
|
{
|
|
*out++ = 0xE0 | ( *in >> 12 );
|
|
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
|
*out++ = 0x80 | ( *in++ & 0x3F );
|
|
}
|
|
else
|
|
{
|
|
*out++ = 0xF0 | ( *in >> 18 );
|
|
*out++ = 0x80 | ( ( *in >> 12 ) & 0x3F );
|
|
*out++ = 0x80 | ( ( *in >> 6 ) & 0x3F );
|
|
*out++ = 0x80 | ( *in++ & 0x3F );
|
|
}
|
|
}
|
|
|
|
return out - (unsigned char *) out_;
|
|
}
|
|
|
|
long decode( char const * in_, size_t inSize, wchar * out_ )
|
|
{
|
|
unsigned char const * in = (unsigned char const *) in_;
|
|
wchar * out = out_;
|
|
|
|
while( inSize-- )
|
|
{
|
|
wchar result;
|
|
|
|
if ( *in & 0x80 )
|
|
{
|
|
if ( *in & 0x40 )
|
|
{
|
|
if ( *in & 0x20 )
|
|
{
|
|
if ( *in & 0x10 )
|
|
{
|
|
// Four-byte sequence
|
|
if ( *in & 8 )
|
|
// This can't be
|
|
return -1;
|
|
|
|
if ( inSize < 3 )
|
|
return -1;
|
|
|
|
inSize -= 3;
|
|
|
|
result = ( (wchar )*in++ & 7 ) << 18;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= ( (wchar)*in++ & 0x3F ) << 12;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= ( (wchar)*in++ & 0x3F ) << 6;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= (wchar)*in++ & 0x3F;
|
|
}
|
|
else
|
|
{
|
|
// Three-byte sequence
|
|
|
|
if ( inSize < 2 )
|
|
return -1;
|
|
|
|
inSize -= 2;
|
|
|
|
result = ( (wchar )*in++ & 0xF ) << 12;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= ( (wchar)*in++ & 0x3F ) << 6;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= (wchar)*in++ & 0x3F;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Two-byte sequence
|
|
if ( !inSize )
|
|
return -1;
|
|
|
|
--inSize;
|
|
|
|
result = ( (wchar )*in++ & 0x1F ) << 6;
|
|
|
|
if ( ( *in & 0xC0 ) != 0x80 )
|
|
return -1;
|
|
result |= (wchar)*in++ & 0x3F;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// This char is from the middle of encoding, it can't be leading
|
|
return -1;
|
|
}
|
|
}
|
|
else
|
|
// One-byte encoding
|
|
result = *in++;
|
|
|
|
*out++ = result;
|
|
}
|
|
|
|
return out - out_;
|
|
}
|
|
|
|
string encode( wstring const & in ) noexcept
|
|
{
|
|
if( in.size() == 0 )
|
|
return string();
|
|
|
|
std::vector< char > buffer( in.size() * 4 );
|
|
|
|
return string( &buffer.front(),
|
|
encode( in.data(), in.size(), &buffer.front() ) );
|
|
}
|
|
|
|
wstring decode( string const & in )
|
|
{
|
|
|
|
if ( in.size() == 0 )
|
|
return wstring();
|
|
|
|
std::vector< wchar > buffer( in.size() );
|
|
|
|
long result = decode( in.data(), in.size(), &buffer.front() );
|
|
|
|
if ( result < 0 )
|
|
throw exCantDecode( in );
|
|
|
|
return wstring( &buffer.front(), result );
|
|
}
|
|
|
|
bool isspace( int c )
|
|
{
|
|
switch( c )
|
|
{
|
|
case ' ':
|
|
case '\f':
|
|
case '\n':
|
|
case '\r':
|
|
case '\t':
|
|
case '\v':
|
|
return true;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
//get the first line in string s1. -1 if not found
|
|
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length)
|
|
{
|
|
char* pos = std::search(s1,s1+s1length, s2, s2+s2length);
|
|
|
|
if (pos == s1 + s1length)
|
|
return pos-s1;
|
|
|
|
//the line size.
|
|
return pos- s1+ s2length;
|
|
}
|
|
|
|
char const* getEncodingNameFor(Encoding e)
|
|
{
|
|
switch (e)
|
|
{
|
|
case Utf16LE:
|
|
return "UTF-16LE";
|
|
case Utf16BE:
|
|
return "UTF-16BE";
|
|
case Windows1252:
|
|
return "WINDOWS-1252";
|
|
case Windows1251:
|
|
return "WINDOWS-1251";
|
|
case Utf8:
|
|
return "UTF-8";
|
|
case Windows1250:
|
|
default:
|
|
return "WINDOWS-1250";
|
|
}
|
|
}
|
|
|
|
LineFeed initLineFeed(Encoding e)
|
|
{
|
|
LineFeed lf;
|
|
switch (e)
|
|
{
|
|
case Utf8::Utf16LE:
|
|
lf.lineFeed= new char[2]{ 0x0A,0 };
|
|
lf.length = 2;
|
|
break;
|
|
case Utf8::Utf16BE:
|
|
lf.lineFeed = new char[2]{ 0,0x0A };
|
|
lf.length = 2;
|
|
break;
|
|
case Utf8::Windows1252:
|
|
|
|
case Utf8::Windows1251:
|
|
|
|
case Utf8::Utf8:
|
|
|
|
case Utf8::Windows1250:
|
|
default:
|
|
lf.length = 1;
|
|
lf.lineFeed = new char[1]{ 0x0A };
|
|
}
|
|
return lf;
|
|
}
|
|
|
|
}
|