goldendict-ng/utf8.hh

/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>
 * Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __UTF8_HH_INCLUDED__
#define __UTF8_HH_INCLUDED__

#include <cstdio>
#include <string>
#include "cpp_features.hh"
#include "ex.hh"
#include "wstring.hh"

/// A simple UTF-8 encoder/decoder. Some dictionary backends only require
/// utf8, so we have this separately, removing the iconv dependency for them.
/// Besides, utf8 is quite ubiquitous now, and its use is spreaded over many
/// places.
namespace Utf8 {

// Those are possible encodings for .dsl files
enum Encoding
{
  Utf16LE,
  Utf16BE,
  Windows1252,
  Windows1251,
  Windows1250,
  Utf8 // This is an extension. Detected solely by the UTF8 BOM.
};

using std::string;
using gd::wstring;
using gd::wchar;

DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )

/// Encodes the given UCS-4 into UTF-8. The inSize specifies the number
/// of wide characters the 'in' pointer points to. The 'out' buffer must be
/// at least inSize * 4 bytes long. The function returns the number of chars
/// stored in the 'out' buffer. The result is not 0-terminated.
size_t encode( wchar const * in, size_t inSize, char * out );
/// Decodes the given UTF-8 into UCS-32. The inSize specifies the number
/// of bytes the 'in' pointer points to. The 'out' buffer must be at least
/// inSize wide characters long. If the given UTF-8 is invalid, the decode
/// function returns -1, otherwise it returns the number of wide characters
/// stored in the 'out' buffer. The result is not 0-terminated.
long decode( char const * in, size_t inSize, wchar * out );

/// Versions for non time-critical code.
string encode( wstring const & ) throw();
wstring decode( string const & ) ;

/// Since the standard isspace() is locale-specific, we need something
/// that would never mess up our utf8 input. The stock one worked fine under
/// Linux but was messing up strings under Windows.
bool isspace( int c );

//get the first line in string s1. -1 if not found
int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);
char const* getEncodingNameFor(Encoding e);

struct LineFeed
{
	int length;
	char* lineFeed;

};

LineFeed initLineFeed(Encoding e);
}

#endif
Update year in copyright notices. 2012-02-20 21:47:14 +00:00			`/* This file is (c) 2008-2012 Konstantin Isakov <ikm@goldendict.org>`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */`
refract encoding method 2021-11-06 08:26:30 +00:00			`#ifndef __UTF8_HH_INCLUDED__`
			`#define __UTF8_HH_INCLUDED__`
refract encoding method 2021-11-06 08:55:51 +00:00
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`#include <cstdio>`
			`#include <string>`
Fix gcc 7.3 compiler warnings (#issue 978) 2018-05-21 15:32:04 +00:00			`#include "cpp_features.hh"`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`#include "ex.hh"`
*! Introduce gd::wstring and gd:wchar and switch to them from std::wstring and wchar_t. This changes nothing on Linux and most other systems, but on Win32 it causes to use normal UCS-4 strings instead of Win32's usual UTF-16. 2009-04-18 17:20:12 +00:00			`#include "wstring.hh"`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00
			`/// A simple UTF-8 encoder/decoder. Some dictionary backends only require`
			`/// utf8, so we have this separately, removing the iconv dependency for them.`
			`/// Besides, utf8 is quite ubiquitous now, and its use is spreaded over many`
			`/// places.`
			`namespace Utf8 {`

refract encoding method 2021-11-06 08:26:30 +00:00			`// Those are possible encodings for .dsl files`
			`enum Encoding`
			`{`
			`Utf16LE,`
			`Utf16BE,`
			`Windows1252,`
			`Windows1251,`
			`Windows1250,`
			`Utf8 // This is an extension. Detected solely by the UTF8 BOM.`
			`};`

Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`using std::string;`
*! Introduce gd::wstring and gd:wchar and switch to them from std::wstring and wchar_t. This changes nothing on Linux and most other systems, but on Win32 it causes to use normal UCS-4 strings instead of Win32's usual UTF-16. 2009-04-18 17:20:12 +00:00			`using gd::wstring;`
			`using gd::wchar;`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00
			`DEF_EX_STR( exCantDecode, "Can't decode the given string from Utf8:", std::exception )`

			`/// Encodes the given UCS-4 into UTF-8. The inSize specifies the number`
			`/// of wide characters the 'in' pointer points to. The 'out' buffer must be`
			`/// at least inSize * 4 bytes long. The function returns the number of chars`
			`/// stored in the 'out' buffer. The result is not 0-terminated.`
*! Introduce gd::wstring and gd:wchar and switch to them from std::wstring and wchar_t. This changes nothing on Linux and most other systems, but on Win32 it causes to use normal UCS-4 strings instead of Win32's usual UTF-16. 2009-04-18 17:20:12 +00:00			`size_t encode( wchar const * in, size_t inSize, char * out );`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`/// Decodes the given UTF-8 into UCS-32. The inSize specifies the number`
			`/// of bytes the 'in' pointer points to. The 'out' buffer must be at least`
			`/// inSize wide characters long. If the given UTF-8 is invalid, the decode`
			`/// function returns -1, otherwise it returns the number of wide characters`
			`/// stored in the 'out' buffer. The result is not 0-terminated.`
*! Introduce gd::wstring and gd:wchar and switch to them from std::wstring and wchar_t. This changes nothing on Linux and most other systems, but on Win32 it causes to use normal UCS-4 strings instead of Win32's usual UTF-16. 2009-04-18 17:20:12 +00:00			`long decode( char const * in, size_t inSize, wchar * out );`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00
			`/// Versions for non time-critical code.`
			`string encode( wstring const & ) throw();`
clean code: remove old cpp feature 2022-01-09 08:35:07 +00:00			`wstring decode( string const & ) ;`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00
Use own isspace() implementation in all places 2016-04-15 14:44:53 +00:00			`/// Since the standard isspace() is locale-specific, we need something`
			`/// that would never mess up our utf8 input. The stock one worked fine under`
			`/// Linux but was messing up strings under Windows.`
			`bool isspace( int c );`

refract encoding method 2021-11-06 08:26:30 +00:00			`//get the first line in string s1. -1 if not found`
			`int findFirstLinePosition( char* s1,int s1length, const char* s2,int s2length);`
			`char const* getEncodingNameFor(Encoding e);`
refract encoding method 2021-11-06 08:55:51 +00:00
			`struct LineFeed`
			`{`
			`int length;`
			`char* lineFeed;`

			`};`

			`LineFeed initLineFeed(Encoding e);`
Created /trunk/src and moved everything there. 2009-01-28 20:55:45 +00:00			`}`
refract encoding method 2021-11-06 08:55:51 +00:00
refract encoding method 2021-11-06 08:26:30 +00:00			`#endif`