Enhanced XDXF support (by soshial)

This commit is contained in:
Abs62 2013-06-15 12:55:15 +04:00
parent 195e28409d
commit f79129c9de
4 changed files with 376 additions and 86 deletions

View file

@ -175,37 +175,85 @@ div.xdxf
display: none;
}
/* Abbreviation */
.xdxf_abr
/* Article structure tag */
.xdxf_num
{
font-style: italic;
color: green;
cursor: default;
color: red;
font-weight: bold;
padding-left:-15px;
}
.xdxf_def
{
display: block;
border-color: #e3e3e3;
border-width: 1px;
border-style: dashed;
margin: 1px;
margin-left: 15px;
}
/* Color-highlighted */
.xdxf_c
.xdxf_def:target
{
color: blue;
/* this pseudoclass is used to marked out the referenced <def> that was just clicked */
border-color: red;
border-width: 1px;
border-style: double;
}
/* Abbreviation */
.xdxf_abbr
{
font-style: italic;
color: seagreen;
cursor: default;
border-bottom: 1px seagreen dotted;
}
/* Editorial comment */
.xdxf_co, .xdxf_co_old
{
color: darkslateblue;
font-style: italic;
}
.xdxf_co:before
{
content:"("
}
.xdxf_co:after
{
content:")"
}
/* Grammar information */
.xdxf_gr
{
color: orangered;
display: block;
}
.xdxf_gr_old
{
color: blue; /*orangered;*/
}
/* Example */
.xdxf_ex
{
color: grey;
/* display: block;
margin: 8px;*/
color: #808080;
display: block;
margin-left: 14px;
}
.xdxf_ex_old
{
color: #808080;
}
/* Direct translation */
.xdxf_dtrn
{
}
/* Appearance-only tag */
.xdxf_nu
{
color: red;
font-weight: bold;
color: #dd7800;
}
/* Transcription */
@ -219,7 +267,7 @@ div.xdxf
content:"]";
}
.xdxf_tr
.xdxf_tr, .xdxf_tr_old
{
font-weight: bold;
}
@ -239,9 +287,16 @@ div.xdxf
margin-bottom: 10px;
}
/* The words in examples that are meaked out; they are marked out only when placed into <ex> tag */
.xdxf_ex .xdxf_ex_markd, .xdxf_ex_old .xdxf_ex_markd
{
color:black;
background-color:lightgray;
}
.xdxf_opt
{
color: grey;
color: #808080;
}
/******** SDictionary markup classes *********/
@ -391,7 +446,7 @@ div.xdxf
{
color: blue;
cursor: pointer;
vertical-align: center;
vertical-align: text-bottom;
}
/************* MDict dictionaries **************/

132
xdxf.cc
View file

@ -60,7 +60,7 @@ DEF_EX( exCorruptedIndex, "The index file is corrupted", Dictionary::Ex )
enum
{
Signature = 0x46584458, // XDXF on little-endian, FXDX on big-endian
CurrentFormatVersion = 3 + BtreeIndexing::FormatVersion + Folding::Version
CurrentFormatVersion = 4 + BtreeIndexing::FormatVersion + Folding::Version
};
enum ArticleFormat
@ -93,6 +93,7 @@ struct IdxHeader
uint32_t zipIndexBtreeMaxElements; // Two fields from IndexInfo of the zip
// resource index.
uint32_t zipIndexRootOffset;
uint32_t revisionNumber; // Format revision
} __attribute__((packed));
bool indexIsOldOrBad( string const & indexFile )
@ -165,8 +166,7 @@ protected:
private:
/// Loads the article, storing its headword and formatting the data it has
/// into an html.
// Loads the article, storing its headword and formatting article's data into an html.
void loadArticle( uint32_t address,
string & articleText );
@ -526,7 +526,7 @@ void XdxfDictionary::loadArticle( uint32_t address,
if ( &chunk.front() + chunk.size() - propertiesData < 9 )
throw exCorruptedIndex();
// unsigned char fType = (unsigned char) *propertiesData;
unsigned char fType = (unsigned char) *propertiesData;
uint32_t articleOffset, articleSize;
@ -551,7 +551,8 @@ void XdxfDictionary::loadArticle( uint32_t address,
return;
}
articleText = Xdxf2Html::convert( string( articleBody ), Xdxf2Html::XDXF, idxHeader.hasAbrv ? &abrv : NULL, this );
articleText = Xdxf2Html::convert( string( articleBody ), Xdxf2Html::XDXF, idxHeader.hasAbrv ? &abrv : NULL, this,
fType == Logical, idxHeader.revisionNumber );
free( articleBody );
}
@ -679,6 +680,7 @@ QString readXhtmlData( QXmlStreamReader & stream )
void addAllKeyTags( QXmlStreamReader & stream, list< QString > & words )
{
// todo implement support for tag <srt>, that overrides the article sorting order
if ( stream.name() == "k" )
{
words.push_back( stream.readElementText( QXmlStreamReader::SkipChildElements ) );
@ -721,7 +723,8 @@ void indexArticle( GzippedFile & gzFile,
IndexedWords & indexedWords,
ChunkedStorage::Writer & chunks,
unsigned & articleCount,
unsigned & wordCount )
unsigned & wordCount,
ArticleFormat defaultFormat )
{
ArticleFormat format( Default );
@ -732,7 +735,8 @@ void indexArticle( GzippedFile & gzFile,
else
if ( formatValue == "l" )
format = Logical;
if( format == Default )
format = defaultFormat;
size_t articleOffset = gzFile.pos() - 1; // stream.characterOffset() is loony
// uint32_t lineNumber = stream.lineNumber();
@ -970,7 +974,8 @@ sptr< Dictionary::DataRequest > XdxfDictionary::getResource( string const & name
return new XdxfResourceRequest( *this, name );
}
} // anonymous namespace
}
// anonymous namespace - this section of file is devoted to rebuilding of dictionary articles index
vector< sptr< Dictionary::Class > > makeDictionaries(
vector< string > const & fileNames,
@ -1074,6 +1079,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
idxHeader.langTo = LangCoder::findIdForLanguageCode3( str.c_str() );
bool isLogical = ( stream.attributes().value( "format" ) == "logical" );
idxHeader.revisionNumber = stream.attributes().value( "revision" ).toString().toUInt();
idxHeader.articleFormat = isLogical ? Logical : Visual;
@ -1085,7 +1091,8 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
if ( stream.isStartElement() )
{
if ( stream.name() == "full_name" )
// todo implement using short <title> for denoting the dictionary in settings or dict list toolbar
if ( stream.name() == "full_name" || stream.name() == "full_title" )
{
// That's our name
@ -1113,6 +1120,7 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
else
if ( stream.name() == "description" )
{
// todo implement adding other information to the description like <publisher>, <authors>, <file_ver>, <creation_date>, <last_edited_date>, <dict_edition>, <publishing_date>, <dict_src_url>
QString desc = readXhtmlData( stream );
if ( dictionaryDescription.isEmpty() )
@ -1134,64 +1142,92 @@ vector< sptr< Dictionary::Class > > makeDictionaries(
else
if ( stream.name() == "abbreviations" )
{
QString s;
string value;
list < wstring > keys;
while( !( stream.isEndElement() && stream.name() == "abbreviations" ) && !stream.atEnd() )
QString s;
string value;
list < wstring > keys;
while( !( stream.isEndElement() && stream.name() == "abbreviations" ) && !stream.atEnd() )
{
stream.readNext();
// abbreviations tag set switch at format revision = 30
if( idxHeader.revisionNumber >= 30 )
{
while ( !( stream.isEndElement() && stream.name() == "abbr_def" ) || !stream.atEnd() )
{
stream.readNext();
while ( !( stream.isEndElement() && stream.name() == "abr_def" ) && !stream.atEnd() )
if ( stream.isStartElement() && stream.name() == "abbr_k" )
{
stream.readNext();
if ( stream.isStartElement() && stream.name() == "k" )
{
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
keys.push_back( gd::toWString( s ) );
}
else if ( stream.isStartElement() && stream.name() == "v" )
{
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
{
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
}
keys.clear();
}
else if ( stream.isEndElement() && stream.name() == "abbreviations" )
break;
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
keys.push_back( gd::toWString( s ) );
}
else if ( stream.isStartElement() && stream.name() == "abbr_v" )
{
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
{
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
}
keys.clear();
}
else if ( stream.isEndElement() && stream.name() == "abbreviations" )
break;
}
}
else
{
while ( !( stream.isEndElement() && stream.name() == "abr_def" ) || !stream.atEnd() )
{
stream.readNext();
if ( stream.isStartElement() && stream.name() == "k" )
{
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
keys.push_back( gd::toWString( s ) );
}
else if ( stream.isStartElement() && stream.name() == "v" )
{
s = stream.readElementText( QXmlStreamReader::SkipChildElements );
value = Utf8::encode( Folding::trimWhitespace( gd::toWString( s ) ) );
for( list< wstring >::iterator i = keys.begin(); i != keys.end(); ++i )
{
abrv[ Utf8::encode( Folding::trimWhitespace( *i ) ) ] = value;
}
keys.clear();
}
else if ( stream.isEndElement() && stream.name() == "abbreviations" )
break;
}
}
}
}
else
if ( stream.name() == "ar" )
{
indexArticle( gzFile, stream, indexedWords, chunks,
articleCount, wordCount );
articleCount, wordCount, isLogical ? Logical : Visual );
}
}
}
// Write abbreviations if presented
if( !abrv.empty() ) {
idxHeader.hasAbrv = 1;
idxHeader.abrvAddress = chunks.startNewBlock();
if( !abrv.empty() )
{
idxHeader.hasAbrv = 1;
idxHeader.abrvAddress = chunks.startNewBlock();
uint32_t sz = abrv.size();
uint32_t sz = abrv.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
for( map< string, string >::const_iterator i = abrv.begin(); i != abrv.end(); ++i )
{
sz = i->first.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
for( map< string, string >::const_iterator i = abrv.begin();
i != abrv.end(); ++i )
{
sz = i->first.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
chunks.addToBlock( i->first.data(), sz );
sz = i->second.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
chunks.addToBlock( i->second.data(), sz );
}
chunks.addToBlock( i->first.data(), sz );
sz = i->second.size();
chunks.addToBlock( &sz, sizeof( uint32_t ) );
chunks.addToBlock( i->second.data(), sz );
}
}
// Finish with the chunks

View file

@ -26,7 +26,37 @@ static void fixLink( QDomElement & el, string const & dictId, const char *attrNa
el.setAttribute( attrName, url.toEncoded().data() );
}
string convert( string const & in, DICT_TYPE type, map < string, string > const * pAbrv, Dictionary::Class *dictPtr )
// converting a number into roman representation
string convertToRoman( int input, int lower_case )
{
string romanvalue = "";
if( input >= 4000 )
{
int x = ( input - input % 4000 ) / 1000;
romanvalue = "(" + convertToRoman( x, lower_case ) + ")" ;
input %= 4000;
}
const string roman[26] = { "M", "CM", "D", "CD", "C", "XC", "L", "XL", "X", "IX", "V", "IV", "I",
"m", "cm", "d", "cd", "c", "xc", "l", "xl", "x", "ix", "v", "iv", "i"};
const int decimal[13] = {1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1};
for( int i = 0; i < 13; i++ )
{
while( input >= decimal[ i ] )
{
input -= decimal[ i ];
if ( lower_case == 1 )
romanvalue += roman[ i + 13 ];
else
romanvalue += roman[ i ];
}
}
return romanvalue;
}
string convert( string const & in, DICT_TYPE type, map < string, string > const * pAbrv,
Dictionary::Class *dictPtr, bool isLogicalFormat, unsigned revisionNumber )
{
// DPRINTF( "Source>>>>>>>>>>: %s\n\n\n", in.c_str() );
@ -45,13 +75,18 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
{
case '\n':
afterEol = true;
inConverted.append( "<br/>" );
break;
if( !isLogicalFormat )
inConverted.append( "<br/>" );
break;
case '\r':
break;
case ' ':
if ( afterEol )
{
inConverted.append( "&nbsp;" );
if( !isLogicalFormat )
inConverted.append( "&nbsp;" );
break;
}
// Fall-through
@ -80,7 +115,7 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
in_data = "<div class=\"sdct_x\">";
in_data += inConverted + "</div>";
if ( !dd.setContent( QByteArray( in_data.c_str() ), false, &errorStr, &errorLine, &errorColumn ) )
if( !dd.setContent( QByteArray( in_data.c_str() ), false, &errorStr, &errorLine, &errorColumn ) )
{
FDPRINTF( stderr, "Xdxf2html error, xml parse failed: %s at %d,%d\n", errorStr.toLocal8Bit().constData(), errorLine, errorColumn );
FDPRINTF( stderr, "The input was: %s\n", in.c_str() );
@ -95,7 +130,20 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_ex" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_ex" );
else
el.setAttribute( "class", "xdxf_ex_old" );
}
nodes = dd.elementsByTagName( "mrkd" ); // marked out words in tranlations/examples of usage
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_ex_markd" );
}
nodes = dd.elementsByTagName( "k" ); // Key
@ -116,6 +164,100 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
}
}
// processing of nested <def>s
if( isLogicalFormat ) // in articles with visual format <def> tags do not effect the formatting.
{
nodes = dd.elementsByTagName( "def" );
// this is a logical type of XDXF, so we need to render proper numbering
// we will do it this way:
// 1. we compute the maximum nesting depth of the article
int maxNestingDepth = 1; // maximum nesting depth of the article
for( int i = 0; i < nodes.size(); i++ )
{
QDomElement el = nodes.at( i ).toElement();
QDomElement nestingNode = el;
int nestingCount = 0;
while ( nestingNode.parentNode().toElement().tagName() == "def" )
{
nestingCount++;
nestingNode = nestingNode.parentNode().toElement();
}
if ( nestingCount > maxNestingDepth )
maxNestingDepth = nestingCount;
}
// 2. in this loop we go layer-by-layer through all <def> and insert proper numbers according to its structure
for( int j = maxNestingDepth; j > 0; j-- ) // j symbolizes special depth to be processed at this iteration
{
int siblingCount = 0; // this that counts the number of among all siblings of this depth
QString numberText = ""; // the number to be inserted into the beginning of <def> (I,II,IV,1,2,3,a),b),c)...)
for( int i = 0; i < nodes.size(); i++ )
{
QDomElement el = nodes.at( i ).toElement();
QDomElement nestingNode = el;
// computing the depth @nestingDepth of a current node @el
int nestingDepth = 0;
while( nestingNode.parentNode().toElement().tagName() == "def" )
{
nestingDepth++;
nestingNode=nestingNode.parentNode().toElement();
}
// we process nodes on of current depth @j
// we do this in order not to break the numbering at this depth level
if (nestingDepth == j)
{
siblingCount++;
if( maxNestingDepth == 1 )
{
numberText = numberText.setNum( siblingCount ) + ". ";
}
else if( maxNestingDepth == 2 )
{
if( nestingDepth == 1 )
numberText = numberText.setNum( siblingCount ) + ". ";
if( nestingDepth == 2 )
numberText = numberText.setNum( siblingCount ) + ") ";
}
else
{
if( nestingDepth == 1 )
numberText = QString::fromStdString( convertToRoman(siblingCount,0) + ". " );
if( nestingDepth == 2 )
numberText = numberText.setNum( siblingCount ) + ". ";
if( nestingDepth == 3 )
numberText = numberText.setNum( siblingCount ) + ") ";
if( nestingDepth == 4 )
numberText = QString::fromStdString( convertToRoman(siblingCount,1) + ") " );
}
QDomElement numberNode = dd.createElement( "span" );
numberNode.setAttribute( "class", "xdxf_num" );
QDomText text_num = dd.createTextNode( numberText );
numberNode.appendChild( text_num );
el.insertBefore( numberNode, el.firstChild() );
if ( el.hasAttribute( "cmt" ) )
{
QDomElement cmtNode = dd.createElement( "span" );
cmtNode.setAttribute( "class", "xdxf_co" );
QDomText text_num = dd.createTextNode( el.attribute( "cmt" ) );
cmtNode.appendChild( text_num );
el.insertAfter( cmtNode, el.firstChild() );
}
}
else if( nestingDepth < j ) // if it goes one level up @siblingCount needs to be reset
siblingCount = 0;
}
}
// we finally change all <def> tags into 'xdxf_def' <span>s
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_def" );
}
}
nodes = dd.elementsByTagName( "opt" ); // Optional headword part
while( nodes.size() )
@ -135,6 +277,16 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
el.setTagName( "a" );
el.setAttribute( "href", QString( "bword:" ) + el.text() );
el.setAttribute( "class", "xdxf_kref" );
if ( el.hasAttribute( "idref" ) )
{
// todo implement support for referencing only specific parts of the article
el.setAttribute( "href", QString( "bword:" ) + el.text() + "#" + el.attribute( "idref" ));
}
if ( el.hasAttribute( "kcmt" ) )
{
QDomText kcmtText = dd.createTextNode( " " + el.attribute( "kcmt" ) );
el.parentNode().insertAfter( kcmtText, el );
}
}
nodes = dd.elementsByTagName( "iref" ); // Reference to internet site
@ -147,14 +299,18 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
el.setAttribute( "href", el.text() );
}
nodes = dd.elementsByTagName( "abr" ); // Abbreviation
// Abbreviations
if( revisionNumber < 29 )
nodes = dd.elementsByTagName( "abr" );
else
nodes = dd.elementsByTagName( "abbr" );
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_abr" );
el.setAttribute( "class", "xdxf_abbr" );
if( type == XDXF && pAbrv != NULL )
{
string val = Utf8::encode( Folding::trimWhitespace( gd::toWString( el.text() ) ) );
@ -169,8 +325,7 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
if ( Utf8::decode( i->second ).size() < 70 )
{
// Replace all spaces with non-breakable ones, since that's how
// Lingvo shows tooltips
// Replace all spaces with non-breakable ones, since that's how Lingvo shows tooltips
title.reserve( i->second.size() );
for( char const * c = i->second.c_str(); *c; ++c )
@ -206,14 +361,15 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "font" );
el.setAttribute( "class", "xdxf_c" );
el.setTagName( "span" );
if ( el.hasAttribute( "c" ) )
{
el.setAttribute( "color", el.attribute( "c" ) );
el.setAttribute( "style", "color:" + el.attribute( "c" ) );
el.removeAttribute( "c" );
}
else
el.setAttribute( "style", "color:blue" );
}
nodes = dd.elementsByTagName( "co" ); // Editorial comment
@ -223,9 +379,48 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_co" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_co" );
else
el.setAttribute( "class", "xdxf_co_old" );
}
/* grammar information */
nodes = dd.elementsByTagName( "gr" ); // proper grammar tag
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_gr" );
else
el.setAttribute( "class", "xdxf_gr_old" );
}
nodes = dd.elementsByTagName( "pos" ); // deprecated grammar tag
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_gr" );
else
el.setAttribute( "class", "xdxf_gr_old" );
}
nodes = dd.elementsByTagName( "tense" ); // deprecated grammar tag
while( nodes.size() )
{
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_gr" );
else
el.setAttribute( "class", "xdxf_gr_old" );
}
/* end of grammar generation */
nodes = dd.elementsByTagName( "tr" ); // Transcription
while( nodes.size() )
@ -233,7 +428,10 @@ string convert( string const & in, DICT_TYPE type, map < string, string > const
QDomElement el = nodes.at( 0 ).toElement();
el.setTagName( "span" );
el.setAttribute( "class", "xdxf_tr" );
if( isLogicalFormat )
el.setAttribute( "class", "xdxf_tr" );
else
el.setAttribute( "class", "xdxf_tr_old" );
}
// Ensure that ArticleNetworkAccessManager can deal with XDXF images.

View file

@ -19,7 +19,8 @@ using std::map;
/// Converts the given xdxf markup to an html one. This is currently used
/// for Stardict's 'x' records.
string convert( string const &, DICT_TYPE type = STARDICT, map < string, string > const * pAbrv = NULL, Dictionary::Class *dictPtr = NULL );
string convert( string const &, DICT_TYPE type = STARDICT, map < string, string > const * pAbrv = NULL,
Dictionary::Class *dictPtr = NULL, bool isLogicalFormat = false, unsigned revisionNumber = 0 );
}