Index: kdelibs/khtml/misc/decoder.cpp
===================================================================
RCS file: /home/kde/kdelibs/khtml/misc/decoder.cpp,v
retrieving revision 1.50
retrieving revision 1.57
diff -u -r1.50 -r1.57
--- kdelibs/khtml/misc/decoder.cpp 29 Jul 2001 16:26:38 -0000 1.50
+++ kdelibs/khtml/misc/decoder.cpp 14 May 2002 00:37:15 -0000 1.57
@@ -21,15 +21,18 @@
//----------------------------------------------------------------------------
//
// KDE HTML Widget -- decoder for input stream
-// $Id: decoder.cpp,v 1.50 2001/07/29 16:26:38 mueller Exp $
+// $Id: decoder.cpp,v 1.57 2002/05/14 00:37:15 mueller Exp $
#undef DECODE_DEBUG
//#define DECODE_DEBUG
+#include
+
#include "decoder.h"
using namespace khtml;
#include "htmlhashes.h"
+
#include
#include
@@ -38,6 +41,230 @@
#include
#include
+#include
+
+class KanjiCode
+{
+public:
+ enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
+ static enum Type judge(const char *str);
+ static const int ESC;
+ static const int _SS2_;
+ static const unsigned char kanji_map_sjis[];
+ static int ISkanji(int code)
+ {
+ if (code >= 0x100)
+ return 0;
+ return (kanji_map_sjis[code & 0xff] & 1);
+ }
+
+ static int ISkana(int code)
+ {
+ if (code >= 0x100)
+ return 0;
+ return (kanji_map_sjis[code & 0xff] & 2);
+ }
+
+};
+
+const int KanjiCode::ESC = 0x1b;
+const int KanjiCode::_SS2_ = 0x8e;
+
+const unsigned char KanjiCode::kanji_map_sjis[] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
+};
+
+/*
+ * EUC-JP is
+ * [0xa1 - 0xfe][0xa1 - 0xfe]
+ * 0x8e[0xa1 - 0xfe](SS2)
+ * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
+ *
+ * Shift_Jis is
+ * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
+ *
+ * Shift_Jis Hankaku Kana is
+ * [0xa1 - 0xdf]
+ */
+
+/*
+ * KanjiCode::judge() is based on judge_jcode() from jvim
+ * http://hp.vector.co.jp/authors/VA003457/vim/
+ *
+ * Special Thanks to Kenichi Tsuchida
+ */
+
+/*
+ * Maybe we should use QTextCodec::heuristicContentMatch()
+ * But it fails detection. It's not useful.
+ */
+
+enum KanjiCode::Type KanjiCode::judge(const char *str)
+{
+ enum Type code;
+ int i;
+ int bfr = FALSE; /* Kana Moji */
+ int bfk = 0; /* EUC Kana */
+ int sjis = 0;
+ int euc = 0;
+
+ const unsigned char *ptr = (const unsigned char *) str;
+ int size = strlen(str);
+
+ code = ASCII;
+
+ i = 0;
+ while (i < size) {
+ if (ptr[i] == ESC && (size - i >= 3)) {
+ if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
+ || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
+ code = JIS;
+ goto breakBreak;
+ } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
+ || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
+ code = JIS;
+ goto breakBreak;
+ } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
+ code = JIS;
+ i += 3;
+ } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
+ code = JIS;
+ i += 3;
+ } else {
+ i++;
+ }
+ bfr = FALSE;
+ bfk = 0;
+ } else {
+ if (ptr[i] < 0x20) {
+ bfr = FALSE;
+ bfk = 0;
+ /* ?? check kudokuten ?? && ?? hiragana ?? */
+ if ((i >= 2) && (ptr[i - 2] == 0x81)
+ && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
+ code = SJIS;
+ sjis += 100; /* kudokuten */
+ } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
+ && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
+ code = EUC;
+ euc += 100; /* kudokuten */
+ } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
+ sjis += 40; /* hiragana */
+ } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
+ euc += 40; /* hiragana */
+ }
+ } else {
+ /* ?? check hiragana or katana ?? */
+ if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
+ sjis++; /* hiragana */
+ } else if ((size - i > 1) && (ptr[i] == 0x83)
+ && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
+ sjis++; /* katakana */
+ } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
+ euc++; /* hiragana */
+ } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
+ euc++; /* katakana */
+ }
+ if (bfr) {
+ if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
+ code = SJIS;
+ goto breakBreak;
+ } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
+ code = SJIS;
+ goto breakBreak;
+ } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
+ code = EUC;
+ goto breakBreak;
+ } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
+ code = EUC;
+ goto breakBreak;
+ } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
+ code = SJIS;
+ goto breakBreak;
+ } else if (ptr[i] <= 0x7f) {
+ code = SJIS;
+ goto breakBreak;
+ } else {
+ if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
+ euc++; /* sjis hankaku kana kigo */
+ } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
+ ; /* sjis hankaku kana */
+ } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
+ euc++;
+ } else if (0x8e == ptr[i]) {
+ euc++;
+ } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
+ sjis++;
+ }
+ bfr = FALSE;
+ bfk = 0;
+ }
+ } else if (0x8e == ptr[i]) {
+ if (size - i <= 1) {
+ ;
+ } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
+ /* EUC KANA or SJIS KANJI */
+ if (bfk == 1) {
+ euc += 100;
+ }
+ bfk++;
+ i++;
+ } else {
+ /* SJIS only */
+ code = SJIS;
+ goto breakBreak;
+ }
+ } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
+ /* SJIS only */
+ code = SJIS;
+ if ((size - i >= 1)
+ && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
+ || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
+ goto breakBreak;
+ }
+ } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
+ /* EUC only */
+ code = EUC;
+ if ((size - i >= 1)
+ && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
+ goto breakBreak;
+ }
+ } else if (ptr[i] <= 0x7f) {
+ ;
+ } else {
+ bfr = TRUE;
+ bfk = 0;
+ }
+ }
+ i++;
+ }
+ }
+ if (code == ASCII) {
+ if (sjis > euc) {
+ code = SJIS;
+ } else if (sjis < euc) {
+ code = EUC;
+ }
+ }
+breakBreak:
+ return (code);
+}
Decoder::Decoder()
{
@@ -117,14 +344,12 @@
m_decoder = m_codec->makeDecoder();
} else {
- if(m_codec->mibEnum() != 1000) // utf16
- {
- // ### hack for a bug in QTextCodec. It cut's the input stream
- // in case there are \0 in it. ZDNET has them inside... :-(
+ if(m_codec->mibEnum() != 1000) { // utf16
+ // replace '\0' by spaces, for buggy pages
char *d = const_cast(data);
int i = len - 1;
while(i >= 0) {
- if(*(d+i) == 0) *(d+i) = ' ';
+ if(d[i] == 0) d[i] = ' ';
i--;
}
}
@@ -220,7 +445,7 @@
default:
body = true;
#ifdef DECODE_DEBUG
- kdDebug( 6005 ) << "Decoder: no charset found, using latin1. Id=" << id << endl;
+ kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
#endif
goto found;
}
@@ -233,6 +458,32 @@
}
found:
+ if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
+#ifdef DECODE_DEBUG
+ kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
+#endif
+ switch ( KanjiCode::judge( data ) ) {
+ case KanjiCode::JIS:
+ enc = "jis7";
+ break;
+ case KanjiCode::EUC:
+ enc = "eucjp";
+ break;
+ case KanjiCode::SJIS:
+ enc = "sjis";
+ break;
+ default:
+ enc = NULL;
+ break;
+ }
+#ifdef DECODE_DEBUG
+ kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
+#endif
+ if (!enc.isEmpty()) {
+ setEncoding(enc, true);
+ }
+ }
+
// if we still haven't found an encoding latin1 will be used...
// this is according to HTML4.0 specs
if (!m_codec)
@@ -270,7 +521,7 @@
// the hell knows, why the output does sometimes have a QChar::null at
// the end...
if(out[out.length()-1] == QChar::null)
- out.truncate(out.length() - 1);
+ assert(0);
return out;
}