7#include "UnicodeGroupProber.h"
12namespace kencodingprober
14UnicodeGroupProber::UnicodeGroupProber(
void)
16 mCodingSM[0] =
new nsCodingStateMachine(&UTF8SMModel);
17 mCodingSM[1] =
new nsCodingStateMachine(&UCS2LESMModel);
18 mCodingSM[2] =
new nsCodingStateMachine(&UCS2BESMModel);
19 mActiveSM = NUM_OF_UNICODE_CHARSETS;
21 mDetectedCharset =
"UTF-8";
24UnicodeGroupProber::~UnicodeGroupProber(
void)
26 for (
unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
31void UnicodeGroupProber::Reset(
void)
34 for (
unsigned int i = 0; i < NUM_OF_UNICODE_CHARSETS; i++) {
35 mCodingSM[i]->Reset();
37 mActiveSM = NUM_OF_UNICODE_CHARSETS;
38 mDetectedCharset =
"UTF-8";
41nsProbingState UnicodeGroupProber::HandleData(
const char *aBuf,
unsigned int aLen)
43 nsSMState codingState;
44 static bool disableUTF16LE =
false;
45 static bool disableUTF16BE =
false;
47 if (mActiveSM == 0 || aLen < 2) {
52 if (!(disableUTF16LE || disableUTF16BE)) {
54 disableUTF16LE =
true;
55 disableUTF16BE =
true;
57 const uint weight_BOM = sqrt((
double)aLen) + aLen / 10.0;
58 uint counts[5] = {0, 0, 0, 0, 0};
59 for (uint i = 0; i < 5; i++) {
60 counts[i] = std::count(aBuf, aBuf + aLen,
char(i));
62 const double weight_zero = (2.0 * (counts[0] + counts[1] + counts[2] + counts[3] + counts[4]) + weight_BOM) / aLen;
63 if (weight_zero < log(1.4142)) {
64 disableUTF16LE =
true;
65 disableUTF16BE =
true;
67 if (4 >= aBuf[1] && aBuf[1] >= 0 &&
QChar::isPrint(
static_cast<uint
>(aBuf[0]))) {
68 disableUTF16BE =
true;
70 disableUTF16LE =
true;
76 nsCodingStateMachine *t;
78 mCodingSM[1] = mCodingSM[2];
84 for (uint i = 0; i < aLen; ++i) {
85 for (
int j = mActiveSM - 1; j >= 0; --j) {
87 codingState = mCodingSM[j]->NextState(aBuf[i]);
88 if (codingState == eError) {
94 }
else if (j != (
int)mActiveSM) {
95 nsCodingStateMachine *t;
96 t = mCodingSM[mActiveSM];
97 mCodingSM[mActiveSM] = mCodingSM[j];
100 }
else if (codingState == eItsMe) {
102 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
104 }
else if (mState == eDetecting) {
105 mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
112float UnicodeGroupProber::GetConfidence()
114 if (mState == eFoundIt) {
122void UnicodeGroupProber::DumpStatus()
125 for (uint i = 0; i < mActiveSM; i++) {
126 qDebug() <<
"Unicode group" << mCodingSM[i]->DumpCurrentState() << mCodingSM[i]->GetCodingStateMachine();
bool isPrint() const const