KCodecs

nsUniversalDetector.cpp
1/* -*- C++ -*-
2 SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
3 SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
4
5 SPDX-License-Identifier: MIT
6*/
7
8#include "nsUniversalDetector.h"
9
10#include "nsEscCharsetProber.h"
11#include "nsLatin1Prober.h"
12#include "nsMBCSGroupProber.h"
13#include "nsSBCSGroupProber.h"
14
15namespace kencodingprober
16{
17nsUniversalDetector::nsUniversalDetector()
18{
19 mDone = false;
20 mBestGuess = -1; // illegal value as signal
21 mInTag = false;
22 mEscCharSetProber = nullptr;
23
24 mStart = true;
25 mDetectedCharset = nullptr;
26 mGotData = false;
27 mInputState = ePureAscii;
28 mLastChar = '\0';
29
30 unsigned int i;
31 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
32 mCharSetProbers[i] = nullptr;
33 }
34}
35
36nsUniversalDetector::~nsUniversalDetector()
37{
38 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
39 delete mCharSetProbers[i];
40 }
41 delete mEscCharSetProber;
42}
43
44void nsUniversalDetector::Reset()
45{
46 mDone = false;
47 mBestGuess = -1; // illegal value as signal
48 mInTag = false;
49
50 mStart = true;
51 mDetectedCharset = nullptr;
52 mGotData = false;
53 mInputState = ePureAscii;
54 mLastChar = '\0';
55
56 if (mEscCharSetProber) {
57 mEscCharSetProber->Reset();
58 }
59
60 unsigned int i;
61 for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
62 if (mCharSetProbers[i]) {
63 mCharSetProbers[i]->Reset();
64 }
65 }
66}
67
68//---------------------------------------------------------------------
69#define SHORTCUT_THRESHOLD (float)0.95
70#define MINIMUM_THRESHOLD (float)0.20
71
72nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen)
73{
74 if (mDone) {
75 return eFoundIt;
76 }
77
78 if (aLen > 0) {
79 mGotData = true;
80 }
81
82 unsigned int i;
83 for (i = 0; i < aLen; i++) {
84 // other than 0xa0, if every other character is ascii, the page is ascii
85 if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP
86 // we got a non-ascii byte (high-byte)
87 if (mInputState != eHighbyte) {
88 // adjust state
89 mInputState = eHighbyte;
90
91 // kill mEscCharSetProber if it is active
92 delete mEscCharSetProber;
93 mEscCharSetProber = nullptr;
94
95 // start multibyte and singlebyte charset prober
96 if (nullptr == mCharSetProbers[0]) {
97 mCharSetProbers[0] = new nsMBCSGroupProber;
98 }
99 if (nullptr == mCharSetProbers[1]) {
100 mCharSetProbers[1] = new nsSBCSGroupProber;
101 }
102 if (nullptr == mCharSetProbers[2]) {
103 mCharSetProbers[2] = new nsLatin1Prober;
104 }
105 }
106 } else {
107 // ok, just pure ascii so far
108 if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) {
109 // found escape character or HZ "~{"
110 mInputState = eEscAscii;
111 }
112
113 mLastChar = aBuf[i];
114 }
115 }
116
117 nsProbingState st = eDetecting;
118 switch (mInputState) {
119 case eEscAscii:
120 if (nullptr == mEscCharSetProber) {
121 mEscCharSetProber = new nsEscCharSetProber;
122 }
123 st = mEscCharSetProber->HandleData(aBuf, aLen);
124 if (st == eFoundIt) {
125 mDone = true;
126 mDetectedCharset = mEscCharSetProber->GetCharSetName();
127 }
128 break;
129 case eHighbyte:
130 for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) {
131 st = mCharSetProbers[i]->HandleData(aBuf, aLen);
132 if (st == eFoundIt) {
133 mDone = true;
134 mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
135 }
136 }
137 break;
138
139 default: // pure ascii
140 mDetectedCharset = "UTF-8";
141 }
142 return st;
143}
144
145//---------------------------------------------------------------------
146const char *nsUniversalDetector::GetCharSetName()
147{
148 if (mDetectedCharset) {
149 return mDetectedCharset;
150 }
151 switch (mInputState) {
152 case eHighbyte: {
153 float proberConfidence;
154 float maxProberConfidence = (float)0.0;
155 int maxProber = 0;
156
157 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
158 proberConfidence = mCharSetProbers[i]->GetConfidence();
159 if (proberConfidence > maxProberConfidence) {
160 maxProberConfidence = proberConfidence;
161 maxProber = i;
162 }
163 }
164 // do not report anything because we are not confident of it, that's in fact a negative answer
165 if (maxProberConfidence > MINIMUM_THRESHOLD) {
166 return mCharSetProbers[maxProber]->GetCharSetName();
167 }
168 }
169 case eEscAscii:
170 break;
171 default: // pure ascii
172 ;
173 }
174 return "UTF-8";
175}
176
177//---------------------------------------------------------------------
178float nsUniversalDetector::GetConfidence()
179{
180 if (!mGotData) {
181 // we haven't got any data yet, return immediately
182 // caller program sometimes call DataEnd before anything has been sent to detector
183 return MINIMUM_THRESHOLD;
184 }
185 if (mDetectedCharset) {
186 return 0.99f;
187 }
188 switch (mInputState) {
189 case eHighbyte: {
190 float proberConfidence;
191 float maxProberConfidence = (float)0.0;
192 int maxProber = 0;
193
194 for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
195 proberConfidence = mCharSetProbers[i]->GetConfidence();
196 if (proberConfidence > maxProberConfidence) {
197 maxProberConfidence = proberConfidence;
198 maxProber = i;
199 }
200 }
201 // do not report anything because we are not confident of it, that's in fact a negative answer
202 if (maxProberConfidence > MINIMUM_THRESHOLD) {
203 return mCharSetProbers[maxProber]->GetConfidence();
204 }
205 }
206 case eEscAscii:
207 break;
208 default: // pure ascii
209 ;
210 }
211 return MINIMUM_THRESHOLD;
212}
213
214nsProbingState nsUniversalDetector::GetState()
215{
216 if (mDone) {
217 return eFoundIt;
218 } else {
219 return eDetecting;
220 }
221}
222}
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:48:44 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.