KCodecs

kencodingprober.cpp
1/*
2 This file is part of the KDE libraries
3
4 SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
5
6 SPDX-License-Identifier: LGPL-2.0-or-later
7*/
8
9#include "kencodingprober.h"
10
11#include "probers/ChineseGroupProber.h"
12#include "probers/JapaneseGroupProber.h"
13#include "probers/UnicodeGroupProber.h"
14#include "probers/nsCharSetProber.h"
15#include "probers/nsMBCSGroupProber.h"
16#include "probers/nsSBCSGroupProber.h"
17#include "probers/nsUniversalDetector.h"
18
19#include <string.h>
20
21class KEncodingProberPrivate
22{
23public:
24 KEncodingProberPrivate()
25 : mProber(nullptr)
26 , mStart(true)
27 {
28 }
29 ~KEncodingProberPrivate()
30 {
31 delete mProber;
32 }
33 void setProberType(KEncodingProber::ProberType pType)
34 {
35 mProberType = pType;
36 /* handle multi-byte encodings carefully , because they're hard to detect,
37 * and have to use some Stastics methods.
38 * for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
39 * because encoding state machine can detect many such encodings.
40 */
41
42 delete mProber;
43
44 switch (mProberType) {
45 case KEncodingProber::None:
46 mProber = nullptr;
47 break;
48 case KEncodingProber::Arabic:
49 case KEncodingProber::Baltic:
50 case KEncodingProber::CentralEuropean:
51 case KEncodingProber::Cyrillic:
52 case KEncodingProber::Greek:
53 case KEncodingProber::Hebrew:
54 case KEncodingProber::NorthernSaami:
55 case KEncodingProber::Other:
56 case KEncodingProber::SouthEasternEurope:
57 case KEncodingProber::Thai:
58 case KEncodingProber::Turkish:
59 case KEncodingProber::WesternEuropean:
60 mProber = new kencodingprober::nsSBCSGroupProber();
61 break;
62 case KEncodingProber::ChineseSimplified:
63 case KEncodingProber::ChineseTraditional:
64 mProber = new kencodingprober::ChineseGroupProber();
65 break;
66 case KEncodingProber::Japanese:
67 mProber = new kencodingprober::JapaneseGroupProber();
68 break;
69 case KEncodingProber::Korean:
70 mProber = new kencodingprober::nsMBCSGroupProber();
71 break;
72 case KEncodingProber::Unicode:
73 mProber = new kencodingprober::UnicodeGroupProber();
74 break;
75 case KEncodingProber::Universal:
76 mProber = new kencodingprober::nsUniversalDetector();
77 break;
78 default:
79 mProber = nullptr;
80 }
81 }
82 void unicodeTest(const char *aBuf, int aLen)
83 {
84 if (mStart) {
85 mStart = false;
86 if (aLen > 3) {
87 switch (aBuf[0]) {
88 case '\xEF':
89 if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
90 // EF BB BF UTF-8 encoded BOM
91 {
92 mProberState = KEncodingProber::FoundIt;
93 }
94 break;
95 case '\xFE':
96 if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
97 // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
98 {
99 mProberState = KEncodingProber::FoundIt;
100 } else if ('\xFF' == aBuf[1])
101 // FE FF UTF-16, big endian BOM
102 {
103 mProberState = KEncodingProber::FoundIt;
104 }
105 break;
106 case '\x00':
107 if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
108 // 00 00 FE FF UTF-32, big-endian BOM
109 {
110 mProberState = KEncodingProber::FoundIt;
111 } else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
112 // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
113 {
114 mProberState = KEncodingProber::FoundIt;
115 }
116 break;
117 case '\xFF':
118 if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
119 // FF FE 00 00 UTF-32, little-endian BOM
120 {
121 mProberState = KEncodingProber::FoundIt;
122 } else if ('\xFE' == aBuf[1])
123 // FF FE UTF-16, little endian BOM
124 {
125 mProberState = KEncodingProber::FoundIt;
126 }
127 break;
128 } // switch
129 }
130 }
131 }
132 KEncodingProber::ProberType mProberType;
133 KEncodingProber::ProberState mProberState;
134 kencodingprober::nsCharSetProber *mProber;
135 bool mStart;
136};
137
138KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
139 : d(new KEncodingProberPrivate())
140{
141 setProberType(proberType);
142}
143
144KEncodingProber::~KEncodingProber() = default;
145
147{
148 d->mProberState = KEncodingProber::Probing;
149 d->mStart = true;
150}
151
153{
154 if (!d->mProber) {
155 return d->mProberState;
156 }
157 if (d->mProberState == Probing) {
158 if (d->mStart) {
159 d->unicodeTest(data.constData(), data.size());
160 if (d->mProberState == FoundIt) {
161 return d->mProberState;
162 }
163 }
164 d->mProber->HandleData(data.constData(), data.size());
165 switch (d->mProber->GetState()) {
166 case kencodingprober::eNotMe:
167 d->mProberState = NotMe;
168 break;
169 case kencodingprober::eFoundIt:
170 d->mProberState = FoundIt;
171 break;
172 default:
173 d->mProberState = Probing;
174 break;
175 }
176 }
177#ifdef DEBUG_PROBE
178 d->mProber->DumpStatus();
179#endif
180 return d->mProberState;
181}
182
184{
185 return d->mProberState;
186}
187
189{
190 if (!d->mProber) {
191 return QByteArray("UTF-8");
192 }
193
194 return QByteArray(d->mProber->GetCharSetName());
195}
196
198{
199 if (!d->mProber) {
200 return 0.0;
201 }
202
203 return d->mProber->GetConfidence();
204}
205
206KEncodingProber::ProberType KEncodingProber::proberType() const
207{
208 return d->mProberType;
209}
210
211void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
212{
213 d->setProberType(proberType);
214 reset();
215}
216
217KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
218{
219 if (lang.isEmpty()) {
220 return KEncodingProber::Universal;
221 } else if (lang == tr("Disabled", "@item Text character set")) {
222 return KEncodingProber::None;
223 } else if (lang == tr("Universal", "@item Text character set")) {
224 return KEncodingProber::Universal;
225 } else if (lang == tr("Unicode", "@item Text character set")) {
226 return KEncodingProber::Unicode;
227 } else if (lang == tr("Cyrillic", "@item Text character set")) {
228 return KEncodingProber::Cyrillic;
229 } else if (lang == tr("Western European", "@item Text character set")) {
230 return KEncodingProber::WesternEuropean;
231 } else if (lang == tr("Central European", "@item Text character set")) {
232 return KEncodingProber::CentralEuropean;
233 } else if (lang == tr("Greek", "@item Text character set")) {
234 return KEncodingProber::Greek;
235 } else if (lang == tr("Hebrew", "@item Text character set")) {
236 return KEncodingProber::Hebrew;
237 } else if (lang == tr("Turkish", "@item Text character set")) {
238 return KEncodingProber::Turkish;
239 } else if (lang == tr("Japanese", "@item Text character set")) {
240 return KEncodingProber::Japanese;
241 } else if (lang == tr("Baltic", "@item Text character set")) {
242 return KEncodingProber::Baltic;
243 } else if (lang == tr("Chinese Traditional", "@item Text character set")) {
244 return KEncodingProber::ChineseTraditional;
245 } else if (lang == tr("Chinese Simplified", "@item Text character set")) {
246 return KEncodingProber::ChineseSimplified;
247 } else if (lang == tr("Korean", "@item Text character set")) {
248 return KEncodingProber::Korean;
249 } else if (lang == tr("Thai", "@item Text character set")) {
250 return KEncodingProber::Thai;
251 } else if (lang == tr("Arabic", "@item Text character set")) {
252 return KEncodingProber::Arabic;
253 }
254
255 return KEncodingProber::Universal;
256}
257
258QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
259{
260 switch (proberType) {
261 case KEncodingProber::None:
262 return tr("Disabled", "@item Text character set");
263 break;
264 case KEncodingProber::Universal:
265 return tr("Universal", "@item Text character set");
266 break;
267 case KEncodingProber::Arabic:
268 return tr("Arabic", "@item Text character set");
269 break;
270 case KEncodingProber::Baltic:
271 return tr("Baltic", "@item Text character set");
272 break;
273 case KEncodingProber::CentralEuropean:
274 return tr("Central European", "@item Text character set");
275 break;
276 case KEncodingProber::Cyrillic:
277 return tr("Cyrillic", "@item Text character set");
278 break;
279 case KEncodingProber::Greek:
280 return tr("Greek", "@item Text character set");
281 break;
282 case KEncodingProber::Hebrew:
283 return tr("Hebrew", "@item Text character set");
284 break;
285 case KEncodingProber::Japanese:
286 return tr("Japanese", "@item Text character set");
287 break;
288 case KEncodingProber::Turkish:
289 return tr("Turkish", "@item Text character set");
290 break;
291 case KEncodingProber::WesternEuropean:
292 return tr("Western European", "@item Text character set");
293 break;
294 case KEncodingProber::ChineseTraditional:
295 return tr("Chinese Traditional", "@item Text character set");
296 break;
297 case KEncodingProber::ChineseSimplified:
298 return tr("Chinese Simplified", "@item Text character set");
299 break;
300 case KEncodingProber::Korean:
301 return tr("Korean", "@item Text character set");
302 break;
303 case KEncodingProber::Thai:
304 return tr("Thai", "@item Text character set");
305 break;
306 case KEncodingProber::Unicode:
307 return tr("Unicode", "@item Text character set");
308 break;
309 default:
310 return QString();
311 }
312}
void reset()
reset the prober's internal state and data.
KEncodingProber(ProberType proberType=Universal)
Default ProberType is Universal(detect all possible encodings)
static QString nameForProberType(ProberType proberType)
map ProberType to language string
float confidence() const
ProberState state() const
void setProberType(ProberType proberType)
change current prober's ProberType and reset the prober
static ProberType proberTypeForName(const QString &lang)
ProberState feed(QByteArrayView data)
The main class method.
QByteArray encoding() const
@ Probing
Need more data to make a decision.
@ NotMe
Sure not included in current ProberType's all supported encodings
@ FoundIt
Sure find the encoding.
const_pointer constData() const const
qsizetype size() const const
bool isEmpty() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:48:44 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.