KItinerary

pricefinder.cpp
1/*
2 SPDX-FileCopyrightText: 2023 Volker Krause <vkrause@kde.org>
3 SPDX-License-Identifier: LGPL-2.0-or-later
4*/
5
6#include "pricefinder_p.h"
7
8#include <KItinerary/PriceUtil>
9
10#include <QDebug>
11#include <QLocale>
12#include <QRegularExpression>
13
14#include <cmath>
15#include <cstring>
16
17using namespace KItinerary;
18
19std::vector<PriceFinder::CurrencyData> PriceFinder::s_currencyData;
20
21// normalize currency symbols, as e.g. "wide Yen" and "normal Yen" should be considered the same
22static QString normalizeSymbol(QStringView str)
23{
24 QString out;
25 out.reserve(str.size());
26 for (const auto c : str) {
27 if (c.decompositionTag() == QChar::Wide) {
28 out.push_back(c.decomposition().at(0));
29 } else {
30 out.push_back(c);
31 }
32 }
33 return out;
34}
35
36static bool isCollidingSymbol(QStringView lhs, QStringView rhs)
37{
38 return lhs == rhs
39 || (lhs.size() == rhs.size() + 1 && lhs.back() == QLatin1Char('.') && lhs.startsWith(rhs))
40 || (rhs.size() == lhs.size() + 1 && rhs.back() == QLatin1Char('.') && rhs.startsWith(lhs));
41}
42
43// overrides to QLocale data
44// ### keep sorted by ISO code
45struct {
46 const char isoCode[4];
47 const char *symbol;
48} static constexpr const currency_data_overrides[] = {
49 { "BAM", nullptr }, // BAM's symbol is "KM", which collides with distance values on train tickets too often
50 { "GBP", "£" }, // FKP, GIP and SHP are practically GPB-equivalent using the pound sign, SSP has it wrongly assigned in QLocale
51 { "JPY", "円"}, // the Yen sign is also used by CNY and thus ambigious, but the Japanese Yen symbol works
52};
53
54PriceFinder::PriceFinder()
55{
56 if (!s_currencyData.empty()) {
57 return;
58 }
59
61 for (const auto &locale : allLocales) {
62 CurrencyData data{locale.currencySymbol(QLocale::CurrencyIsoCode), normalizeSymbol(locale.currencySymbol(QLocale::CurrencySymbol))};
63 if (data.isoCode.isEmpty()) {
64 continue;
65 }
66
67 // single letter symbols tend to be way too trigger-happy
68 if (data.symbol.size() == 1 && data.symbol[0].isLetter()) {
69 //qDebug() << "Dropping single letter symbol:" << data.symbol << data.isoCode;
70 data.symbol.clear();
71 }
72
73 s_currencyData.push_back(std::move(data));
74 }
75
76 // remove duplicates
77 const auto lessThanCurrencyData = [](const auto &lhs, const auto &rhs) {
78 return std::tie(lhs.isoCode, lhs.symbol) < std::tie(rhs.isoCode, rhs.symbol);
79 };
80 std::sort(s_currencyData.begin(), s_currencyData.end(), lessThanCurrencyData);
81 const auto compareCurrencyData = [](const auto &lhs, const auto &rhs) {
82 return lhs.isoCode == rhs.isoCode && lhs.symbol == rhs.symbol;
83 };
84 s_currencyData.erase(std::unique(s_currencyData.begin(), s_currencyData.end(), compareCurrencyData), s_currencyData.end());
85
86 // clear ambigious symbols
87 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
88 if ((*it).symbol.isEmpty()) {
89 continue;
90 }
91 bool collision = false;
92 for (auto it2 = std::next(it); it2 != s_currencyData.end(); ++it2) {
93 if (!isCollidingSymbol((*it).symbol, (*it2).symbol)) {
94 continue;
95 }
96 (*it2).symbol.clear();
97 if (!collision) {
98 qDebug() << "Ambigious currency symbol:" << (*it).symbol;
99 }
100 collision = true;
101 }
102 if (collision) {
103 (*it).symbol.clear();
104 }
105 }
106
107 // apply our own overrides over QLocale
108 for (auto it = s_currencyData.begin(); it != s_currencyData.end(); ++it) {
109 const auto it2 = std::lower_bound(std::begin(currency_data_overrides), std::end(currency_data_overrides), (*it).isoCode, [](const auto &lhs, const auto &rhs) {
110 return std::strncmp(lhs.isoCode, rhs.toLatin1().constData(), 3) < 0;
111 });
112 if (it2 == std::end(currency_data_overrides) || std::strncmp((*it2).isoCode, (*it).isoCode.toLatin1().constData(), 3) != 0) {
113 continue;
114 }
115 (*it).symbol = (*it2).symbol ? QString::fromUtf8((*it2).symbol) : QString();
116 }
117}
118
119PriceFinder::~PriceFinder() = default;
120
121static bool isBoundaryChar(QChar c)
122{
123 return c != QLatin1Char('-') && (c.isSpace() || c.isPunct() || c.isSymbol());
124}
125
126void PriceFinder::findAll(QStringView text, std::vector<Result> &results) const
127{
128 static QRegularExpression rx(QStringLiteral(R"((?<=\s|[[:punct:]]|^)([^\d\s]{1,4})?[  ]*(\d(?:[\d,.  ]*\d)?)[  ]*([^\d\s]{1,4})?(?=\s|[[:punct:]]|$))"));
129
130 const auto prevResultSize = results.size();
131 qsizetype offset = 0;
132 while (true) {
133 const auto match = rx.matchView(text, offset);
134 if (!match.hasMatch()) {
135 break;
136 }
137 offset = match.capturedEnd(2);
138
139 const auto leadingCurrency = parseCurrency(match.capturedView(1), CurrencyPrefix);
140 const auto trailingCurrency = parseCurrency(match.capturedView(3), CurrencySuffix);
141 if ((leadingCurrency.isEmpty() && trailingCurrency.isEmpty()) || (!leadingCurrency.isEmpty() && !trailingCurrency.isEmpty() && leadingCurrency != trailingCurrency)) {
142 continue;
143 }
144
145 // additional boundary checks not covered by the regular expression
146 if (leadingCurrency.isEmpty() && match.capturedStart(2) > 0 && !isBoundaryChar(text[match.capturedStart(2) - 1])) {
147 continue;
148 }
149 if (trailingCurrency.isEmpty() && match.capturedEnd(2) < text.size() - 2 && !isBoundaryChar(text[match.capturedEnd(2)])) {
150 continue;
151 }
152
153 Result r;
154 r.start = leadingCurrency.isEmpty() ? match.capturedStart(2) : match.capturedStart();
155 r.end = trailingCurrency.isEmpty() ? match.capturedEnd(2) : match.capturedEnd();
156 r.currency = leadingCurrency.isEmpty() ? trailingCurrency : leadingCurrency;
157
158 r.value = parseValue(match.capturedView(2), r.currency);
159 if (std::isnan(r.value)) {
160 continue;
161 }
162
163 results.push_back(std::move(r));
164 }
165
166 // check for overlapping results: in those case we have to assume the entire result is invalid
167 if (results.size() <= 1 + prevResultSize) {
168 return;
169 }
170 for (auto it = results.begin() + prevResultSize; it != std::prev(results.end()); ++it) {
171 if ((*it).end >= (*std::next(it)).start) {
172 qDebug() << "overlapping price data, discarding result";
173 results.erase(results.begin() + prevResultSize, results.end());
174 return;
175 }
176 }
177}
178
179PriceFinder::Result PriceFinder::findHighest(QStringView text) const
180{
181 std::vector<Result> results;
182 findAll(text, results);
183 return highest(results);
184}
185
186bool PriceFinder::isSingleCurrency(const std::vector<Result> &results) const
187{
188 if (results.empty()) {
189 return false;
190 }
191
192 const auto isoCode = results.front().currency;
193 return std::all_of(results.begin(), results.end(), [&isoCode](const auto &r) { return r.currency == isoCode; });
194}
195
196PriceFinder::Result PriceFinder::highest(std::vector<Result> &results) const
197{
198 if (!isSingleCurrency(results)) {
199 return {};
200 }
201
202 std::sort(results.begin(), results.end(), [](const auto &lhs, const auto &rhs) { return lhs.value > rhs.value; });
203 if (results.size() == 1) {
204 return results.front();
205 }
206
207 // check for extremely large differences between the max and max - 1
208 // this can be caused by the fine print containing company capital statements (common e.g. in France)
209 if (results[1].value > 0 && results[0].value / results[1].value > 1000) {
210 // TODO is this reliable enough to return results[1] here?
211 return {};
212 }
213 return results.front();
214}
215
216static bool equalIgnoreDiacritics(QStringView lhs, QStringView rhs)
217{
218 if (lhs.size() != rhs.size()) {
219 return false;
220 }
221
222 for (qsizetype i = 0; i < lhs.size(); ++i) {
223 auto l = lhs[i];
224 if (l.decompositionTag() == QChar::Canonical) {
225 l = l.decomposition().at(0);
226 }
227 auto r = rhs[i];
228 if (r.decompositionTag() == QChar::Canonical) {
229 r = r.decomposition().at(0);
230 }
231 if (l != r) {
232 return false;
233 }
234 }
235
236 return true;
237}
238
239QString PriceFinder::parseCurrency(QStringView s, CurrencyPosition pos) const
240{
241 // trim remaining boundary chars
242 if (s.isEmpty()) {
243 return {};
244 }
245
246 // valid currency ISO code
247 auto isoCandidate = s;
248 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.last())) {
249 isoCandidate = isoCandidate.left(isoCandidate.size() - 1);
250 }
251 while (!isoCandidate.isEmpty() && isBoundaryChar(isoCandidate.front())) {
252 isoCandidate = isoCandidate.mid(1);
253 }
254 if (isoCandidate.size() == 3) {
255 const auto it = std::lower_bound(s_currencyData.begin(), s_currencyData.end(), isoCandidate, [](const auto &lhs, QStringView rhs) { return lhs.isoCode < rhs; });
256 if (it != s_currencyData.end() && (*it).isoCode == isoCandidate) {
257 return (*it).isoCode;
258 }
259 }
260
261 // currency symbol
262 const auto symbol = normalizeSymbol(s);
263 // exact match: we know there is only ever going to be one (see ctor)
264 const auto it = std::find_if(s_currencyData.begin(), s_currencyData.end(), [&symbol](const auto &data) { return data.symbol == symbol; });
265 if (it != s_currencyData.end())
266 return (*it).isoCode;
267
268 // partial match: needs to be unique
269 QString isoCode;
270 for (const auto &data : s_currencyData) {
271 if (data.symbol.isEmpty()) {
272 continue;
273 }
274
275 // match disregarding diacritics
276 if (equalIgnoreDiacritics(data.symbol, symbol)) {
277 if (!isoCode.isEmpty()) {
278 return {};
279 }
280 isoCode = data.isoCode;
281 }
282
283 // prefix or suffix match
284 if (pos == CurrencyPrefix) {
285 if (symbol.size() <= data.symbol.size() || !symbol.endsWith(data.symbol) || !isBoundaryChar(symbol.at(symbol.size() - data.symbol.size() - 1))) {
286 continue;
287 }
288 } else {
289 if (symbol.size() <= data.symbol.size() || !symbol.startsWith(data.symbol) || !isBoundaryChar(symbol.at(data.symbol.size()))) {
290 continue;
291 }
292 }
293 if (!isoCode.isEmpty()) {
294 return {};
295 }
296 isoCode = data.isoCode;
297 }
298 return isoCode;
299}
300
301double PriceFinder::parseValue(QStringView s, const QString &isoCode) const
302{
303 if (s.isEmpty() || !s[0].isDigit() || !s[s.size() - 1].isDigit()) {
304 return NAN;
305 }
306
307 // find potential decimal separator
308 QChar decimalSeparator;
309 qsizetype decimalSeparatorIndex = -1;
310 for (qsizetype i = s.size() - 1; i > 0; --i) {
311 if (s[i].isDigit()) {
312 continue;
313 }
314 if (!s[i].isSpace()) {
315 decimalSeparator = s[i];
316 decimalSeparatorIndex = i;
317 }
318 break;
319 }
320
321 // identify/validate group separators
322 QChar groupSeparator;
323 qsizetype lastGroupSeparatorIndex = -1;
324 for (qsizetype i = 0; i < s.size(); ++i) {
325 if (s[i].isDigit()) {
326 continue;
327 }
328 if (lastGroupSeparatorIndex > 0 && i - lastGroupSeparatorIndex != 4) { // separator interval is wrong
329 return NAN;
330 }
331 if (decimalSeparatorIndex > 0 && i == decimalSeparatorIndex) { // found the suspected decimal separator
332 break;
333 }
334 if (!groupSeparator.isNull() && s[i] != groupSeparator) { // inconsistent separators
335 return NAN;
336 }
337
338 lastGroupSeparatorIndex = i;
339 groupSeparator = s[i];
340 }
341
342 // we found both and they are the same: has to be the group separator
343 if (!decimalSeparator.isNull() && !groupSeparator.isNull() && decimalSeparator == groupSeparator) {
344 if ((s.size() - decimalSeparatorIndex) != 4) {
345 return NAN;
346 }
347 decimalSeparator = {};
348 decimalSeparatorIndex = -1;
349 }
350
351 // we found a decimal separator: verify the number of decimals is consistent with the currency's subdivision
352 // see https://en.wikipedia.org/wiki/List_of_circulating_currencies
353 if (!decimalSeparator.isNull()) {
354 const auto decimalCount = s.size() - decimalSeparatorIndex - 1;
355 const auto expectedDecimalCount = PriceUtil::decimalCount(isoCode);
356
357 // subdivision x1000 is ambigious if we don't have a group separator
358 if (decimalCount == expectedDecimalCount && decimalCount == 3 && groupSeparator.isNull()) {
359 return NAN;
360 }
361
362 // if decimal count is 3, assume group separator
363 else if (decimalCount != expectedDecimalCount && decimalCount == 3) {
364 if (groupSeparator.isNull()) {
365 groupSeparator = decimalSeparator;
366 decimalSeparator = {};
367 } else {
368 return NAN;
369 }
370 }
371
372 else if (decimalCount > expectedDecimalCount) {
373 return NAN;
374 }
375 }
376
377 // strip group separators, replace decimal separator
378 auto normalized = s.toString();
379 if (!groupSeparator.isNull()) {
380 normalized.remove(groupSeparator);
381 }
382 if (!decimalSeparator.isNull()) {
383 normalized.replace(decimalSeparator, QLatin1Char('.'));
384 }
385
386 bool ok = false;
387 const auto value = normalized.toDouble(&ok);
388 if (!ok) {
389 return NAN;
390 }
391 return value;
392}
static int decimalCount(QStringView currency)
Returns the number of decimals to represent the sub-unit of currency.
Definition priceutil.cpp:92
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool isNull() const const
bool isPunct(char32_t ucs4)
bool isSpace(char32_t ucs4)
bool isSymbol(char32_t ucs4)
QList< QLocale > matchingLocales(QLocale::Language language, QLocale::Script script, QLocale::Territory territory)
QString fromUtf8(QByteArrayView str)
void push_back(QChar ch)
void reserve(qsizetype size)
QChar back() const const
bool isEmpty() const const
qsizetype size() const const
bool startsWith(QChar ch) const const
QString toString() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Mon Nov 18 2024 12:09:59 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.