KItinerary

nameoptimizer.cpp
1/*
2 SPDX-FileCopyrightText: 2022 Volker Krause <vkrause@kde.org>
3 SPDX-License-Identifier: LGPL-2.0-or-later
4*/
5
6#include "nameoptimizer_p.h"
7#include "stringutil.h"
8
9#include <KItinerary/Person>
10
11#include <QDebug>
12#include <QMetaProperty>
13#include <QRegularExpression>
14
15using namespace KItinerary;
16
17static const char* name_truncation_pattern[] = {
18 "(?:^|\\s)(%1\\w+) %2(?:$|\\s)",
19 "(?:^|\\s)%2 ?/ ?(%1\\w+)(?:$|\\s)",
20 "(?:^|\\s)%2, (%1\\w+)(?:$|\\s)",
21};
22
23Person NameOptimizer::optimizeName(const QString &text, const Person &person)
24{
25 Person p(person);
26 if (p.givenName().isEmpty() && p.familyName().isEmpty()) {
27 p.setName(optimizeNameString(text, p.name().trimmed()));
28 return p;
29 }
30
31 p.setFamilyName(optimizeNameString(text, p.familyName().trimmed()));
32 p.setGivenName(optimizeNameString(text, p.givenName().trimmed()));
33
34 // check for IATA BCBP truncation effects
35 // IATA BCBP has a 20 character size limit, with one character used for separating name parts
36 if (person.givenName() == p.givenName() && (person.familyName().size() + person.givenName().size()) == 19 && person.givenName().size() >= 3) {
37 for (auto pattern : name_truncation_pattern) {
39 QRegularExpression::escape(p.givenName()),
40 QRegularExpression::escape(p.familyName())),
42 const auto match = rx.match(text);
43 if (match.hasMatch()) {
44 p.setGivenName(match.captured(1));
45 break;
46 }
47 }
48 }
49
50 return p;
51}
52
53static const char* name_prefixes[] = {
54 "DR", "MR", "MRS", "MS"
55};
56
57static bool isNamePrefix(QStringView s)
58{
59 s = s.trimmed();
60 return std::any_of(
61 std::begin(name_prefixes), std::end(name_prefixes),
62 [s](const char *prefix) { return s == QLatin1StringView(prefix); });
63}
64
65static QStringView stripNamePrefix(QStringView s)
66{
67 for (auto prefix : name_prefixes) {
68 QLatin1StringView p(prefix);
69 if (s.endsWith(p) && s.size() > p.size() &&
70 s[s.size() - p.size() - 1] == QLatin1Char(' ')) {
71 return s.left(s.size() - p.size() - 1);
72 }
73 }
74
75 return s;
76}
77
78static bool isSameChar(QChar c1, QChar c2)
79{
80 if (c1 == c2) {
81 return true;
82 }
83
84 if (c1.decompositionTag() != c2.decompositionTag()) {
86 c1 = c1.decomposition().at(0);
87 }
89 c2 = c2.decomposition().at(0);
90 }
91 return c1 == c2;
92 }
93
94 return false;
95}
96
97QString NameOptimizer::optimizeNameString(const QString &text, const QString &name)
98{
99 if (name.size() < 2) {
100 return name;
101 }
102
103 for (int i = 0; i < text.size(); ++i) {
104 bool mismatch = false;
105 int nameLen = 0;
106 for (int j = 0; j < name.size(); ++j, ++nameLen) {
107 // reached the end of text
108 if (i + nameLen >= text.size()) {
109 // remainder is either a prefix placed as suffix (see below), or we are unsuccessful
110 if (!isNamePrefix(QStringView(name).mid(j))) {
111 mismatch = true;
112 }
113 break;
114 }
115
116 auto c1 = text.at(i+nameLen).toCaseFolded();
117 auto c2 = name.at(j).toCaseFolded();
118
119 if (isSameChar(c1, c2)) {
120 continue;
121 }
122
123 // expand spaces missing in name
124 if (nameLen > 0 && c1 == QLatin1Char(' ') && (i + nameLen + 1) < text.size() && isSameChar(text.at(i+nameLen+1).toCaseFolded(), c2)) {
125 ++nameLen;
126 continue;
127 }
128
129 // mismatch: check if the remainder is a name prefix (yes, those also occur frequently as suffixes of name parts in IATA BCBP for example)
130 if (isNamePrefix(QStringView(name).mid(j))) {
131 nameLen = QStringView(name).left(nameLen).trimmed().size();
132 break;
133 }
134
135 mismatch = true;
136 break;
137 }
138 if (mismatch) {
139 continue;
140 }
141
142 // test for word boundaries
143 if (i > 0 && text.at(i-1).isLetter()) {
144 continue;
145 }
146 if (i + nameLen < text.size() && text.at(i + nameLen).isLetter()) {
147 continue;
148 }
149
150 const auto betterName = QStringView(text).mid(i, nameLen).trimmed();
151 if (StringUtil::betterString(betterName, name) != name) {
152 return stripNamePrefix(betterName).toString();
153 }
154 }
155
156 return name;
157}
158
159QVariant NameOptimizer::optimizeNameRecursive(const QString &text, QVariant object)
160{
161 if (JsonLd::isA<Person>(object)) {
162 return optimizeName(text, object.value<Person>());
163 }
164
165 const auto mo = QMetaType(object.userType()).metaObject();
166 if (!mo) {
167 return object;
168 }
169
170 for (int i = 0; i < mo->propertyCount(); ++i) {
171 const auto prop = mo->property(i);
172 const auto subMo = QMetaType(prop.userType()).metaObject();
173 if (!prop.isStored() || prop.isEnumType()|| (!subMo && prop.userType() != QMetaType::QVariant)) {
174 continue;
175 }
176 const auto value = optimizeNameRecursive(text, prop.readOnGadget(object.constData()));
177 prop.writeOnGadget(object.data(), value);
178 }
179
180 return object;
181}
A person.
Definition person.h:20
KCOREADDONS_EXPORT Result match(QStringView pattern, QStringView str)
bool isA(const QVariant &value)
Returns true if value is of type T.
Definition datatypes.h:24
QStringView betterString(QStringView lhs, QStringView rhs)
Assuming both sides are describing the same thing, this tries to find the "better" string.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
QString decomposition(char32_t ucs4)
Decomposition decompositionTag(char32_t ucs4)
bool isLetter(char32_t ucs4)
char32_t toCaseFolded(char32_t ucs4)
const QMetaObject * metaObject() const const
QString escape(QStringView str)
const QChar at(qsizetype position) const const
qsizetype size() const const
QStringView left(qsizetype length) const const
QStringView mid(qsizetype start, qsizetype length) const const
bool endsWith(QChar ch) const const
qsizetype size() const const
QString toString() const const
QStringView trimmed() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:50:01 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.