KItinerary

extractordocumentnodefactory.cpp
1/*
2 SPDX-FileCopyrightText: 2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "extractordocumentnodefactory.h"
8#include "extractordocumentnode.h"
9#include "extractordocumentprocessor.h"
10#include "logging.h"
11
12#include "processors/binarydocumentprocessor.h"
13#include "processors/dosipasdocumentprocessor.h"
14#include "processors/eradocumentprocessor.h"
15#include "processors/externalprocessor.h"
16#include "processors/htmldocumentprocessor.h"
17#include "processors/httpresponseprocessor.h"
18#include "processors/iatabcbpdocumentprocessor.h"
19#include "processors/icaldocumentprocessor.h"
20#include "processors/imagedocumentprocessor.h"
21#include "processors/jsonlddocumentprocessor.h"
22#include "processors/mimedocumentprocessor.h"
23#include "processors/pdfdocumentprocessor.h"
24#include "processors/pkpassdocumentprocessor.h"
25#include "processors/plistdocumentprocessor.h"
26#include "processors/textdocumentprocessor.h"
27#include "processors/uic9183documentprocessor.h"
28#include "processors/vdvdocumentprocessor.h"
29
30#include <QHash>
31#include <QMimeDatabase>
32
33using namespace KItinerary;
34
35enum {
36 MinDocumentSize = 4,
37 MaxDocumentSize = 10000000,
38};
39
40namespace KItinerary {
41class ExtractorDocumentNodeFactoryStatic {
42public:
43 ExtractorDocumentNodeFactoryStatic();
44
45 void registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
46 std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {});
47
48 template <typename T>
49 inline void registerProcessor(QStringView canonicalMimeType, std::initializer_list<QStringView> aliasMimeTypes = {}, QStringView fallbackMimeType = {})
50 {
51 registerProcessor(std::make_unique<T>(), canonicalMimeType, aliasMimeTypes, fallbackMimeType);
52 }
53
54 void registerBuiltIn();
55 QStringView resolveAlias(QStringView mimeType) const;
56
57 struct ProcessorData {
58 QString mimeType;
59 const ExtractorDocumentProcessor* processor;
60 };
61 std::vector<ProcessorData> m_probeProcessors;
62 std::vector<ProcessorData> m_fallbackProbeProcessors;
63 std::vector<ProcessorData> m_mimetypeProcessorMap;
64 QHash<QString, QString> m_aliasMap;
65
66 // just for memory management
67 std::vector<std::unique_ptr<ExtractorDocumentProcessor>> processorPool;
68
69 static void insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap);
70};
71
72class ExtractorDocumentNodeFactoryPrivate {
73public:
74 ExtractorDocumentNodeFactoryStatic *s;
75 std::unique_ptr<ExtractorDocumentProcessor> interceptProcessor;
76};
77}
78
79ExtractorDocumentNodeFactoryStatic::ExtractorDocumentNodeFactoryStatic()
80{
81 registerBuiltIn();
82}
83
84void ExtractorDocumentNodeFactoryStatic::insertProcessor(const ExtractorDocumentProcessor *proc, QStringView mimeType, std::vector<ProcessorData> &procMap)
85{
86 if (mimeType.empty()) {
87 return;
88 }
89
90 const auto it = std::lower_bound(procMap.begin(), procMap.end(), mimeType, [](const auto &proc, auto mt) {
91 return proc.mimeType < mt;
92 });
93 if (it != procMap.end() && (*it).mimeType == mimeType) {
94 qCWarning(Log) << "Document processor already registered for mimetype:" << mimeType;
95 return;
96 }
97
98 procMap.insert(it, { mimeType.toString(), proc });
99}
100
101void ExtractorDocumentNodeFactoryStatic::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView canonicalMimeType,
102 std::initializer_list<QStringView> aliasMimeTypes, QStringView fallbackMimeType)
103{
104 insertProcessor(processor.get(), canonicalMimeType, m_probeProcessors);
105 insertProcessor(processor.get(), canonicalMimeType, m_mimetypeProcessorMap);
106 for (const auto mt : aliasMimeTypes) {
107 m_aliasMap.insert(mt.toString(), canonicalMimeType.isEmpty() ? fallbackMimeType.toString() : canonicalMimeType.toString());
108 }
109 if (!fallbackMimeType.isEmpty()) { // priorioty order matters for fallbacks, don't sort them!
110 m_fallbackProbeProcessors.push_back({ fallbackMimeType.toString(), processor.get() });
111 }
112 insertProcessor(processor.get(), fallbackMimeType, m_mimetypeProcessorMap);
113 processorPool.push_back(std::move(processor));
114}
115
116void ExtractorDocumentNodeFactoryStatic::registerBuiltIn()
117{
118 registerProcessor<PdfDocumentProcessor>(u"application/pdf");
119 registerProcessor<PkPassDocumentProcessor>(u"application/vnd.apple.pkpass");
120 registerProcessor<IcalEventProcessor>(u"internal/event");
121 registerProcessor<ImageDocumentProcessor>(u"internal/qimage", {u"image/png", u"image/jpeg", u"image/gif"});
122 registerProcessor<ElbDocumentProcessor>(u"internal/era-elb");
123 registerProcessor<SsbDocumentProcessor>(u"internal/era-ssb");
124 registerProcessor<IataBcbpDocumentProcessor>(u"internal/iata-bcbp");
125 registerProcessor<Uic9183DocumentProcessor>(u"internal/uic9183");
126 registerProcessor<DosipasDocumentProcessor>(u"internal/uic-dosipas");
127 registerProcessor<VdvDocumentProcessor>(u"internal/vdv");
128 registerProcessor<IcalCalendarProcessor>(u"text/calendar");
129 registerProcessor<PListDocumentProcessor>(u"application/x-plist");
130 registerProcessor<HttpResponseProcessor>(u"internal/http-response");
131 registerProcessor<HarDocumentProcessor>(u"internal/har-archive");
132
133 // fallback types that catch a very broad set of input types
134 // order matters particularly here, the broadest ones need to go last
135 registerProcessor<JsonLdDocumentProcessor>({}, {u"application/json"}, u"application/ld+json");
136 registerProcessor<MimeDocumentProcessor>({}, {u"application/mbox"}, u"message/rfc822");
137 registerProcessor<HtmlDocumentProcessor>({}, {u"application/xhtml+xml"}, u"text/html");
138 registerProcessor<TextDocumentProcessor>({}, {}, u"text/plain");
139 registerProcessor<BinaryDocumentProcessor>({}, {}, u"application/octet-stream");
140}
141
142QStringView ExtractorDocumentNodeFactoryStatic::resolveAlias(QStringView mimeType) const
143{
144 const auto it = m_aliasMap.find(mimeType.toString());
145 if (it != m_aliasMap.end()) {
146 return it.value();
147 }
148 return mimeType;
149}
150
151
152ExtractorDocumentNodeFactory::ExtractorDocumentNodeFactory()
153 : d(std::make_unique<ExtractorDocumentNodeFactoryPrivate>())
154{
155 static ExtractorDocumentNodeFactoryStatic s_factory;
156 d->s = &s_factory;
157}
158
159ExtractorDocumentNodeFactory::~ExtractorDocumentNodeFactory() = default;
160
162{
163 if (data.size() <= MinDocumentSize || data.size() > MaxDocumentSize) {
164 return {};
165 }
166
167 if (d->interceptProcessor && d->interceptProcessor->canHandleData(data, fileName)) {
168 auto node = d->interceptProcessor->createNodeFromData(data);
169 if (node.mimeType().isEmpty()) {
170 node.setMimeType(QStringLiteral("internal/external-process"));
171 }
172 node.setProcessor(d->interceptProcessor.get());
173 return node;
174 }
175
176 if (mimeType.isEmpty()) {
177 QMimeDatabase db;
178 QString autoDetectedMimeType;
179 if (fileName.isEmpty()) {
180 autoDetectedMimeType = db.mimeTypeForData(data).name();
181 } else {
182 autoDetectedMimeType = db.mimeTypeForFileNameAndData(fileName.toString(), data).name();
183 }
184 mimeType = d->s->resolveAlias(autoDetectedMimeType);
185
186 // let processors check themselves if they support this data, or whether the auto-detected mimetype matches
187 for (const auto &p : d->s->m_probeProcessors) {
188 if (p.processor->canHandleData(data, fileName) || (!mimeType.isEmpty() && p.mimeType == mimeType)) {
189 auto node = p.processor->createNodeFromData(data);
190 if (node.content().isNull()) {
191 continue;
192 }
193
194 node.setMimeType(p.mimeType);
195 node.setProcessor(p.processor);
196 return node;
197 }
198 }
199
200 // try the basic types that ultimately will accept anything
201 for (const auto &p : d->s->m_fallbackProbeProcessors) {
202 if (p.processor->canHandleData(data, fileName)) {
203 auto node = p.processor->createNodeFromData(data);
204 if (node.content().isNull()) {
205 continue;
206 }
207
208 node.setMimeType(p.mimeType);
209 node.setProcessor(p.processor);
210 return node;
211 }
212 }
213
214 // unreachable as application/octet-stream will always match
215 return {};
216 }
217
218 mimeType = d->s->resolveAlias(mimeType);
219 const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
220 return proc.mimeType < mt;
221 });
222 if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
223 qCDebug(Log) << "No document processor found for mimetype" << mimeType;
224 return {};
225 }
226
227 auto node = (*it).processor->createNodeFromData(data);
228 node.setMimeType((*it).mimeType);
229 node.setProcessor((*it).processor);
230 return node;
231}
232
234{
235 mimeType = d->s->resolveAlias(mimeType);
236 const auto it = std::lower_bound(d->s->m_mimetypeProcessorMap.begin(), d->s->m_mimetypeProcessorMap.end(), mimeType, [](const auto &proc, auto mt) {
237 return proc.mimeType < mt;
238 });
239 if (it == d->s->m_mimetypeProcessorMap.end() || (*it).mimeType != mimeType) {
240 qCDebug(Log) << "No document processor found for mimetype" << mimeType;
241 return {};
242 }
243
244 auto node = (*it).processor->createNodeFromContent(decodedData);
245 node.setMimeType((*it).mimeType);
246 node.setProcessor((*it).processor);
247 return node;
248}
249
250void ExtractorDocumentNodeFactory::registerProcessor(std::unique_ptr<ExtractorDocumentProcessor> &&processor, QStringView mimeType,
251 std::initializer_list<QStringView> aliasMimeTypes)
252{
253 d->s->registerProcessor(std::move(processor), mimeType, aliasMimeTypes);
254}
255
257{
258 if (separateProcess && !d->interceptProcessor) {
259 d->interceptProcessor = std::make_unique<ExternalProcessor>();
260 } else if (!separateProcess && d->interceptProcessor) {
261 d->interceptProcessor.reset();
262 }
263}
void registerProcessor(std::unique_ptr< ExtractorDocumentProcessor > &&processor, QStringView canonicalMimeType, std::initializer_list< QStringView > aliasMimeTypes={})
Register a new document processor.
ExtractorDocumentNode createNode(const QByteArray &data, QStringView fileName={}, QStringView mimeType={}) const
Create a new document node from data.
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
A node in the extracted document object tree.
Abstract base class of a document type processor.
KCALUTILS_EXPORT QString mimeType()
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
qsizetype size() const const
QMimeType mimeTypeForData(QIODevice *device) const const
QMimeType mimeTypeForFileNameAndData(const QString &fileName, QIODevice *device) const const
bool isEmpty() const const
bool isEmpty() const const
QString toString() const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Apr 4 2025 12:02:17 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.