KItinerary

engine/extractorengine.cpp
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "extractorengine.h"
9
10#include "barcodedecoder.h"
11#include "abstractextractor.h"
12#include "extractordocumentnode.h"
13#include "extractordocumentnodefactory.h"
14#include "extractordocumentprocessor.h"
15#include "extractorresult.h"
16#include "extractorrepository.h"
17#include "extractorscriptengine_p.h"
18#include "jsonlddocument.h"
19#include "logging.h"
20
21#include <QDateTime>
22#include <QFile>
23#include <QFileInfo>
24#include <QJsonArray>
25#include <QJsonDocument>
26#include <QJsonObject>
27#include <QLocale>
28
29#include <cstring>
30
31using namespace KItinerary;
32
33namespace KItinerary {
34
35class ExtractorEnginePrivate {
36public:
37 void processNode(ExtractorDocumentNode &node);
38
39 ExtractorEngine *q = nullptr;
40 std::vector<const AbstractExtractor*> m_additionalExtractors;
41 ExtractorDocumentNode m_rootNode;
42 ExtractorDocumentNode m_contextNode;
43 ExtractorDocumentNodeFactory m_nodeFactory;
45 BarcodeDecoder m_barcodeDecoder;
46 ExtractorScriptEngine m_scriptEngine;
47 ExtractorEngine::Hints m_hints = ExtractorEngine::NoHint;
48};
49
50}
51
52void ExtractorEnginePrivate::processNode(ExtractorDocumentNode& node)
53{
54 if (node.isNull()) {
55 return;
56 }
57
58 node.processor()->expandNode(node, q);
59 for (auto c : node.childNodes()) {
60 processNode(c);
61 }
62 node.processor()->reduceNode(node);
63
64 node.processor()->preExtract(node, q);
65 std::vector<const AbstractExtractor*> extractors = m_additionalExtractors;
66 m_repo.extractorsForNode(node, extractors);
67
68 ExtractorResult nodeResult;
69 QString usedExtractor;
70 for (const auto &extractor : extractors) {
71 auto res = extractor->extract(node, q);
72 if (!res.isEmpty()) {
73 usedExtractor = extractor->name();
74 nodeResult.append(std::move(res));
75 }
76 }
77 if (!nodeResult.isEmpty()) {
78 node.setResult(std::move(nodeResult));
79 node.setUsedExtractor(usedExtractor);
80 }
81
82 node.processor()->postExtract(node, q);
83
84 // set modification time for all results that don't have it yet
85 if (node.contextDateTime().isValid()) {
86 auto result = node.result().jsonLdResult();
87 for (int i = 0; i < result.size(); ++i) {
88 auto res = result.at(i).toObject();
89 if (!res.contains(QLatin1StringView("modifiedTime"))) {
90 res.insert(QStringLiteral("modifiedTime"),
92 }
93 result[i] = res;
94 }
95 node.setResult(result);
96 }
97}
98
99
100ExtractorEngine::ExtractorEngine()
101 : d(new ExtractorEnginePrivate)
102{
103 d->q = this;
104}
105
106ExtractorEngine::ExtractorEngine(ExtractorEngine &&) noexcept = default;
107
109{
110 // ensure we destroy nodes before we destroy the node factory
111 clear();
112}
113
115{
116 d->m_rootNode = {};
117 d->m_contextNode = {};
118}
119
120void ExtractorEngine::setData(const QByteArray &data, QStringView fileName, QStringView mimeType)
121{
122 d->m_rootNode = d->m_nodeFactory.createNode(data, fileName, mimeType);
123}
124
126{
127 d->m_rootNode = d->m_nodeFactory.createNode(data, mimeType);
128}
129
131{
132 d->m_contextNode = d->m_nodeFactory.createNode(data, mimeType);
133}
134
136{
137 d->m_contextNode.setContextDateTime(dt);
138}
139
141{
142 return d->m_hints;
143}
144
146{
147 d->m_hints = hints;
148}
149
151{
152 d->m_rootNode.setParent(d->m_contextNode);
153 d->processNode(d->m_rootNode);
154 return d->m_rootNode.result().jsonLdResult();
155}
156
158{
159 d->m_nodeFactory.setUseSeparateProcess(separateProcess);
160}
161
162void ExtractorEngine::setAdditionalExtractors(std::vector<const AbstractExtractor*> &&extractors)
163{
164 d->m_additionalExtractors = std::move(extractors);
165}
166
168{
169 return d->m_rootNode.usedExtractor();
170}
171
173{
174 return &d->m_nodeFactory;
175}
176
178{
179 return &d->m_barcodeDecoder;
180}
181
182const ExtractorRepository* ExtractorEngine::extractorRepository() const
183{
184 return &d->m_repo;
185}
186
187const ExtractorScriptEngine* ExtractorEngine::scriptEngine() const
188{
189 d->m_scriptEngine.setExtractorEngine(const_cast<ExtractorEngine*>(this));
190 return &d->m_scriptEngine;
191}
192
193ExtractorDocumentNode ExtractorEngine::rootDocumentNode() const
194{
195 return d->m_rootNode;
196}
197
198void ExtractorEngine::processNode(ExtractorDocumentNode &node) const
199{
200 d->processNode(node);
201}
Barcode decoding with result caching.
Instantiates KItinerary::ExtractorDocumentNode instances using the type-specific document processor.
A node in the extracted document object tree.
QJsonArray result
Result access for QJSEngine.
void setResult(ExtractorResult &&result)
Replace the existing results by result.
QVariantList childNodes
Child nodes, for QJSEngine access.
QDateTime contextDateTime
The best known context date/time at this point in the document tree.
Semantic data extraction engine.
void setAdditionalExtractors(std::vector< const AbstractExtractor * > &&extractors)
Sets additional extractors to run on the given data.
Hints hints() const
The currently set extraction hints.
void setData(const QByteArray &data, QStringView fileName={}, QStringView mimeType={})
Set raw data to extract from.
void setContent(const QVariant &data, QStringView mimeType)
Already decoded data to extract from.
QString usedCustomExtractor() const
Returns the extractor id used to obtain the result.
void clear()
Resets the internal state, call before processing new input data.
void setContextDate(const QDateTime &dt)
Set the date the extracted document has been issued at.
void setHints(Hints hints)
Set extraction hints.
QJsonArray extract()
Perform the actual extraction, and return the JSON-LD data that has been found.
const BarcodeDecoder * barcodeDecoder() const
Barcode decoder for use by KItinerary::ExtractorDocumentProcessor.
void setContext(const QVariant &data, QStringView mimeType)
Provide a document part that is only used to determine which extractor to use, but not for extraction...
const ExtractorDocumentNodeFactory * documentNodeFactory() const
Factory for creating new document nodes.
void setUseSeparateProcess(bool separateProcess)
Perform extraction of "risky" content such as PDF files in a separate process.
Collection of all known data extractors.
void extractorsForNode(const ExtractorDocumentNode &node, std::vector< const AbstractExtractor * > &extractors) const
Finds matching extractors for the given document node.
Generic extraction result.
void append(ExtractorResult &&other)
Append another result to this one.
bool isEmpty() const
Checks if there is any relevant result set in here.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool isValid() const const
QString toString(QStringView format, QCalendar cal) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:50:00 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.