KItinerary

scriptextractor.h
1/*
2 SPDX-FileCopyrightText: 2017-2021 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#pragma once
8
9#include "abstractextractor.h"
10
11#include <memory>
12#include <vector>
13
14class QJsonObject;
15class QString;
16
17namespace KItinerary {
18class ExtractorFilter;
19class ScriptExtractorPrivate;
20
21/** A single unstructured data extraction rule set.
22 *
23 * These rules are loaded from JSON meta-data files in a compiled-in qrc file,
24 * or from $XDG_DATA_DIRS/kitinerary/extractors.
25 *
26 * @section extractor_metadata Meta Data Format
27 *
28 * The meta-data files either contain a single JSON object or an array of JSON objects
29 * with the following content:
30 * - \c mimeType: The MIME type of the extractor, \c text if not specified.
31 * - \c filter: An array of filters that are used to select this extractor for a given input file.
32 * - \c script: A JavaScript file to execute.
33 * - \c function: The entry point in the above mentioned script, @c main if not specified.
34 *
35 * The following extractor types are supported:
36 * - \c text/plain: plain text, the argument to the script function is a single string.
37 * - \c text/html: HTML documents, the argument to the script function is a KItinerary::HtmlDocument instance.
38 * - \c application/pdf: PDF documents, the argument to the script function is a KItinerary::PdfDocument instance.
39 * - \c application/vnd.apple.pkpass: Apple Wallet passes, the argument to the script function is a KPkPass::Pass instance.
40 * - \c internal/event: iCalendar events, the argument to the script function is a KCalendarCore::Event instance.
41 *
42 * Filter definitions have the following field:
43 * - \c mimeType: The MIME type of the document part this filter can match against.
44 * - \c field: The name of the field to match against. This can be a field id in a Apple Wallet pass,
45 * A MIME message header name, a property on a Json-LD object or an iCal calendar or event.
46 * For plain text or binary content, this is ignored.
47 * - \c match: A regular expression that is matched against the specified value (see QRegularExpression).
48 * - \c scope: Specifies how the filter should be applied relative to the document node that is being extracted.
49 * One of @c Current, @c Parent, @c Children, @c Ancestors, @c Descendants (@c Current is the default).
50 *
51 * Example:
52 * @code
53 * [
54 * {
55 * "mimeType": "application/pdf",
56 * "filter": [ { "field": "From", "match": "@swiss.com", "mimeType": "message/rfc822", "scope": "Ancestors" } ],
57 * "script": "swiss.js",
58 * "function": "parsePdf"
59 * },
60 * {
61 * "mimeType": "application/vnd.apple.pkpass",
62 * "filter": [ { "field": "passTypeIdentifier", "match": "pass.booking.swiss.com", "mimeType": "application/vnd.apple.pkpass", "scope": "Current" } ],
63 * "script": "swiss.js",
64 * "function": "parsePkPass"
65 * }
66 * ]
67 * @endcode
68 *
69 * @section extractor_development Development
70 *
71 * For development it's convenient to symlink the extractors source folder to
72 * $XDG_DATA_DIRS/kitinerary/extractors, so you can re-run a changed extractor
73 * script without recompiling or restarting the application.
74 *
75 */
76class KITINERARY_EXPORT ScriptExtractor : public AbstractExtractor
77{
78public:
79 explicit ScriptExtractor();
81
82 QString name() const override;
83 bool canHandle(const ExtractorDocumentNode &node) const override;
84 ExtractorResult extract(const ExtractorDocumentNode &node, const ExtractorEngine *engine) const override;
85
86 /** The JS script containing the code of the extractor. */
87 QString scriptFileName() const;
88 /** The JS function entry point for this extractor, @c main if empty. */
89 QString scriptFunction() const;
90 /** Mime type this script extractor supports. */
91 QString mimeType() const;
92 /** Returns the filters deciding whether this extractor should be applied. */
93 const std::vector<ExtractorFilter> &filters() const;
94
95 ///@cond internal
96 /** Load meta data from the given JSON object. */
97 bool load(const QJsonObject &obj, const QString &fileName, int index = -1);
98 /** Save extractor meta data to a JSON object. */
99 QJsonObject toJson() const;
100
101 /** Source file name. */
102 QString fileName() const;
103
104 void setMimeType(const QString &mimeType);
105 void setScriptFileName(const QString &script);
106 void setScriptFunction(const QString &func);
107 void setFilters(std::vector<ExtractorFilter> &&filters);
108 void setFilters(const std::vector<ExtractorFilter> &filters);
109 ///@endcond
110
111private:
112 std::unique_ptr<ScriptExtractorPrivate> d;
113};
114
115}
116
Abstract base class for data extractors.
A node in the extracted document object tree.
Semantic data extraction engine.
Generic extraction result.
A single unstructured data extraction rule set.
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:50:00 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.