KFileMetaData

epubextractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8
9#include "epubextractor.h"
10#include "kfilemetadata_debug.h"
11
12#include <epub.h>
13
14#include <QDateTime>
15#include <QRegularExpression>
16
17using namespace KFileMetaData;
18
19EPubExtractor::EPubExtractor(QObject* parent)
20 : ExtractorPlugin(parent)
21{
22
23}
24
25namespace
26{
27static const QStringList supportedMimeTypes = {
28 QStringLiteral("application/epub+zip"),
29};
30
31const QStringList fetchMetadata(struct epub* e, const epub_metadata& type)
32{
33 int size = 0;
34 unsigned char** data = epub_get_metadata(e, type, &size);
35 if (data) {
36 QStringList strList;
37 strList.reserve(size);
38 for (int i = 0; i < size; i++) {
39 // skip nullptr entries, can happen for broken xml files
40 // also skip empty entries
41 if (!data[i] || !data[i][0]) {
42 continue;
43 }
44
45 strList << QString::fromUtf8((char*)data[i]);
46 free(data[i]);
47 }
48 free(data);
49
50 return strList;
51 }
52 return QStringList();
53}
54}
55
56QStringList EPubExtractor::mimetypes() const
57{
58 return supportedMimeTypes;
59}
60
61void EPubExtractor::extract(ExtractionResult* result)
62{
63 // open epub, return on exit, file will be closed again at end of function
64 auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1);
65 if (!ePubDoc) {
66 qCWarning(KFILEMETADATA_LOG) << "Invalid document";
67 return;
68 }
69
70 result->addType(Type::Document);
71
72 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
73
74 for (const QString& value : fetchMetadata(ePubDoc, EPUB_TITLE)) {
75 result->add(Property::Title, value);
76 }
77
78 for (const QString& value : fetchMetadata(ePubDoc, EPUB_SUBJECT)) {
79 result->add(Property::Subject, value);
80 }
81
82 for (QString value : fetchMetadata(ePubDoc, EPUB_CREATOR)) {
83 // Prefix added by libepub when no opf:role is specified
84 if (value.startsWith(QLatin1String("Author: "), Qt::CaseSensitive)) {
85 value = value.mid(8).simplified();
86 } else {
87 // Find 'opf:role' prefix added by libepub
88 int index = value.indexOf(QLatin1String(": "), Qt::CaseSensitive);
89 if (index > 0) {
90 value = value.mid(index + 2).simplified();
91 }
92 }
93
94 // Name is provided as "<name>(<file-as>)" when opf:file-as property
95 // is specified, "<name>(<name>)" otherwise. Strip the last part
96 int index = value.indexOf(QLatin1Char('('));
97 if (index > 0) {
98 value = value.mid(0, index);
99 }
100
101 result->add(Property::Author, value);
102 }
103
104 // The Contributor just seems to be mostly Calibre aka the Generator
105 /*
106 value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
107 if( !value.isEmpty() ) {
108 SimpleResource con;
109 con.addType( NCO::Contact() );
110 con.addProperty( NCO::fullname(), value );
111
112 fileRes.addProperty( NCO::contributor(), con );
113 graph << con;
114 }*/
115
116 for (const QString& value : fetchMetadata(ePubDoc, EPUB_PUBLISHER)) {
117 result->add(Property::Publisher, value);
118 }
119
120 for (const QString& value : fetchMetadata(ePubDoc, EPUB_DESCRIPTION)) {
121 result->add(Property::Description, value);
122 }
123
124 for (QString value : fetchMetadata(ePubDoc, EPUB_DATE)) {
125 if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) {
126 value = value.mid(12).simplified();
127 } else if (value.startsWith(QLatin1String("publication:"), Qt::CaseInsensitive)) {
128 value = value.mid(12).simplified();
129 } else {
130 continue;
131 }
133 if (!dt.isNull()) {
134 result->add(Property::CreationDate, dt);
135 result->add(Property::ReleaseYear, dt.date().year());
136 }
137 }
138 }
139
140 //
141 // Plain Text
142 //
143 if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
144 if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) {
145 do {
146 char* curr = epub_it_get_curr(iter);
147 if (!curr) {
148 continue;
149 }
150
151 QString html = QString::fromUtf8(curr);
152 html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
153 result->append(html);
154 } while (epub_it_get_next(iter));
155
156 epub_free_iterator(iter);
157 }
158
159 auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0);
160 if (!tit) {
161 tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0);
162 }
163 if (tit) {
164 if (epub_tit_curr_valid(tit)) {
165 do {
166 // get link, iterator handles freeing of it
167 char* clink = epub_tit_get_curr_link(tit);
168
169 // epub_get_data returns -1 on failure
170 char* data = nullptr;
171 const int size = epub_get_data(ePubDoc, clink, &data);
172 if (size >= 0 && data) {
173 QString html = QString::fromUtf8(data, size);
174 // strip html tags
175 html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
176
177 result->append(html);
178 free(data);
179 }
180 } while (epub_tit_next(tit));
181 }
182 epub_free_titerator(tit);
183 }
184 }
185
186 // close epub file again
187 epub_close(ePubDoc);
188}
189
190#include "moc_epubextractor.cpp"
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
static QDateTime dateTimeFromString(const QString &dateString)
Tries to extract a valid date time from the string provided.
@ Subject
Refers to the subject of the file.
Definition properties.h:127
@ Title
Refers to the Title of the content of the file.
Definition properties.h:121
@ Author
The Author field indicated the primary creator of a document.
Definition properties.h:114
@ Description
Represents the description stored in the file.
Definition properties.h:351
@ CreationDate
The date the content of the file was created.
Definition properties.h:177
@ Publisher
The publisher of the content.
Definition properties.h:169
@ ReleaseYear
Indicates the year a track was released.
Definition properties.h:71
@ Document
Any file which counts as a document.
Definition types.h:63
The KFileMetaData namespace.
const char * constData() const const
int year() const const
QDate date() const const
bool isNull() const const
void reserve(qsizetype size)
QString fromUtf8(QByteArrayView str)
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QByteArray toUtf8() const const
CaseSensitive
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Fri Nov 22 2024 12:12:41 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.