KFileMetaData

epubextractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3 SPDX-FileCopyrightText: 2016 Christoph Cullmann <cullmann@kde.org>
4
5 SPDX-License-Identifier: LGPL-2.1-or-later
6*/
7
8
9#include "datetimeparser_p.h"
10#include "epubextractor.h"
11#include "kfilemetadata_debug.h"
12
13#include <epub.h>
14
15#include <QDateTime>
16#include <QRegularExpression>
17
18using namespace KFileMetaData;
19
20EPubExtractor::EPubExtractor(QObject* parent)
21 : ExtractorPlugin(parent)
22{
23
24}
25
26namespace
27{
28static const QStringList supportedMimeTypes = {
29 QStringLiteral("application/epub+zip"),
30};
31
32const QStringList fetchMetadata(struct epub* e, const epub_metadata& type)
33{
34 int size = 0;
35 unsigned char** data = epub_get_metadata(e, type, &size);
36 if (data) {
37 QStringList strList;
38 strList.reserve(size);
39 for (int i = 0; i < size; i++) {
40 // skip nullptr entries, can happen for broken xml files
41 // also skip empty entries
42 if (!data[i] || !data[i][0]) {
43 continue;
44 }
45
46 strList << QString::fromUtf8((char*)data[i]);
47 free(data[i]);
48 }
49 free(data);
50
51 return strList;
52 }
53 return QStringList();
54}
55}
56
57QStringList EPubExtractor::mimetypes() const
58{
59 return supportedMimeTypes;
60}
61
62void EPubExtractor::extract(ExtractionResult* result)
63{
64 // open epub, return on exit, file will be closed again at end of function
65 auto ePubDoc = epub_open(result->inputUrl().toUtf8().constData(), 1);
66 if (!ePubDoc) {
67 qCWarning(KFILEMETADATA_LOG) << "Invalid document";
68 return;
69 }
70
71 result->addType(Type::Document);
72
73 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
74
75 for (const QString& value : fetchMetadata(ePubDoc, EPUB_TITLE)) {
76 result->add(Property::Title, value);
77 }
78
79 for (const QString& value : fetchMetadata(ePubDoc, EPUB_SUBJECT)) {
80 result->add(Property::Subject, value);
81 }
82
83 for (QString value : fetchMetadata(ePubDoc, EPUB_CREATOR)) {
84 // Prefix added by libepub when no opf:role is specified
85 if (value.startsWith(QLatin1String("Author: "), Qt::CaseSensitive)) {
86 value = value.mid(8).simplified();
87 } else {
88 // Find 'opf:role' prefix added by libepub
89 int index = value.indexOf(QLatin1String(": "), Qt::CaseSensitive);
90 if (index > 0) {
91 value = value.mid(index + 2).simplified();
92 }
93 }
94
95 // Name is provided as "<name>(<file-as>)" when opf:file-as property
96 // is specified, "<name>(<name>)" otherwise. Strip the last part
97 int index = value.indexOf(QLatin1Char('('));
98 if (index > 0) {
99 value = value.mid(0, index);
100 }
101
102 result->add(Property::Author, value);
103 }
104
105 // The Contributor just seems to be mostly Calibre aka the Generator
106 /*
107 value = fetchMetadata(ePubDoc, EPUB_CONTRIB);
108 if( !value.isEmpty() ) {
109 SimpleResource con;
110 con.addType( NCO::Contact() );
111 con.addProperty( NCO::fullname(), value );
112
113 fileRes.addProperty( NCO::contributor(), con );
114 graph << con;
115 }*/
116
117 for (const QString& value : fetchMetadata(ePubDoc, EPUB_PUBLISHER)) {
118 result->add(Property::Publisher, value);
119 }
120
121 for (const QString& value : fetchMetadata(ePubDoc, EPUB_DESCRIPTION)) {
122 result->add(Property::Description, value);
123 }
124
125 for (QString value : fetchMetadata(ePubDoc, EPUB_DATE)) {
126 if (value.startsWith(QLatin1String("Unspecified:"), Qt::CaseInsensitive)) {
127 value = value.mid(12).simplified();
128 } else if (value.startsWith(QLatin1String("publication:"), Qt::CaseInsensitive)) {
129 value = value.mid(12).simplified();
130 } else {
131 continue;
132 }
133 QDateTime dt = Parser::dateTimeFromString(value);
134 if (!dt.isNull()) {
135 result->add(Property::CreationDate, dt);
136 result->add(Property::ReleaseYear, dt.date().year());
137 }
138 }
139 }
140
141 //
142 // Plain Text
143 //
144 if (result->inputFlags() & ExtractionResult::ExtractPlainText) {
145 if (auto iter = epub_get_iterator(ePubDoc, EITERATOR_SPINE, 0)) {
146 do {
147 char* curr = epub_it_get_curr(iter);
148 if (!curr) {
149 continue;
150 }
151
152 QString html = QString::fromUtf8(curr);
153 html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
154 result->append(html);
155 } while (epub_it_get_next(iter));
156
157 epub_free_iterator(iter);
158 }
159
160 auto tit = epub_get_titerator(ePubDoc, TITERATOR_NAVMAP, 0);
161 if (!tit) {
162 tit = epub_get_titerator(ePubDoc, TITERATOR_GUIDE, 0);
163 }
164 if (tit) {
165 if (epub_tit_curr_valid(tit)) {
166 do {
167 // get link, iterator handles freeing of it
168 char* clink = epub_tit_get_curr_link(tit);
169
170 // epub_get_data returns -1 on failure
171 char* data = nullptr;
172 const int size = epub_get_data(ePubDoc, clink, &data);
173 if (size >= 0 && data) {
174 QString html = QString::fromUtf8(data, size);
175 // strip html tags
176 html.remove(QRegularExpression(QStringLiteral("<[^>]*>")));
177
178 result->append(html);
179 free(data);
180 }
181 } while (epub_tit_next(tit));
182 }
183 epub_free_titerator(tit);
184 }
185 }
186
187 // close epub file again
188 epub_close(ePubDoc);
189}
190
191#include "moc_epubextractor.cpp"
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
@ Subject
Refers to the subject of the file.
Definition properties.h:127
@ Title
Refers to the Title of the content of the file.
Definition properties.h:121
@ Author
The Author field indicated the primary creator of a document.
Definition properties.h:114
@ Description
Represents the description stored in the file.
Definition properties.h:351
@ CreationDate
The date the content of the file was created.
Definition properties.h:177
@ Publisher
The publisher of the content.
Definition properties.h:169
@ ReleaseYear
Indicates the year a track was released.
Definition properties.h:71
@ Document
Any file which counts as a document.
Definition types.h:63
The KFileMetaData namespace.
const char * constData() const const
int year() const const
QDate date() const const
bool isNull() const const
void reserve(qsizetype size)
QString fromUtf8(QByteArrayView str)
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QByteArray toUtf8() const const
CaseSensitive
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Feb 28 2025 11:50:27 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.