KFileMetaData

office2007extractor.cpp
1/*
2 SPDX-FileCopyrightText: 2013 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "kfilemetadata_debug.h"
9#include "office2007extractor.h"
10
11#include "dublincoreextractor.h"
12#include <memory>
13
14#include <KZip>
15
16#include <QDomDocument>
17#include <QXmlStreamReader>
18
19using namespace KFileMetaData;
20
21namespace {
22inline QString cpNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/metadata/core-properties"); }
23inline QString relNS() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships"); }
24inline QString extPropNST() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"); }
25inline QString extPropNSS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/extendedProperties"); }
26
27inline QString coreProp() { return QStringLiteral("http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"); }
28inline QString extPropT() { return QStringLiteral("http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties"); }
29inline QString extPropS() { return QStringLiteral("http://purl.oclc.org/ooxml/officeDocument/relationships/extendedProperties"); }
30} // namespace
31
32Office2007Extractor::Office2007Extractor(QObject* parent)
33 : ExtractorPlugin(parent)
34{
35
36}
37
38const QStringList supportedMimeTypes = {
39 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
40 QStringLiteral("application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
41 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
42 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slide"),
43 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
44 QStringLiteral("application/vnd.openxmlformats-officedocument.presentationml.template"),
45 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
46 QStringLiteral("application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
47 QStringLiteral("application/vnd.ms-xpsdocument"),
48 QStringLiteral("application/oxps"),
49 QStringLiteral("model/3mf"),
50};
51
52QStringList Office2007Extractor::mimetypes() const
53{
54 return supportedMimeTypes;
55}
56
57void Office2007Extractor::extract(ExtractionResult* result)
58{
59 KZip zip(result->inputUrl());
60 if (!zip.open(QIODevice::ReadOnly)) {
61 qCWarning(KFILEMETADATA_LOG) << "Failed to open" << zip.fileName() << "-" << zip.errorString();
62 return;
63 }
64
65 const KArchiveDirectory* rootDir = zip.directory();
66 if (!rootDir) {
67 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (main directory is missing)";
68 return;
69 }
70
71 const bool extractMetaData = result->inputFlags() & ExtractionResult::ExtractMetaData;
72
73 // Resolve part relationships according to ECMA-376-2 (Open Packaging Conventions, OPC)
74 const QDomElement relationsElem = [rootDir]() {
75 const KArchiveFile *baseRels = rootDir->file(QStringLiteral("_rels/.rels"));
76 if (!baseRels) {
77 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - missing package relationship";
78 return QDomElement{};
79 }
80
81 QDomDocument relationsDoc;
82 relationsDoc.setContent(baseRels->data(), QDomDocument::ParseOption::UseNamespaceProcessing);
83
84 auto relations = relationsDoc.firstChildElement(QStringLiteral("Relationships"));
85 if (relations.isNull()) {
86 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure - invalid package relationships";
87 }
88 return relations;
89 }();
90
91 auto targetByType = [&relationsElem](const QString &type, const QString &defVal = {}) -> QString {
92 for (auto rel = relationsElem.firstChildElement(); !rel.isNull(); rel = rel.nextSiblingElement()) {
93 if (rel.namespaceURI() == relNS() && rel.localName() == QStringLiteral("Relationship")
94 && rel.attribute(QStringLiteral("Type")) == type) {
95 return rel.attribute(QStringLiteral("Target"));
96 }
97 }
98 return defVal;
99 };
100
101 // Core Properties
102 const QString corePropertiesFile = targetByType(coreProp(), QStringLiteral("docProps/core.xml"));
103 if (const KArchiveFile *file = extractMetaData ? rootDir->file(corePropertiesFile) : nullptr; file) {
104 QDomDocument coreDoc(QStringLiteral("core"));
105 coreDoc.setContent(file->data(), QDomDocument::ParseOption::UseNamespaceProcessing);
106
107 QDomElement cpElem = coreDoc.documentElement();
108
109 if (!cpElem.isNull() && cpElem.namespaceURI() == cpNS()) {
110 DublinCoreExtractor::extract(result, cpElem);
111 }
112
113 auto elem = cpElem.firstChildElement(QStringLiteral("keywords"));
114 if (!elem.isNull() && elem.namespaceURI() == cpNS()) {
115 for (auto c = elem.firstChild(); !c.isNull(); c = c.nextSibling()) {
116 if (const auto childElem = c.toElement(); childElem.localName() == QStringLiteral("value") && !childElem.text().isEmpty()) {
117 result->add(Property::Keywords, childElem.text());
118 } else if (const auto tNode = c.toText(); !tNode.nodeValue().isEmpty()) {
119 result->add(Property::Keywords, tNode.nodeValue());
120 }
121 }
122 }
123 }
124
125 // Extended Properties - two valid relation types: "strict" (ECMA-376-1:2016) or "transitional" (ECMA-367-4:2016)
126 const QString extPropertiesFile = targetByType(extPropS(), targetByType(extPropT(), QStringLiteral("docProps/app.xml")));
127 if (const KArchiveFile *file = extractMetaData ? rootDir->file(extPropertiesFile) : nullptr; file) {
128 QDomDocument appDoc;
129 appDoc.setContent(file->data(), QDomDocument::ParseOption::UseNamespaceProcessing);
130
131 QDomElement propsElem = appDoc.documentElement();
132
133 for (auto prop = propsElem.firstChildElement(); !prop.isNull(); prop = prop.nextSiblingElement()) {
134 // Look for properties as specified in ECMA-376-1, Annex A.6.2 Extended Properties
135 bool ok;
136 if (prop.localName() == QStringLiteral("Pages")) {
137 if (int count = prop.text().toInt(&ok); ok == true) {
138 result->add(Property::PageCount, count);
139 }
140 } else if (prop.localName() == QStringLiteral("Slides")) {
141 if (int count = prop.text().toInt(&ok); ok == true) {
142 // Map number of slides to PageCount
143 result->add(Property::PageCount, count);
144 }
145 } else if (prop.localName() == QStringLiteral("Words")) {
146 if (int count = prop.text().toInt(&ok); ok == true) {
147 result->add(Property::WordCount, count);
148 }
149 } else if (prop.localName() == QStringLiteral("Lines")) {
150 if (int count = prop.text().toInt(&ok); ok == true) {
151 result->add(Property::LineCount, count);
152 }
153 } else if (prop.localName() == QStringLiteral("Application")) {
154 QString application = prop.text();
155 if (!application.isEmpty()) {
156 result->add(Property::Generator, application);
157 }
158 }
159 }
160 }
161
162 //
163 // Plain Text
164 //
165 bool extractPlainText = (result->inputFlags() & ExtractionResult::ExtractPlainText);
166
167 if (const auto wordEntry = rootDir->entry(QStringLiteral("word")); wordEntry) {
168 result->addType(Type::Document);
169
170 if (!extractPlainText) {
171 return;
172 }
173
174 if (!wordEntry->isDirectory()) {
175 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (word is not a directory)";
176 return;
177 }
178
179 const KArchiveDirectory* wordDirectory = dynamic_cast<const KArchiveDirectory*>(wordEntry);
180 const QStringList wordEntries = wordDirectory->entries();
181
182 if (wordEntries.contains(QStringLiteral("document.xml"))) {
183 const KArchiveFile* file = wordDirectory->file(QStringLiteral("document.xml"));
184
185 if (file) {
186 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
187 extractTextWithTag(contentIODevice.get(), QStringLiteral("w:t"), result);
188 }
189 }
190 }
191
192 else if (const auto xlEntry = rootDir->entry(QStringLiteral("xl")); xlEntry) {
193 result->addType(Type::Document);
194 result->addType(Type::Spreadsheet);
195
196 if (!extractPlainText) {
197 return;
198 }
199
200 if (!xlEntry->isDirectory()) {
201 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (xl is not a directory)";
202 return;
203 }
204
205 const auto xlDirectory = dynamic_cast<const KArchiveDirectory*>(xlEntry);
206 // TODO: Read the sheets from worksheets/*.xml, and dereference all cells
207 // values in order
208 const KArchiveFile* file = xlDirectory->file(QStringLiteral("sharedStrings.xml"));
209 if (!file) {
210 return;
211 }
212 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
213 extractTextWithTag(contentIODevice.get(), QStringLiteral("t"), result);
214 }
215
216 else if (const auto pptEntry = rootDir->entry(QStringLiteral("ppt")); pptEntry) {
217 result->addType(Type::Document);
219
220 if (!extractPlainText) {
221 return;
222 }
223
224 if (!pptEntry->isDirectory()) {
225 qCWarning(KFILEMETADATA_LOG) << "Invalid document structure (ppt is not a directory)";
226 return;
227 }
228
229 const auto pptDirectory = dynamic_cast<const KArchiveDirectory*>(pptEntry);
230 const auto slidesEntry = pptDirectory->entry(QStringLiteral("slides"));
231 if (!slidesEntry || !slidesEntry->isDirectory()) {
232 return;
233 }
234
235 const auto slidesDirectory = dynamic_cast<const KArchiveDirectory*>(slidesEntry);
236 QStringList entries = slidesDirectory->entries();
237 // TODO: Read the actual order from presentation.xml, and follow the
238 // references in ppt/_rels/presentation.xml.rel
239 std::sort(entries.begin(), entries.end());
240 for (const QString & entryName : std::as_const(entries)) {
241 const KArchiveFile* file = slidesDirectory->file(entryName);
242 if (!file) {
243 continue;
244 }
245 std::unique_ptr<QIODevice> contentIODevice{file->createDevice()};
246 extractTextWithTag(contentIODevice.get(), QStringLiteral("a:t"), result);
247 }
248 }
249
250 else if (!relationsElem.isNull()) {
251 // Any other document type likely following OPC
252 result->addType(Type::Document);
253 }
254}
255
256void Office2007Extractor::extractTextWithTag(QIODevice* device, const QString& tag, ExtractionResult* result)
257{
258 QXmlStreamReader xml(device);
259
260 while (!xml.atEnd()) {
261 xml.readNext();
262 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
263 QString str = xml.readElementText(QXmlStreamReader::IncludeChildElements);
264
265 if (!str.isEmpty()) {
266 result->append(str);
267 }
268 }
269
270 if (xml.isEndDocument() || xml.hasError()) {
271 break;
272 }
273 }
274}
275
276#include "moc_office2007extractor.cpp"
QStringList entries() const
const KArchiveEntry * entry(const QString &name) const
const KArchiveFile * file(const QString &name) const
virtual QIODevice * createDevice() const
virtual QByteArray data() const
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
Type type(const QSqlDatabase &db)
@ WordCount
The number of words in a document.
Definition properties.h:145
@ Generator
Refers to the Application used to create this file.
Definition properties.h:134
@ PageCount
The number of pages in a document.
Definition properties.h:139
@ LineCount
The number of lines in a document.
Definition properties.h:151
@ Keywords
The keywords used to represent the document.
Definition properties.h:183
@ Document
Any file which counts as a document.
Definition types.h:63
@ Presentation
A Presentation file.
Definition types.h:75
@ Spreadsheet
A SpreadSheet file.
Definition types.h:69
The KFileMetaData namespace.
QDomElement documentElement() const const
ParseResult setContent(QAnyStringView text, ParseOptions options)
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
QString namespaceURI() const const
iterator begin()
iterator end()
bool isEmpty() const const
bool contains(QLatin1StringView str, Qt::CaseSensitivity cs) const const
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Apr 11 2025 11:55:34 by doxygen 1.13.2 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.