8#include "office2007extractor.h"
10#include "dublincoreextractor.h"
16#include <QDomDocument>
17#include <QXmlStreamReader>
22inline QString cpNS() {
return QStringLiteral(
"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"); }
25Office2007Extractor::Office2007Extractor(
QObject* parent)
32 QStringLiteral(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
33 QStringLiteral(
"application/vnd.openxmlformats-officedocument.wordprocessingml.template"),
34 QStringLiteral(
"application/vnd.openxmlformats-officedocument.presentationml.presentation"),
35 QStringLiteral(
"application/vnd.openxmlformats-officedocument.presentationml.slide"),
36 QStringLiteral(
"application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
37 QStringLiteral(
"application/vnd.openxmlformats-officedocument.presentationml.template"),
38 QStringLiteral(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
39 QStringLiteral(
"application/vnd.openxmlformats-officedocument.spreadsheetml.template"),
44 return supportedMimeTypes;
51 qWarning() <<
"Document is not a valid ZIP archive";
55 const KArchiveDirectory* rootDir = zip.directory();
57 qWarning() <<
"Invalid document structure (main directory is missing)";
61 const QStringList rootEntries = rootDir->
entries();
62 if (!rootEntries.
contains(QStringLiteral(
"docProps"))) {
63 qWarning() <<
"Invalid document structure (docProps is missing)";
67 const KArchiveEntry* docPropEntry = rootDir->
entry(QStringLiteral(
"docProps"));
69 qWarning() <<
"Invalid document structure (docProps is not a directory)";
73 const KArchiveDirectory* docPropDirectory =
dynamic_cast<const KArchiveDirectory*
>(docPropEntry);
75 const bool extractMetaData = result->
inputFlags() & ExtractionResult::ExtractMetaData;
77 const KArchiveFile* file = docPropDirectory->
file(QStringLiteral(
"core.xml"));
78 if (extractMetaData && file) {
79 QDomDocument coreDoc(QStringLiteral(
"core"));
80 coreDoc.setContent(file->
data(),
true);
82 QDomElement cpElem = coreDoc.documentElement();
85 DublinCoreExtractor::extract(result, cpElem);
89 if (!elem.isNull() && elem.namespaceURI() == cpNS()) {
90 QString str = elem.text();
97 file = docPropDirectory->
file(QStringLiteral(
"app.xml"));
98 if (extractMetaData && file) {
99 QDomDocument appDoc(QStringLiteral(
"app"));
100 appDoc.setContent(file->
data());
102 QDomElement docElem = appDoc.documentElement();
105 if (mimeType == QLatin1String(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
136 QString app = elem.
text();
146 bool extractPlainText = (result->
inputFlags() & ExtractionResult::ExtractPlainText);
148 if (rootEntries.
contains(QStringLiteral(
"word"))) {
151 if (!extractPlainText) {
155 const KArchiveEntry* wordEntry = rootDir->
entry(QStringLiteral(
"word"));
157 qWarning() <<
"Invalid document structure (word is not a directory)";
161 const KArchiveDirectory* wordDirectory =
dynamic_cast<const KArchiveDirectory*
>(wordEntry);
162 const QStringList wordEntries = wordDirectory->
entries();
164 if (wordEntries.
contains(QStringLiteral(
"document.xml"))) {
165 const KArchiveFile* file = wordDirectory->
file(QStringLiteral(
"document.xml"));
168 std::unique_ptr<QIODevice> contentIODevice{file->
createDevice()};
169 extractTextWithTag(contentIODevice.get(), QStringLiteral(
"w:t"), result);
174 else if (rootEntries.
contains(QStringLiteral(
"xl"))) {
178 if (!extractPlainText) {
182 const KArchiveEntry* xlEntry = rootDir->
entry(QStringLiteral(
"xl"));
184 qWarning() <<
"Invalid document structure (xl is not a directory)";
188 const auto xlDirectory =
dynamic_cast<const KArchiveDirectory*
>(xlEntry);
191 const KArchiveFile* file = xlDirectory->file(QStringLiteral(
"sharedStrings.xml"));
195 std::unique_ptr<QIODevice> contentIODevice{file->
createDevice()};
196 extractTextWithTag(contentIODevice.get(), QStringLiteral(
"t"), result);
199 else if (rootEntries.
contains(QStringLiteral(
"ppt"))) {
203 if (!extractPlainText) {
207 const KArchiveEntry* pptEntry = rootDir->
entry(QStringLiteral(
"ppt"));
209 qWarning() <<
"Invalid document structure (ppt is not a directory)";
213 const auto pptDirectory =
dynamic_cast<const KArchiveDirectory*
>(pptEntry);
214 const auto slidesEntry = pptDirectory->entry(QStringLiteral(
"slides"));
215 if (!slidesEntry || !slidesEntry->isDirectory()) {
219 const auto slidesDirectory =
dynamic_cast<const KArchiveDirectory*
>(slidesEntry);
220 QStringList entries = slidesDirectory->entries();
223 std::sort(entries.
begin(), entries.
end());
224 for (
const QString & entryName : std::as_const(entries)) {
225 const KArchiveFile* file = slidesDirectory->file(entryName);
229 std::unique_ptr<QIODevice> contentIODevice{file->
createDevice()};
230 extractTextWithTag(contentIODevice.get(), QStringLiteral(
"a:t"), result);
235void Office2007Extractor::extractTextWithTag(QIODevice* device,
const QString& tag,
ExtractionResult* result)
237 QXmlStreamReader xml(device);
239 while (!xml.atEnd()) {
241 if (xml.qualifiedName().startsWith(tag) && xml.isStartElement()) {
249 if (xml.isEndDocument() || xml.hasError()) {
255#include "moc_office2007extractor.cpp"
QStringList entries() const
const KArchiveEntry * entry(const QString &name) const
const KArchiveFile * file(const QString &name) const
virtual bool isDirectory() const
virtual QIODevice * createDevice() const
virtual QByteArray data() const
KCALUTILS_EXPORT QString mimeType()
QString text() const const
QDomElement firstChildElement(const QString &tagName, const QString &namespaceURI) const const
bool isNull() const const
QString namespaceURI() const const
bool isEmpty() const const
int toInt(bool *ok, int base) const const
bool contains(QLatin1StringView str, Qt::CaseSensitivity cs) const const