KItinerary

pdfdocument.cpp
1/*
2 SPDX-FileCopyrightText: 2018 Volker Krause <vkrause@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#include "config-kitinerary.h"
8#include "pdfdocument.h"
9#include "pdfdocument_p.h"
10#include "pdfextractoroutputdevice_p.h"
11#include "pdfimage_p.h"
12#include "popplerglobalparams_p.h"
13#include "popplerutils_p.h"
14#include "logging.h"
15
16#include <QDebug>
17#include <QImage>
18#include <QScopedValueRollback>
19#include <QTimeZone>
20
21#include <DateInfo.h>
22#include <PDFDoc.h>
23#include <PDFDocEncoding.h>
24#include <Stream.h>
25#include <UTF.h>
26
27#include <cmath>
28
29using namespace KItinerary;
30
31void PdfPagePrivate::load()
32{
33 if (m_loaded) {
34 return;
35 }
36
37 PopplerGlobalParams gp;
38 PdfExtractorOutputDevice device;
39 m_doc->m_popplerDoc->displayPageSlice(&device, m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
40 m_doc->m_popplerDoc->processLinks(&device, m_pageNum + 1);
41 device.finalize();
42 const auto pageRect = m_doc->m_popplerDoc->getPage(m_pageNum + 1)->getCropBox();
43#if KPOPPLER_VERSION < QT_VERSION_CHECK(25, 1, 0)
44 std::unique_ptr<GooString> s(device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2));
45 m_text = QString::fromUtf8(s->c_str());
46#else
47 const auto s = device.getText(pageRect->x1, pageRect->y1, pageRect->x2, pageRect->y2);
48 m_text = QString::fromUtf8(s.c_str());
49#endif
50
51 m_images = std::move(device.m_images);
52 for (auto it = m_images.begin(); it != m_images.end(); ++it) {
53 (*it).d->m_page = this;
54 }
55
56 m_links = std::move(device.m_links);
57 for (auto &link : m_links) {
58 link.convertToPageRect(pageRect);
59 }
60
61 m_loaded = true;
62}
63
64PdfPage::PdfPage()
65 : d(new PdfPagePrivate)
66{
67}
68
69PdfPage::PdfPage(const PdfPage&) = default;
70PdfPage::~PdfPage() = default;
71PdfPage& PdfPage::operator=(const PdfPage&) = default;
72
73QString PdfPage::text() const
74{
75 d->load();
76 return d->m_text;
77}
78
79static double ratio(double begin, double end, double ratio)
80{
81 return begin + (end - begin) * ratio;
82}
83
84QString PdfPage::textInRect(double left, double top, double right, double bottom) const
85{
86 PopplerGlobalParams gp;
87
88 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
89 const auto pageRect = page->getCropBox();
90
91 double l;
92 double t;
93 double r;
94 double b;
95 switch (page->getRotate()) {
96 case 0:
97 l = ratio(pageRect->x1, pageRect->x2, left);
98 t = ratio(pageRect->y1, pageRect->y2, top);
99 r = ratio(pageRect->x1, pageRect->x2, right);
100 b = ratio(pageRect->y1, pageRect->y2, bottom);
101 break;
102 case 90:
103 l = ratio(pageRect->y1, pageRect->y2, left);
104 t = ratio(pageRect->x1, pageRect->x2, top);
105 r = ratio(pageRect->y1, pageRect->y2, right);
106 b = ratio(pageRect->x1, pageRect->x2, bottom);
107 break;
108 default:
109 qCWarning(Log) << "Unsupported page rotation!" << page->getRotate();
110 return {};
111 }
112
113 TextOutputDev device(nullptr, false, 0, false, false);
114 d->m_doc->m_popplerDoc->displayPageSlice(&device, d->m_pageNum + 1, 72, 72, 0, false, true, false, -1, -1, -1, -1);
115#if KPOPPLER_VERSION <QT_VERSION_CHECK(25, 1, 0)
116 std::unique_ptr<GooString> s(device.getText(l, t, r, b));
117 return QString::fromUtf8(s->c_str());
118#else
119 const auto s = device.getText(l, t, r, b);
120 return QString::fromUtf8(s.c_str());
121#endif
122}
123
125{
126 d->load();
127 return d->m_images.size();
128}
129
130PdfImage PdfPage::image(int index) const
131{
132 d->load();
133 return d->m_images[index];
134}
135
136QVariantList PdfPage::imagesVariant() const
137{
138 d->load();
139 QVariantList l;
140 l.reserve(imageCount());
141 std::for_each(d->m_images.begin(), d->m_images.end(), [&l](const PdfImage& img) { l.push_back(QVariant::fromValue(img)); });
142 return l;
143}
144
145QVariantList PdfPage::imagesInRect(double left, double top, double right, double bottom) const
146{
147 d->load();
148 QVariantList l;
149 PopplerGlobalParams gp;
150 const auto pageRect = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1)->getCropBox();
151
152 for (const auto &img : d->m_images) {
153 if ((img.d->m_transform.dx() >= ratio(pageRect->x1, pageRect->x2, left) && img.d->m_transform.dx() <= ratio(pageRect->x1, pageRect->x2, right)) &&
154 (img.d->m_transform.dy() >= ratio(pageRect->y1, pageRect->y2, top) && img.d->m_transform.dy() <= ratio(pageRect->y1, pageRect->y2, bottom)))
155 {
156 l.push_back(QVariant::fromValue(img));
157 }
158 }
159 return l;
160}
161
163{
164 d->load();
165 return d->m_links.size();
166}
167
168PdfLink PdfPage::link(int index) const
169{
170 d->load();
171 return d->m_links[index];
172}
173
174QVariantList PdfPage::linksVariant() const
175{
176 d->load();
177 QVariantList l;
178 l.reserve(d->m_links.size());
179 std::transform(d->m_links.begin(), d->m_links.end(), std::back_inserter(l), [](const PdfLink &link) { return QVariant::fromValue(link); });
180 return l;
181}
182
183QVariantList PdfPage::linksInRect(double left, double top, double right, double bottom) const
184{
185 QRectF bbox(QPointF(left, top), QPointF(right, bottom));
186 d->load();
187
188 QVariantList l;
189 for (const auto &link : d->m_links) {
190 if (!link.area().intersects(bbox)) {
191 continue;
192 }
193 l.push_back(QVariant::fromValue(link));
194 }
195
196 std::sort(l.begin(), l.end(), [](const auto &lhs, const auto &rhs) {
197 const auto lhsLink = lhs.template value<PdfLink>();
198 const auto rhsLink = rhs.template value<PdfLink>();
199 if (lhsLink.area().top() == rhsLink.area().top()) {
200 return lhsLink.area().left() < rhsLink.area().left();
201 }
202 return lhsLink.area().top() < rhsLink.area().top();
203 });
204
205 return l;
206}
207
208static constexpr inline double pdfToMM(double points)
209{
210 return points * 25.4 / 72.0;
211}
212
213int PdfPage::width() const
214{
215 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
216 const auto rot = page->getRotate();
217 if (rot == 90 || rot == 270) {
218 return pdfToMM(page->getCropHeight());
219 }
220 return pdfToMM(page->getCropWidth());
221}
222
223int PdfPage::height() const
224{
225 const auto page = d->m_doc->m_popplerDoc->getPage(d->m_pageNum + 1);
226 const auto rot = page->getRotate();
227 if (rot == 90 || rot == 270) {
228 return pdfToMM(page->getCropWidth());
229 }
230 return pdfToMM(page->getCropHeight());
231}
232
233
234PdfDocument::PdfDocument(QObject *parent)
235 : QObject(parent)
236 , d(new PdfDocumentPrivate)
237{
238}
239
240PdfDocument::~PdfDocument() = default;
241
242QString PdfDocument::text() const
243{
244 QString text;
245 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&text](const PdfPage &p) { text += p.text(); });
246 return text;
247}
248
249int PdfDocument::pageCount() const
250{
251 return d->m_popplerDoc->getNumPages();
252}
253
255{
256 return d->m_pages[index];
257}
258
260{
261 return d->m_pdfData.size();
262}
263
264static QDateTime parsePdfDateTime(const GooString *str)
265{
266 int year;
267 int month;
268 int day;
269 int hour;
270 int min;
271 int sec;
272 int tzHours;
273 int tzMins;
274 char tz;
275
276 if (!parseDateString(str, &year, &month, &day, &hour, &min, &sec, &tz, &tzHours, &tzMins)) {
277 return {};
278 }
279
280 QDate date(year, month, day);
281 QTime time(hour, min, sec);
282 if (!date.isValid() || !time.isValid()) {
283 return {};
284 }
285
286 int offset = tzHours * 3600 + tzMins * 60;
287 if (tz == '+') {
288 return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(offset));
289 } else if (tz == '-') {
290 return QDateTime(date, time, QTimeZone::fromSecondsAheadOfUtc(-offset));
291 }
292 return QDateTime(date, time, QTimeZone::UTC);
293}
294
295QDateTime PdfDocument::creationTime() const
296{
297 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoCreatDate());
298 if (!dt) {
299 return {};
300 }
301 return parsePdfDateTime(dt.get());
302}
303
304QDateTime PdfDocument::modificationTime() const
305{
306 std::unique_ptr<GooString> dt(d->m_popplerDoc->getDocInfoModDate());
307 if (!dt) {
308 return {};
309 }
310 return parsePdfDateTime(dt.get());
311}
312
313
314QString gooStringToUnicode(const std::unique_ptr<GooString> &s)
315{
316 if (!s) {
317 return {};
318 }
319
320#if KPOPPLER_VERSION >= QT_VERSION_CHECK(24, 5, 0)
321 if (hasUnicodeByteOrderMark(s->toStr()) || hasUnicodeByteOrderMarkLE(s->toStr())) {
322#else
323 if (s->hasUnicodeMarker() || s->hasUnicodeMarkerLE()) {
324#endif
325 return QString::fromUtf16(reinterpret_cast<const char16_t*>(s->toStr().c_str()), s->toStr().size() / 2);
326 } else {
327 int len = 0;
328 std::unique_ptr<const char[]> utf16Data(pdfDocEncodingToUTF16(s->toStr(), &len));
329 return QString::fromUtf16(reinterpret_cast<const char16_t*>(utf16Data.get()), len / 2);
330 }
331
332 return QString::fromUtf8(s->c_str());
333}
334
335QString PdfDocument::title() const
336{
337 return gooStringToUnicode(d->m_popplerDoc->getDocInfoTitle());
338}
339
340QString PdfDocument::producer() const
341{
342 return gooStringToUnicode(d->m_popplerDoc->getDocInfoProducer());
343}
344
345QString PdfDocument::creator() const
346{
347 return gooStringToUnicode(d->m_popplerDoc->getDocInfoCreator());
348}
349
350QString PdfDocument::author() const
351{
352 return gooStringToUnicode(d->m_popplerDoc->getDocInfoAuthor());
353}
354
355QVariantList PdfDocument::pagesVariant() const
356{
357 QVariantList l;
358 l.reserve(pageCount());
359 std::for_each(d->m_pages.begin(), d->m_pages.end(), [&l](const PdfPage& p) { l.push_back(QVariant::fromValue(p)); });
360 return l;
361}
362
364{
365 PopplerGlobalParams gp;
366
367 std::unique_ptr<PdfDocument> doc(new PdfDocument(parent));
368 doc->d->m_pdfData = data;
369 // PDFDoc takes ownership of stream
370 auto stream = new MemStream(const_cast<char*>(doc->d->m_pdfData.constData()), 0, doc->d->m_pdfData.size(), Object());
371 std::unique_ptr<PDFDoc> popplerDoc(new PDFDoc(stream));
372 if (!popplerDoc->isOk()) {
373 qCWarning(Log) << "Got invalid PDF document!" << popplerDoc->getErrorCode();
374 return nullptr;
375 }
376
377 doc->d->m_pages.reserve(popplerDoc->getNumPages());
378 for (int i = 0; i < popplerDoc->getNumPages(); ++i) {
380 page.d->m_pageNum = i;
381 page.d->m_doc = doc->d.get();
382 doc->d->m_pages.push_back(page);
383 }
384
385 doc->d->m_popplerDoc = std::move(popplerDoc);
386 return doc.release();
387}
388
390{
391 return data.startsWith("%PDF");
392}
393
394#include "moc_pdfdocument.cpp"
PDF document for extraction.
Definition pdfdocument.h:92
PdfPage page(int index) const
The n-thj page in this document.
int fileSize() const
File size of the entire document in bytes.
static PdfDocument * fromData(const QByteArray &data, QObject *parent=nullptr)
Creates a PdfDocument from the given raw data.
static bool maybePdf(const QByteArray &data)
Fast check whether data might be a PDF document.
An image in a PDF document.
Definition pdfimage.h:74
A page in a PDF document.
Definition pdfdocument.h:29
Q_INVOKABLE QVariantList linksInRect(double left, double top, double right, double bottom) const
Returns all links in the specified sub-rect of this page.
PdfImage image(int index) const
The n-th image found in this document.
int linkCount() const
The number of links found in this document.
Q_INVOKABLE QString textInRect(double left, double top, double right, double bottom) const
Returns the text in the specified sub-rect of this page.
PdfLink link(int index) const
The n-th link found in this document.
int imageCount() const
The number of images found in this document.
Q_INVOKABLE QVariantList imagesInRect(double left, double top, double right, double bottom) const
Returns the images in the specified sub-rect of this page.
KIOCORE_EXPORT CopyJob * link(const QList< QUrl > &src, const QUrl &destDir, JobFlags flags=DefaultFlags)
Classes for reservation/travel data models, data extraction and data augmentation.
Definition berelement.h:17
bool startsWith(QByteArrayView bv) const const
QObject * parent() const const
bool intersects(const QRectF &rectangle) const const
QString fromUtf16(const char16_t *unicode, qsizetype size)
QString fromUtf8(QByteArrayView str)
QTimeZone fromSecondsAheadOfUtc(int offset)
QVariant fromValue(T &&value)
This file is part of the KDE documentation.
Documentation copyright © 1996-2025 The KDE developers.
Generated on Fri Jan 3 2025 11:50:01 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.