KFileMetaData

plaintextextractor.cpp
1/*
2 SPDX-FileCopyrightText: 2012 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "plaintextextractor.h"
9
10#include <QDebug>
11#include <QFile>
12#include <QStringDecoder>
13#include <QtMinMax>
14
15#include <KEncodingProber>
16
17#if defined(Q_OS_LINUX) || defined(__GLIBC__)
18 #include <fcntl.h>
19#endif
20
21using namespace KFileMetaData;
22
23PlainTextExtractor::PlainTextExtractor(QObject* parent)
24 : ExtractorPlugin(parent)
25{
26
27}
28
29const QStringList supportedMimeTypes = {
30 QStringLiteral("text/plain"),
31};
32
33QStringList PlainTextExtractor::mimetypes() const
34{
35 return supportedMimeTypes;
36}
37
38void PlainTextExtractor::extract(ExtractionResult* result)
39{
40 QFile file(result->inputUrl());
41 bool isOpen = false;
42
43#ifdef O_NOATIME
44 const QByteArray filePath = QFile::encodeName(result->inputUrl());
45 int fd = open(filePath.constData(), O_RDONLY | O_NOATIME);
46 if (fd >= 0) {
48 } else
49#endif
50 {
51 isOpen = file.open(QIODevice::ReadOnly | QIODevice::Text);
52 }
53
54 if (!isOpen) {
55 return;
56 }
57
58 result->addType(Type::Text);
59 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
60 return;
61 }
62
63 auto autodetectCodec = [](QFile &file) -> QStringDecoder {
64 const qint64 BUFFER_SIZE = 256 * 1024;
65 const auto buffer = file.read(BUFFER_SIZE);
66 file.seek(0);
67
68 // First 16 bytes for detecting by BOM.
69 const QByteArrayView bufferForBom(buffer.begin(), qMin(16, buffer.size()));
70
71 // first: try to get encoding by BOM handling
72 // If BOM has been found, trust it
73 if (auto encoding = QStringConverter::encodingForData(bufferForBom)) {
74 return QStringDecoder(encoding.value());
75 }
76
77 // second: try to get encoding by KEncodingProber
78 KEncodingProber prober(KEncodingProber::Universal);
79 prober.feed(buffer.constData());
80
81 // we found codec with some confidence?
82 if (prober.confidence() > 0.5) {
83 auto proberDecoder = QStringDecoder(prober.encoding().constData());
84 // rare case, but if not valid, do not return proberDecoder
85 if (proberDecoder.isValid()) {
86 return proberDecoder;
87 }
88 }
89
91 };
92
93 QStringDecoder codec = {autodetectCodec(file)};
94
95 int lines = 0;
96
97 while (!file.atEnd()) {
98 QString text = codec.decode(file.readLine());
99
100 if (codec.hasError()) {
101 qDebug() << "Invalid encoding. Ignoring" << result->inputUrl();
102 return;
103 }
104
105 // Newline '\n' can be first symbol in line in case UTF-16LE.
106 if (!text.isEmpty() && text.front() == QLatin1Char('\n')) {
107 text.removeFirst();
108 } else if (!text.isEmpty() && text.back() == QLatin1Char('\n')) {
109 text.removeLast();
110 }
111
112 // This case is possible for Little-Endian encodings
113 // when '\00' part of the newline character
114 // is mistakenly read here as a separate line.
115 if (file.atEnd() && text.isEmpty()) {
116 break;
117 }
118
119 result->append(text);
120
121 lines += 1;
122 }
123 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
124 result->add(Property::LineCount, lines);
125 }
126}
127
128#include "moc_plaintextextractor.cpp"
The ExtractionResult class is where all the data extracted by the indexer is saved.
QString inputUrl() const
The input URL which the plugins will use to locate the file.
virtual void addType(Type::Type type)=0
This function is called by the plugins.
virtual void add(Property::Property property, const QVariant &value)=0
This function is called by the plugins when they wish to add a key value pair which should be indexed...
Flags inputFlags() const
The flags which the extraction plugin should considering following when extracting metadata from the ...
virtual void append(const QString &text)=0
This function is called by plugins when they wish for some plain text to be indexed without any prope...
The ExtractorPlugin is the base class for all file metadata extractors.
@ LineCount
The number of lines in a document.
Definition properties.h:151
@ Text
Any file which contains text data (i.e.
Definition types.h:83
The KFileMetaData namespace.
const QList< QKeySequence > & open()
const char * constData() const const
QByteArray encodeName(const QString &fileName)
QChar & back()
QChar & front()
bool isEmpty() const const
QString & removeFirst()
QString & removeLast()
std::optional< Encoding > encodingForData(QByteArrayView data, char16_t expectedFirstCharacter)
bool hasError() const const
EncodedData< QByteArrayView > decode(QByteArrayView ba)
This file is part of the KDE documentation.
Documentation copyright © 1996-2024 The KDE developers.
Generated on Sat Dec 21 2024 16:59:41 by doxygen 1.12.0 written by Dimitri van Heesch, © 1997-2006

KDE's Doxygen guidelines are available online.