14#include "utf8validator.h"
21#include <QStringDecoder>
28#define STR_DIM(x) (sizeof(x) - 1)
38Lexer::Lexer(
const char *scursor,
const char *send,
int options)
39 : i(new Impl(scursor, send, options))
49bool Lexer::ignoreComments()
const
52 return i->ignoreComments();
55const Error &Lexer::error()
const
61bool Lexer::atEnd()
const
67int Lexer::column()
const
73int Lexer::line()
const
91Lexer::Token Lexer::nextToken(
QString &result)
94 return i->nextToken(result);
99static const unsigned char iTextMap[16] = {
120static const unsigned char delimMap[16] = {
140static const unsigned char illegalMap[16] = {0xFF, 0x9B, 0xFF, 0xFF, 0x4F, 0x16, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x0A, 0x80, 0x00, 0x00, 0x0A};
142static inline bool isOfSet(
const unsigned char map[16],
unsigned char ch)
145 return map[ch / 8] & 0x80 >> ch % 8;
148static inline bool isIText(
unsigned char ch)
150 return ch <=
'z' && isOfSet(iTextMap, ch);
153static inline bool isDelim(
unsigned char ch)
155 return ch <=
'}' && isOfSet(delimMap, ch);
158static inline bool isIllegal(
unsigned char ch)
160 return ch >=
'~' || isOfSet(illegalMap, ch);
163static inline bool is8Bit(
signed char ch)
173 const int e = CRLF ? 2 : LF ? 1 : 0;
191Lexer::Impl::Impl(
const char *scursor,
const char *send,
int options)
192 : mState(scursor ? scursor : send)
193 , mEnd(send ? send : scursor)
194 , mIgnoreComments(options & IgnoreComments)
195 , mIgnoreLF(options & IgnoreLineFeeds)
197 if (!scursor || !send) {
202Lexer::Token Lexer::Impl::nextToken(
QString &result)
208 const int oldLine = line();
210 const bool eatingWSSucceeded = ignoreComments() ? eatCWS() : eatWS();
212 if (!ignoreLineFeeds() && oldLine != line()) {
213 result.
setNum(line() - oldLine);
217 if (!eatingWSSucceeded) {
225 switch (*mState.cursor) {
227 assert(!ignoreComments());
230 parseHashComment(result, true);
234 assert(!ignoreComments());
236 if (atEnd() || *mState.cursor !=
'*') {
237 makeError(Error::SlashWithoutAsterisk);
238 return BracketComment;
242 makeError(Error::UnfinishedBracketComment);
243 return BracketComment;
245 parseBracketComment(result,
true);
246 return BracketComment;
250 makeError(Error::UnexpectedCharacter, line(), column() - 1);
253 if (!isIText(*mState.cursor)) {
254 makeIllegalCharError(*mState.cursor);
261 parseQuotedString(result);
286 if (_strnicmp(mState.cursor,
"text:", STR_DIM(
"text:")) == 0) {
288 mState.cursor += STR_DIM(
"text:");
289 parseMultiLine(result);
292 return MultiLineString;
296 if (!isIText(*mState.cursor)) {
297 makeError(Error::IllegalCharacter);
300 parseIdentifier(result);
305bool Lexer::Impl::eatWS()
308 switch (*mState.cursor) {
328bool Lexer::Impl::eatCRLF()
331 assert(*mState.cursor ==
'\n' || *mState.cursor ==
'\r');
333 if (*mState.cursor ==
'\r') {
335 if (atEnd() || *mState.cursor !=
'\n') {
337 makeError(Error::CRWithoutLF);
351bool Lexer::Impl::parseHashComment(
QString &result,
bool reallySave)
356 assert(*(mState.cursor - 1) ==
'#');
358 const char *
const commentStart = mState.cursor;
362 if (*mState.cursor ==
'\n' || *mState.cursor ==
'\r') {
367 const char *
const commentEnd = mState.cursor - 1;
374 if (atEnd() || eatCRLF()) {
375 const int commentLength = commentEnd - commentStart + 1;
376 if (commentLength > 0) {
377 if (!isValidUtf8(commentStart, commentLength)) {
378 makeError(Error::InvalidUTF8);
395bool Lexer::Impl::parseBracketComment(
QString &result,
bool reallySave)
400 assert(*(mState.cursor - 2) ==
'/');
401 assert(*(mState.cursor - 1) ==
'*');
403 const char *
const commentStart = mState.cursor;
404 const int commentCol = column() - 2;
405 const int commentLine = line();
411 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
415 }
while (!atEnd() && *++mState.cursor !=
'/');
418 makeError(Error::UnfinishedBracketComment, commentLine, commentCol);
422 assert(*mState.cursor ==
'/');
424 const int commentLength = mState.cursor - commentStart - 1;
425 if (commentLength > 0) {
426 if (!isValidUtf8(commentStart, commentLength)) {
427 makeError(Error::InvalidUTF8);
440bool Lexer::Impl::parseComment(
QString &result,
bool reallySave)
444 switch (*mState.cursor) {
447 return parseHashComment(result, reallySave);
449 if (charsLeft() < 2 || mState.cursor[1] !=
'*') {
450 makeError(Error::IllegalCharacter);
454 return parseBracketComment(result, reallySave);
461bool Lexer::Impl::eatCWS()
466 switch (*mState.cursor) {
480 if (!parseComment(dummy)) {
492bool Lexer::Impl::parseIdentifier(
QString &result)
496 assert(isIText(*mState.cursor));
498 const char *
const identifierStart = mState.cursor;
501 if (isdigit(*mState.cursor)) {
502 makeError(Error::NoLeadingDigits);
507 for (++mState.cursor; !atEnd() && isIText(*mState.cursor); ++mState.cursor) { }
509 const int identifierLength = mState.cursor - identifierStart;
515 if (atEnd() || isDelim(*mState.cursor)) {
519 makeIllegalCharError(*mState.cursor);
523bool Lexer::Impl::parseTag(
QString &result)
528 assert(*(mState.cursor - 1) ==
':');
530 assert(isIText(*mState.cursor));
532 return parseIdentifier(result);
535bool Lexer::Impl::parseNumber(
QString &result)
540 assert(isdigit(*mState.cursor));
542 while (!atEnd() && isdigit(*mState.cursor)) {
546 if (atEnd() || isDelim(*mState.cursor)) {
550 switch (*mState.cursor) {
560 makeIllegalCharError();
565 if (atEnd() || isDelim(*mState.cursor)) {
568 makeIllegalCharError();
572bool Lexer::Impl::parseMultiLine(
QString &result)
582 assert(_strnicmp(mState.cursor - 5,
"text:", STR_DIM(
"text:")) == 0);
584 const int mlBeginLine = line();
585 const int mlBeginCol = column() - 5;
588 switch (*mState.cursor) {
596 if (!parseHashComment(dummy)) {
608 makeError(Error::NonCWSAfterTextColon);
615 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
622 const char *
const oldBeginOfLine = beginOfLine();
626 const int lineLength = mState.cursor - oldBeginOfLine;
627 if (lineLength > 0) {
628 if (!isValidUtf8(oldBeginOfLine, lineLength)) {
629 makeError(Error::InvalidUTF8);
643 makeError(Error::PrematureEndOfMultiLine, mlBeginLine, mlBeginCol);
647 assert(!lines.
empty());
653bool Lexer::Impl::parseQuotedString(
QString &result)
658 assert(*(mState.cursor - 1) ==
'"');
660 const int qsBeginCol = column() - 1;
661 const int qsBeginLine = line();
665 switch (*mState.cursor) {
683 if (!is8Bit(*mState.cursor)) {
686 const char *
const eightBitBegin = mState.cursor;
688 const int eightBitLen = mState.cursor - eightBitBegin;
689 assert(eightBitLen > 0);
690 if (isValidUtf8(eightBitBegin, eightBitLen)) {
693 assert(column() >= eightBitLen);
694 makeError(Error::InvalidUTF8, line(), column() - eightBitLen);
701 makeError(Error::PrematureEndOfQuotedString, qsBeginLine, qsBeginCol);
705void Lexer::Impl::makeIllegalCharError(
char ch)
707 makeError(isIllegal(ch) ? Error::IllegalCharacter : Error::UnexpectedCharacter);
void error(QWidget *parent, const QString &text, const QString &title, const KGuiItem &buttonOk, Options options=Notify)
iterator erase(const_iterator begin, const_iterator end)
void push_back(parameter_type value)
bool endsWith(QChar c, Qt::CaseSensitivity cs) const const
QString fromLatin1(QByteArrayView str)
QString fromUtf8(QByteArrayView str)
QString left(qsizetype n) const const
qsizetype length() const const
QString mid(qsizetype position, qsizetype n) const const
QString & remove(QChar ch, Qt::CaseSensitivity cs)
QString & replace(QChar before, QChar after, Qt::CaseSensitivity cs)
QString & setNum(double n, char format, int precision)
bool startsWith(QChar c, Qt::CaseSensitivity cs) const const
QString join(QChar separator) const const
QTextStream & dec(QTextStream &stream)
QFuture< void > map(Iterator begin, Iterator end, MapFunctor &&function)