9#include <QCoreApplication>
13#include <QStandardPaths>
15#include "core_debug.h"
16#include "guesslanguage.h"
19#include "spellerplugin_p.h"
20#include "tokenizer_p.h"
47class GuessLanguagePrivate
50 GuessLanguagePrivate();
52 static QHash<QString, QHash<QString, int>> s_knownModels;
55 QList<QChar::Script> findRuns(
const QString &text);
56 QList<QString> createOrderedModel(
const QString &content);
57 int distance(
const QList<QString> &model,
const QHash<QString, int> &knownModel);
58 QStringList guessFromTrigrams(
const QString &sample,
const QStringList &langs);
59 QStringList identify(
const QString &sample,
const QList<QChar::Script> &scripts);
60 QString guessFromDictionaries(
const QString &sentence,
const QStringList &candidates);
62 static QSet<QString> s_knownDictionaries;
63 static QMultiHash<QChar::Script, QString> s_scriptLanguages;
64 static QMap<QString, QString> s_dictionaryNameMap;
68 double m_minConfidence;
71QHash<QString, QHash<QString, int>> GuessLanguagePrivate::s_knownModels;
72QSet<QString> GuessLanguagePrivate::s_knownDictionaries;
73QMultiHash<QChar::Script, QString> GuessLanguagePrivate::s_scriptLanguages;
74QMap<QString, QString> GuessLanguagePrivate::s_dictionaryNameMap;
80 locales.
reserve(matchingLocales.size());
81 for (
const QLocale &locale : matchingLocales) {
82 locales << locale.name();
87GuessLanguagePrivate::GuessLanguagePrivate()
92 if (!s_scriptLanguages.isEmpty()) {
96 const QStringList languages = Loader::openLoader()->languages();
99 for (
const QString &dictName : std::as_const(s_knownDictionaries)) {
102 qCWarning(SONNET_LOG_CORE) <<
"Unable to parse name for dictionary" << dictName;
105 dictionaryLanguages.
insert(languageName);
500 qCDebug(SONNET_LOG_CORE) <<
"Unhandled script" << script;
507 for (
const QString &name : std::as_const(names)) {
508 if (!dictionaryLanguages.contains(name)) {
520 for (
const QString &name : std::as_const(names)) {
521 s_scriptLanguages.insert(script, name);
526 if (!allLanguages.
contains(s_knownDictionaries)) {
527 QSet<QString> dicts(s_knownDictionaries);
528 dicts.subtract(allLanguages);
529 for (const QString &dictName : std::as_const(dicts)) {
530 QString languageName = QLocale(dictName).name();
531 if (languageName.isEmpty()) {
532 qCWarning(SONNET_LOG_CORE) <<
"Unable to parse language name" << dictName;
535 s_dictionaryNameMap[languageName] = dictName;
536 if (std::find(s_scriptLanguages.cbegin(), s_scriptLanguages.cend(), languageName) == s_scriptLanguages.cend()) {
537 qCWarning(SONNET_LOG_CORE) <<
"Unable to handle language from dictionary" << dictName << languageName;
544 : d(new GuessLanguagePrivate)
558 for (
const QString &suggestion : suggestionsListIn) {
559 if (d->s_knownDictionaries.contains(suggestion) && !suggestionsList.
contains(suggestion)) {
560 suggestionsList.
append(suggestion);
565 if (d->s_knownModels.isEmpty()) {
571 QStringList candidateLanguages = d->identify(text, scriptsList);
574 if (candidateLanguages.
isEmpty()) {
576 const auto languagesList = d->s_scriptLanguages.values(script);
577 for (
const QString &lang : languagesList) {
578 if (!d->s_knownModels.contains(lang)) {
579 candidateLanguages.
append(lang);
586 for (
int i = 0; i < candidateLanguages.
count(); i++) {
587 if (d->s_dictionaryNameMap.contains(candidateLanguages[i])) {
588 candidateLanguages[i] = d->s_dictionaryNameMap.
value(candidateLanguages[i]);
592 if (candidateLanguages.
count() == 1) {
593 return candidateLanguages.
first();
598 candidateLanguages.
append(suggestionsList);
600 QString identified = d->guessFromDictionaries(text, candidateLanguages);
605 qCDebug(SONNET_LOG_CORE()) <<
"Unable to identify string with dictionaries:" << text;
608 if (!suggestionsList.
isEmpty()) {
609 return suggestionsList.
first();
612 qCDebug(SONNET_LOG_CORE) <<
"Unable to find any suggestion for" << text;
620 d->m_maxItems = maxItems;
621 d->m_minConfidence = minConfidence;
624void GuessLanguagePrivate::loadModels()
627 const QString triMapFile = QStringLiteral(
":/org.kde.sonnet/trigrams.map");
628 qCDebug(SONNET_LOG_CORE) <<
"Loading trigrams from" << triMapFile;
630 QFile sin(triMapFile);
632 qCWarning(SONNET_LOG_CORE) <<
"Sonnet: Unable to load trigram models from file" << triMapFile;
642 while (iterator.hasNext()) {
644 if (iterator.value().count() < MAXGRAMS) {
645 qCWarning(SONNET_LOG_CORE) << iterator.key() <<
"is has only" << iterator.value().count() <<
"trigrams, expected" << MAXGRAMS;
647 availableLanguages.
insert(iterator.key());
649 QSet<QString> knownLanguages(s_scriptLanguages.constBegin(), s_scriptLanguages.constEnd());
650 knownLanguages.subtract(availableLanguages);
651 if (!knownLanguages.isEmpty()) {
652 qCDebug(SONNET_LOG_CORE) <<
"Missing trigrams for languages:" << knownLanguages;
656QList<QChar::Script> GuessLanguagePrivate::findRuns(
const QString &text)
658 QHash<QChar::Script, int> scriptCounts;
662 for (
const QChar c : text) {
673 scriptCounts[script]++;
677 QList<QChar::Script> relevantScripts;
679 if (totalCount == 0) {
680 return relevantScripts;
683 if (scriptCounts.
size() == 1) {
684 return {scriptCounts.
cbegin().key()};
687 for (
auto it = scriptCounts.
cbegin(); it != scriptCounts.
cend(); ++it) {
689 const int scriptCount = it.value();
690 const auto currentScript = it.key();
691 if (scriptCount * 100 / totalCount >= 40) {
692 relevantScripts << currentScript;
695 relevantScripts << currentScript;
699 return relevantScripts;
702QStringList GuessLanguagePrivate::identify(
const QString &sample,
const QList<QChar::Script> &scripts)
704 if (sample.
size() < MIN_LENGTH) {
705 return QStringList();
710 guesses.
append(guessFromTrigrams(sample, s_scriptLanguages.values(script)));
716QStringList GuessLanguagePrivate::guessFromTrigrams(
const QString &sample,
const QStringList &languages)
720 const QList<QString> sampleTrigrams = createOrderedModel(sample);
723 QMultiMap<int, QString> scores;
724 for (
const QString &language : languages) {
725 if (s_knownModels.contains(language)) {
726 scores.
insert(distance(sampleTrigrams, s_knownModels[language]), language);
732 qCDebug(SONNET_LOG_CORE) <<
"No scores for" << sample;
737 double confidence = 0;
739 QMultiMapIterator<int, QString> it(scores);
742 QString prevItem = it.value();
743 int prevScore = it.key();
745 while (it.hasNext() && counter < m_maxItems && confidence < m_minConfidence) {
748 confidence += (it.key() - prevScore) / (double)it.key();
750 prevItem = it.value();
751 prevScore = it.key();
753 if (counter < m_maxItems && confidence < m_minConfidence) {
760QList<QString> GuessLanguagePrivate::createOrderedModel(
const QString &content)
762 QHash<QString, int> trigramCounts;
766 for (
int i = 0; i < (content.
size() - 2); ++i) {
768 trigramCounts[tri]++;
772 QList<QPair<int, QString>> trigramFrequencyList;
773 trigramFrequencyList.
reserve(trigramCounts.
size());
776 for (; it != trigramCounts.
constEnd(); ++it) {
777 const QChar *data = it.key().constData();
778 bool hasTwoSpaces = (data[1].
isSpace() && (data[0].isSpace() || data[2].isSpace()));
781 const int freq = it.value();
782 const QString &trigram = it.key();
783 trigramFrequencyList.
append({freq, trigram});
788 std::sort(trigramFrequencyList.
begin(), trigramFrequencyList.
end(), [](
const QPair<int, QString> &a,
const QPair<int, QString> &b) {
789 return a.first > b.first;
792 QList<QString> orderedTrigrams;
793 orderedTrigrams.
reserve(trigramFrequencyList.
size());
794 for (
const auto &tri : std::as_const(trigramFrequencyList)) {
795 orderedTrigrams.
append(tri.second);
798 return orderedTrigrams;
801int GuessLanguagePrivate::distance(
const QList<QString> &model,
const QHash<QString, int> &knownModel)
806 for (
const QString &trigram : model) {
807 const int val = knownModel.
value(trigram, -1);
809 dist += qAbs(++counter - val);
814 if (counter == (MAXGRAMS - 1)) {
822QString GuessLanguagePrivate::guessFromDictionaries(
const QString &sentence,
const QStringList &candidates)
825 QList<QSharedPointer<SpellerPlugin>> spellers;
826 for (
const QString &lang : candidates) {
827 if (!Loader::openLoader()->
languages().contains(lang)) {
828 qCWarning(SONNET_LOG_CORE) <<
"Dictionary asked for invalid speller" << lang;
831 QSharedPointer<SpellerPlugin> plugin = Loader::openLoader()->cachedSpeller(lang);
842 QMap<QString, int> correctHits;
844 WordTokenizer tokenizer(sentence);
845 while (tokenizer.hasNext()) {
846 Token word = tokenizer.next();
847 if (!tokenizer.isSpellcheckable()) {
851 for (
int i = 0; i < spellers.
count(); ++i) {
852 if (spellers[i]->isCorrect(word.toString())) {
853 correctHits[spellers[i]->language()]++;
862 QMap<QString, int>::const_iterator max = correctHits.
constBegin();
863 for (QMap<QString, int>::const_iterator itr = correctHits.
constBegin(); itr != correctHits.
constEnd(); ++itr) {
864 if (itr.value() > max.value()) {
GuessLanguage()
Constructor Creates a new GuessLanguage instance.
QString identify(const QString &text, const QStringList &suggestions=QStringList()) const
Returns the 2 digit ISO 639-1 code for the language of the currently set text and.
~GuessLanguage()
Destructor.
void setLimits(int maxItems, double minConfidence)
Sets limits to number of languages returned by identify().
KEDUVOCDOCUMENT_EXPORT QStringList languages()
bool isSpace(char32_t ucs4)
const_iterator cbegin() const const
const_iterator cend() const const
const_iterator constBegin() const const
const_iterator constEnd() const const
void reserve(qsizetype size)
qsizetype size() const const
T value(const Key &key) const const
void append(QList< T > &&value)
const_iterator constBegin() const const
const_iterator constEnd() const const
qsizetype count() const const
bool isEmpty() const const
void reserve(qsizetype size)
qsizetype size() const const
T value(qsizetype i) const const
QList< QLocale > matchingLocales(QLocale::Language language, QLocale::Script script, QLocale::Territory territory)
QString name() const const
const_iterator constBegin() const const
const_iterator constEnd() const const
bool isEmpty() const const
const Key & firstKey() const const
iterator insert(const Key &key, const T &value)
bool isEmpty() const const
bool contains(const QSet< T > &other) const const
iterator insert(const T &value)
QSet< T > & unite(const QSet< T > &other)
bool isNull() const const
QString & insert(qsizetype position, QChar ch)
bool isEmpty() const const
QString mid(qsizetype position, qsizetype n) const const
qsizetype size() const const
QString toLower() const const
bool contains(QLatin1StringView str, Qt::CaseSensitivity cs) const const
qsizetype removeDuplicates()