cf12defd28
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
313 lines
10 KiB
C++
313 lines
10 KiB
C++
/*
|
|
This file is part of the KDE libraries
|
|
|
|
SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
|
|
|
|
SPDX-License-Identifier: LGPL-2.0-or-later
|
|
*/
|
|
|
|
#include "kencodingprober.h"
|
|
|
|
#include "probers/ChineseGroupProber.h"
|
|
#include "probers/JapaneseGroupProber.h"
|
|
#include "probers/UnicodeGroupProber.h"
|
|
#include "probers/nsCharSetProber.h"
|
|
#include "probers/nsMBCSGroupProber.h"
|
|
#include "probers/nsSBCSGroupProber.h"
|
|
#include "probers/nsUniversalDetector.h"
|
|
|
|
#include <string.h>
|
|
|
|
class KEncodingProberPrivate
|
|
{
|
|
public:
|
|
KEncodingProberPrivate()
|
|
: mProber(nullptr)
|
|
, mStart(true)
|
|
{
|
|
}
|
|
~KEncodingProberPrivate()
|
|
{
|
|
delete mProber;
|
|
}
|
|
void setProberType(KEncodingProber::ProberType pType)
|
|
{
|
|
mProberType = pType;
|
|
/* handle multi-byte encodings carefully , because they're hard to detect,
|
|
* and have to use some Stastics methods.
|
|
* for single-byte encodings (most western encodings), nsSBCSGroupProber is ok,
|
|
* because encoding state machine can detect many such encodings.
|
|
*/
|
|
|
|
delete mProber;
|
|
|
|
switch (mProberType) {
|
|
case KEncodingProber::None:
|
|
mProber = nullptr;
|
|
break;
|
|
case KEncodingProber::Arabic:
|
|
case KEncodingProber::Baltic:
|
|
case KEncodingProber::CentralEuropean:
|
|
case KEncodingProber::Cyrillic:
|
|
case KEncodingProber::Greek:
|
|
case KEncodingProber::Hebrew:
|
|
case KEncodingProber::NorthernSaami:
|
|
case KEncodingProber::Other:
|
|
case KEncodingProber::SouthEasternEurope:
|
|
case KEncodingProber::Thai:
|
|
case KEncodingProber::Turkish:
|
|
case KEncodingProber::WesternEuropean:
|
|
mProber = new kencodingprober::nsSBCSGroupProber();
|
|
break;
|
|
case KEncodingProber::ChineseSimplified:
|
|
case KEncodingProber::ChineseTraditional:
|
|
mProber = new kencodingprober::ChineseGroupProber();
|
|
break;
|
|
case KEncodingProber::Japanese:
|
|
mProber = new kencodingprober::JapaneseGroupProber();
|
|
break;
|
|
case KEncodingProber::Korean:
|
|
mProber = new kencodingprober::nsMBCSGroupProber();
|
|
break;
|
|
case KEncodingProber::Unicode:
|
|
mProber = new kencodingprober::UnicodeGroupProber();
|
|
break;
|
|
case KEncodingProber::Universal:
|
|
mProber = new kencodingprober::nsUniversalDetector();
|
|
break;
|
|
default:
|
|
mProber = nullptr;
|
|
}
|
|
}
|
|
void unicodeTest(const char *aBuf, int aLen)
|
|
{
|
|
if (mStart) {
|
|
mStart = false;
|
|
if (aLen > 3) {
|
|
switch (aBuf[0]) {
|
|
case '\xEF':
|
|
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
|
|
// EF BB BF UTF-8 encoded BOM
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
}
|
|
break;
|
|
case '\xFE':
|
|
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
} else if ('\xFF' == aBuf[1])
|
|
// FE FF UTF-16, big endian BOM
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
}
|
|
break;
|
|
case '\x00':
|
|
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
|
// 00 00 FE FF UTF-32, big-endian BOM
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
} else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
|
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
}
|
|
break;
|
|
case '\xFF':
|
|
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
|
// FF FE 00 00 UTF-32, little-endian BOM
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
} else if ('\xFE' == aBuf[1])
|
|
// FF FE UTF-16, little endian BOM
|
|
{
|
|
mProberState = KEncodingProber::FoundIt;
|
|
}
|
|
break;
|
|
} // switch
|
|
}
|
|
}
|
|
}
|
|
KEncodingProber::ProberType mProberType;
|
|
KEncodingProber::ProberState mProberState;
|
|
kencodingprober::nsCharSetProber *mProber;
|
|
bool mStart;
|
|
};
|
|
|
|
KEncodingProber::KEncodingProber(KEncodingProber::ProberType proberType)
|
|
: d(new KEncodingProberPrivate())
|
|
{
|
|
setProberType(proberType);
|
|
}
|
|
|
|
KEncodingProber::~KEncodingProber() = default;
|
|
|
|
void KEncodingProber::reset()
|
|
{
|
|
d->mProberState = KEncodingProber::Probing;
|
|
d->mStart = true;
|
|
}
|
|
|
|
KEncodingProber::ProberState KEncodingProber::feed(QByteArrayView data)
|
|
{
|
|
if (!d->mProber) {
|
|
return d->mProberState;
|
|
}
|
|
if (d->mProberState == Probing) {
|
|
if (d->mStart) {
|
|
d->unicodeTest(data.constData(), data.size());
|
|
if (d->mProberState == FoundIt) {
|
|
return d->mProberState;
|
|
}
|
|
}
|
|
d->mProber->HandleData(data.constData(), data.size());
|
|
switch (d->mProber->GetState()) {
|
|
case kencodingprober::eNotMe:
|
|
d->mProberState = NotMe;
|
|
break;
|
|
case kencodingprober::eFoundIt:
|
|
d->mProberState = FoundIt;
|
|
break;
|
|
default:
|
|
d->mProberState = Probing;
|
|
break;
|
|
}
|
|
}
|
|
#ifdef DEBUG_PROBE
|
|
d->mProber->DumpStatus();
|
|
#endif
|
|
return d->mProberState;
|
|
}
|
|
|
|
KEncodingProber::ProberState KEncodingProber::state() const
|
|
{
|
|
return d->mProberState;
|
|
}
|
|
|
|
QByteArray KEncodingProber::encoding() const
|
|
{
|
|
if (!d->mProber) {
|
|
return QByteArray("UTF-8");
|
|
}
|
|
|
|
return QByteArray(d->mProber->GetCharSetName());
|
|
}
|
|
|
|
float KEncodingProber::confidence() const
|
|
{
|
|
if (!d->mProber) {
|
|
return 0.0;
|
|
}
|
|
|
|
return d->mProber->GetConfidence();
|
|
}
|
|
|
|
KEncodingProber::ProberType KEncodingProber::proberType() const
|
|
{
|
|
return d->mProberType;
|
|
}
|
|
|
|
void KEncodingProber::setProberType(KEncodingProber::ProberType proberType)
|
|
{
|
|
d->setProberType(proberType);
|
|
reset();
|
|
}
|
|
|
|
KEncodingProber::ProberType KEncodingProber::proberTypeForName(const QString &lang)
|
|
{
|
|
if (lang.isEmpty()) {
|
|
return KEncodingProber::Universal;
|
|
} else if (lang == tr("Disabled", "@item Text character set")) {
|
|
return KEncodingProber::None;
|
|
} else if (lang == tr("Universal", "@item Text character set")) {
|
|
return KEncodingProber::Universal;
|
|
} else if (lang == tr("Unicode", "@item Text character set")) {
|
|
return KEncodingProber::Unicode;
|
|
} else if (lang == tr("Cyrillic", "@item Text character set")) {
|
|
return KEncodingProber::Cyrillic;
|
|
} else if (lang == tr("Western European", "@item Text character set")) {
|
|
return KEncodingProber::WesternEuropean;
|
|
} else if (lang == tr("Central European", "@item Text character set")) {
|
|
return KEncodingProber::CentralEuropean;
|
|
} else if (lang == tr("Greek", "@item Text character set")) {
|
|
return KEncodingProber::Greek;
|
|
} else if (lang == tr("Hebrew", "@item Text character set")) {
|
|
return KEncodingProber::Hebrew;
|
|
} else if (lang == tr("Turkish", "@item Text character set")) {
|
|
return KEncodingProber::Turkish;
|
|
} else if (lang == tr("Japanese", "@item Text character set")) {
|
|
return KEncodingProber::Japanese;
|
|
} else if (lang == tr("Baltic", "@item Text character set")) {
|
|
return KEncodingProber::Baltic;
|
|
} else if (lang == tr("Chinese Traditional", "@item Text character set")) {
|
|
return KEncodingProber::ChineseTraditional;
|
|
} else if (lang == tr("Chinese Simplified", "@item Text character set")) {
|
|
return KEncodingProber::ChineseSimplified;
|
|
} else if (lang == tr("Korean", "@item Text character set")) {
|
|
return KEncodingProber::Korean;
|
|
} else if (lang == tr("Thai", "@item Text character set")) {
|
|
return KEncodingProber::Thai;
|
|
} else if (lang == tr("Arabic", "@item Text character set")) {
|
|
return KEncodingProber::Arabic;
|
|
}
|
|
|
|
return KEncodingProber::Universal;
|
|
}
|
|
|
|
QString KEncodingProber::nameForProberType(KEncodingProber::ProberType proberType)
|
|
{
|
|
switch (proberType) {
|
|
case KEncodingProber::None:
|
|
return tr("Disabled", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Universal:
|
|
return tr("Universal", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Arabic:
|
|
return tr("Arabic", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Baltic:
|
|
return tr("Baltic", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::CentralEuropean:
|
|
return tr("Central European", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Cyrillic:
|
|
return tr("Cyrillic", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Greek:
|
|
return tr("Greek", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Hebrew:
|
|
return tr("Hebrew", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Japanese:
|
|
return tr("Japanese", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Turkish:
|
|
return tr("Turkish", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::WesternEuropean:
|
|
return tr("Western European", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::ChineseTraditional:
|
|
return tr("Chinese Traditional", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::ChineseSimplified:
|
|
return tr("Chinese Simplified", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Korean:
|
|
return tr("Korean", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Thai:
|
|
return tr("Thai", "@item Text character set");
|
|
break;
|
|
case KEncodingProber::Unicode:
|
|
return tr("Unicode", "@item Text character set");
|
|
break;
|
|
default:
|
|
return QString();
|
|
}
|
|
}
|