Files
RedBear-OS/local/recipes/kde/kf6-kcodecs/source/src/probers/nsUniversalDetector.cpp
T
2026-04-14 10:51:06 +01:00

223 lines
6.1 KiB
C++

/* -*- C++ -*-
SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org>
SPDX-FileCopyrightText: 2008 Wang Kai <wkai@gmail.com>
SPDX-License-Identifier: MIT
*/
#include "nsUniversalDetector.h"
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
#include "nsMBCSGroupProber.h"
#include "nsSBCSGroupProber.h"
namespace kencodingprober
{
nsUniversalDetector::nsUniversalDetector()
{
mDone = false;
mBestGuess = -1; // illegal value as signal
mInTag = false;
mEscCharSetProber = nullptr;
mStart = true;
mDetectedCharset = nullptr;
mGotData = false;
mInputState = ePureAscii;
mLastChar = '\0';
unsigned int i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
mCharSetProbers[i] = nullptr;
}
}
nsUniversalDetector::~nsUniversalDetector()
{
for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
delete mCharSetProbers[i];
}
delete mEscCharSetProber;
}
void nsUniversalDetector::Reset()
{
mDone = false;
mBestGuess = -1; // illegal value as signal
mInTag = false;
mStart = true;
mDetectedCharset = nullptr;
mGotData = false;
mInputState = ePureAscii;
mLastChar = '\0';
if (mEscCharSetProber) {
mEscCharSetProber->Reset();
}
unsigned int i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
if (mCharSetProbers[i]) {
mCharSetProbers[i]->Reset();
}
}
}
//---------------------------------------------------------------------
#define SHORTCUT_THRESHOLD (float)0.95
#define MINIMUM_THRESHOLD (float)0.20
nsProbingState nsUniversalDetector::HandleData(const char *aBuf, unsigned int aLen)
{
if (mDone) {
return eFoundIt;
}
if (aLen > 0) {
mGotData = true;
}
unsigned int i;
for (i = 0; i < aLen; i++) {
// other than 0xa0, if every other character is ascii, the page is ascii
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') { // Since many Ascii only page contains NBSP
// we got a non-ascii byte (high-byte)
if (mInputState != eHighbyte) {
// adjust state
mInputState = eHighbyte;
// kill mEscCharSetProber if it is active
delete mEscCharSetProber;
mEscCharSetProber = nullptr;
// start multibyte and singlebyte charset prober
if (nullptr == mCharSetProbers[0]) {
mCharSetProbers[0] = new nsMBCSGroupProber;
}
if (nullptr == mCharSetProbers[1]) {
mCharSetProbers[1] = new nsSBCSGroupProber;
}
if (nullptr == mCharSetProbers[2]) {
mCharSetProbers[2] = new nsLatin1Prober;
}
}
} else {
// ok, just pure ascii so far
if (ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~'))) {
// found escape character or HZ "~{"
mInputState = eEscAscii;
}
mLastChar = aBuf[i];
}
}
nsProbingState st = eDetecting;
switch (mInputState) {
case eEscAscii:
if (nullptr == mEscCharSetProber) {
mEscCharSetProber = new nsEscCharSetProber;
}
st = mEscCharSetProber->HandleData(aBuf, aLen);
if (st == eFoundIt) {
mDone = true;
mDetectedCharset = mEscCharSetProber->GetCharSetName();
}
break;
case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; ++i) {
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt) {
mDone = true;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
}
}
break;
default: // pure ascii
mDetectedCharset = "UTF-8";
}
return st;
}
//---------------------------------------------------------------------
const char *nsUniversalDetector::GetCharSetName()
{
if (mDetectedCharset) {
return mDetectedCharset;
}
switch (mInputState) {
case eHighbyte: {
float proberConfidence;
float maxProberConfidence = (float)0.0;
int maxProber = 0;
for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
// do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD) {
return mCharSetProbers[maxProber]->GetCharSetName();
}
}
case eEscAscii:
break;
default: // pure ascii
;
}
return "UTF-8";
}
//---------------------------------------------------------------------
float nsUniversalDetector::GetConfidence()
{
if (!mGotData) {
// we haven't got any data yet, return immediately
// caller program sometimes call DataEnd before anything has been sent to detector
return MINIMUM_THRESHOLD;
}
if (mDetectedCharset) {
return 0.99f;
}
switch (mInputState) {
case eHighbyte: {
float proberConfidence;
float maxProberConfidence = (float)0.0;
int maxProber = 0;
for (int i = 0; i < NUM_OF_CHARSET_PROBERS; i++) {
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
// do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD) {
return mCharSetProbers[maxProber]->GetConfidence();
}
}
case eEscAscii:
break;
default: // pure ascii
;
}
return MINIMUM_THRESHOLD;
}
nsProbingState nsUniversalDetector::GetState()
{
if (mDone) {
return eFoundIt;
} else {
return eDetecting;
}
}
}