mirror of
https://github.com/jellyfin/jellyfin.git
synced 2026-01-18 17:18:08 +00:00
remove trailing whitespace
This commit is contained in:
@@ -18,7 +18,7 @@ namespace NLangDetect.Core.Extensions
|
||||
if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
|
||||
if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
|
||||
if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
|
||||
|
||||
|
||||
return s.Substring(start, end - start);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
namespace Emby.Server.Implementations.TextEncoding
|
||||
{
|
||||
// Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
|
||||
//
|
||||
// https://www.autoitscript.com
|
||||
//
|
||||
// https://www.autoitscript.com
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
@@ -166,7 +166,7 @@
|
||||
return encoding;
|
||||
}
|
||||
|
||||
// Now try UTF16
|
||||
// Now try UTF16
|
||||
encoding = CheckUtf16NewlineChars(buffer, size);
|
||||
if (encoding != CharacterEncoding.None)
|
||||
{
|
||||
|
||||
@@ -41,7 +41,7 @@ using System.IO;
|
||||
namespace UniversalDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Default implementation of charset detection interface.
|
||||
/// Default implementation of charset detection interface.
|
||||
/// The detector can be fed by a System.IO.Stream:
|
||||
/// <example>
|
||||
/// <code>
|
||||
@@ -52,9 +52,9 @@ namespace UniversalDetector
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
///
|
||||
///
|
||||
/// or by a byte a array:
|
||||
///
|
||||
///
|
||||
/// <example>
|
||||
/// <code>
|
||||
/// byte[] buff = new byte[1024];
|
||||
@@ -64,23 +64,23 @@ namespace UniversalDetector
|
||||
/// cdet.DataEnd();
|
||||
/// Console.WriteLine("{0}, {1}", cdet.Charset, cdet.Confidence);
|
||||
/// </code>
|
||||
/// </example>
|
||||
/// </summary>
|
||||
/// </example>
|
||||
/// </summary>
|
||||
public class CharsetDetector : Core.UniversalDetector, ICharsetDetector
|
||||
{
|
||||
private string charset;
|
||||
|
||||
|
||||
private float confidence;
|
||||
|
||||
|
||||
//public event DetectorFinished Finished;
|
||||
|
||||
|
||||
public CharsetDetector() : base(FILTER_ALL)
|
||||
{
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void Feed(Stream stream)
|
||||
{
|
||||
{
|
||||
byte[] buff = new byte[1024];
|
||||
int read;
|
||||
while ((read = stream.Read(buff, 0, buff.Length)) > 0 && !done)
|
||||
@@ -88,19 +88,19 @@ namespace UniversalDetector
|
||||
Feed(buff, 0, read);
|
||||
}
|
||||
}
|
||||
|
||||
public bool IsDone()
|
||||
|
||||
public bool IsDone()
|
||||
{
|
||||
return done;
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
this.charset = null;
|
||||
this.confidence = 0.0f;
|
||||
base.Reset();
|
||||
}
|
||||
|
||||
|
||||
public string Charset {
|
||||
get { return charset; }
|
||||
}
|
||||
@@ -108,7 +108,7 @@ namespace UniversalDetector
|
||||
public float Confidence {
|
||||
get { return confidence; }
|
||||
}
|
||||
|
||||
|
||||
protected override void Report(string charset, float confidence)
|
||||
{
|
||||
this.charset = charset;
|
||||
@@ -118,7 +118,7 @@ namespace UniversalDetector
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//public delegate void DetectorFinished(string charset, float confidence);
|
||||
|
||||
}
|
||||
|
||||
@@ -44,12 +44,12 @@ namespace UniversalDetector.Core
|
||||
private CodingStateMachine codingSM;
|
||||
private BIG5DistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
|
||||
public Big5Prober()
|
||||
{
|
||||
this.codingSM = new CodingStateMachine(new BIG5SMModel());
|
||||
this.distributionAnalyser = new BIG5DistributionAnalyser();
|
||||
this.Reset();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
@@ -73,7 +73,7 @@ namespace UniversalDetector.Core
|
||||
lastChar[1] = buf[offset];
|
||||
distributionAnalyser.HandleOneChar(lastChar, 0, charLen);
|
||||
} else {
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -84,23 +84,23 @@ namespace UniversalDetector.Core
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "Big-5";
|
||||
return "Big-5";
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,15 +43,15 @@ namespace UniversalDetector.Core
|
||||
public static int INDEX_SHIFT_4BITS = 3;
|
||||
public static int INDEX_SHIFT_8BITS = 2;
|
||||
public static int INDEX_SHIFT_16BITS = 1;
|
||||
|
||||
|
||||
public static int SHIFT_MASK_4BITS = 7;
|
||||
public static int SHIFT_MASK_8BITS = 3;
|
||||
public static int SHIFT_MASK_16BITS = 1;
|
||||
|
||||
|
||||
public static int BIT_SHIFT_4BITS = 2;
|
||||
public static int BIT_SHIFT_8BITS = 3;
|
||||
public static int BIT_SHIFT_16BITS = 4;
|
||||
|
||||
|
||||
public static int UNIT_MASK_4BITS = 0x0000000F;
|
||||
public static int UNIT_MASK_8BITS = 0x000000FF;
|
||||
public static int UNIT_MASK_16BITS = 0x0000FFFF;
|
||||
@@ -61,7 +61,7 @@ namespace UniversalDetector.Core
|
||||
private int bitShift;
|
||||
private int unitMask;
|
||||
private int[] data;
|
||||
|
||||
|
||||
public BitPackage(int indexShift, int shiftMask,
|
||||
int bitShift, int unitMask, int[] data)
|
||||
{
|
||||
@@ -71,27 +71,27 @@ namespace UniversalDetector.Core
|
||||
this.unitMask = unitMask;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
|
||||
public static int Pack16bits(int a, int b)
|
||||
{
|
||||
return ((b << 16) | a);
|
||||
}
|
||||
|
||||
|
||||
public static int Pack8bits(int a, int b, int c, int d)
|
||||
{
|
||||
return Pack16bits((b << 8) | a, (d << 8) | c);
|
||||
}
|
||||
|
||||
public static int Pack4bits(int a, int b, int c, int d,
|
||||
|
||||
public static int Pack4bits(int a, int b, int c, int d,
|
||||
int e, int f, int g, int h)
|
||||
{
|
||||
return Pack8bits((b << 4) | a, (d << 4) | c,
|
||||
(f << 4) | e, (h << 4) | g);
|
||||
}
|
||||
|
||||
|
||||
public int Unpack(int i)
|
||||
{
|
||||
return (data[i >> indexShift] >>
|
||||
return (data[i >> indexShift] >>
|
||||
((i & shiftMask) << bitShift)) & unitMask;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -40,10 +40,10 @@ using System.IO;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
public enum ProbingState {
|
||||
public enum ProbingState {
|
||||
Detecting = 0, // no sure answer yet, but caller can ask for confidence
|
||||
FoundIt = 1, // positive answer
|
||||
NotMe = 2 // negative answer
|
||||
NotMe = 2 // negative answer
|
||||
};
|
||||
|
||||
public abstract class CharsetProber
|
||||
@@ -51,16 +51,16 @@ namespace UniversalDetector.Core
|
||||
protected const float SHORTCUT_THRESHOLD = 0.95F;
|
||||
|
||||
protected ProbingState state;
|
||||
|
||||
|
||||
// ASCII codes
|
||||
private const byte SPACE = 0x20;
|
||||
private const byte CAPITAL_A = 0x41;
|
||||
private const byte CAPITAL_Z = 0x5A;
|
||||
private const byte SMALL_A = 0x61;
|
||||
private const byte SMALL_Z = 0x7A;
|
||||
private const byte LESS_THAN = 0x3C;
|
||||
private const byte LESS_THAN = 0x3C;
|
||||
private const byte GREATER_THAN = 0x3E;
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Feed data to the prober
|
||||
/// </summary>
|
||||
@@ -71,44 +71,44 @@ namespace UniversalDetector.Core
|
||||
/// A <see cref="ProbingState"/>
|
||||
/// </returns>
|
||||
public abstract ProbingState HandleData(byte[] buf, int offset, int len);
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Reset prober state
|
||||
/// </summary>
|
||||
public abstract void Reset();
|
||||
|
||||
public abstract string GetCharsetName();
|
||||
|
||||
|
||||
public abstract float GetConfidence();
|
||||
|
||||
|
||||
public virtual ProbingState GetState()
|
||||
{
|
||||
return state;
|
||||
}
|
||||
|
||||
public virtual void SetOption()
|
||||
{
|
||||
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
public virtual void DumpStatus()
|
||||
{
|
||||
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Helper functions used in the Latin1 and Group probers
|
||||
//
|
||||
/// <summary>
|
||||
///
|
||||
///
|
||||
/// </summary>
|
||||
/// <returns>filtered buffer</returns>
|
||||
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
|
||||
protected static byte[] FilterWithoutEnglishLetters(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
|
||||
|
||||
bool meetMSB = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
@@ -140,8 +140,8 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Do filtering to reduce load to probers (Remove ASCII symbols,
|
||||
/// collapse spaces). This filter applies to all scripts which contain
|
||||
/// Do filtering to reduce load to probers (Remove ASCII symbols,
|
||||
/// collapse spaces). This filter applies to all scripts which contain
|
||||
/// both English characters and upper ASCII characters.
|
||||
/// </summary>
|
||||
/// <returns>a filtered copy of the input buffer</returns>
|
||||
@@ -150,16 +150,16 @@ namespace UniversalDetector.Core
|
||||
byte[] result = null;
|
||||
|
||||
using (MemoryStream ms = new MemoryStream(buf.Length)) {
|
||||
|
||||
|
||||
bool inTag = false;
|
||||
int max = offset + len;
|
||||
int prev = offset;
|
||||
int cur = offset;
|
||||
|
||||
while (cur < max) {
|
||||
|
||||
|
||||
byte b = buf[cur];
|
||||
|
||||
|
||||
if (b == GREATER_THAN)
|
||||
inTag = false;
|
||||
else if (b == LESS_THAN)
|
||||
@@ -177,7 +177,7 @@ namespace UniversalDetector.Core
|
||||
cur++;
|
||||
}
|
||||
|
||||
// If the current segment contains more than just a symbol
|
||||
// If the current segment contains more than just a symbol
|
||||
// and it is not inside a tag then keep it.
|
||||
if (!inTag && cur > prev)
|
||||
ms.Write(buf, prev, cur - prev);
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*
|
||||
* Contributor(s):
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -40,47 +40,47 @@ namespace UniversalDetector.Core
|
||||
public static class Charsets
|
||||
{
|
||||
public const string ASCII = "ASCII";
|
||||
|
||||
|
||||
public const string UTF8 = "UTF-8";
|
||||
|
||||
|
||||
public const string UTF16_LE = "UTF-16LE";
|
||||
|
||||
|
||||
public const string UTF16_BE = "UTF-16BE";
|
||||
|
||||
|
||||
public const string UTF32_BE = "UTF-32BE";
|
||||
|
||||
|
||||
public const string UTF32_LE = "UTF-32LE";
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (3412 order)
|
||||
/// </summary>
|
||||
public const string UCS4_3412 = "X-ISO-10646-UCS-4-3412";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Unusual BOM (2413 order)
|
||||
/// </summary>
|
||||
public const string UCS4_2413 = "X-ISO-10646-UCS-4-2413";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Cyrillic (based on bulgarian and russian data)
|
||||
/// </summary>
|
||||
public const string WIN1251 = "windows-1251";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Latin-1, almost identical to ISO-8859-1
|
||||
/// </summary>
|
||||
public const string WIN1252 = "windows-1252";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Greek
|
||||
/// </summary>
|
||||
public const string WIN1253 = "windows-1253";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Logical hebrew (includes ISO-8859-8-I and most of x-mac-hebrew)
|
||||
/// </summary>
|
||||
public const string WIN1255 = "windows-1255";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Traditional chinese
|
||||
/// </summary>
|
||||
@@ -89,7 +89,7 @@ namespace UniversalDetector.Core
|
||||
public const string EUCKR = "EUC-KR";
|
||||
|
||||
public const string EUCJP = "EUC-JP";
|
||||
|
||||
|
||||
public const string EUCTW = "EUC-TW";
|
||||
|
||||
/// <summary>
|
||||
@@ -98,11 +98,11 @@ namespace UniversalDetector.Core
|
||||
public const string GB18030 = "gb18030";
|
||||
|
||||
public const string ISO2022_JP = "ISO-2022-JP";
|
||||
|
||||
|
||||
public const string ISO2022_CN = "ISO-2022-CN";
|
||||
|
||||
|
||||
public const string ISO2022_KR = "ISO-2022-KR";
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Simplified chinese
|
||||
/// </summary>
|
||||
@@ -111,15 +111,15 @@ namespace UniversalDetector.Core
|
||||
public const string SHIFT_JIS = "Shift-JIS";
|
||||
|
||||
public const string MAC_CYRILLIC = "x-mac-cyrillic";
|
||||
|
||||
|
||||
public const string KOI8R = "KOI8-R";
|
||||
|
||||
|
||||
public const string IBM855 = "IBM855";
|
||||
|
||||
|
||||
public const string IBM866 = "IBM866";
|
||||
|
||||
/// <summary>
|
||||
/// East-Europe. Disabled because too similar to windows-1252
|
||||
/// East-Europe. Disabled because too similar to windows-1252
|
||||
/// (latin-1). Should use tri-grams models to discriminate between
|
||||
/// these two charsets.
|
||||
/// </summary>
|
||||
@@ -141,9 +141,9 @@ namespace UniversalDetector.Core
|
||||
public const string ISO8859_8 = "ISO-8859-8";
|
||||
|
||||
/// <summary>
|
||||
/// Thai. This recognizer is not enabled yet.
|
||||
/// Thai. This recognizer is not enabled yet.
|
||||
/// </summary>
|
||||
public const string TIS620 = "TIS620";
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Kohei TAKETA <k-tak@void.in> (Java port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -48,8 +48,8 @@ namespace UniversalDetector.Core
|
||||
private SMModel model;
|
||||
private int currentCharLen;
|
||||
private int currentBytePos;
|
||||
|
||||
public CodingStateMachine(SMModel model)
|
||||
|
||||
public CodingStateMachine(SMModel model)
|
||||
{
|
||||
this.currentState = SMModel.START;
|
||||
this.model = model;
|
||||
@@ -57,34 +57,34 @@ namespace UniversalDetector.Core
|
||||
|
||||
public int NextState(byte b)
|
||||
{
|
||||
// for each byte we get its class, if it is first byte,
|
||||
// for each byte we get its class, if it is first byte,
|
||||
// we also get byte length
|
||||
int byteCls = model.GetClass(b);
|
||||
if (currentState == SMModel.START) {
|
||||
if (currentState == SMModel.START) {
|
||||
currentBytePos = 0;
|
||||
currentCharLen = model.charLenTable[byteCls];
|
||||
}
|
||||
|
||||
// from byte's class and stateTable, we get its next state
|
||||
|
||||
// from byte's class and stateTable, we get its next state
|
||||
currentState = model.stateTable.Unpack(
|
||||
currentState * model.ClassFactor + byteCls);
|
||||
currentBytePos++;
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
currentState = SMModel.START;
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
currentState = SMModel.START;
|
||||
}
|
||||
|
||||
public int CurrentCharLen
|
||||
{
|
||||
get { return currentCharLen; }
|
||||
public int CurrentCharLen
|
||||
{
|
||||
get { return currentCharLen; }
|
||||
}
|
||||
|
||||
public string ModelName
|
||||
{
|
||||
get { return model.Name; }
|
||||
public string ModelName
|
||||
{
|
||||
get { return model.Name; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,25 +43,25 @@ namespace UniversalDetector.Core
|
||||
private EUCJPContextAnalyser contextAnalyser;
|
||||
private EUCJPDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
|
||||
public EUCJPProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCJPSMModel());
|
||||
distributionAnalyser = new EUCJPDistributionAnalyser();
|
||||
contextAnalyser = new EUCJPContextAnalyser();
|
||||
contextAnalyser = new EUCJPContextAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-JP";
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
@@ -83,7 +83,7 @@ namespace UniversalDetector.Core
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
@@ -93,18 +93,18 @@ namespace UniversalDetector.Core
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
contextAnalyser.Reset();
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float contxtCf = contextAnalyser.GetConfidence();
|
||||
float distribCf = distributionAnalyser.GetConfidence();
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,15 +46,15 @@ namespace UniversalDetector.Core
|
||||
public EUCKRProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new EUCKRSMModel());
|
||||
distributionAnalyser = new EUCKRDistributionAnalyser();
|
||||
distributionAnalyser = new EUCKRDistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "EUC-KR";
|
||||
return "EUC-KR";
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
@@ -81,12 +81,12 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
|
||||
|
||||
}
|
||||
|
||||
public override float GetConfidence()
|
||||
@@ -96,7 +96,7 @@ namespace UniversalDetector.Core
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
//mContextAnalyser.Reset();
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -50,7 +50,7 @@ namespace UniversalDetector.Core
|
||||
this.distributionAnalyser = new EUCTWDistributionAnalyser();
|
||||
this.Reset();
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
@@ -77,21 +77,21 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
|
||||
|
||||
if (state == ProbingState.Detecting)
|
||||
if (distributionAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "x-euc-tw";
|
||||
return "x-euc-tw";
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
@@ -100,7 +100,7 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
return distributionAnalyser.GetConfidence();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,19 +41,19 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
private const int CHARSETS_NUM = 4;
|
||||
private string detectedCharset;
|
||||
private CodingStateMachine[] codingSM;
|
||||
private CodingStateMachine[] codingSM;
|
||||
int activeSM;
|
||||
|
||||
public EscCharsetProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine[CHARSETS_NUM];
|
||||
codingSM = new CodingStateMachine[CHARSETS_NUM];
|
||||
codingSM[0] = new CodingStateMachine(new HZSMModel());
|
||||
codingSM[1] = new CodingStateMachine(new ISO2022CNSMModel());
|
||||
codingSM[2] = new CodingStateMachine(new ISO2022JPSMModel());
|
||||
codingSM[3] = new CodingStateMachine(new ISO2022KRSMModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
@@ -66,7 +66,7 @@ namespace UniversalDetector.Core
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max && state == ProbingState.Detecting; i++) {
|
||||
for (int j = activeSM - 1; j >= 0; j--) {
|
||||
// byte is feed to all active state machine
|
||||
@@ -94,12 +94,12 @@ namespace UniversalDetector.Core
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return detectedCharset;
|
||||
return detectedCharset;
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return 0.99f;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,59 +44,59 @@ namespace UniversalDetector.Core
|
||||
public class HZSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] HZ_cls = {
|
||||
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
|
||||
BitPackage.Pack4bits(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,4,0,5,2,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 80 - 87
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 88 - 8f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 90 - 97
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // 98 - 9f
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a0 - a7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // a8 - af
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b0 - b7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // b8 - bf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c0 - c7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // c8 - cf
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d0 - d7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // d8 - df
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e0 - e7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // e8 - ef
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1), // f0 - f7
|
||||
BitPackage.Pack4bits(1,1,1,1,1,1,1,1) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] HZ_st = {
|
||||
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
|
||||
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
|
||||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||
BitPackage.Pack4bits(START, ERROR, 3, START, START, START, ERROR, ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, ERROR, ITSME, ITSME, ITSME, ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ERROR, ERROR, START, START, 4, ERROR),//10-17
|
||||
BitPackage.Pack4bits( 5, ERROR, 6, ERROR, 5, 5, 4, ERROR),//18-1f
|
||||
BitPackage.Pack4bits( 4, ERROR, 4, 4, 4, ERROR, 4, ERROR),//20-27
|
||||
BitPackage.Pack4bits( 4, ITSME, START, START, START, START, START, START) //28-2f
|
||||
};
|
||||
|
||||
private readonly static int[] HZCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
|
||||
public HZSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, HZ_st),
|
||||
HZCharLenTable, "HZ-GB-2312")
|
||||
@@ -104,65 +104,65 @@ namespace UniversalDetector.Core
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class ISO2022CNSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022CN_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,3,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,4,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CN_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ITSME,ITSME,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits( 5, 6,ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//30-37
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ERROR,ITSME,ERROR,START) //38-3f
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022CNCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022CNSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_cls),
|
||||
9,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022CN_st),
|
||||
ISO2022CNCharLenTable, "ISO-2022-CN")
|
||||
@@ -170,130 +170,130 @@ namespace UniversalDetector.Core
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class ISO2022JPSMModel : SMModel
|
||||
{
|
||||
private readonly static int[] ISO2022JP_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,7,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(3,0,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(6,0,4,0,8,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,9,5,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JP_st = {
|
||||
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||
BitPackage.Pack4bits(START, 3, ERROR,START,START,START,START,START),//00-07
|
||||
BitPackage.Pack4bits(START, START, ERROR,ERROR,ERROR,ERROR,ERROR,ERROR),//08-0f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//10-17
|
||||
BitPackage.Pack4bits(ITSME, ITSME, ITSME,ITSME,ITSME,ITSME,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR, 5, ERROR,ERROR,ERROR, 4,ERROR,ERROR),//20-27
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR, 6,ITSME,ERROR,ITSME,ERROR),//28-2f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ERROR,ERROR,ITSME,ITSME),//30-37
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ITSME,ERROR,ERROR,ERROR,ERROR),//38-3f
|
||||
BitPackage.Pack4bits(ERROR, ERROR, ERROR,ERROR,ITSME,ERROR,START,START) //40-47
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022JPCharLenTable = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022JPSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_cls),
|
||||
10,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022JP_st),
|
||||
ISO2022JPCharLenTable, "ISO-2022-JP")
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public class ISO2022KRSMModel : SMModel
|
||||
{
|
||||
{
|
||||
private readonly static int[] ISO2022KR_cls = {
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
BitPackage.Pack4bits(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
BitPackage.Pack4bits(0,0,0,1,0,0,0,0), // 18 - 1f
|
||||
BitPackage.Pack4bits(0,0,0,0,3,0,0,0), // 20 - 27
|
||||
BitPackage.Pack4bits(0,4,0,0,0,0,0,0), // 28 - 2f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
BitPackage.Pack4bits(0,0,0,5,0,0,0,0), // 40 - 47
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
BitPackage.Pack4bits(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 80 - 87
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 88 - 8f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 90 - 97
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // 98 - 9f
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a0 - a7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // a8 - af
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b0 - b7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // b8 - bf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c0 - c7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // c8 - cf
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d0 - d7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // d8 - df
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e0 - e7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // e8 - ef
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2), // f0 - f7
|
||||
BitPackage.Pack4bits(2,2,2,2,2,2,2,2) // f8 - ff
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KR_st = {
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||
BitPackage.Pack4bits(START, 3,ERROR,START,START,START,ERROR,ERROR),//00-07
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR,ITSME,ITSME,ITSME,ITSME),//08-0f
|
||||
BitPackage.Pack4bits(ITSME,ITSME,ERROR,ERROR,ERROR, 4,ERROR,ERROR),//10-17
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ERROR, 5,ERROR,ERROR,ERROR),//18-1f
|
||||
BitPackage.Pack4bits(ERROR,ERROR,ERROR,ITSME,START,START,START,START) //20-27
|
||||
};
|
||||
|
||||
private readonly static int[] ISO2022KRCharLenTable = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
public ISO2022KRSMModel() : base(
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_cls),
|
||||
6,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
new BitPackage(BitPackage.INDEX_SHIFT_4BITS,
|
||||
BitPackage.SHIFT_MASK_4BITS,
|
||||
BitPackage.BIT_SHIFT_4BITS,
|
||||
BitPackage.UNIT_MASK_4BITS, ISO2022KR_st),
|
||||
ISO2022KRCharLenTable, "ISO-2022-KR")
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -38,7 +38,7 @@
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
// We use gb18030 to replace gb2312, because 18030 is a superset.
|
||||
// We use gb18030 to replace gb2312, because 18030 is a superset.
|
||||
public class GB18030Prober : CharsetProber
|
||||
{
|
||||
private CodingStateMachine codingSM;
|
||||
@@ -52,18 +52,18 @@ namespace UniversalDetector.Core
|
||||
analyser = new GB18030DistributionAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "gb18030";
|
||||
return "gb18030";
|
||||
}
|
||||
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
@@ -91,18 +91,18 @@ namespace UniversalDetector.Core
|
||||
if (analyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
state = ProbingState.FoundIt;
|
||||
}
|
||||
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
return analyser.GetConfidence();
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
analyser.Reset();
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -43,40 +43,40 @@ using System;
|
||||
*
|
||||
* Four main charsets exist in Hebrew:
|
||||
* "ISO-8859-8" - Visual Hebrew
|
||||
* "windows-1255" - Logical Hebrew
|
||||
* "windows-1255" - Logical Hebrew
|
||||
* "ISO-8859-8-I" - Logical Hebrew
|
||||
* "x-mac-hebrew" - ?? Logical Hebrew ??
|
||||
*
|
||||
* Both "ISO" charsets use a completely identical set of code points, whereas
|
||||
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
* "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
||||
* these code points. windows-1255 defines additional characters in the range
|
||||
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
* 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
||||
* diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
|
||||
* x-mac-hebrew defines similar additional code points but with a different
|
||||
* x-mac-hebrew defines similar additional code points but with a different
|
||||
* mapping.
|
||||
*
|
||||
* As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
* charsets are identical with respect to code points. Meaning that for the
|
||||
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
* As far as an average Hebrew text with no diacritics is concerned, all four
|
||||
* charsets are identical with respect to code points. Meaning that for the
|
||||
* main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
||||
* (including final letters).
|
||||
*
|
||||
* The dominant difference between these charsets is their directionality.
|
||||
* "Visual" directionality means that the text is ordered as if the renderer is
|
||||
* not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
* draws it from left to right. The text itself when ordered naturally is read
|
||||
* not aware of a BIDI rendering algorithm. The renderer sees the text and
|
||||
* draws it from left to right. The text itself when ordered naturally is read
|
||||
* backwards. A buffer of Visual Hebrew generally looks like so:
|
||||
* "[last word of first line spelled backwards] [whole line ordered backwards
|
||||
* and spelled backwards] [first word of first line spelled backwards]
|
||||
* and spelled backwards] [first word of first line spelled backwards]
|
||||
* [end of line] [last word of second line] ... etc' "
|
||||
* adding punctuation marks, numbers and English text to visual text is
|
||||
* naturally also "visual" and from left to right.
|
||||
*
|
||||
*
|
||||
* "Logical" directionality means the text is ordered "naturally" according to
|
||||
* the order it is read. It is the responsibility of the renderer to display
|
||||
* the text from right to left. A BIDI algorithm is used to place general
|
||||
* the order it is read. It is the responsibility of the renderer to display
|
||||
* the text from right to left. A BIDI algorithm is used to place general
|
||||
* punctuation marks, numbers and English text in the text.
|
||||
*
|
||||
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
* Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
||||
* what little evidence I could find, it seems that its general directionality
|
||||
* is Logical.
|
||||
*
|
||||
@@ -84,17 +84,17 @@ using System;
|
||||
* charsets:
|
||||
* Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
|
||||
* backwards while line order is natural. For charset recognition purposes
|
||||
* the line order is unimportant (In fact, for this implementation, even
|
||||
* the line order is unimportant (In fact, for this implementation, even
|
||||
* word order is unimportant).
|
||||
* Logical Hebrew - "windows-1255" - normal, naturally ordered text.
|
||||
*
|
||||
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
* "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
||||
* specifically identified.
|
||||
* "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
|
||||
* that contain special punctuation marks or diacritics is displayed with
|
||||
* some unconverted characters showing as question marks. This problem might
|
||||
* be corrected using another model prober for x-mac-hebrew. Due to the fact
|
||||
* that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
* that x-mac-hebrew texts are so rare, writing another model prober isn't
|
||||
* worth the effort and performance hit.
|
||||
*
|
||||
* *** The Prober ***
|
||||
@@ -136,7 +136,7 @@ using System;
|
||||
*/
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// This prober doesn't actually recognize a language or a charset.
|
||||
/// It is a helper prober for the use of the Hebrew model probers
|
||||
@@ -165,49 +165,49 @@ namespace UniversalDetector.Core
|
||||
|
||||
protected const string VISUAL_HEBREW_NAME = "ISO-8859-8";
|
||||
protected const string LOGICAL_HEBREW_NAME = "windows-1255";
|
||||
|
||||
|
||||
// owned by the group prober.
|
||||
protected CharsetProber logicalProber, visualProber;
|
||||
protected int finalCharLogicalScore, finalCharVisualScore;
|
||||
|
||||
protected int finalCharLogicalScore, finalCharVisualScore;
|
||||
|
||||
// The two last bytes seen in the previous buffer.
|
||||
protected byte prev, beforePrev;
|
||||
|
||||
|
||||
public HebrewProber()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
|
||||
{
|
||||
logicalProber = logical;
|
||||
visualProber = visual;
|
||||
|
||||
public void SetModelProbers(CharsetProber logical, CharsetProber visual)
|
||||
{
|
||||
logicalProber = logical;
|
||||
visualProber = visual;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
/**
|
||||
* Final letter analysis for logical-visual decision.
|
||||
* Look for evidence that the received buffer is either logical Hebrew or
|
||||
* Look for evidence that the received buffer is either logical Hebrew or
|
||||
* visual Hebrew.
|
||||
* The following cases are checked:
|
||||
* 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
* indication that the text is laid out "naturally" since the final letter
|
||||
* 1) A word longer than 1 letter, ending with a final letter. This is an
|
||||
* indication that the text is laid out "naturally" since the final letter
|
||||
* really appears at the end. +1 for logical score.
|
||||
* 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
||||
* Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
||||
* the Non-Final form of that letter. Exceptions to this rule are mentioned
|
||||
* above in isNonFinal(). This is an indication that the text is laid out
|
||||
* backwards. +1 for visual score
|
||||
* 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
* should not appear at the beginning of a word. This is an indication that
|
||||
* 3) A word longer than 1 letter, starting with a final letter. Final letters
|
||||
* should not appear at the beginning of a word. This is an indication that
|
||||
* the text is laid out backwards. +1 for visual score.
|
||||
*
|
||||
* The visual score and logical score are accumulated throughout the text and
|
||||
* The visual score and logical score are accumulated throughout the text and
|
||||
* are finally checked against each other in GetCharSetName().
|
||||
* No checking for final letters in the middle of words is done since that case
|
||||
* is not an indication for either Logical or Visual text.
|
||||
*
|
||||
* The input buffer should not contain any white spaces that are not (' ')
|
||||
* or any low-ascii punctuation marks.
|
||||
* or any low-ascii punctuation marks.
|
||||
*/
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
@@ -218,31 +218,31 @@ namespace UniversalDetector.Core
|
||||
int max = offset + len;
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
|
||||
|
||||
byte b = buf[i];
|
||||
|
||||
|
||||
// a word just ended
|
||||
if (b == 0x20) {
|
||||
// *(curPtr-2) was not a space so prev is not a 1 letter word
|
||||
if (beforePrev != 0x20) {
|
||||
// case (1) [-2:not space][-1:final letter][cur:space]
|
||||
if (IsFinal(prev))
|
||||
if (IsFinal(prev))
|
||||
finalCharLogicalScore++;
|
||||
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
||||
else if (IsNonFinal(prev))
|
||||
finalCharVisualScore++;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
// case (3) [-2:space][-1:final letter][cur:not space]
|
||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||
if ((beforePrev == 0x20) && (IsFinal(prev)) && (b != ' '))
|
||||
++finalCharVisualScore;
|
||||
}
|
||||
beforePrev = prev;
|
||||
prev = b;
|
||||
}
|
||||
|
||||
// Forever detecting, till the end or until both model probers
|
||||
// Forever detecting, till the end or until both model probers
|
||||
// return NotMe (handled above).
|
||||
return ProbingState.Detecting;
|
||||
}
|
||||
@@ -252,7 +252,7 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
// If the final letter score distance is dominant enough, rely on it.
|
||||
int finalsub = finalCharLogicalScore - finalCharVisualScore;
|
||||
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
|
||||
if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
@@ -263,9 +263,9 @@ namespace UniversalDetector.Core
|
||||
return LOGICAL_HEBREW_NAME;
|
||||
if (modelsub < -(MIN_MODEL_DISTANCE))
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
|
||||
// Still no good, back to final letter distance, maybe it'll save the day.
|
||||
if (finalsub < 0)
|
||||
if (finalsub < 0)
|
||||
return VISUAL_HEBREW_NAME;
|
||||
|
||||
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
||||
@@ -280,10 +280,10 @@ namespace UniversalDetector.Core
|
||||
beforePrev = 0x20;
|
||||
}
|
||||
|
||||
public override ProbingState GetState()
|
||||
public override ProbingState GetState()
|
||||
{
|
||||
// Remain active as long as any of the model probers are active.
|
||||
if (logicalProber.GetState() == ProbingState.NotMe &&
|
||||
if (logicalProber.GetState() == ProbingState.NotMe &&
|
||||
visualProber.GetState() == ProbingState.NotMe)
|
||||
return ProbingState.NotMe;
|
||||
return ProbingState.Detecting;
|
||||
@@ -293,31 +293,31 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
//Console.WriteLine(" HEB: {0} - {1} [Logical-Visual score]", finalCharLogicalScore, finalCharVisualScore);
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
{
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
|
||||
protected static bool IsFinal(byte b)
|
||||
{
|
||||
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|
||||
|| b == FINAL_PE || b == FINAL_TSADI);
|
||||
return (b == FINAL_KAF || b == FINAL_MEM || b == FINAL_NUN
|
||||
|| b == FINAL_PE || b == FINAL_TSADI);
|
||||
}
|
||||
|
||||
|
||||
protected static bool IsNonFinal(byte b)
|
||||
{
|
||||
// The normal Tsadi is not a good Non-Final letter due to words like
|
||||
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
// the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
// The normal Tsadi is not a good Non-Final letter due to words like
|
||||
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
||||
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
||||
// the Non-Final tsadi to appear at an end of a word even though this is not
|
||||
// the case in the original text.
|
||||
// The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
// these letters as Non-Final letters outweighs the damage since these words
|
||||
// are quite rare.
|
||||
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|
||||
// The letters Pe and Kaf rarely display a related behavior of not being a
|
||||
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
||||
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
||||
// these letters as Non-Final letters outweighs the damage since these words
|
||||
// are quite rare.
|
||||
return (b == NORMAL_KAF || b == NORMAL_MEM || b == NORMAL_NUN
|
||||
|| b == NORMAL_PE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -48,7 +48,7 @@ namespace UniversalDetector.Core
|
||||
|
||||
// hiragana frequency category table
|
||||
// This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
protected static byte[,] jp2CharContext = {
|
||||
protected static byte[,] jp2CharContext = {
|
||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||
{ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,},
|
||||
@@ -133,35 +133,35 @@ namespace UniversalDetector.Core
|
||||
{ 0,4,0,4,0,4,0,3,0,4,4,3,4,2,4,3,2,0,4,4,4,3,5,3,5,3,3,2,4,2,4,3,4,3,1,4,0,2,3,4,4,4,3,3,3,4,4,4,3,4,1,3,4,3,2,1,2,1,3,3,3,4,4,3,3,5,0,4,0,3,0,4,3,3,3,2,1,0,3,0,0,3,3,},
|
||||
{ 0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1,},
|
||||
};
|
||||
|
||||
|
||||
// category counters, each integer counts sequence in its category
|
||||
int[] relSample = new int[CATEGORIES_NUM];
|
||||
|
||||
// total sequence received
|
||||
int totalRel;
|
||||
|
||||
|
||||
// The order of previous char
|
||||
int lastCharOrder;
|
||||
|
||||
// if last byte in current buffer is not the last byte of a character,
|
||||
// if last byte in current buffer is not the last byte of a character,
|
||||
// we need to know how many byte to skip in next buffer.
|
||||
int needToSkipCharNum;
|
||||
|
||||
// If this flag is set to true, detection is done and conclusion has
|
||||
// If this flag is set to true, detection is done and conclusion has
|
||||
// been made
|
||||
bool done;
|
||||
|
||||
|
||||
public JapaneseContextAnalyser()
|
||||
{
|
||||
Reset();
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public float GetConfidence()
|
||||
{
|
||||
// This is just one way to calculate confidence. It works well for me.
|
||||
if (totalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(totalRel - relSample[0]))/totalRel;
|
||||
else
|
||||
else
|
||||
return DONT_KNOW;
|
||||
}
|
||||
|
||||
@@ -170,15 +170,15 @@ namespace UniversalDetector.Core
|
||||
|
||||
int charLen = 0;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
if (done)
|
||||
return;
|
||||
|
||||
// The buffer we got is byte oriented, and a character may span
|
||||
// The buffer we got is byte oriented, and a character may span
|
||||
// more than one buffer. In case the last one or two byte in last
|
||||
// buffer is not complete, we record how many byte needed to
|
||||
// buffer is not complete, we record how many byte needed to
|
||||
// complete that character and skip these bytes here. We can choose
|
||||
// to record those bytes as well and analyse the character once it
|
||||
// to record those bytes as well and analyse the character once it
|
||||
// is complete, but since a character will not make much difference,
|
||||
// skipping it will simplify our logic and improve performance.
|
||||
for (int i = needToSkipCharNum+offset; i < max; ) {
|
||||
@@ -200,14 +200,14 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void HandleOneChar(byte[] buf, int offset, int charLen)
|
||||
{
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
if (totalRel > MAX_REL_THRESHOLD)
|
||||
done = true;
|
||||
if (done)
|
||||
if (done)
|
||||
return;
|
||||
|
||||
|
||||
// Only 2-bytes characters are of our interest
|
||||
int order = (charLen == 2) ? GetOrder(buf, offset) : -1;
|
||||
if (order != -1 && lastCharOrder != -1) {
|
||||
@@ -217,7 +217,7 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
lastCharOrder = order;
|
||||
}
|
||||
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
totalRel = 0;
|
||||
@@ -228,18 +228,18 @@ namespace UniversalDetector.Core
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset, out int charLen);
|
||||
|
||||
|
||||
protected abstract int GetOrder(byte[] buf, int offset);
|
||||
|
||||
public bool GotEnoughData()
|
||||
|
||||
public bool GotEnoughData()
|
||||
{
|
||||
return totalRel > ENOUGH_REL_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public class SJISContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0x82;
|
||||
@@ -247,10 +247,10 @@ namespace UniversalDetector.Core
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
//find out current char's byte length
|
||||
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|
||||
if (buf[offset] >= 0x81 && buf[offset] <= 0x9F
|
||||
|| buf[offset] >= 0xe0 && buf[offset] <= 0xFC)
|
||||
charLen = 2;
|
||||
else
|
||||
else
|
||||
charLen = 1;
|
||||
|
||||
// return its order if it is hiragana
|
||||
@@ -259,7 +259,7 @@ namespace UniversalDetector.Core
|
||||
if (low >= 0x9F && low <= 0xF1)
|
||||
return low - 0x9F;
|
||||
}
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
@@ -274,15 +274,15 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public class EUCJPContextAnalyser : JapaneseContextAnalyser
|
||||
{
|
||||
private const byte HIRAGANA_FIRST_BYTE = 0xA4;
|
||||
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset, out int charLen)
|
||||
{
|
||||
byte high = buf[offset];
|
||||
|
||||
|
||||
//find out current char's byte length
|
||||
if (high == 0x8E || high >= 0xA1 && high <= 0xFE)
|
||||
charLen = 2;
|
||||
@@ -297,9 +297,9 @@ namespace UniversalDetector.Core
|
||||
if (low >= 0xA1 && low <= 0xF3)
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
protected override int GetOrder(byte[] buf, int offset)
|
||||
{
|
||||
// We are only interested in Hiragana
|
||||
@@ -309,7 +309,7 @@ namespace UniversalDetector.Core
|
||||
return low - 0xA1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -37,15 +37,15 @@
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public abstract class BulgarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 96.9392%
|
||||
//first 1024 sequences:3.0618%
|
||||
//rest sequences: 0.2992%
|
||||
//negative sequences: 0.0020%
|
||||
//negative sequences: 0.0020%
|
||||
private static byte[] BULGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
@@ -175,15 +175,15 @@ namespace UniversalDetector.Core
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
|
||||
|
||||
};
|
||||
|
||||
public BulgarianModel(byte[] charToOrderMap, string name)
|
||||
public BulgarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, BULGARIAN_LANG_MODEL, 0.969392f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Latin5BulgarianModel : BulgarianModel
|
||||
{
|
||||
//255: Control characters that usually does not exist in any text
|
||||
@@ -191,7 +191,7 @@ namespace UniversalDetector.Core
|
||||
//253: symbol (punctuation) that does not belong to word
|
||||
//252: 0 - 9
|
||||
// Character Mapping Table:
|
||||
// this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
// this table is modified base on win1251BulgarianCharToOrderMap, so
|
||||
// only number <64 is sure valid
|
||||
private static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
@@ -209,14 +209,14 @@ namespace UniversalDetector.Core
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
|
||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
};
|
||||
|
||||
|
||||
public Latin5BulgarianModel() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Win1251BulgarianModel : BulgarianModel
|
||||
{
|
||||
private static byte[] WIN1251__CHAR_TO_ORDER_MAP = {
|
||||
@@ -236,8 +236,8 @@ namespace UniversalDetector.Core
|
||||
39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,252, 60, 56, //d0
|
||||
1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
|
||||
7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,253, 42, 16, //f0
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
public Win1251BulgarianModel() : base(WIN1251__CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -40,12 +40,12 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
public abstract class CyrillicModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 97.6601%
|
||||
// first 1024 sequences: 2.3389%
|
||||
// rest sequences: 0.1237%
|
||||
// negative sequences: 0.0009%
|
||||
// negative sequences: 0.0009%
|
||||
protected readonly static byte[] RUSSIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
@@ -176,13 +176,13 @@ namespace UniversalDetector.Core
|
||||
0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
|
||||
};
|
||||
|
||||
public CyrillicModel(byte[] charToOrderMap, string name)
|
||||
|
||||
public CyrillicModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, RUSSIAN_LANG_MODEL, 0.976601f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Koi8rModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] KOI8R_CHAR_TO_ORDER_MAP = {
|
||||
@@ -203,12 +203,12 @@ namespace UniversalDetector.Core
|
||||
59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0
|
||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||
};
|
||||
|
||||
|
||||
public Koi8rModel() : base(KOI8R_CHAR_TO_ORDER_MAP, "KOI8-R")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Win1251Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] WIN1251_CHAR_TO_ORDER_MAP = {
|
||||
@@ -229,12 +229,12 @@ namespace UniversalDetector.Core
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
};
|
||||
|
||||
|
||||
public Win1251Model() : base(WIN1251_CHAR_TO_ORDER_MAP, "windows-1251")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Latin5Model : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] LATIN5_CHAR_TO_ORDER_MAP = {
|
||||
@@ -254,13 +254,13 @@ namespace UniversalDetector.Core
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
public Latin5Model() : base(LATIN5_CHAR_TO_ORDER_MAP, "ISO-8859-5")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class MacCyrillicModel : CyrillicModel
|
||||
{
|
||||
private readonly static byte[] MACCYRILLIC_CHAR_TO_ORDER_MAP = {
|
||||
@@ -281,7 +281,7 @@ namespace UniversalDetector.Core
|
||||
3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15,
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
};
|
||||
|
||||
|
||||
public MacCyrillicModel() : base(MACCYRILLIC_CHAR_TO_ORDER_MAP,
|
||||
"x-mac-cyrillic")
|
||||
{
|
||||
@@ -308,7 +308,7 @@ namespace UniversalDetector.Core
|
||||
43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249,
|
||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
};
|
||||
|
||||
|
||||
public Ibm855Model() : base(IBM855_BYTE_TO_ORDER_MAP, "IBM855")
|
||||
{
|
||||
}
|
||||
@@ -334,12 +334,12 @@ namespace UniversalDetector.Core
|
||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
|
||||
public Ibm866Model() : base(IBM866_CHAR_TO_ORDER_MAP, "IBM866")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -37,15 +37,15 @@
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public abstract class GreekModel : SequenceModel
|
||||
{
|
||||
// Model Table:
|
||||
// Model Table:
|
||||
// total sequences: 100%
|
||||
// first 512 sequences: 98.2851%
|
||||
// first 1024 sequences:1.7001%
|
||||
// rest sequences: 0.0359%
|
||||
// negative sequences: 0.0148%
|
||||
// negative sequences: 0.0148%
|
||||
private readonly static byte[] GREEK_LANG_MODEL = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
@@ -177,12 +177,12 @@ namespace UniversalDetector.Core
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public GreekModel(byte[] charToOrderMap, string name)
|
||||
public GreekModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, GREEK_LANG_MODEL, 0.982851f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Latin7Model : GreekModel
|
||||
{
|
||||
/****************************************************************
|
||||
@@ -210,12 +210,12 @@ namespace UniversalDetector.Core
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
|
||||
public Latin7Model() : base(LATIN7_CHAR_TO_ORDER_MAP, "ISO-8859-7")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Win1253Model : GreekModel
|
||||
{
|
||||
private readonly static byte[] WIN1253__CHAR_TO_ORDER_MAP = {
|
||||
@@ -235,8 +235,8 @@ namespace UniversalDetector.Core
|
||||
35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, //d0
|
||||
124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, //e0
|
||||
9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, //f0
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
public Win1253Model() : base(WIN1253__CHAR_TO_ORDER_MAP, "windows-1253")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -37,15 +37,15 @@
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public abstract class HebrewModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 98.4004%
|
||||
//first 1024 sequences: 1.5981%
|
||||
//rest sequences: 0.087%
|
||||
//negative sequences: 0.0015%
|
||||
//negative sequences: 0.0015%
|
||||
private readonly static byte[] HEBREW_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
@@ -177,12 +177,12 @@ namespace UniversalDetector.Core
|
||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
};
|
||||
|
||||
public HebrewModel(byte[] charToOrderMap, string name)
|
||||
public HebrewModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HEBREW_LANG_MODEL, 0.984004f, false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Win1255Model : HebrewModel
|
||||
{
|
||||
/*
|
||||
@@ -192,7 +192,7 @@ namespace UniversalDetector.Core
|
||||
252: 0 - 9
|
||||
*/
|
||||
//Windows-1255 language model
|
||||
//Character Mapping Table:
|
||||
//Character Mapping Table:
|
||||
private readonly static byte[] WIN1255_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@@ -211,7 +211,7 @@ namespace UniversalDetector.Core
|
||||
9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23,
|
||||
12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253,
|
||||
};
|
||||
|
||||
|
||||
public Win1255Model() : base(WIN1255_CHAR_TO_ORDER_MAP, "windows-1255")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -36,15 +36,15 @@
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public abstract class HungarianModel : SequenceModel
|
||||
{
|
||||
//Model Table:
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 94.7368%
|
||||
//first 1024 sequences:5.2623%
|
||||
//rest sequences: 0.8894%
|
||||
//negative sequences: 0.0009%
|
||||
//negative sequences: 0.0009%
|
||||
private readonly static byte[] HUNGARIAN_LANG_MODEL = {
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
@@ -176,13 +176,13 @@ namespace UniversalDetector.Core
|
||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public HungarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
|
||||
public HungarianModel(byte[] charToOrderMap, string name)
|
||||
: base(charToOrderMap, HUNGARIAN_LANG_MODEL, 0.947368f,
|
||||
false, name)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Latin2HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] LATIN2_CHAR_TO_ORDER_MAP = {
|
||||
@@ -203,12 +203,12 @@ namespace UniversalDetector.Core
|
||||
82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85,
|
||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
|
||||
public Latin2HungarianModel() : base(LATIN2_CHAR_TO_ORDER_MAP, "ISO-8859-2")
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public class Win1250HungarianModel : HungarianModel
|
||||
{
|
||||
private readonly static byte[] WIN1250_CHAR_TO_ORDER_MAP = {
|
||||
@@ -229,7 +229,7 @@ namespace UniversalDetector.Core
|
||||
84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87,
|
||||
245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
|
||||
public Win1250HungarianModel() : base(WIN1250_CHAR_TO_ORDER_MAP, "windows-1250")
|
||||
{
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -37,7 +37,7 @@
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public class ThaiModel : SequenceModel
|
||||
{
|
||||
/****************************************************************
|
||||
@@ -46,7 +46,7 @@ namespace UniversalDetector.Core
|
||||
253: symbol (punctuation) that does not belong to word
|
||||
252: 0 - 9
|
||||
*****************************************************************/
|
||||
// The following result for thai was collected from a limited sample (1M)
|
||||
// The following result for thai was collected from a limited sample (1M)
|
||||
private readonly static byte[] TIS620_CHAR_TO_ORDER_MAP = {
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
@@ -66,12 +66,12 @@ namespace UniversalDetector.Core
|
||||
68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253,
|
||||
};
|
||||
|
||||
//Model Table:
|
||||
//Model Table:
|
||||
//total sequences: 100%
|
||||
//first 512 sequences: 92.6386%
|
||||
//first 1024 sequences:7.3177%
|
||||
//rest sequences: 1.0230%
|
||||
//negative sequences: 0.0436%
|
||||
//negative sequences: 0.0436%
|
||||
private readonly static byte[] THAI_LANG_MODEL = {
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
@@ -203,11 +203,11 @@ namespace UniversalDetector.Core
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
public ThaiModel(byte[] charToOrderMap, string name)
|
||||
: base(TIS620_CHAR_TO_ORDER_MAP, THAI_LANG_MODEL,
|
||||
public ThaiModel(byte[] charToOrderMap, string name)
|
||||
: base(TIS620_CHAR_TO_ORDER_MAP, THAI_LANG_MODEL,
|
||||
0.926386f, false, "TIS-620")
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -40,7 +40,7 @@ using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
// TODO: Using trigrams the detector should be able to discriminate between
|
||||
// TODO: Using trigrams the detector should be able to discriminate between
|
||||
// latin-1 and iso8859-2
|
||||
public class Latin1Prober : CharsetProber
|
||||
{
|
||||
@@ -54,9 +54,9 @@ namespace UniversalDetector.Core
|
||||
private const int ACO = 5; // accent capital other
|
||||
private const int ASV = 6; // accent small vowel
|
||||
private const int ASO = 7; // accent small other
|
||||
|
||||
|
||||
private const int CLASS_NUM = 8; // total classes
|
||||
|
||||
|
||||
private readonly static byte[] Latin1_CharToClass = {
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||
@@ -92,36 +92,36 @@ namespace UniversalDetector.Core
|
||||
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
|
||||
};
|
||||
|
||||
/* 0 : illegal
|
||||
1 : very unlikely
|
||||
2 : normal
|
||||
/* 0 : illegal
|
||||
1 : very unlikely
|
||||
2 : normal
|
||||
3 : very likely
|
||||
*/
|
||||
private readonly static byte[] Latin1ClassModel = {
|
||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
|
||||
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
|
||||
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
|
||||
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
|
||||
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
|
||||
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
|
||||
};
|
||||
|
||||
private byte lastCharClass;
|
||||
private int[] freqCounter = new int[FREQ_CAT_NUM];
|
||||
|
||||
|
||||
public Latin1Prober()
|
||||
{
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "windows-1252";
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
@@ -129,12 +129,12 @@ namespace UniversalDetector.Core
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++)
|
||||
freqCounter[i] = 0;
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
|
||||
byte charClass, freq;
|
||||
|
||||
|
||||
for (int i = 0; i < newbuf.Length; i++) {
|
||||
charClass = Latin1_CharToClass[newbuf[i]];
|
||||
freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
|
||||
@@ -152,21 +152,21 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
if (state == ProbingState.NotMe)
|
||||
return 0.01f;
|
||||
|
||||
|
||||
float confidence = 0.0f;
|
||||
int total = 0;
|
||||
for (int i = 0; i < FREQ_CAT_NUM; i++) {
|
||||
total += freqCounter[i];
|
||||
}
|
||||
|
||||
|
||||
if (total <= 0) {
|
||||
confidence = 0.0f;
|
||||
} else {
|
||||
confidence = freqCounter[3] * 1.0f / total;
|
||||
confidence -= freqCounter[1] * 20.0f / total;
|
||||
}
|
||||
|
||||
// lower the confidence of latin1 so that other more accurate detector
|
||||
|
||||
// lower the confidence of latin1 so that other more accurate detector
|
||||
// can take priority.
|
||||
return confidence < 0.0f ? 0.0f : confidence * 0.5f;
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -46,13 +46,13 @@ namespace UniversalDetector.Core
|
||||
public class MBCSGroupProber : CharsetProber
|
||||
{
|
||||
private const int PROBERS_NUM = 7;
|
||||
private readonly static string[] ProberName =
|
||||
private readonly static string[] ProberName =
|
||||
{ "UTF8", "SJIS", "EUCJP", "GB18030", "EUCKR", "Big5", "EUCTW" };
|
||||
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
|
||||
private bool[] isActive = new bool[PROBERS_NUM];
|
||||
private int bestGuess;
|
||||
private int activeNum;
|
||||
|
||||
|
||||
public MBCSGroupProber()
|
||||
{
|
||||
probers[0] = new UTF8Prober();
|
||||
@@ -62,7 +62,7 @@ namespace UniversalDetector.Core
|
||||
probers[4] = new EUCKRProber();
|
||||
probers[5] = new Big5Prober();
|
||||
probers[6] = new EUCTWProber();
|
||||
Reset();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
@@ -99,7 +99,7 @@ namespace UniversalDetector.Core
|
||||
//assume previous is not ascii, it will do no harm except add some noise
|
||||
bool keepNext = true;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
if ((buf[i] & 0x80) != 0) {
|
||||
highbyteBuf[hptr++] = buf[i];
|
||||
@@ -112,9 +112,9 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
|
||||
for (int i = 0; i < probers.Length; i++) {
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
@@ -139,7 +139,7 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
float bestConf = 0.0f;
|
||||
float cf = 0.0f;
|
||||
|
||||
|
||||
if (state == ProbingState.FoundIt) {
|
||||
return 0.99f;
|
||||
} else if (state == ProbingState.NotMe) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -43,11 +43,11 @@ namespace UniversalDetector.Core
|
||||
public class SBCSGroupProber : CharsetProber
|
||||
{
|
||||
private const int PROBERS_NUM = 13;
|
||||
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
|
||||
private CharsetProber[] probers = new CharsetProber[PROBERS_NUM];
|
||||
private bool[] isActive = new bool[PROBERS_NUM];
|
||||
private int bestGuess;
|
||||
private int activeNum;
|
||||
|
||||
|
||||
public SBCSGroupProber()
|
||||
{
|
||||
probers[0] = new SingleByteCharSetProber(new Win1251Model());
|
||||
@@ -62,24 +62,24 @@ namespace UniversalDetector.Core
|
||||
probers[9] = new SingleByteCharSetProber(new Win1251BulgarianModel());
|
||||
HebrewProber hebprober = new HebrewProber();
|
||||
probers[10] = hebprober;
|
||||
// Logical
|
||||
probers[11] = new SingleByteCharSetProber(new Win1255Model(), false, hebprober);
|
||||
// Logical
|
||||
probers[11] = new SingleByteCharSetProber(new Win1255Model(), false, hebprober);
|
||||
// Visual
|
||||
probers[12] = new SingleByteCharSetProber(new Win1255Model(), true, hebprober);
|
||||
probers[12] = new SingleByteCharSetProber(new Win1255Model(), true, hebprober);
|
||||
hebprober.SetModelProbers(probers[11], probers[12]);
|
||||
// disable latin2 before latin1 is available, otherwise all latin1
|
||||
// disable latin2 before latin1 is available, otherwise all latin1
|
||||
// will be detected as latin2 because of their similarity.
|
||||
//probers[13] = new SingleByteCharSetProber(new Latin2HungarianModel());
|
||||
//probers[14] = new SingleByteCharSetProber(new Win1250HungarianModel());
|
||||
//probers[14] = new SingleByteCharSetProber(new Win1250HungarianModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
|
||||
//apply filter to original buffer, and we got new buffer back
|
||||
//depend on what script it is, we will feed them the new buffer
|
||||
//depend on what script it is, we will feed them the new buffer
|
||||
//we got after applying proper filter
|
||||
//this is done without any consideration to KeepEnglishLetters
|
||||
//of each prober since as of now, there are no probers here which
|
||||
@@ -87,12 +87,12 @@ namespace UniversalDetector.Core
|
||||
byte[] newBuf = FilterWithoutEnglishLetters(buf, offset, len);
|
||||
if (newBuf.Length == 0)
|
||||
return state; // Nothing to see here, move on.
|
||||
|
||||
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (!isActive[i])
|
||||
continue;
|
||||
st = probers[i].HandleData(newBuf, 0, newBuf.Length);
|
||||
|
||||
|
||||
if (st == ProbingState.FoundIt) {
|
||||
bestGuess = i;
|
||||
state = ProbingState.FoundIt;
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
*
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
@@ -51,11 +51,11 @@ namespace UniversalDetector.Core
|
||||
private const int NUMBER_OF_SEQ_CAT = 4;
|
||||
private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
|
||||
private const int NEGATIVE_CAT = 0;
|
||||
|
||||
|
||||
protected SequenceModel model;
|
||||
|
||||
// true if we need to reverse every pair in the model lookup
|
||||
bool reversed;
|
||||
|
||||
// true if we need to reverse every pair in the model lookup
|
||||
bool reversed;
|
||||
|
||||
// char order of last character
|
||||
byte lastOrder;
|
||||
@@ -63,38 +63,38 @@ namespace UniversalDetector.Core
|
||||
int totalSeqs;
|
||||
int totalChar;
|
||||
int[] seqCounters = new int[NUMBER_OF_SEQ_CAT];
|
||||
|
||||
|
||||
// characters that fall in our sampling range
|
||||
int freqChar;
|
||||
|
||||
|
||||
// Optional auxiliary prober for name decision. created and destroyed by the GroupProber
|
||||
CharsetProber nameProber;
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model)
|
||||
CharsetProber nameProber;
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model)
|
||||
: this(model, false, null)
|
||||
{
|
||||
|
||||
|
||||
}
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model, bool reversed,
|
||||
|
||||
public SingleByteCharSetProber(SequenceModel model, bool reversed,
|
||||
CharsetProber nameProber)
|
||||
{
|
||||
this.model = model;
|
||||
this.reversed = reversed;
|
||||
this.nameProber = nameProber;
|
||||
Reset();
|
||||
Reset();
|
||||
}
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
byte order = model.GetOrder(buf[i]);
|
||||
|
||||
if (order < SYMBOL_CAT_ORDER)
|
||||
totalChar++;
|
||||
|
||||
|
||||
if (order < SAMPLE_SIZE) {
|
||||
freqChar++;
|
||||
|
||||
@@ -120,7 +120,7 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
return state;
|
||||
}
|
||||
|
||||
|
||||
public override void DumpStatus()
|
||||
{
|
||||
//Console.WriteLine(" SBCS: {0} [{1}]", GetConfidence(), GetCharsetName());
|
||||
@@ -146,9 +146,9 @@ namespace UniversalDetector.Core
|
||||
r = 0.99f;
|
||||
return r;
|
||||
}
|
||||
return 0.01f;
|
||||
return 0.01f;
|
||||
}
|
||||
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
state = ProbingState.Detecting;
|
||||
@@ -159,12 +159,12 @@ namespace UniversalDetector.Core
|
||||
totalChar = 0;
|
||||
freqChar = 0;
|
||||
}
|
||||
|
||||
public override string GetCharsetName()
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return (nameProber == null) ? model.CharsetName
|
||||
: nameProber.GetCharsetName();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -50,25 +50,25 @@ namespace UniversalDetector.Core
|
||||
private SJISContextAnalyser contextAnalyser;
|
||||
private SJISDistributionAnalyser distributionAnalyser;
|
||||
private byte[] lastChar = new byte[2];
|
||||
|
||||
|
||||
public SJISProber()
|
||||
{
|
||||
codingSM = new CodingStateMachine(new SJISSMModel());
|
||||
distributionAnalyser = new SJISDistributionAnalyser();
|
||||
contextAnalyser = new SJISContextAnalyser();
|
||||
contextAnalyser = new SJISContextAnalyser();
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName()
|
||||
{
|
||||
return "Shift-JIS";
|
||||
return "Shift-JIS";
|
||||
}
|
||||
|
||||
|
||||
public override ProbingState HandleData(byte[] buf, int offset, int len)
|
||||
{
|
||||
int codingState;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
if (codingState == SMModel.ERROR) {
|
||||
@@ -90,7 +90,7 @@ namespace UniversalDetector.Core
|
||||
distributionAnalyser.HandleOneChar(buf, i-1, charLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lastChar[0] = buf[max-1];
|
||||
if (state == ProbingState.Detecting)
|
||||
if (contextAnalyser.GotEnoughData() && GetConfidence() > SHORTCUT_THRESHOLD)
|
||||
@@ -100,12 +100,12 @@ namespace UniversalDetector.Core
|
||||
|
||||
public override void Reset()
|
||||
{
|
||||
codingSM.Reset();
|
||||
codingSM.Reset();
|
||||
state = ProbingState.Detecting;
|
||||
contextAnalyser.Reset();
|
||||
distributionAnalyser.Reset();
|
||||
}
|
||||
|
||||
|
||||
public override float GetConfidence()
|
||||
{
|
||||
float contxtCf = contextAnalyser.GetConfidence();
|
||||
|
||||
@@ -52,9 +52,9 @@ namespace UniversalDetector.Core
|
||||
public BitPackage classTable;
|
||||
public BitPackage stateTable;
|
||||
public int[] charLenTable;
|
||||
|
||||
|
||||
private string name;
|
||||
|
||||
|
||||
public string Name {
|
||||
get { return name; }
|
||||
}
|
||||
@@ -74,10 +74,10 @@ namespace UniversalDetector.Core
|
||||
this.charLenTable = charLenTable;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
||||
public int GetClass(byte b)
|
||||
{
|
||||
return classTable.Unpack((int)b);
|
||||
{
|
||||
return classTable.Unpack((int)b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -39,36 +39,36 @@
|
||||
using System;
|
||||
|
||||
namespace UniversalDetector.Core
|
||||
{
|
||||
{
|
||||
public abstract class SequenceModel
|
||||
{
|
||||
// [256] table use to find a char's order
|
||||
protected byte[] charToOrderMap;
|
||||
|
||||
// [SAMPLE_SIZE][SAMPLE_SIZE] table to find a 2-char sequence's
|
||||
// frequency
|
||||
|
||||
// [SAMPLE_SIZE][SAMPLE_SIZE] table to find a 2-char sequence's
|
||||
// frequency
|
||||
protected byte[] precedenceMatrix;
|
||||
|
||||
|
||||
// freqSeqs / totalSeqs
|
||||
protected float typicalPositiveRatio;
|
||||
|
||||
|
||||
public float TypicalPositiveRatio {
|
||||
get { return typicalPositiveRatio; }
|
||||
}
|
||||
|
||||
// not used
|
||||
|
||||
// not used
|
||||
protected bool keepEnglishLetter;
|
||||
|
||||
|
||||
public bool KeepEnglishLetter {
|
||||
get { return keepEnglishLetter; }
|
||||
}
|
||||
|
||||
|
||||
protected String charsetName;
|
||||
|
||||
public string CharsetName {
|
||||
get { return charsetName; }
|
||||
}
|
||||
|
||||
|
||||
public SequenceModel(
|
||||
byte[] charToOrderMap,
|
||||
byte[] precedenceMatrix,
|
||||
@@ -82,16 +82,16 @@ namespace UniversalDetector.Core
|
||||
this.keepEnglishLetter = keepEnglishLetter;
|
||||
this.charsetName = charsetName;
|
||||
}
|
||||
|
||||
|
||||
public byte GetOrder(byte b)
|
||||
{
|
||||
return charToOrderMap[b];
|
||||
}
|
||||
|
||||
|
||||
public byte GetPrecedence(int pos)
|
||||
{
|
||||
return precedenceMatrix[pos];
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -46,11 +46,11 @@ namespace UniversalDetector.Core
|
||||
|
||||
public UTF8Prober()
|
||||
{
|
||||
numOfMBChar = 0;
|
||||
numOfMBChar = 0;
|
||||
codingSM = new CodingStateMachine(new UTF8SMModel());
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
||||
public override string GetCharsetName() {
|
||||
return "UTF-8";
|
||||
}
|
||||
@@ -66,7 +66,7 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
int codingState = SMModel.START;
|
||||
int max = offset + len;
|
||||
|
||||
|
||||
for (int i = offset; i < max; i++) {
|
||||
|
||||
codingState = codingSM.NextState(buf[i]);
|
||||
@@ -97,7 +97,7 @@ namespace UniversalDetector.Core
|
||||
{
|
||||
float unlike = 0.99f;
|
||||
float confidence = 0.0f;
|
||||
|
||||
|
||||
if (numOfMBChar < 6) {
|
||||
for (int i = 0; i < numOfMBChar; i++)
|
||||
unlike *= ONE_CHAR_PROB;
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
* Contributor(s):
|
||||
* Shy Shalom <shooshX@gmail.com>
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -41,7 +41,7 @@ namespace UniversalDetector.Core
|
||||
|
||||
enum InputState { PureASCII=0, EscASCII=1, Highbyte=2 };
|
||||
|
||||
public abstract class UniversalDetector
|
||||
public abstract class UniversalDetector
|
||||
{
|
||||
protected const int FILTER_CHINESE_SIMPLIFIED = 1;
|
||||
protected const int FILTER_CHINESE_TRADITIONAL = 2;
|
||||
@@ -49,12 +49,12 @@ namespace UniversalDetector.Core
|
||||
protected const int FILTER_KOREAN = 8;
|
||||
protected const int FILTER_NON_CJK = 16;
|
||||
protected const int FILTER_ALL = 31;
|
||||
protected static int FILTER_CHINESE =
|
||||
protected static int FILTER_CHINESE =
|
||||
FILTER_CHINESE_SIMPLIFIED | FILTER_CHINESE_TRADITIONAL;
|
||||
protected static int FILTER_CJK =
|
||||
FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED
|
||||
protected static int FILTER_CJK =
|
||||
FILTER_JAPANESE | FILTER_KOREAN | FILTER_CHINESE_SIMPLIFIED
|
||||
| FILTER_CHINESE_TRADITIONAL;
|
||||
|
||||
|
||||
protected const float SHORTCUT_THRESHOLD = 0.95f;
|
||||
protected const float MINIMUM_THRESHOLD = 0.20f;
|
||||
|
||||
@@ -70,16 +70,16 @@ namespace UniversalDetector.Core
|
||||
protected CharsetProber escCharsetProber;
|
||||
protected string detectedCharset;
|
||||
|
||||
public UniversalDetector(int languageFilter) {
|
||||
public UniversalDetector(int languageFilter) {
|
||||
this.start = true;
|
||||
this.inputState = InputState.PureASCII;
|
||||
this.lastChar = 0x00;
|
||||
this.lastChar = 0x00;
|
||||
this.bestGuess = -1;
|
||||
this.languageFilter = languageFilter;
|
||||
}
|
||||
|
||||
public virtual void Feed(byte[] buf, int offset, int len)
|
||||
{
|
||||
{
|
||||
if (done) {
|
||||
return;
|
||||
}
|
||||
@@ -125,7 +125,7 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
|
||||
|
||||
// other than 0xa0, if every other character is ascii, the page is ascii
|
||||
if ((buf[i] & 0x80) != 0 && buf[i] != 0xA0) {
|
||||
// we got a non-ascii byte (high-byte)
|
||||
@@ -143,9 +143,9 @@ namespace UniversalDetector.Core
|
||||
if (charsetProbers[1] == null)
|
||||
charsetProbers[1] = new SBCSGroupProber();
|
||||
if (charsetProbers[2] == null)
|
||||
charsetProbers[2] = new Latin1Prober();
|
||||
charsetProbers[2] = new Latin1Prober();
|
||||
}
|
||||
} else {
|
||||
} else {
|
||||
if (inputState == InputState.PureASCII &&
|
||||
(buf[i] == 0x33 || (buf[i] == 0x7B && lastChar == 0x7E))) {
|
||||
// found escape character or HZ "~{"
|
||||
@@ -154,9 +154,9 @@ namespace UniversalDetector.Core
|
||||
lastChar = buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ProbingState st = ProbingState.NotMe;
|
||||
|
||||
|
||||
switch (inputState) {
|
||||
case InputState.EscASCII:
|
||||
if (escCharsetProber == null) {
|
||||
@@ -172,18 +172,18 @@ namespace UniversalDetector.Core
|
||||
for (int i = 0; i < PROBERS_NUM; i++) {
|
||||
if (charsetProbers[i] != null) {
|
||||
st = charsetProbers[i].HandleData(buf, offset, len);
|
||||
#if DEBUG
|
||||
#if DEBUG
|
||||
charsetProbers[i].DumpStatus();
|
||||
#endif
|
||||
#endif
|
||||
if (st == ProbingState.FoundIt) {
|
||||
done = true;
|
||||
detectedCharset = charsetProbers[i].GetCharsetName();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
default:
|
||||
// pure ascii
|
||||
break;
|
||||
}
|
||||
@@ -191,13 +191,13 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notify detector that no further data is available.
|
||||
/// Notify detector that no further data is available.
|
||||
/// </summary>
|
||||
public virtual void DataEnd()
|
||||
{
|
||||
if (!gotData) {
|
||||
// we haven't got any data yet, return immediately
|
||||
// caller program sometimes call DataEnd before anything has
|
||||
// we haven't got any data yet, return immediately
|
||||
// caller program sometimes call DataEnd before anything has
|
||||
// been sent to detector
|
||||
return;
|
||||
}
|
||||
@@ -206,7 +206,7 @@ namespace UniversalDetector.Core
|
||||
done = true;
|
||||
Report(detectedCharset, 1.0f);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (inputState == InputState.Highbyte) {
|
||||
float proberConfidence = 0.0f;
|
||||
@@ -221,22 +221,22 @@ namespace UniversalDetector.Core
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (maxProberConfidence > MINIMUM_THRESHOLD) {
|
||||
Report(charsetProbers[maxProber].GetCharsetName(), maxProberConfidence);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else if (inputState == InputState.PureASCII) {
|
||||
Report("ASCII", 1.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clear internal state of charset detector.
|
||||
/// In the original interface this method is protected.
|
||||
/// In the original interface this method is protected.
|
||||
/// </summary>
|
||||
public virtual void Reset()
|
||||
{
|
||||
public virtual void Reset()
|
||||
{
|
||||
done = false;
|
||||
start = true;
|
||||
detectedCharset = null;
|
||||
@@ -250,7 +250,7 @@ namespace UniversalDetector.Core
|
||||
if (charsetProbers[i] != null)
|
||||
charsetProbers[i].Reset();
|
||||
}
|
||||
|
||||
|
||||
protected abstract void Report(string charset, float confidence);
|
||||
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
*
|
||||
* Contributor(s):
|
||||
* Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either of the GNU General Public License Version 2 or later (the "GPL"),
|
||||
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
@@ -40,26 +40,26 @@ namespace UniversalDetector
|
||||
{
|
||||
/// <summary>
|
||||
/// Indicate how confident the detection module about the return result.
|
||||
///
|
||||
/// NoAnswerYet: the detector have not find out a answer yet based on
|
||||
///
|
||||
/// NoAnswerYet: the detector have not find out a answer yet based on
|
||||
/// the data it received.
|
||||
///
|
||||
/// BestAnswer: the answer the detector returned is the best one within
|
||||
/// the knowledge of the detector. In other words, the test to all
|
||||
///
|
||||
/// BestAnswer: the answer the detector returned is the best one within
|
||||
/// the knowledge of the detector. In other words, the test to all
|
||||
/// other candidates fail.
|
||||
/// For example, the (Shift_JIS/EUC-JP/ISO-2022-JP) detection
|
||||
/// module may return this with answer "Shift_JIS " if it receive
|
||||
/// bytes > 0x80 (which make ISO-2022-JP test failed) and byte
|
||||
/// module may return this with answer "Shift_JIS " if it receive
|
||||
/// bytes > 0x80 (which make ISO-2022-JP test failed) and byte
|
||||
/// 0x82 (which may EUC-JP test failed)
|
||||
///
|
||||
/// SureAnswer: the detector is 100% sure about the answer.
|
||||
///
|
||||
///
|
||||
/// Example 1: the Shift_JIS/ISO-2022-JP/EUC-JP detector return
|
||||
/// this w/ ISO-2022-JP when it hit one of the following ESC seq
|
||||
/// ESC ( J
|
||||
/// ESC $ @
|
||||
/// ESC $ B
|
||||
///
|
||||
///
|
||||
/// Example 2: the detector which can detect UCS2 return w/ UCS2
|
||||
/// when the first 2 byte are BOM mark.
|
||||
/// Example 3: the Korean detector return ISO-2022-KR when it
|
||||
|
||||
@@ -47,31 +47,31 @@ namespace UniversalDetector
|
||||
/// The detected charset. It can be null.
|
||||
/// </summary>
|
||||
string Charset { get; }
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// The confidence of the detected charset, if any
|
||||
/// The confidence of the detected charset, if any
|
||||
/// </summary>
|
||||
float Confidence { get; }
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Feed a block of bytes to the detector.
|
||||
/// Feed a block of bytes to the detector.
|
||||
/// </summary>
|
||||
/// <param name="buf">input buffer</param>
|
||||
/// <param name="offset">offset into buffer</param>
|
||||
/// <param name="len">number of available bytes</param>
|
||||
void Feed(byte[] buf, int offset, int len);
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Feed a bytes stream to the detector.
|
||||
/// Feed a bytes stream to the detector.
|
||||
/// </summary>
|
||||
/// <param name="stream">an input stream</param>
|
||||
void Feed(Stream stream);
|
||||
|
||||
/// <summary>
|
||||
/// Resets the state of the detector.
|
||||
/// </summary>
|
||||
/// Resets the state of the detector.
|
||||
/// </summary>
|
||||
void Reset();
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if the detector has found a result and it is sure about it.
|
||||
/// </summary>
|
||||
@@ -83,6 +83,6 @@ namespace UniversalDetector
|
||||
/// decision.
|
||||
/// </summary>
|
||||
void DataEnd();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user