Visual Studio Reformat: Emby.Server.Implementations Part T-T

This commit is contained in:
Erwin de Haan
2019-01-13 20:22:56 +01:00
parent 0efc699e3d
commit 25f0315e91
39 changed files with 1054 additions and 892 deletions

View File

@@ -1,10 +1,8 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using NLangDetect.Core.Utils;
using MediaBrowser.Model.Serialization;
using System.Linq;
using MediaBrowser.Model.Serialization;
using NLangDetect.Core.Utils;
namespace NLangDetect.Core
{

View File

@@ -1,15 +1,15 @@
namespace NLangDetect.Core
{
public enum ErrorCode
{
NoTextError,
FormatError,
FileLoadError,
DuplicateLangError,
NeedLoadProfileError,
CantDetectError,
CantOpenTrainData,
TrainDataFormatError,
InitParamError,
}
public enum ErrorCode
{
NoTextError,
FormatError,
FileLoadError,
DuplicateLangError,
NeedLoadProfileError,
CantDetectError,
CantOpenTrainData,
TrainDataFormatError,
InitParamError,
}
}

View File

@@ -2,13 +2,13 @@
namespace NLangDetect.Core.Extensions
{
public static class CharExtensions
{
private const int MIN_CODE_POINT = 0x000000;
private const int MAX_CODE_POINT = 0x10ffff;
public static class CharExtensions
{
private const int MIN_CODE_POINT = 0x000000;
private const int MAX_CODE_POINT = 0x10ffff;
private static readonly int[] _unicodeBlockStarts =
{
private static readonly int[] _unicodeBlockStarts =
{
#region Unicode block starts
0x0000, // Basic Latin
@@ -165,8 +165,8 @@ namespace NLangDetect.Core.Extensions
#endregion
};
private static readonly UnicodeBlock?[] _unicodeBlocks =
{
private static readonly UnicodeBlock?[] _unicodeBlocks =
{
#region Unicode blocks
UnicodeBlock.BasicLatin,
UnicodeBlock.Latin1Supplement,
@@ -322,53 +322,53 @@ namespace NLangDetect.Core.Extensions
#endregion
};
#region Public methods
#region Public methods
/// <remarks>
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
/// </remarks>
public static UnicodeBlock? GetUnicodeBlock(this char ch)
{
int codePoint = ch;
if (!IsValidCodePoint(codePoint))
{
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
/// <remarks>
/// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
/// </remarks>
public static UnicodeBlock? GetUnicodeBlock(this char ch)
{
bottom = current;
}
else
{
top = current;
int codePoint = ch;
if (!IsValidCodePoint(codePoint))
{
throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
}
int top, bottom, current;
bottom = 0;
top = _unicodeBlockStarts.Length;
current = top / 2;
// invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
while (top - bottom > 1)
{
if (codePoint >= _unicodeBlockStarts[current])
{
bottom = current;
}
else
{
top = current;
}
current = (top + bottom) / 2;
}
return _unicodeBlocks[current];
}
current = (top + bottom) / 2;
}
#endregion
return _unicodeBlocks[current];
#region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
}
#endregion
#region Private helper methods
private static bool IsValidCodePoint(int codePoint)
{
return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
}
#endregion
}
}

View File

@@ -2,50 +2,50 @@
namespace NLangDetect.Core.Extensions
{
public static class RandomExtensions
{
private const double _Epsilon = 2.22044604925031E-15;
private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
public static class RandomExtensions
{
lock (_mutex)
{
if (_hasNextNextGaussian)
private const double _Epsilon = 2.22044604925031E-15;
private static readonly object _mutex = new object();
private static double _nextNextGaussian;
private static bool _hasNextNextGaussian;
/// <summary>
/// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
/// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
/// </summary>
/// <remarks>
/// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
/// </remarks>
public static double NextGaussian(this Random random)
{
_hasNextNextGaussian = false;
lock (_mutex)
{
if (_hasNextNextGaussian)
{
_hasNextNextGaussian = false;
return _nextNextGaussian;
return _nextNextGaussian;
}
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
}
double v1, v2, s;
do
{
v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
s = v1 * v1 + v2 * v2;
}
while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
_nextNextGaussian = v2 * multiplier;
_hasNextNextGaussian = true;
return v1 * multiplier;
}
}
}
}

View File

@@ -1,131 +1,131 @@
namespace NLangDetect.Core.Extensions
{
public enum UnicodeBlock
{
BasicLatin,
Latin1Supplement,
LatinExtendedA,
LatinExtendedB,
IpaExtensions,
SpacingModifierLetters,
CombiningDiacriticalMarks,
Greek,
Cyrillic,
CyrillicSupplementary,
Armenian,
Hebrew,
Arabic,
Syriac,
Thaana,
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Oriya,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Tibetan,
Myanmar,
Georgian,
HangulJamo,
Ethiopic,
Cherokee,
UnifiedCanadianAboriginalSyllabics,
Ogham,
Runic,
Tagalog,
Hanunoo,
Buhid,
Tagbanwa,
Khmer,
Mongolian,
Limbu,
TaiLe,
KhmerSymbols,
PhoneticExtensions,
LatinExtendedAdditional,
GreekExtended,
GeneralPunctuation,
SuperscriptsAndSubscripts,
CurrencySymbols,
CombiningMarksForSymbols,
LetterlikeSymbols,
NumberForms,
Arrows,
MathematicalOperators,
MiscellaneousTechnical,
ControlPictures,
OpticalCharacterRecognition,
EnclosedAlphanumerics,
BoxDrawing,
BlockElements,
GeometricShapes,
MiscellaneousSymbols,
Dingbats,
MiscellaneousMathematicalSymbolsA,
SupplementalArrowsA,
BraillePatterns,
SupplementalArrowsB,
MiscellaneousMathematicalSymbolsB,
SupplementalMathematicalOperators,
MiscellaneousSymbolsAndArrows,
CjkRadicalsSupplement,
KangxiRadicals,
IdeographicDescriptionCharacters,
CjkSymbolsAndPunctuation,
Hiragana,
Katakana,
Bopomofo,
HangulCompatibilityJamo,
Kanbun,
BopomofoExtended,
KatakanaPhoneticExtensions,
EnclosedCjkLettersAndMonths,
CjkCompatibility,
CjkUnifiedIdeographsExtensionA,
YijingHexagramSymbols,
CjkUnifiedIdeographs,
YiSyllables,
YiRadicals,
HangulSyllables,
HighSurrogates,
HighPrivateUseSurrogates,
LowSurrogates,
PrivateUseArea,
CjkCompatibilityIdeographs,
AlphabeticPresentationForms,
ArabicPresentationFormsA,
VariationSelectors,
CombiningHalfMarks,
CjkCompatibilityForms,
SmallFormVariants,
ArabicPresentationFormsB,
HalfwidthAndFullwidthForms,
Specials,
LinearBSyllabary,
LinearBIdeograms,
AegeanNumbers,
OldItalic,
Gothic,
Ugaritic,
Deseret,
Shavian,
Osmanya,
CypriotSyllabary,
ByzantineMusicalSymbols,
MusicalSymbols,
TaiXuanJingSymbols,
MathematicalAlphanumericSymbols,
CjkUnifiedIdeographsExtensionB,
CjkCompatibilityIdeographsSupplement,
Tags,
VariationSelectorsSupplement,
SupplementaryPrivateUseAreaA,
SupplementaryPrivateUseAreaB,
}
public enum UnicodeBlock
{
BasicLatin,
Latin1Supplement,
LatinExtendedA,
LatinExtendedB,
IpaExtensions,
SpacingModifierLetters,
CombiningDiacriticalMarks,
Greek,
Cyrillic,
CyrillicSupplementary,
Armenian,
Hebrew,
Arabic,
Syriac,
Thaana,
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Oriya,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Tibetan,
Myanmar,
Georgian,
HangulJamo,
Ethiopic,
Cherokee,
UnifiedCanadianAboriginalSyllabics,
Ogham,
Runic,
Tagalog,
Hanunoo,
Buhid,
Tagbanwa,
Khmer,
Mongolian,
Limbu,
TaiLe,
KhmerSymbols,
PhoneticExtensions,
LatinExtendedAdditional,
GreekExtended,
GeneralPunctuation,
SuperscriptsAndSubscripts,
CurrencySymbols,
CombiningMarksForSymbols,
LetterlikeSymbols,
NumberForms,
Arrows,
MathematicalOperators,
MiscellaneousTechnical,
ControlPictures,
OpticalCharacterRecognition,
EnclosedAlphanumerics,
BoxDrawing,
BlockElements,
GeometricShapes,
MiscellaneousSymbols,
Dingbats,
MiscellaneousMathematicalSymbolsA,
SupplementalArrowsA,
BraillePatterns,
SupplementalArrowsB,
MiscellaneousMathematicalSymbolsB,
SupplementalMathematicalOperators,
MiscellaneousSymbolsAndArrows,
CjkRadicalsSupplement,
KangxiRadicals,
IdeographicDescriptionCharacters,
CjkSymbolsAndPunctuation,
Hiragana,
Katakana,
Bopomofo,
HangulCompatibilityJamo,
Kanbun,
BopomofoExtended,
KatakanaPhoneticExtensions,
EnclosedCjkLettersAndMonths,
CjkCompatibility,
CjkUnifiedIdeographsExtensionA,
YijingHexagramSymbols,
CjkUnifiedIdeographs,
YiSyllables,
YiRadicals,
HangulSyllables,
HighSurrogates,
HighPrivateUseSurrogates,
LowSurrogates,
PrivateUseArea,
CjkCompatibilityIdeographs,
AlphabeticPresentationForms,
ArabicPresentationFormsA,
VariationSelectors,
CombiningHalfMarks,
CjkCompatibilityForms,
SmallFormVariants,
ArabicPresentationFormsB,
HalfwidthAndFullwidthForms,
Specials,
LinearBSyllabary,
LinearBIdeograms,
AegeanNumbers,
OldItalic,
Gothic,
Ugaritic,
Deseret,
Shavian,
Osmanya,
CypriotSyllabary,
ByzantineMusicalSymbols,
MusicalSymbols,
TaiXuanJingSymbols,
MathematicalAlphanumericSymbols,
CjkUnifiedIdeographsExtensionB,
CjkCompatibilityIdeographsSupplement,
Tags,
VariationSelectorsSupplement,
SupplementaryPrivateUseAreaA,
SupplementaryPrivateUseAreaB,
}
}

View File

@@ -1,67 +1,67 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Xml;
using NLangDetect.Core.Utils;
using System.IO;
namespace NLangDetect.Core
{
// TODO IMM HI: xml reader not tested
public static class GenProfile
{
#region Public methods
public static LangProfile load(string lang, string file)
// TODO IMM HI: xml reader not tested
public static class GenProfile
{
LangProfile profile = new LangProfile(lang);
TagExtractor tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
#region Public methods
try
{
inputStream = File.OpenRead(file);
string extension = Path.GetExtension(file) ?? "";
if (extension.ToUpper() == ".GZ")
public static LangProfile load(string lang, string file)
{
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
}
LangProfile profile = new LangProfile(lang);
TagExtractor tagextractor = new TagExtractor("abstract", 100);
Stream inputStream = null;
using (XmlReader xmlReader = XmlReader.Create(inputStream))
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
try
{
case XmlNodeType.Element:
tagextractor.SetTag(xmlReader.Name);
break;
inputStream = File.OpenRead(file);
case XmlNodeType.Text:
tagextractor.Add(xmlReader.Value);
break;
string extension = Path.GetExtension(file) ?? "";
case XmlNodeType.EndElement:
tagextractor.CloseTag(profile);
break;
if (extension.ToUpper() == ".GZ")
{
inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
}
using (XmlReader xmlReader = XmlReader.Create(inputStream))
{
while (xmlReader.Read())
{
switch (xmlReader.NodeType)
{
case XmlNodeType.Element:
tagextractor.SetTag(xmlReader.Name);
break;
case XmlNodeType.Text:
tagextractor.Add(xmlReader.Value);
break;
case XmlNodeType.EndElement:
tagextractor.CloseTag(profile);
break;
}
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
}
}
}
}
finally
{
if (inputStream != null)
{
inputStream.Close();
}
}
Console.WriteLine(lang + ": " + tagextractor.Count);
Console.WriteLine(lang + ": " + tagextractor.Count);
return profile;
return profile;
}
#endregion
}
#endregion
}
}

View File

@@ -2,21 +2,21 @@
namespace NLangDetect.Core
{
[Serializable]
public class InternalException : Exception
{
#region Constructor(s)
public InternalException(string message, Exception innerException)
: base(message, innerException)
[Serializable]
public class InternalException : Exception
{
}
#region Constructor(s)
public InternalException(string message)
: this(message, null)
{
}
public InternalException(string message, Exception innerException)
: base(message, innerException)
{
}
#endregion
}
public InternalException(string message)
: this(message, null)
{
}
#endregion
}
}

View File

@@ -2,44 +2,44 @@ using System.Globalization;
namespace NLangDetect.Core
{
// TODO IMM HI: name??
public class Language
{
#region Constructor(s)
public Language(string name, double probability)
// TODO IMM HI: name??
public class Language
{
Name = name;
Probability = probability;
#region Constructor(s)
public Language(string name, double probability)
{
Name = name;
Probability = probability;
}
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
}
#endregion
#region Object overrides
public override string ToString()
{
if (Name == null)
{
return "";
}
return
string.Format(
CultureInfo.InvariantCulture.NumberFormat,
"{0}:{1:0.000000}",
Name,
Probability);
}
#endregion
#region Properties
public string Name { get; set; }
public double Probability { get; set; }
#endregion
}
}

View File

@@ -2,22 +2,22 @@
namespace NLangDetect.Core
{
public class NLangDetectException : Exception
{
#region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
public class NLangDetectException : Exception
{
ErrorCode = errorCode;
#region Constructor(s)
public NLangDetectException(string message, ErrorCode errorCode)
: base(message)
{
ErrorCode = errorCode;
}
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
}
#endregion
#region Properties
public ErrorCode ErrorCode { get; private set; }
#endregion
}
}

View File

@@ -3,33 +3,33 @@ using System.Collections.Generic;
namespace NLangDetect.Core
{
public class ProbVector
{
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
public double this[int key]
public class ProbVector
{
get
{
double value;
private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
return _dict.TryGetValue(key, out value) ? value : 0.0;
}
set
{
if (Math.Abs(value) < double.Epsilon)
public double this[int key]
{
if (_dict.ContainsKey(key))
{
_dict.Remove(key);
}
get
{
double value;
return;
return _dict.TryGetValue(key, out value) ? value : 0.0;
}
set
{
if (Math.Abs(value) < double.Epsilon)
{
if (_dict.ContainsKey(key))
{
_dict.Remove(key);
}
return;
}
_dict[key] = value;
}
}
_dict[key] = value;
}
}
}
}

View File

@@ -1,10 +1,9 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Reflection;
using System.Text.RegularExpressions;
using System.Linq;
using System;
using System.Text.RegularExpressions;
namespace NLangDetect.Core.Utils
{
@@ -29,7 +28,7 @@ namespace NLangDetect.Core.Utils
private static Dictionary<string, string> LoadMessages()
{
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1);
Stream messagesStream =
typeof(Messages).Assembly

View File

@@ -6,14 +6,14 @@ using NLangDetect.Core.Extensions;
namespace NLangDetect.Core.Utils
{
public class NGram
{
public const int GramsCount = 3;
public class NGram
{
public const int GramsCount = 3;
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
private static readonly string[] CjkClass =
{
private static readonly string[] CjkClass =
{
#region CJK classes
Messages.getString("NGram.KANJI_1_0"),
@@ -146,185 +146,185 @@ namespace NLangDetect.Core.Utils
#endregion
};
private static readonly Dictionary<char, char> _cjkMap;
private static readonly Dictionary<char, char> _cjkMap;
private StringBuilder _grams;
private bool _capitalword;
private StringBuilder _grams;
private bool _capitalword;
#region Constructor(s)
#region Constructor(s)
static NGram()
{
_cjkMap = new Dictionary<char, char>();
foreach (string cjk_list in CjkClass)
{
char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
static NGram()
{
_cjkMap.Add(cjk_list[i], representative);
}
}
}
_cjkMap = new Dictionary<char, char>();
public NGram()
{
_grams = new StringBuilder(" ");
_capitalword = false;
}
#endregion
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
foreach (string cjk_list in CjkClass)
{
return ' ';
char representative = cjk_list[0];
for (int i = 0; i < cjk_list.Length; i++)
{
_cjkMap.Add(cjk_list[i], representative);
}
}
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
}
return ch.ToString();
}
public NGram()
{
_grams = new StringBuilder(" ");
_capitalword = false;
}
// TODO IMM HI: is ToString() here effective?
return _grams.ToString().SubSequence(len - n, len);
#endregion
#region Public methods
public static char Normalize(char ch)
{
UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
if (!unicodeBlock.HasValue)
{
return ch;
}
switch (unicodeBlock.Value)
{
case UnicodeBlock.BasicLatin:
{
if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
{
return ' ';
}
break;
}
case UnicodeBlock.Latin1Supplement:
{
if (Latin1Excluded.IndexOf(ch) >= 0)
{
return ' ';
}
break;
}
case UnicodeBlock.GeneralPunctuation:
{
return ' ';
}
case UnicodeBlock.Arabic:
{
if (ch == '\u06cc')
{
return '\u064a';
}
break;
}
case UnicodeBlock.LatinExtendedAdditional:
{
if (ch >= '\u1ea0')
{
return '\u1ec3';
}
break;
}
case UnicodeBlock.Hiragana:
{
return '\u3042';
}
case UnicodeBlock.Katakana:
{
return '\u30a2';
}
case UnicodeBlock.Bopomofo:
case UnicodeBlock.BopomofoExtended:
{
return '\u3105';
}
case UnicodeBlock.CjkUnifiedIdeographs:
{
if (_cjkMap.ContainsKey(ch))
{
return _cjkMap[ch];
}
break;
}
case UnicodeBlock.HangulSyllables:
{
return '\uac00';
}
}
return ch;
}
public void AddChar(char ch)
{
ch = Normalize(ch);
char lastchar = _grams[_grams.Length - 1];
if (lastchar == ' ')
{
_grams = new StringBuilder(" ");
_capitalword = false;
if (ch == ' ') return;
}
else if (_grams.Length >= GramsCount)
{
_grams.Remove(0, 1);
}
_grams.Append(ch);
if (char.IsUpper(ch))
{
if (char.IsUpper(lastchar)) _capitalword = true;
}
else
{
_capitalword = false;
}
}
public string Get(int n)
{
if (_capitalword)
{
return null;
}
int len = _grams.Length;
if (n < 1 || n > 3 || len < n)
{
return null;
}
if (n == 1)
{
char ch = _grams[len - 1];
if (ch == ' ')
{
return null;
}
return ch.ToString();
}
// TODO IMM HI: is ToString() here effective?
return _grams.ToString().SubSequence(len - n, len);
}
#endregion
}
#endregion
}
}

View File

@@ -2,75 +2,75 @@ using System.Text;
namespace NLangDetect.Core.Utils
{
public class TagExtractor
{
// TODO IMM HI: do the really need to be internal?
internal string Target;
internal int Threshold;
internal StringBuilder StringBuilder;
internal string Tag;
#region Constructor(s)
public TagExtractor(string tag, int threshold)
public class TagExtractor
{
Target = tag;
Threshold = threshold;
Count = 0;
Clear();
}
// TODO IMM HI: do the really need to be internal?
internal string Target;
internal int Threshold;
internal StringBuilder StringBuilder;
internal string Tag;
#endregion
#region Constructor(s)
#region Public methods
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
public TagExtractor(string tag, int threshold)
{
gram.AddChar(StringBuilder[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
profile.Add(gram.Get(n));
}
Target = tag;
Threshold = threshold;
Count = 0;
Clear();
}
Count++;
}
#endregion
Clear();
#region Public methods
public void Clear()
{
StringBuilder = new StringBuilder();
Tag = null;
}
public void SetTag(string tag)
{
Tag = tag;
}
public void Add(string line)
{
if (Tag == Target && line != null)
{
StringBuilder.Append(line);
}
}
public void CloseTag(LangProfile profile)
{
if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
{
var gram = new NGram();
for (int i = 0; i < StringBuilder.Length; i++)
{
gram.AddChar(StringBuilder[i]);
for (int n = 1; n <= NGram.GramsCount; n++)
{
profile.Add(gram.Get(n));
}
}
Count++;
}
Clear();
}
#endregion
#region Properties
public int Count { get; private set; }
#endregion
}
#endregion
#region Properties
public int Count { get; private set; }
#endregion
}
}