From fd88e403764081285c30eab8bb1d1ff1c84c1ccc Mon Sep 17 00:00:00 2001 From: Oceania2018 Date: Fri, 26 Oct 2018 08:09:44 -0500 Subject: [PATCH] remove NLP dependency --- BotSharp.Core.UnitTest/ConversationTest.cs | 3 +- .../Accounts/AccountDbInitializer.cs | 2 +- BotSharp.Core/BotSharp.Core.csproj | 2 +- .../BotSharp/BotSharpIntentClassifier.cs | 4 +- BotSharp.Core/Engines/BotSharp/BotSharpNER.cs | 2 +- .../Engines/BotSharp/BotSharpTagger.cs | 8 +- .../Engines/BotSharp/BotSharpTokenizer.cs | 4 +- .../Engines/Jieba.NET/JiebaTagger.cs | 4 +- .../Engines/Jieba.NET/JiebaTokenizer.cs | 4 +- BotSharp.Core/Engines/NlpDoc.cs | 2 +- .../BotSharp.NLP.UnitTest.csproj | 30 - BotSharp.NLP.UnitTest/DefaultTaggerTest.cs | 32 - .../Featuring/CountFeatureExtractorTest.cs | 63 -- BotSharp.NLP.UnitTest/NGramTaggerTest.cs | 118 --- .../NaiveBayesClassifierTest.cs | 185 ---- BotSharp.NLP.UnitTest/README.rst | 8 - BotSharp.NLP.UnitTest/RegexStemmerTest.cs | 25 - BotSharp.NLP.UnitTest/SVMClassifierTest.cs | 98 -- BotSharp.NLP.UnitTest/TestEssential.cs | 28 - .../Tokenize/RegexTokenizerTest.cs | 99 -- .../Tokenize/TreebankTokenizerTest.cs | 121 --- .../Vector/OneHotEncodingTest.cs | 42 - BotSharp.NLP.UnitTest/Vector/Word2VecTest.cs | 31 - BotSharp.NLP/BotSharp.NLP.csproj | 54 -- BotSharp.NLP/Classify/ClassifierFactory.cs | 69 -- BotSharp.NLP/Classify/ClassifyOptions.cs | 30 - BotSharp.NLP/Classify/IClassifier.cs | 29 - BotSharp.NLP/Classify/IEstimator.cs | 10 - .../Classify/ITextFeatureExtractor.cs | 17 - BotSharp.NLP/Classify/NaiveBayesClassifier.cs | 177 ---- BotSharp.NLP/Classify/SVMClassifier.cs | 235 ----- .../Classify/SentenceFeatureExtractor.cs | 24 - BotSharp.NLP/Classify/WordFeatureExtractor.cs | 23 - BotSharp.NLP/Corpus/ConllReader.cs | 53 -- BotSharp.NLP/Corpus/FasttextDataReader.cs | 52 - .../Corpus/LabeledPerFileNameReader.cs | 39 - BotSharp.NLP/Corpus/ReaderOptions.cs | 15 - .../Featuring/CountFeatureExtractor.cs | 92 -- BotSharp.NLP/Featuring/IFeatureExtractor.cs | 45 - .../Featuring/TfIdfFeatureExtractor.cs | 201 ---- .../Featuring/Word2VecFeatureExtractor.cs | 56 -- .../Models/Entropy/AbstractDataIndexer.cs | 230 ----- .../Models/Entropy/BasicContextGenerator.cs | 70 -- .../Models/Entropy/BasicEventReader.cs | 125 --- .../Models/Entropy/ComparableEvent.cs | 220 ----- BotSharp.NLP/Models/Entropy/GisModel.cs | 308 ------ BotSharp.NLP/Models/Entropy/GisTrainer.cs | 886 ----------------- .../Models/Entropy/IContextGenerator.cs | 71 -- .../Models/Entropy/IMaximumEntropyModel.cs | 151 --- .../Models/Entropy/IO/BinaryGisModelReader.cs | 175 ---- .../Models/Entropy/IO/BinaryGisModelWriter.cs | 186 ---- .../Models/Entropy/IO/GisModelReader.cs | 347 ------- .../Models/Entropy/IO/GisModelWriter.cs | 313 ------ .../Models/Entropy/IO/IGisModelReader.cs | 87 -- .../Entropy/IO/JavaBinaryGisModelReader.cs | 123 --- .../Entropy/IO/JavaBinaryGisModelWriter.cs | 140 --- .../Entropy/IO/PlainTextGisModelReader.cs | 111 --- .../Entropy/IO/PlainTextGisModelWriter.cs | 134 --- .../Models/Entropy/ITrainingDataIndexer.cs | 86 -- .../Models/Entropy/ITrainingDataReader.cs | 74 -- .../Models/Entropy/ITrainingEventReader.cs | 66 -- .../Models/Entropy/OnePassDataIndexer.cs | 212 ----- .../Models/Entropy/PatternedPredicate.cs | 118 --- .../Entropy/PlainTextByLineDataReader.cs | 86 -- BotSharp.NLP/Models/Entropy/TrainingEvent.cs | 94 -- .../Models/Entropy/TwoPassDataIndexer.cs | 282 ------ BotSharp.NLP/Models/WordNet/DataFileEngine.cs | 511 ---------- BotSharp.NLP/Models/WordNet/IndexWord.cs | 56 -- .../Morph/AbstractDelegatingOperation.cs | 74 -- .../WordNet/Morph/DetachSuffixesOperation.cs | 67 -- .../Models/WordNet/Morph/IOperation.cs | 46 - .../Morph/LookupExceptionsOperation.cs | 57 -- .../WordNet/Morph/LookupIndexWordOperation.cs | 49 - .../WordNet/Morph/TokenizerOperation.cs | 181 ---- BotSharp.NLP/Models/WordNet/Morph/Util.cs | 89 -- BotSharp.NLP/Models/WordNet/Relation.cs | 85 -- BotSharp.NLP/Models/WordNet/RelationType.cs | 72 -- BotSharp.NLP/Models/WordNet/Synset.cs | 113 --- BotSharp.NLP/Models/WordNet/Tokenizer.cs | 49 - BotSharp.NLP/Models/WordNet/WordNetEngine.cs | 151 --- BotSharp.NLP/NER/README.md | 5 - BotSharp.NLP/README.rst | 41 - BotSharp.NLP/Sentence.cs | 18 - BotSharp.NLP/Stem/IStemmer.cs | 23 - BotSharp.NLP/Stem/RegexStemmer.cs | 47 - BotSharp.NLP/Stem/StemOptions.cs | 14 - BotSharp.NLP/Stem/StemmerFactory.cs | 35 - BotSharp.NLP/SupportedLanguage.cs | 60 -- BotSharp.NLP/Tag/DefaultTagger.cs | 25 - BotSharp.NLP/Tag/ITagger.cs | 24 - BotSharp.NLP/Tag/NGramTagger.cs | 138 --- BotSharp.NLP/Tag/TagOptions.cs | 33 - BotSharp.NLP/Tag/TaggerFactory.cs | 49 - BotSharp.NLP/Tokenize/ITokenizer.cs | 21 - BotSharp.NLP/Tokenize/README.rst | 1 - BotSharp.NLP/Tokenize/RegexTokenizer.cs | 124 --- BotSharp.NLP/Tokenize/Token.cs | 73 -- BotSharp.NLP/Tokenize/TokenizationOptions.cs | 36 - BotSharp.NLP/Tokenize/TokenizerBase.cs | 22 - BotSharp.NLP/Tokenize/TokenizerFactory.cs | 78 -- BotSharp.NLP/Tokenize/TreebankTokenizer.cs | 165 ---- BotSharp.NLP/Txt2Vec/Decoder.cs | 211 ---- BotSharp.NLP/Txt2Vec/Encoder.cs | 900 ------------------ BotSharp.NLP/Txt2Vec/Model.cs | 313 ------ BotSharp.NLP/Txt2Vec/OneHotEncoder.cs | 63 -- BotSharp.NLP/Txt2Vec/Shrink.cs | 93 -- BotSharp.NLP/Txt2Vec/VectorGenerator.cs | 230 ----- BotSharp.WebHost/BotSharp.WebHost.csproj | 2 +- BotSharp.sln | 103 +- 109 files changed, 43 insertions(+), 11335 deletions(-) delete mode 100644 BotSharp.NLP.UnitTest/BotSharp.NLP.UnitTest.csproj delete mode 100644 BotSharp.NLP.UnitTest/DefaultTaggerTest.cs delete mode 100644 BotSharp.NLP.UnitTest/Featuring/CountFeatureExtractorTest.cs delete mode 100644 BotSharp.NLP.UnitTest/NGramTaggerTest.cs delete mode 100644 BotSharp.NLP.UnitTest/NaiveBayesClassifierTest.cs delete mode 100644 BotSharp.NLP.UnitTest/README.rst delete mode 100644 BotSharp.NLP.UnitTest/RegexStemmerTest.cs delete mode 100644 BotSharp.NLP.UnitTest/SVMClassifierTest.cs delete mode 100644 BotSharp.NLP.UnitTest/TestEssential.cs delete mode 100644 BotSharp.NLP.UnitTest/Tokenize/RegexTokenizerTest.cs delete mode 100644 BotSharp.NLP.UnitTest/Tokenize/TreebankTokenizerTest.cs delete mode 100644 BotSharp.NLP.UnitTest/Vector/OneHotEncodingTest.cs delete mode 100644 BotSharp.NLP.UnitTest/Vector/Word2VecTest.cs delete mode 100644 BotSharp.NLP/BotSharp.NLP.csproj delete mode 100644 BotSharp.NLP/Classify/ClassifierFactory.cs delete mode 100644 BotSharp.NLP/Classify/ClassifyOptions.cs delete mode 100644 BotSharp.NLP/Classify/IClassifier.cs delete mode 100644 BotSharp.NLP/Classify/IEstimator.cs delete mode 100644 BotSharp.NLP/Classify/ITextFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Classify/NaiveBayesClassifier.cs delete mode 100644 BotSharp.NLP/Classify/SVMClassifier.cs delete mode 100644 BotSharp.NLP/Classify/SentenceFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Classify/WordFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Corpus/ConllReader.cs delete mode 100644 BotSharp.NLP/Corpus/FasttextDataReader.cs delete mode 100644 BotSharp.NLP/Corpus/LabeledPerFileNameReader.cs delete mode 100644 BotSharp.NLP/Corpus/ReaderOptions.cs delete mode 100644 BotSharp.NLP/Featuring/CountFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Featuring/IFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Featuring/TfIdfFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Featuring/Word2VecFeatureExtractor.cs delete mode 100644 BotSharp.NLP/Models/Entropy/AbstractDataIndexer.cs delete mode 100644 BotSharp.NLP/Models/Entropy/BasicContextGenerator.cs delete mode 100644 BotSharp.NLP/Models/Entropy/BasicEventReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/ComparableEvent.cs delete mode 100644 BotSharp.NLP/Models/Entropy/GisModel.cs delete mode 100644 BotSharp.NLP/Models/Entropy/GisTrainer.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IContextGenerator.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IMaximumEntropyModel.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/BinaryGisModelReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/BinaryGisModelWriter.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/GisModelReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/GisModelWriter.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/IGisModelReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelWriter.cs delete mode 100644 BotSharp.NLP/Models/Entropy/ITrainingDataIndexer.cs delete mode 100644 BotSharp.NLP/Models/Entropy/ITrainingDataReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/ITrainingEventReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/OnePassDataIndexer.cs delete mode 100644 BotSharp.NLP/Models/Entropy/PatternedPredicate.cs delete mode 100644 BotSharp.NLP/Models/Entropy/PlainTextByLineDataReader.cs delete mode 100644 BotSharp.NLP/Models/Entropy/TrainingEvent.cs delete mode 100644 BotSharp.NLP/Models/Entropy/TwoPassDataIndexer.cs delete mode 100644 BotSharp.NLP/Models/WordNet/DataFileEngine.cs delete mode 100644 BotSharp.NLP/Models/WordNet/IndexWord.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/DetachSuffixesOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/IOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/LookupExceptionsOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/LookupIndexWordOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/TokenizerOperation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Morph/Util.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Relation.cs delete mode 100644 BotSharp.NLP/Models/WordNet/RelationType.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Synset.cs delete mode 100644 BotSharp.NLP/Models/WordNet/Tokenizer.cs delete mode 100644 BotSharp.NLP/Models/WordNet/WordNetEngine.cs delete mode 100644 BotSharp.NLP/NER/README.md delete mode 100644 BotSharp.NLP/README.rst delete mode 100644 BotSharp.NLP/Sentence.cs delete mode 100644 BotSharp.NLP/Stem/IStemmer.cs delete mode 100644 BotSharp.NLP/Stem/RegexStemmer.cs delete mode 100644 BotSharp.NLP/Stem/StemOptions.cs delete mode 100644 BotSharp.NLP/Stem/StemmerFactory.cs delete mode 100644 BotSharp.NLP/SupportedLanguage.cs delete mode 100644 BotSharp.NLP/Tag/DefaultTagger.cs delete mode 100644 BotSharp.NLP/Tag/ITagger.cs delete mode 100644 BotSharp.NLP/Tag/NGramTagger.cs delete mode 100644 BotSharp.NLP/Tag/TagOptions.cs delete mode 100644 BotSharp.NLP/Tag/TaggerFactory.cs delete mode 100644 BotSharp.NLP/Tokenize/ITokenizer.cs delete mode 100644 BotSharp.NLP/Tokenize/README.rst delete mode 100644 BotSharp.NLP/Tokenize/RegexTokenizer.cs delete mode 100644 BotSharp.NLP/Tokenize/Token.cs delete mode 100644 BotSharp.NLP/Tokenize/TokenizationOptions.cs delete mode 100644 BotSharp.NLP/Tokenize/TokenizerBase.cs delete mode 100644 BotSharp.NLP/Tokenize/TokenizerFactory.cs delete mode 100644 BotSharp.NLP/Tokenize/TreebankTokenizer.cs delete mode 100644 BotSharp.NLP/Txt2Vec/Decoder.cs delete mode 100644 BotSharp.NLP/Txt2Vec/Encoder.cs delete mode 100644 BotSharp.NLP/Txt2Vec/Model.cs delete mode 100644 BotSharp.NLP/Txt2Vec/OneHotEncoder.cs delete mode 100644 BotSharp.NLP/Txt2Vec/Shrink.cs delete mode 100644 BotSharp.NLP/Txt2Vec/VectorGenerator.cs diff --git a/BotSharp.Core.UnitTest/ConversationTest.cs b/BotSharp.Core.UnitTest/ConversationTest.cs index 8a6b5c4ea..bb9f8f9be 100644 --- a/BotSharp.Core.UnitTest/ConversationTest.cs +++ b/BotSharp.Core.UnitTest/ConversationTest.cs @@ -1,5 +1,4 @@ -ï»¿using BotSharp.NLP; -using Microsoft.VisualStudio.TestTools.UnitTesting; +ï»¿using Microsoft.VisualStudio.TestTools.UnitTesting; using System; using System.Collections.Generic; using System.Linq; diff --git a/BotSharp.Core/Accounts/AccountDbInitializer.cs b/BotSharp.Core/Accounts/AccountDbInitializer.cs index 5f90383b2..dd066622e 100644 --- a/BotSharp.Core/Accounts/AccountDbInitializer.cs +++ b/BotSharp.Core/Accounts/AccountDbInitializer.cs @@ -1,5 +1,5 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP.Tokenize; +using CherubNLP.Tokenize; using EntityFrameworkCore.BootKit; using Newtonsoft.Json; using System; diff --git a/BotSharp.Core/BotSharp.Core.csproj b/BotSharp.Core/BotSharp.Core.csproj index 1b75e69e7..016b8c7c5 100644 --- a/BotSharp.Core/BotSharp.Core.csproj +++ b/BotSharp.Core/BotSharp.Core.csproj @@ -85,7 +85,7 @@ If you feel that this project is helpful to you, please Star on the project, we - + diff --git a/BotSharp.Core/Engines/BotSharp/BotSharpIntentClassifier.cs b/BotSharp.Core/Engines/BotSharp/BotSharpIntentClassifier.cs index 4306effa4..86fca2e42 100644 --- a/BotSharp.Core/Engines/BotSharp/BotSharpIntentClassifier.cs +++ b/BotSharp.Core/Engines/BotSharp/BotSharpIntentClassifier.cs @@ -1,6 +1,6 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP; -using BotSharp.NLP.Classify; +using CherubNLP; +using CherubNLP.Classify; using BotSharp.Platform.Models; using BotSharp.Platform.Models.AiResponse; using BotSharp.Platform.Models.MachineLearning; diff --git a/BotSharp.Core/Engines/BotSharp/BotSharpNER.cs b/BotSharp.Core/Engines/BotSharp/BotSharpNER.cs index 284529e7e..2bc6e2f09 100644 --- a/BotSharp.Core/Engines/BotSharp/BotSharpNER.cs +++ b/BotSharp.Core/Engines/BotSharp/BotSharpNER.cs @@ -2,7 +2,7 @@ using Bigtree.Algorithm.CRFLite; using Bigtree.Algorithm.CRFLite.Decoder; using Bigtree.Algorithm.CRFLite.Encoder; -using BotSharp.NLP.Tokenize; +using CherubNLP.Tokenize; using Microsoft.Extensions.Configuration; using System; using System.Collections.Generic; diff --git a/BotSharp.Core/Engines/BotSharp/BotSharpTagger.cs b/BotSharp.Core/Engines/BotSharp/BotSharpTagger.cs index 30d251506..4276e21ee 100644 --- a/BotSharp.Core/Engines/BotSharp/BotSharpTagger.cs +++ b/BotSharp.Core/Engines/BotSharp/BotSharpTagger.cs @@ -1,8 +1,8 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP; -using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tag; -using BotSharp.NLP.Tokenize; +using CherubNLP; +using CherubNLP.Corpus; +using CherubNLP.Tag; +using CherubNLP.Tokenize; using BotSharp.Platform.Models; using BotSharp.Platform.Models.MachineLearning; using Microsoft.Extensions.Configuration; diff --git a/BotSharp.Core/Engines/BotSharp/BotSharpTokenizer.cs b/BotSharp.Core/Engines/BotSharp/BotSharpTokenizer.cs index 70eebbe3d..254a92fd7 100644 --- a/BotSharp.Core/Engines/BotSharp/BotSharpTokenizer.cs +++ b/BotSharp.Core/Engines/BotSharp/BotSharpTokenizer.cs @@ -1,6 +1,6 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP; -using BotSharp.NLP.Tokenize; +using CherubNLP; +using CherubNLP.Tokenize; using BotSharp.Platform.Models; using BotSharp.Platform.Models.AiResponse; using BotSharp.Platform.Models.MachineLearning; diff --git a/BotSharp.Core/Engines/Jieba.NET/JiebaTagger.cs b/BotSharp.Core/Engines/Jieba.NET/JiebaTagger.cs index 8507be0da..71cc3bc53 100644 --- a/BotSharp.Core/Engines/Jieba.NET/JiebaTagger.cs +++ b/BotSharp.Core/Engines/Jieba.NET/JiebaTagger.cs @@ -1,6 +1,6 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP; -using BotSharp.NLP.Tag; +using CherubNLP; +using CherubNLP.Tag; using JiebaNet.Segmenter; using JiebaNet.Segmenter.PosSeg; using Microsoft.Extensions.Configuration; diff --git a/BotSharp.Core/Engines/Jieba.NET/JiebaTokenizer.cs b/BotSharp.Core/Engines/Jieba.NET/JiebaTokenizer.cs index ef1862ee8..f467bfb55 100644 --- a/BotSharp.Core/Engines/Jieba.NET/JiebaTokenizer.cs +++ b/BotSharp.Core/Engines/Jieba.NET/JiebaTokenizer.cs @@ -1,5 +1,5 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP.Tokenize; +using CherubNLP.Tokenize; using JiebaNet.Segmenter; using Microsoft.Extensions.Configuration; using System; @@ -8,7 +8,7 @@ using System.Linq; using System.Text; using System.Threading.Tasks; -using Token = BotSharp.NLP.Tokenize.Token; +using Token = CherubNLP.Tokenize.Token; namespace BotSharp.Core.Engines.Jieba.NET { diff --git a/BotSharp.Core/Engines/NlpDoc.cs b/BotSharp.Core/Engines/NlpDoc.cs index 17dafd33e..ea0f05aac 100644 --- a/BotSharp.Core/Engines/NlpDoc.cs +++ b/BotSharp.Core/Engines/NlpDoc.cs @@ -1,5 +1,5 @@ ï»¿using BotSharp.Core.Abstractions; -using BotSharp.NLP.Tokenize; +using CherubNLP.Tokenize; using BotSharp.Platform.Models.AiResponse; using BotSharp.Platform.Models.Entities; using System; diff --git a/BotSharp.NLP.UnitTest/BotSharp.NLP.UnitTest.csproj b/BotSharp.NLP.UnitTest/BotSharp.NLP.UnitTest.csproj deleted file mode 100644 index 23efcdacb..000000000 --- a/BotSharp.NLP.UnitTest/BotSharp.NLP.UnitTest.csproj +++ /dev/null @@ -1,30 +0,0 @@ -ï»¿ - - - netcoreapp2.1 - - false - - AnyCPU;x64 - - Debug;Release;RASA NLU;DIALOGFLOW;RASA;ARTICULATE - - - - - - - - - - - - - - - - - - - - diff --git a/BotSharp.NLP.UnitTest/DefaultTaggerTest.cs b/BotSharp.NLP.UnitTest/DefaultTaggerTest.cs deleted file mode 100644 index a829a8940..000000000 --- a/BotSharp.NLP.UnitTest/DefaultTaggerTest.cs +++ /dev/null @@ -1,32 +0,0 @@ -ï»¿using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tag; -using BotSharp.NLP.Tokenize; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.UnitTest -{ - [TestClass] - public class DefaultTaggerTest - { - [TestMethod] - public void TagInCoNLL2000() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("How are you doing?"); - - var tagger = new TaggerFactory(new TagOptions - { - Tag = "NN" - }, SupportedLanguage.English); - - tagger.GetTagger(); - - tagger.Tag(new Sentence { Words = tokens }); - } - } -} \ No newline at end of file diff --git a/BotSharp.NLP.UnitTest/Featuring/CountFeatureExtractorTest.cs b/BotSharp.NLP.UnitTest/Featuring/CountFeatureExtractorTest.cs deleted file mode 100644 index 90177712f..000000000 --- a/BotSharp.NLP.UnitTest/Featuring/CountFeatureExtractorTest.cs +++ /dev/null @@ -1,63 +0,0 @@ -ï»¿using BotSharp.NLP.Featuring; -using BotSharp.NLP.Tokenize; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.UnitTest.Featuring -{ - [TestClass] - public class CountFeatureExtractorTest : TestEssential - { - [TestMethod] - public void TestVectorizer() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var extractor = new CountFeatureExtractor(); - extractor.Sentences = tokenizer.Tokenize(Corpus()); - extractor.Vectorize(new List()); - - var vectors = Vectors(); - - for (int i = 0; i < extractor.Sentences.Count; i++) - { - var sentence = extractor.Sentences[i]; - - for(int j = 0; j < extractor.Features.Count; j++) - { - var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]); - - if(word != null) - { - Assert.IsTrue(word.Vector == vectors[i][j]); - } - } - } - } - - public List Corpus() - { - return new List - { - "This is the first document.", - "This document is the second document.", - "And this is the third one.", - "Is this the first document?" - }; - } - - public int[][] Vectors() - { - return new int[4][] - { - new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 }, - new int []{ 0, 2, 0, 1, 0, 1, 1, 0, 1 }, - new int []{ 1, 0, 0, 1, 1, 0, 1, 1, 1 }, - new int []{ 0, 1, 1, 1, 0, 0, 1, 0, 1 } - }; - } - } -} diff --git a/BotSharp.NLP.UnitTest/NGramTaggerTest.cs b/BotSharp.NLP.UnitTest/NGramTaggerTest.cs deleted file mode 100644 index c4161ba60..000000000 --- a/BotSharp.NLP.UnitTest/NGramTaggerTest.cs +++ /dev/null @@ -1,118 +0,0 @@ -ï»¿using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tag; -using BotSharp.NLP.Tokenize; -using Microsoft.Extensions.Configuration; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.IO; -using System.Text; - -namespace BotSharp.NLP.UnitTest -{ - [TestClass] - public class NGramTaggerTest : TestEssential - { - [TestMethod] - public void UniGramInCoNLL2000() - { - // tokenization - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WORD_PUNC - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); - - // test tag - var tagger = new TaggerFactory(new TagOptions - { - CorpusDir = Configuration.GetValue("BotSharp.NLP:dataDir"), - NGram = 1, - Tag = "NN" - }, SupportedLanguage.English); - - tagger.GetTagger(); - - var watch = Stopwatch.StartNew(); - tagger.Tag(new Sentence { Words = tokens }); - watch.Stop(); - var elapsedMs1 = watch.ElapsedMilliseconds; - - Assert.IsTrue(tokens[0].Pos == "NNP"); - Assert.IsTrue(tokens[1].Pos == "IN"); - Assert.IsTrue(tokens[2].Pos == "DT"); - Assert.IsTrue(tokens[3].Pos == "NNP"); - - // test if model is loaded repeatly. - watch = Stopwatch.StartNew(); - tagger.Tag(new Sentence { Words = tokens }); - watch.Stop(); - var elapsedMs2 = watch.ElapsedMilliseconds; - - Assert.IsTrue(elapsedMs1 > elapsedMs2 * 100); - } - - [TestMethod] - public void BiGramInCoNLL2000() - { - // tokenization - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WORD_PUNC - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); - - // test tag - var tagger = new TaggerFactory(new TagOptions - { - CorpusDir = Configuration.GetValue("BotSharp.NLP:dataDir"), - NGram = 2, - Tag = "NN" - }, SupportedLanguage.English); - - tagger.GetTagger(); - - tagger.Tag(new Sentence { Words = tokens }); - - Assert.IsTrue(tokens[0].Pos == "NNP"); - Assert.IsTrue(tokens[1].Pos == "IN"); - Assert.IsTrue(tokens[2].Pos == "DT"); - Assert.IsTrue(tokens[3].Pos == "NNP"); - } - - [TestMethod] - public void TriGramInCoNLL2000() - { - // tokenization - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WORD_PUNC - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); - - // test tag - var tagger = new TaggerFactory(new TagOptions - { - CorpusDir = Configuration.GetValue("BotSharp.NLP:dataDir"), - NGram = 3, - Tag = "NN" - }, SupportedLanguage.English); - - tagger.GetTagger(); - - tagger.Tag(new Sentence { Words = tokens }); - - Assert.IsTrue(tokens[0].Pos == "NNP"); - Assert.IsTrue(tokens[1].Pos == "IN"); - Assert.IsTrue(tokens[2].Pos == "DT"); - Assert.IsTrue(tokens[3].Pos == "NNP"); - } - } -} diff --git a/BotSharp.NLP.UnitTest/NaiveBayesClassifierTest.cs b/BotSharp.NLP.UnitTest/NaiveBayesClassifierTest.cs deleted file mode 100644 index b77d14eaf..000000000 --- a/BotSharp.NLP.UnitTest/NaiveBayesClassifierTest.cs +++ /dev/null @@ -1,185 +0,0 @@ -ï»¿using BotSharp.NLP.Classify; -using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tokenize; -using Microsoft.Extensions.Configuration; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using Bigtree.Algorithm.Extensions; -using BotSharp.NLP.Txt2Vec; - -namespace BotSharp.NLP.UnitTest -{ - [TestClass] - public class NaiveBayesClassifierTest : TestEssential - { - [TestMethod] - public void CookingTest() - { - var reader = new FasttextDataReader(); - var sentences = reader.Read(new ReaderOptions - { - DataDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), - FileName = "cooking.stackexchange.txt" - }); - - var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); - for(int i = 0; i < newSentences.Count; i++) - { - newSentences[i].Label = sentences[i].Label; - } - sentences = newSentences.ToList(); - - sentences.Shuffle(); - - var options = new ClassifyOptions - { - ModelFilePath = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"), - TrainingCorpusDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), - Dimension = 100 - }; - var classifier = new ClassifierFactory(options, SupportedLanguage.English); - - var dataset = sentences.Split(0.7M); - classifier.Train(dataset.Item1); - - int correct = 0; - int total = 0; - dataset.Item2.ForEach(td => - { - var classes = classifier.Classify(td); - if (td.Label == classes[0].Item1) - { - correct++; - } - total++; - }); - - var accuracy = (float)correct / total; - - Assert.IsTrue(accuracy > 0.5); - } - - [TestMethod] - public void GenderTest() - { - var options = new ClassifyOptions - { - TrainingCorpusDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Gender") - }; - var classifier = new ClassifierFactory(options, SupportedLanguage.English); - - var corpus = GetLabeledCorpus(options); - - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WORD_PUNC - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - corpus.ForEach(x => x.Words = tokenizer.Tokenize(x.Text)); - - classifier.Train(corpus); - string text = "Bridget"; - classifier.Classify(new Sentence { Text = text, Words = tokenizer.Tokenize(text) }); - - corpus.Shuffle(); - var trainingData = corpus.Skip(2000).ToList(); - classifier.Train(trainingData); - - var testData = corpus.Take(2000).ToList(); - int correct = 0; - testData.ForEach(td => - { - var classes = classifier.Classify(td); - if(td.Label == classes[0].Item1) - { - correct++; - } - }); - - var accuracy = (float)correct / testData.Count; - } - - private List GetLabeledCorpus(ClassifyOptions options) - { - var reader = new LabeledPerFileNameReader(); - - var genders = new List(); - - var female = reader.Read(new ReaderOptions - { - DataDir = options.TrainingCorpusDir, - FileName = "female.txt" - }); - - genders.AddRange(female); - - var male = reader.Read(new ReaderOptions - { - DataDir = options.TrainingCorpusDir, - FileName = "male.txt" - }); - - genders.AddRange(male); - - return genders; - } - - [TestMethod] - public void SpotifyTest() - { - var reader = new FasttextDataReader(); - var sentences = reader.Read(new ReaderOptions - { - DataDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "spotify"), - FileName = "spotify.txt" - }); - - var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); - for (int i = 0; i < newSentences.Count; i++) - { - newSentences[i].Label = sentences[i].Label; - } - sentences = newSentences.ToList(); - - sentences.Shuffle(); - - var options = new ClassifyOptions - { - ModelFilePath = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "spotify", "nb.model"), - TrainingCorpusDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "spotify") - }; - var classifier = new ClassifierFactory(options, SupportedLanguage.English); - - var dataset = sentences.Split(0.7M); - classifier.Train(dataset.Item1); - - int correct = 0; - int total = 0; - dataset.Item2.ForEach(td => - { - var classes = classifier.Classify(td); - if (td.Label == classes[0].Item1) - { - correct++; - } - total++; - }); - - var accuracy = (float)correct / total; - - Assert.IsTrue(accuracy > 0.6); - } - - } -} diff --git a/BotSharp.NLP.UnitTest/README.rst b/BotSharp.NLP.UnitTest/README.rst deleted file mode 100644 index 4a8da75f0..000000000 --- a/BotSharp.NLP.UnitTest/README.rst +++ /dev/null @@ -1,8 +0,0 @@ -Before you run the UnitTest, please config the environment variables in the app.json which is located under Settings directory. -:: - - { - "BotSharp.NLP": { - "dataDir": "D:\\Projects\\BotSharp.NLP\\Data" - } - } \ No newline at end of file diff --git a/BotSharp.NLP.UnitTest/RegexStemmerTest.cs b/BotSharp.NLP.UnitTest/RegexStemmerTest.cs deleted file mode 100644 index 8bf0fc989..000000000 --- a/BotSharp.NLP.UnitTest/RegexStemmerTest.cs +++ /dev/null @@ -1,25 +0,0 @@ -ï»¿using BotSharp.NLP.Stem; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.UnitTest -{ - [TestClass] - public class RegexStemmerTest - { - [TestMethod] - public void StemInDefault() - { - var stemmer = new StemmerFactory(new StemOptions - { - Pattern = RegexStemmer.DEFAULT - }, SupportedLanguage.English); - - var stem = stemmer.Stem("doing"); - - Assert.IsTrue(stem == "do"); - } - } -} diff --git a/BotSharp.NLP.UnitTest/SVMClassifierTest.cs b/BotSharp.NLP.UnitTest/SVMClassifierTest.cs deleted file mode 100644 index 19e5a32b3..000000000 --- a/BotSharp.NLP.UnitTest/SVMClassifierTest.cs +++ /dev/null @@ -1,98 +0,0 @@ -ï»¿using BotSharp.NLP.Classify; -using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tokenize; -using Microsoft.Extensions.Configuration; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; -using Txt2Vec; - -namespace BotSharp.NLP.UnitTest -{ - [TestClass] - public class SVMClassifierTest : TestEssential - { - [TestMethod] - public void TFIDFTest() - { - string[] documents = - { - "Hello, how are you!", - "Hi Bolo!", - "Hey Haiping!", - "Hello Haiping", - "hi, how do you do?", - "goodbye Haiping", - "see you Bolo", - "byebye Haiping" - }; - /*TFIDFGenerator tfidfGenerator = new TFIDFGenerator(); - List> weights = tfidfGenerator.TFIDFWeightVectorsForSentences(documents);*/ - } - - [TestMethod] - public void Doc2VectorTest() - { - List sentences = new List(); - sentences.Add("The sun in the sky is bright."); - sentences.Add("We can see the shining sun, the bright sun."); - Args args = new Args(); - args.ModelFile = "C:\\Users\\bpeng\\Desktop\\BoloReborn\\BotSharp\\BotSharp.WebHost\\App_Data\\wordvec_enu.bin"; - VectorGenerator vg = new VectorGenerator(args); - var list = vg.Sentence2Vec(sentences); - } - - [TestMethod] - public void similarityTest() - { - List sentences = new List(); - sentences.Add("How's it going"); - sentences.Add("How's your day"); - sentences.Add("How's everything"); - sentences.Add("Good morning"); - sentences.Add("Good afternoon"); - sentences.Add("Good evening"); - sentences.Add("I appreciate it"); - sentences.Add("Thanks a lot"); - sentences.Add("Thank you"); - - - Args args = new Args(); - args.ModelFile = "C:\\Users\\bpeng\\Desktop\\BoloReborn\\BotSharp.NLP\\BotSharp.NLP.UnitTest\\wordvec_enu.bin"; - VectorGenerator vg = new VectorGenerator(args); - var list = vg.Sentence2Vec(sentences); - Vec vec1 = vg.SingleSentence2Vec("Good morning"); - Vec vec2 = vg.SingleSentence2Vec("How's it going"); - double score = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score); - - vec1 = vg.SingleSentence2Vec("Good morning"); - vec2 = vg.SingleSentence2Vec("How's your day"); - double score1 = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score1); - - vec1 = vg.SingleSentence2Vec("Good morning"); - vec2 = vg.SingleSentence2Vec("How's everything"); - double score2 = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score2); - - - vec1 = vg.SingleSentence2Vec("Good morning"); - vec2 = vg.SingleSentence2Vec("Good afternoon"); - double score3 = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score3); - - vec1 = vg.SingleSentence2Vec("Good morning"); - vec2 = vg.SingleSentence2Vec("I appreciate"); - double score4 = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score4); - - vec1 = vg.SingleSentence2Vec("Good morning"); - vec2 = vg.SingleSentence2Vec("Thanks a lot"); - double score5 = vg.Similarity(vec1, vec2); - Console.WriteLine("Similarity score: {0}", score5); - } - } -} diff --git a/BotSharp.NLP.UnitTest/TestEssential.cs b/BotSharp.NLP.UnitTest/TestEssential.cs deleted file mode 100644 index 033a8735b..000000000 --- a/BotSharp.NLP.UnitTest/TestEssential.cs +++ /dev/null @@ -1,28 +0,0 @@ -ï»¿using Microsoft.Extensions.Configuration; -using System; -using System.IO; -using System.Linq; - -namespace BotSharp.NLP.UnitTest -{ - public abstract class TestEssential - { - protected IConfiguration Configuration { get; } - - public TestEssential() - { - var rootDir = Path.GetFullPath($"{Directory.GetCurrentDirectory()}{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}..{Path.DirectorySeparatorChar}"); - var settingsDir = Path.Combine(rootDir, "BotSharp.WebHost", "Settings"); - - ConfigurationBuilder configurationBuilder = new ConfigurationBuilder(); - var settings = Directory.GetFiles(settingsDir, "*.json"); - settings.ToList().ForEach(setting => - { - configurationBuilder.AddJsonFile(setting, optional: false, reloadOnChange: true); - }); - Configuration = configurationBuilder.Build(); - } - } - - -} diff --git a/BotSharp.NLP.UnitTest/Tokenize/RegexTokenizerTest.cs b/BotSharp.NLP.UnitTest/Tokenize/RegexTokenizerTest.cs deleted file mode 100644 index e305ca204..000000000 --- a/BotSharp.NLP.UnitTest/Tokenize/RegexTokenizerTest.cs +++ /dev/null @@ -1,99 +0,0 @@ -using BotSharp.NLP.Tokenize; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System.Collections.Generic; - -namespace BotSharp.NLP.UnitTest.Tokenize -{ - [TestClass] - public class RegexTokenizerTest - { - [TestMethod] - public void TokenizeInWhiteSpace() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WHITE_SPACE - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); - - Assert.IsTrue(tokens[0].Start == 0); - Assert.IsTrue(tokens[0].Text == "Chop"); - - Assert.IsTrue(tokens[1].Start == 5); - Assert.IsTrue(tokens[1].Text == "into"); - - Assert.IsTrue(tokens[2].Start == 10); - Assert.IsTrue(tokens[2].Text == "pieces,"); - - Assert.IsTrue(tokens[3].Start == 18); - Assert.IsTrue(tokens[3].Text == "isn't"); - - Assert.IsTrue(tokens[4].Start == 24); - Assert.IsTrue(tokens[4].Text == "it?"); - } - - [TestMethod] - public void TokenizeInWordPunctuation() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.WORD_PUNC, - SpecialWords = new List { "n't" } - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); - - Assert.IsTrue(tokens[0].Start == 0); - Assert.IsTrue(tokens[0].Text == "Chop"); - - Assert.IsTrue(tokens[1].Start == 5); - Assert.IsTrue(tokens[1].Text == "into"); - - Assert.IsTrue(tokens[2].Start == 10); - Assert.IsTrue(tokens[2].Text == "pieces"); - - Assert.IsTrue(tokens[3].Start == 16); - Assert.IsTrue(tokens[3].Text == ","); - - Assert.IsTrue(tokens[4].Start == 18); - Assert.IsTrue(tokens[4].Text == "is"); - - Assert.IsTrue(tokens[5].Start == 20); - Assert.IsTrue(tokens[5].Text == "n't"); - - Assert.IsTrue(tokens[6].Start == 24); - Assert.IsTrue(tokens[6].Text == "it"); - - Assert.IsTrue(tokens[7].Start == 26); - Assert.IsTrue(tokens[7].Text == "?"); - } - - [TestMethod] - public void TokenizeInBlankLine() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - Pattern = RegexTokenizer.BLANK_LINE - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize(@"Chop into pieces, - -isn't - -it?"); - - Assert.IsTrue(tokens[0].Start == 0); - Assert.IsTrue(tokens[0].Text == "Chop into pieces,"); - - Assert.IsTrue(tokens[1].Start == 18); - Assert.IsTrue(tokens[1].Text == "isn't"); - - Assert.IsTrue(tokens[2].Start == 28); - Assert.IsTrue(tokens[2].Text == "it?"); - } - } -} diff --git a/BotSharp.NLP.UnitTest/Tokenize/TreebankTokenizerTest.cs b/BotSharp.NLP.UnitTest/Tokenize/TreebankTokenizerTest.cs deleted file mode 100644 index ef72b1234..000000000 --- a/BotSharp.NLP.UnitTest/Tokenize/TreebankTokenizerTest.cs +++ /dev/null @@ -1,121 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.UnitTest.Tokenize -{ - [TestClass] - public class TreebankTokenizerTest - { - [TestMethod] - public void ReplaceStartingQuoting() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Â«Hello!"); - - Assert.IsTrue(tokens[0].Text == "Â«"); - Assert.IsTrue(tokens[0].Start == 0); - - Assert.IsTrue(tokens[1].Text == "Hello"); - Assert.IsTrue(tokens[1].Start == 1); - - Assert.IsTrue(tokens[2].Text == "!"); - Assert.IsTrue(tokens[2].Start == 6); - } - - [TestMethod] - public void ReplaceEndingQuoting() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Aren't you"); - - Assert.IsTrue(tokens[0].Text == "Are"); - Assert.IsTrue(tokens[0].Start == 0); - - Assert.IsTrue(tokens[1].Text == "n't"); - Assert.IsTrue(tokens[1].Start == 3); - - Assert.IsTrue(tokens[2].Text == "you"); - Assert.IsTrue(tokens[2].Start == 7); - } - - [TestMethod] - public void ReplacePunctuation() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("Hello World..."); - - Assert.IsTrue(tokens[0].Text == "Hello"); - Assert.IsTrue(tokens[0].Start == 0); - - Assert.IsTrue(tokens[1].Text == "World"); - Assert.IsTrue(tokens[1].Start == 6); - - Assert.IsTrue(tokens[2].Text == "..."); - Assert.IsTrue(tokens[2].Start == 11); - } - - [TestMethod] - public void ReplaceBrackets() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize(""); - - Assert.IsTrue(tokens[0].Text == "<"); - Assert.IsTrue(tokens[0].Start == 0); - - Assert.IsTrue(tokens[1].Text == "Hello"); - Assert.IsTrue(tokens[1].Start == 1); - - Assert.IsTrue(tokens[2].Text == "."); - Assert.IsTrue(tokens[2].Start == 6); - - Assert.IsTrue(tokens[3].Text == ">"); - Assert.IsTrue(tokens[3].Start == 7); - } - - [TestMethod] - public void ReplaceConventions() - { - var tokenizer = new TokenizerFactory(new TokenizationOptions - { - }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var tokens = tokenizer.Tokenize("I cannot jump."); - - Assert.IsTrue(tokens[0].Text == "I"); - Assert.IsTrue(tokens[0].Start == 0); - - Assert.IsTrue(tokens[1].Text == "can"); - Assert.IsTrue(tokens[1].Start == 2); - - Assert.IsTrue(tokens[2].Text == "not"); - Assert.IsTrue(tokens[2].Start == 5); - - Assert.IsTrue(tokens[3].Text == "jump"); - Assert.IsTrue(tokens[3].Start == 9); - - Assert.IsTrue(tokens[4].Text == "."); - Assert.IsTrue(tokens[4].Start == 13); - } - } -} diff --git a/BotSharp.NLP.UnitTest/Vector/OneHotEncodingTest.cs b/BotSharp.NLP.UnitTest/Vector/OneHotEncodingTest.cs deleted file mode 100644 index 7c02ea5ae..000000000 --- a/BotSharp.NLP.UnitTest/Vector/OneHotEncodingTest.cs +++ /dev/null @@ -1,42 +0,0 @@ -ï»¿using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tokenize; -using BotSharp.NLP.Txt2Vec; -using Microsoft.Extensions.Configuration; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; - -namespace BotSharp.NLP.UnitTest.Vector -{ - [TestClass] - public class OneHotEncodingTest : TestEssential - { - [TestMethod] - public void OneHotTest() - { - var reader = new FasttextDataReader(); - var sentences = reader.Read(new ReaderOptions - { - DataDir = Path.Combine(Configuration.GetValue("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), - FileName = "cooking.stackexchange.txt" - }); - - var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); - tokenizer.GetTokenizer(); - - var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); - for (int i = 0; i < newSentences.Count; i++) - { - newSentences[i].Label = sentences[i].Label; - } - sentences = newSentences.ToList(); - - var encoder = new OneHotEncoder(); - encoder.Sentences = sentences; - encoder.EncodeAll(); - } - } -} diff --git a/BotSharp.NLP.UnitTest/Vector/Word2VecTest.cs b/BotSharp.NLP.UnitTest/Vector/Word2VecTest.cs deleted file mode 100644 index 994fa78f5..000000000 --- a/BotSharp.NLP.UnitTest/Vector/Word2VecTest.cs +++ /dev/null @@ -1,31 +0,0 @@ -ï»¿using BotSharp.NLP.Txt2Vec; -using Microsoft.Extensions.Configuration; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using Txt2Vec; - -namespace BotSharp.NLP.UnitTest.Vector -{ - public class Word2VecTest - { - [TestClass] - public class OneHotEncodingTest : TestEssential - { - [TestMethod] - public void Word2VecTest() - { - string sentence = "stop this song"; - List words = sentence.Split(' ').ToList(); - Args args = new Args(); - args.ModelFile = @"C:\Users\bpeng\Desktop\BoloReborn\Txt2VecDemo\wordvec_enu.bin"; - VectorGenerator vg = new VectorGenerator(args); - - vg.Distance(words); - } - } - } -} diff --git a/BotSharp.NLP/BotSharp.NLP.csproj b/BotSharp.NLP/BotSharp.NLP.csproj deleted file mode 100644 index 99ce811d9..000000000 --- a/BotSharp.NLP/BotSharp.NLP.csproj +++ /dev/null @@ -1,54 +0,0 @@ -ï»¿ - - - netstandard2.0 - AnyCPU;x64 - true - 0.5.0 - Botsharp.NLP is a set of tools for building C# programs to work with human language data. It can be used in common tasks like POS, NER and text classification in the NLP or NLU field. - -BotSharp.NLP has implemented below machine learning algorithms: - -Conditional Random Field (CRF) -Support Vector Machine (SVM) -N-Gram Tagger -Regex Tokenizer -Naive Bayes Classifier - MIT - https://github.com/Oceania2018/BotSharp - Github - https://github.com/Oceania2018/BotSharp - Haiping Chen, Bo Peng - Personal - Added Penn Treebank tokenize standard. - https://raw.githubusercontent.com/Oceania2018/BotSharp/master/BotSharp.WebHost/wwwroot/images/BotSharp.png - BotSharp, NLP, NLU, POS, NER, LSTM, CRF, SVM, Tagger, NaiveBayes - Debug;Release;RASA NLU;DIALOGFLOW;RASA;ARTICULATE - - - - TRACE;DEBUG - bin\RASA - false - - - - TRACE;DEBUG - false - - - - TRACE;DEBUG - false - - - - DEBUG;TRACE - - - - - - - - diff --git a/BotSharp.NLP/Classify/ClassifierFactory.cs b/BotSharp.NLP/Classify/ClassifierFactory.cs deleted file mode 100644 index 0fc3117ac..000000000 --- a/BotSharp.NLP/Classify/ClassifierFactory.cs +++ /dev/null @@ -1,69 +0,0 @@ -ï»¿using Bigtree.Algorithm.Features; -using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Reflection; -using System.Text; - -namespace BotSharp.NLP.Classify -{ - public class ClassifierFactory - where IFeatureExtractor : ITextFeatureExtractor, new() - { - private SupportedLanguage _lang; - - private IClassifier _classifier; - - private ClassifyOptions _options; - - private IFeatureExtractor featureExtractor; - - public ClassifierFactory(ClassifyOptions options, SupportedLanguage lang) - { - _lang = lang; - _options = options; - featureExtractor = new IFeatureExtractor(); - } - - public IClassifier GetClassifer(string name) - { - List types = new List(); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.Core")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.NLP")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - Type type = types.FirstOrDefault(x => x.Name == name); - var instance = (IClassifier)Activator.CreateInstance(type); - - return _classifier = instance; - } - - public void Train(List sentences) - { - _classifier.Train(sentences, _options); - _classifier.SaveModel(_options); - } - - public List> Classify(Sentence sentence) - { - var options = new ClassifyOptions - { - ModelFilePath = _options.ModelFilePath, - ModelDir = _options.ModelDir, - ModelName = _options.ModelName - }; - - _classifier.LoadModel(options); - - var classes = _classifier.Classify(sentence, options); - - classes = classes.OrderByDescending(x => x.Item2).ToList(); - - return classes; - } - } -} diff --git a/BotSharp.NLP/Classify/ClassifyOptions.cs b/BotSharp.NLP/Classify/ClassifyOptions.cs deleted file mode 100644 index f6ec27513..000000000 --- a/BotSharp.NLP/Classify/ClassifyOptions.cs +++ /dev/null @@ -1,30 +0,0 @@ -ï»¿using Bigtree.Algorithm.SVM; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Classify -{ - public class ClassifyOptions - { - public string TrainingCorpusDir { get; set; } - public string ModelFilePath { get; set; } - public string ModelDir { get; set; } - public string ModelName { get; set; } - public string Word2VecFilePath { get; set; } - - public string FeaturesFileName { get; set; } - public string FeaturesInTfIdfFileName { get; set; } - public string DictionaryFileName { get; set; } - public string CategoriesFileName { get; set; } - - public string PrediceOutputFile { get; set; } - public string TransformFilePath { get; set; } - public RangeTransform Transform { get; set; } - - ///

- /// Feature dimension - ///

- public int Dimension { get; set; } - } -} diff --git a/BotSharp.NLP/Classify/IClassifier.cs b/BotSharp.NLP/Classify/IClassifier.cs deleted file mode 100644 index 85e8f96a1..000000000 --- a/BotSharp.NLP/Classify/IClassifier.cs +++ /dev/null @@ -1,29 +0,0 @@ -ï»¿using Bigtree.Algorithm.Features; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Classify -{ - public interface IClassifier - { - ///

- /// Training by feature vector - ///

- /// - /// - void Train(List sentences, ClassifyOptions options); - - ///

- /// Predict by feature vector - ///

- /// - /// - /// - List> Classify(Sentence sentence, ClassifyOptions options); - - String SaveModel(ClassifyOptions options); - - Object LoadModel(ClassifyOptions options); - } -} diff --git a/BotSharp.NLP/Classify/IEstimator.cs b/BotSharp.NLP/Classify/IEstimator.cs deleted file mode 100644 index 305193e14..000000000 --- a/BotSharp.NLP/Classify/IEstimator.cs +++ /dev/null @@ -1,10 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Classify -{ - public interface IEstimator - { - } -} diff --git a/BotSharp.NLP/Classify/ITextFeatureExtractor.cs b/BotSharp.NLP/Classify/ITextFeatureExtractor.cs deleted file mode 100644 index ab4ce8f5b..000000000 --- a/BotSharp.NLP/Classify/ITextFeatureExtractor.cs +++ /dev/null @@ -1,17 +0,0 @@ -ï»¿using Bigtree.Algorithm.Features; -using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Classify -{ - ///

- /// Featuring text - ///

- public interface ITextFeatureExtractor - { - List GetFeatures(List words); - - } -} diff --git a/BotSharp.NLP/Classify/NaiveBayesClassifier.cs b/BotSharp.NLP/Classify/NaiveBayesClassifier.cs deleted file mode 100644 index 21c002cd7..000000000 --- a/BotSharp.NLP/Classify/NaiveBayesClassifier.cs +++ /dev/null @@ -1,177 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using Bigtree.Algorithm; -using Bigtree.Algorithm.Bayes; -using Bigtree.Algorithm.Estimators; -using Bigtree.Algorithm.Extensions; -using Bigtree.Algorithm.Features; -using Bigtree.Algorithm.Statistics; -using BotSharp.NLP.Featuring; -using BotSharp.NLP.Txt2Vec; -using Newtonsoft.Json; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace BotSharp.NLP.Classify -{ - ///

- /// This is a simple (naive) classification method based on Bayes rule. - /// It relies on a very simple representation of the document (called the bag of words representation) - /// This technique works well for topic classification; - /// say we have a set of academic papers, and we want to classify them into different topics (computer science, biology, mathematics). - /// Naive Bayes is best for Less training data - ///

- public class NaiveBayesClassifier : IClassifier - { - private List labelDist; - - private MultinomiaNaiveBayes nb = new MultinomiaNaiveBayes(); - - private Dictionary condProbDictionary = new Dictionary(); - - private List words; - private double[] features = new double[] { 0, 1 }; - - public void Train(List sentences, ClassifyOptions options) - { - var tfidf = new TfIdfFeatureExtractor(); - tfidf.Dimension = options.Dimension; - tfidf.Sentences = sentences; - tfidf.CalBasedOnCategory(); - - var encoder = new OneHotEncoder(); - encoder.Sentences = sentences; - encoder.Words = tfidf.Keywords(); - words = encoder.EncodeAll(); - - var featureSets = sentences.Select(x => new Tuple(x.Label, x.Vector)).ToList(); - - labelDist = featureSets.GroupBy(x => x.Item1) - .Select(x => new Probability - { - Value = x.Key, - Freq = x.Count() - }) - .OrderBy(x => x.Value) - .ToList(); - - nb.LabelDist = labelDist; - nb.FeatureSet = featureSets; - - // calculate prior prob - labelDist.ForEach(l => l.Prob = nb.CalPriorProb(l.Value)); - - // calculate posterior prob - // loop features - var featureCount = nb.FeatureSet[0].Item2.Length; - - labelDist.ForEach(label => - { - for (int x = 0; x < featureCount; x++) - { - for (int v = 0; v < features.Length; v++) - { - string key = $"{label.Value} f{x} {features[v]}"; - condProbDictionary[key] = nb.CalCondProb(x, label.Value, features[v]); - } - } - }); - } - - public List> Classify(Sentence sentence, ClassifyOptions options) - { - var encoder = new OneHotEncoder(); - encoder.Words = words; - encoder.Encode(sentence); - - var results = new List>(); - - // calculate prop - labelDist.ForEach(lf => - { - var prob = nb.CalPosteriorProb(lf.Value, sentence.Vector, lf.Prob, condProbDictionary); - results.Add(new Tuple(lf.Value, prob)); - }); - - /*Parallel.ForEach(labelDist, (lf) => - { - nb.Y = lf.Value; - lf.Prob = nb.PosteriorProb(); - });*/ - - double total = results.Select(x => x.Item2).Sum(); - return results.Select(x => new Tuple(x.Item1, x.Item2 / total)).ToList(); - } - - public string SaveModel(ClassifyOptions options) - { - // save the model - var model = new MultinomiaNaiveBayesModel - { - LabelDist = labelDist, - CondProbDictionary = condProbDictionary, - Values = words - }; - - //save the file - using (var bw = new BinaryWriter(new FileStream(options.ModelFilePath, FileMode.Create))) - { - var bytes = Encoding.UTF8.GetBytes(JsonConvert.SerializeObject(model)); - bw.Write(bytes); - } - - return options.ModelFilePath; - } - - public Object LoadModel(ClassifyOptions options) - { - string json = String.Empty; - - //read the file - using (var br = new BinaryReader(new FileStream(options.ModelFilePath, FileMode.Open))) - { - byte[] bytes = br.ReadBytes((int)br.BaseStream.Length); - - json = Encoding.UTF8.GetString(bytes); - } - - var model = JsonConvert.DeserializeObject(json); - - labelDist = model.LabelDist; - condProbDictionary = model.CondProbDictionary; - words = model.Values; - - return model; - } - } - - public class FeaturesWithLabel - { - public List Features { get; set; } - public string Label { get; set; } - public FeaturesWithLabel() - { - this.Features = new List(); - } - } -} diff --git a/BotSharp.NLP/Classify/SVMClassifier.cs b/BotSharp.NLP/Classify/SVMClassifier.cs deleted file mode 100644 index 330a006ac..000000000 --- a/BotSharp.NLP/Classify/SVMClassifier.cs +++ /dev/null @@ -1,235 +0,0 @@ -/* - * BotSharp.NLP Library - * Copyright (C) 2018 Bo Peng - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using Bigtree.Algorithm.Features; -using Bigtree.Algorithm.SVM; -using BotSharp.NLP.Featuring; -using BotSharp.NLP.Txt2Vec; -using Newtonsoft.Json; -using Txt2Vec; - -namespace BotSharp.NLP.Classify -{ - ///

- /// This is a simple (naive) classification method based on Support Vector Machine (SVM) - ///

- public class SVMClassifier : IClassifier - { - private List features; - private List> dictionary; - private List categories; - private RangeTransform transform; - private Bigtree.Algorithm.SVM.Model model; - private List featuresInTfIdf; - - public void Train(List sentences, ClassifyOptions options) - { - SVMClassifierTrain(sentences, options); - } - - public void SVMClassifierTrain(List sentences, ClassifyOptions options, SvmType svm = SvmType.C_SVC, KernelType kernel = KernelType.RBF, bool probability = true, string outputFile = null) - { - var tfidf = new TfIdfFeatureExtractor(); - tfidf.Dimension = options.Dimension; - tfidf.Sentences = sentences; - tfidf.CalBasedOnCategory(); - featuresInTfIdf = tfidf.Keywords(); - - // copy test multiclass Model - Problem train = new Problem(); - train.X = GetData(sentences, options).ToArray(); - train.Y = GetLabels(sentences).ToArray(); - train.Count = train.X.Count(); - train.MaxIndex = train.X[0].Count();//int.MaxValue; - - Parameter param = new Parameter(); - transform = RangeTransform.Compute(train); - Problem scaled = transform.Scale(train); - param.Gamma = 1.0 / 3; - param.SvmType = svm; - param.KernelType = kernel; - param.Probability = probability; - - int numberOfClasses = train.Y.OrderBy(x => x).Distinct().Count(); - if (numberOfClasses == 1) - { - Console.Write("Number of classes must greater than one!"); - } - - if (svm == SvmType.C_SVC) - { - for (int i = 0; i < numberOfClasses; i++) - param.Weights[i] = 1; - } - - model = Training.Train(scaled, param); - - Console.Write("Training finished!"); - } - - public List> Classify(Sentence sentence, ClassifyOptions options) - { - var categoryList = new List>(); - - var result = Predict(sentence, options).FirstOrDefault(); - - for(int i = 0; i < result.Length; i++) - { - categoryList.Add(new Tuple(categories[i], result[i])); - } - - return categoryList; - } - - public double[][] Predict(Sentence sentence, ClassifyOptions options) - { - Problem predict = new Problem(); - predict.X = GetData(new List { sentence }, options).ToArray(); - predict.Y = new double[1]; - predict.Count = predict.X.Count(); - predict.MaxIndex = features.Count; - - transform = options.Transform; - Problem scaled = transform.Scale(predict); - - return Prediction.PredictLabelsProbability(model, scaled); - } - - public List GetLabels(List sentences) - { - categories = sentences.Select(x => x.Label).Distinct().OrderBy(x => x).ToList(); - List labels = new List(); - - foreach (var sentence in sentences) - { - var labelId = categories.IndexOf(sentence.Label).ToString(); - labels.Add(double.Parse(labelId)); - } - - return labels; - } - - public List GetData(List sentences, ClassifyOptions options) - { - var extractor = new CountFeatureExtractor(); - //var extractor = new Word2VecFeatureExtractor(); - extractor.ModelFile = options.Word2VecFilePath; - extractor.Sentences = sentences; - if(features != null) - { - extractor.Features = features; - } - - if(dictionary != null) - { - extractor.Dictionary = dictionary; - } - - extractor.Vectorize(featuresInTfIdf); - - if(features == null) - { - features = extractor.Features; - } - - if(dictionary == null) - { - dictionary = extractor.Dictionary; - } - - List datas = new List(); - - foreach (var sentence in sentences) - { - List curNodes = new List(); - - for(int i = 0; i < extractor.Features.Count; i++) - { - - int name = i; - /*var xx = sentence.Words.Find(x => x.Lemma == extractor.Features[i]); - - if (xx == null) - { - curNodes.Add(new Node(name, 0)); - } - else - { - curNodes.Add(new Node(name, xx.Vector)); - }*/ - - curNodes.Add(new Node(i, sentence.Vector[i])); - } - - datas.Add(curNodes.ToArray()); - } - return datas; - } - - public string SaveModel(ClassifyOptions options) - { - options.TransformFilePath = Path.Combine(options.ModelDir, "transform"); - options.FeaturesFileName = Path.Combine(options.ModelDir, "features"); - options.DictionaryFileName = Path.Combine(options.ModelDir, "dictionary"); - options.CategoriesFileName = Path.Combine(options.ModelDir, "categories"); - options.FeaturesInTfIdfFileName = Path.Combine(options.ModelDir, "featuresInTfIdf"); - - File.WriteAllText(options.FeaturesFileName, JsonConvert.SerializeObject(features)); - - File.WriteAllText(options.FeaturesInTfIdfFileName, JsonConvert.SerializeObject(featuresInTfIdf)); - - File.WriteAllText(options.DictionaryFileName, JsonConvert.SerializeObject(dictionary)); - - File.WriteAllText(options.CategoriesFileName, JsonConvert.SerializeObject(categories)); - - RangeTransform.Write(options.TransformFilePath, transform); - Bigtree.Algorithm.SVM.Model.Write(options.ModelFilePath, model); - - return options.ModelFilePath; - } - - object IClassifier.LoadModel(ClassifyOptions options) - { - options.FeaturesFileName = Path.Combine(options.ModelDir, "features"); - options.DictionaryFileName = Path.Combine(options.ModelDir, "dictionary"); - options.ModelFilePath = Path.Combine(options.ModelDir, options.ModelName); - options.TransformFilePath = Path.Combine(options.ModelDir, "transform"); - options.CategoriesFileName = Path.Combine(options.ModelDir, "categories"); - options.FeaturesInTfIdfFileName = Path.Combine(options.ModelDir, "featuresInTfIdf"); - - features = JsonConvert.DeserializeObject>(File.ReadAllText(options.FeaturesFileName)); - - featuresInTfIdf = JsonConvert.DeserializeObject>(File.ReadAllText(options.FeaturesInTfIdfFileName)); - - dictionary = JsonConvert.DeserializeObject>>(File.ReadAllText(options.DictionaryFileName)); - - categories = JsonConvert.DeserializeObject>(File.ReadAllText(options.CategoriesFileName)); - - model = Bigtree.Algorithm.SVM.Model.Read(options.ModelFilePath); - - options.Transform = RangeTransform.Read(options.TransformFilePath); - - return model; - } - } -} diff --git a/BotSharp.NLP/Classify/SentenceFeatureExtractor.cs b/BotSharp.NLP/Classify/SentenceFeatureExtractor.cs deleted file mode 100644 index 15d6c2982..000000000 --- a/BotSharp.NLP/Classify/SentenceFeatureExtractor.cs +++ /dev/null @@ -1,24 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using Bigtree.Algorithm.Features; -using BotSharp.NLP.Tokenize; - -namespace BotSharp.NLP.Classify -{ - public class SentenceFeatureExtractor : ITextFeatureExtractor - { - public List GetFeatures(List words) - { - var features = new List(); - - words.Where(x => x.IsAlpha) - .Distinct() - .ToList() - .ForEach(w => features.Add(new Feature($"contains {w.Text.ToLower()}", "True"))); - - return features; - } - } -} diff --git a/BotSharp.NLP/Classify/WordFeatureExtractor.cs b/BotSharp.NLP/Classify/WordFeatureExtractor.cs deleted file mode 100644 index 22161df12..000000000 --- a/BotSharp.NLP/Classify/WordFeatureExtractor.cs +++ /dev/null @@ -1,23 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; -using Bigtree.Algorithm.Features; -using BotSharp.NLP.Tokenize; - -namespace BotSharp.NLP.Classify -{ - public class WordFeatureExtractor : ITextFeatureExtractor - { - public List GetFeatures(List words) - { - string text = words[0].Text; - var features = new List(); - - features.Add(new Feature("alwayson", "True")); - features.Add(new Feature("startswith", text[0].ToString().ToLower())); - features.Add(new Feature("endswith", text[text.Length - 1].ToString().ToLower())); - - return features; - } - } -} diff --git a/BotSharp.NLP/Corpus/ConllReader.cs b/BotSharp.NLP/Corpus/ConllReader.cs deleted file mode 100644 index ef971f4f7..000000000 --- a/BotSharp.NLP/Corpus/ConllReader.cs +++ /dev/null @@ -1,53 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; - -namespace BotSharp.NLP.Corpus -{ - ///

- /// A corpus reader for CoNLL-style files. These files consist of a - /// series of sentences, separated by blank lines.Each sentence is - /// encoded using a table(or "grid") of values, where each line - /// corresponds to a single word, and each column corresponds to an - /// annotation type.The set of columns used by CoNLL-style files can - /// vary from corpus to corpus; - ///

- public class CoNLLReader - { - public List Read(ReaderOptions options) - { - var sentences = new List(); - using(StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) - { - string line = reader.ReadLine(); - var sentence = new Sentence { Words = new List { } }; - - while (!reader.EndOfStream) - { - if (String.IsNullOrEmpty(line)) - { - sentences.Add(sentence); - sentence = new Sentence { Words = new List { } }; - } - else - { - var columns = line.Split(' '); - - sentence.Words.Add(new Token - { - Text = columns[0], - Pos = columns[1] - }); - } - - line = reader.ReadLine(); - } - - } - - return sentences; - } - } -} diff --git a/BotSharp.NLP/Corpus/FasttextDataReader.cs b/BotSharp.NLP/Corpus/FasttextDataReader.cs deleted file mode 100644 index 92306a244..000000000 --- a/BotSharp.NLP/Corpus/FasttextDataReader.cs +++ /dev/null @@ -1,52 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; - -namespace BotSharp.NLP.Corpus -{ - ///

- /// Fasttext labeled data reader - ///

- public class FasttextDataReader - { - public List Read(ReaderOptions options) - { - if (String.IsNullOrEmpty(options.LabelPrefix)) - { - options.LabelPrefix = "__label__"; - } - - var sentences = new List(); - using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) - { - while (!reader.EndOfStream) - { - string line = reader.ReadLine(); - if (!String.IsNullOrEmpty(line)) - { - var ms = Regex.Matches(line, options.LabelPrefix + @"\S+") - .Cast() - .ToList(); - - var text = line.Substring(ms.Last().Index + ms.Last().Length + 1); - - ms.ForEach(m => - { - sentences.Add(new Sentence - { - Label = m.Value.Substring(options.LabelPrefix.Length), - Text = text - }); - }); - - } - } - } - - return sentences; - } - } -} diff --git a/BotSharp.NLP/Corpus/LabeledPerFileNameReader.cs b/BotSharp.NLP/Corpus/LabeledPerFileNameReader.cs deleted file mode 100644 index 17bf3cab6..000000000 --- a/BotSharp.NLP/Corpus/LabeledPerFileNameReader.cs +++ /dev/null @@ -1,39 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.IO; -using System.Text; - -namespace BotSharp.NLP.Corpus -{ - ///

- /// It used to read labeled data which is seperated by file. - /// The same category data is in one file. - /// File name is the label. - ///

- public class LabeledPerFileNameReader - { - public List Read(ReaderOptions options) - { - string label = options.FileName.Split('.')[0]; - - var sentences = new List(); - using (StreamReader reader = new StreamReader(Path.Combine(options.DataDir, options.FileName))) - { - while (!reader.EndOfStream) - { - string line = reader.ReadLine(); - if (!String.IsNullOrEmpty(line)) - { - sentences.Add(new Sentence - { - Label = label, - Text = line - }); - } - } - } - - return sentences; - } - } -} diff --git a/BotSharp.NLP/Corpus/ReaderOptions.cs b/BotSharp.NLP/Corpus/ReaderOptions.cs deleted file mode 100644 index b60f6101a..000000000 --- a/BotSharp.NLP/Corpus/ReaderOptions.cs +++ /dev/null @@ -1,15 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Corpus -{ - public class ReaderOptions - { - public string DataDir { get; set; } - - public string FileName { get; set; } - - public string LabelPrefix { get; set; } - } -} diff --git a/BotSharp.NLP/Featuring/CountFeatureExtractor.cs b/BotSharp.NLP/Featuring/CountFeatureExtractor.cs deleted file mode 100644 index a22b4518e..000000000 --- a/BotSharp.NLP/Featuring/CountFeatureExtractor.cs +++ /dev/null @@ -1,92 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -//using Bigtree.Algorithm.Matrix; -using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace BotSharp.NLP.Featuring -{ - ///

- /// Convert a collection of text documents to a matrix of token counts - ///

- public class CountFeatureExtractor : IFeatureExtractor - { - public int Dimension { get; set; } - public List Sentences { get; set; } - - public List> Dictionary { get; set; } - public List Features { get; set; } - //public Shape Shape { get; set; } - public string ModelFile { get; set; } - - public void Vectorize(List features) - { - CalculateDictionary(); - - int[][] vec = new int[Sentences.Count][]; - - Sentences.ForEach(s => - { - s.Vector = new double[Features.Count]; - for (int i = 0; i < Features.Count; i++) - { - s.Vector[i] = s.Words.Count(w => w.Lemma == Features[i]); - } - - for (int i = 0; i < s.Words.Count; i++) - { - var dic = Dictionary.Find(x => x.Item1 == s.Words[i].Lemma); - if(dic != null) - { - s.Words[i].Vector = s.Words.Count(w => w.Lemma == dic.Item1); - } - } - }); - } - - private void CalculateDictionary() - { - if (Dictionary == null) - { - List allWords = new List(); - - Sentences.ForEach(s => - { - allWords.AddRange(s.Words); - }); - - Features = allWords.Where(w => w.IsAlpha).Select(x => x.Lemma).Distinct().OrderBy(x => x).ToList(); - - Dictionary = new List>(); - - allWords.Select(x => x.Lemma) - .Distinct() - .OrderBy(x => x) - .ToList() - .ForEach(word => - { - Dictionary.Add(new Tuple(word, allWords.Count(x => x.Lemma == word))); - }); - } - } - } -} diff --git a/BotSharp.NLP/Featuring/IFeatureExtractor.cs b/BotSharp.NLP/Featuring/IFeatureExtractor.cs deleted file mode 100644 index fa5eda4f7..000000000 --- a/BotSharp.NLP/Featuring/IFeatureExtractor.cs +++ /dev/null @@ -1,45 +0,0 @@ -ï»¿//using Bigtree.Algorithm.Matrix; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Featuring -{ - public interface IFeatureExtractor - { - ///

- /// Feature dimension size - ///

- int Dimension { get; set; } - - ///

- /// The whole corpus - ///

- List Sentences { get; set; } - - ///

- /// Feature names - ///

- List Features { get; set; } - - ///

- /// All words and frequency - ///

- List> Dictionary { get; set; } - - ///

- /// Vectorize sentence - ///

- void Vectorize(List features); - - ///

- /// Array shape - ///

- //Shape Shape { get; set; } - - ///

- /// Pre-trained model file path - ///

- string ModelFile { get; set; } - } -} diff --git a/BotSharp.NLP/Featuring/TfIdfFeatureExtractor.cs b/BotSharp.NLP/Featuring/TfIdfFeatureExtractor.cs deleted file mode 100644 index 5de1ce1ab..000000000 --- a/BotSharp.NLP/Featuring/TfIdfFeatureExtractor.cs +++ /dev/null @@ -1,201 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -//using Bigtree.Algorithm.Matrix; -using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.Serialization.Formatters.Binary; -using System.Text; -using System.Text.RegularExpressions; - -namespace BotSharp.NLP.Featuring -{ - public class TfIdfFeatureExtractor : IFeatureExtractor - { - public List Sentences { get; set; } - - private List> tfs; - - private List Categories { get; set; } - public int Dimension { get; set; } - - public List> Dictionary { get; set; } - public List Features { get; set; } - //public Shape Shape { get; set; } - public string ModelFile { get; set; } - - public void Extract(Sentence sentence) - { - - } - - public List Keywords() - { - if(Dimension == 0) - { - Dimension = Categories.Count * 3; - - if(Dimension > 300) - { - Dimension = 300; - } - - if(Dimension < 30) - { - Dimension = 30; - } - } - - var tfs2 = tfs.OrderByDescending(x => x.Item2) - .Select(x => x.Item1) - .Distinct() - .Take(Dimension) - .OrderBy(x => x) - .ToList(); - - return tfs2; - } - - public void CalBasedOnSentence() - { - Categories = Sentences.Select(x => x.Label).Distinct().ToList(); - - tfs = new List>(); - - Sentences.ForEach(sent => - { - sent.Words.Where(x => x.IsAlpha).ToList().ForEach(word => - { - // TF - int c1 = sent.Words.Count(x => x.Lemma == word.Lemma); - double tf = (c1 + 1.0) / sent.Words.Count(); - - // IDF - var c2 = Sentences.Count(s => s.Words.Select(x => x.Lemma).Contains(word.Lemma)); - double idf = Math.Log(Sentences.Count / (c2 + 1.0)); - - word.Vector = tf * idf; - - tfs.Add(new Tuple(word.Lemma, word.Vector)); - }); - }); - } - - public void CalBasedOnCategory() - { - tfs = new List>(); - - Categories = Sentences.Select(x => x.Label).Distinct().ToList(); - - List> allTextByCategory = new List>(); - - Categories.ForEach(label => - { - var allTokens = new List(); - Sentences.Where(x => x.Label == label) - .ToList() - .ForEach(s => allTokens.AddRange(s.Words)); - allTextByCategory.Add(new Tuple(label, String.Join(" ", allTokens.Where(x => x.IsAlpha).Select(x => x.Lemma)))); - }); - - Categories.ForEach(label => - { - var allTokens = new List(); - Sentences.Where(x => x.Label == label) - .ToList() - .ForEach(s => allTokens.AddRange(s.Words)); - - allTokens.Where(x => x.IsAlpha).Select(x => x.Lemma).Distinct() - .ToList() - .ForEach(word => - { - // TF - int c1 = allTokens.Count(x => x.Lemma == word); - double tf = (c1 + 1.0) / allTokens.Count(); - - // IDF - var c2 = 0; - allTextByCategory.ForEach(all => - { - if(Regex.IsMatch(all.Item2, word)) - { - c2++; - } - }); - - double idf = Math.Log(Categories.Count / (c2 + 1.0)); - - tfs.Add(new Tuple(word, tf * idf)); - }); - }); - } - - ///

- /// Normalizes a TF*IDF array of vectors using L2-Norm. - /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) - ///

- /// List> - /// List> - public static List> Normalize(List> vectors) - { - // Normalize the vectors using L2-Norm. - List> normalizedVectors = new List>(); - foreach (var vector in vectors) - { - var normalized = Normalize(vector); - normalizedVectors.Add(normalized); - } - - return normalizedVectors; - } - - ///

- /// Normalizes a TF*IDF vector using L2-Norm. - /// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) - ///

- /// List - /// List - public static List Normalize(List vector) - { - List result = new List(); - - double sumSquared = 0; - foreach (var value in vector) - { - sumSquared += value * value; - } - - double SqrtSumSquared = Math.Sqrt(sumSquared); - - foreach (var value in vector) - { - // L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2) - result.Add(value / SqrtSumSquared); - } - return result; - } - - public void Vectorize(List features) - { - throw new NotImplementedException(); - } - } -} diff --git a/BotSharp.NLP/Featuring/Word2VecFeatureExtractor.cs b/BotSharp.NLP/Featuring/Word2VecFeatureExtractor.cs deleted file mode 100644 index 08f5fe3d6..000000000 --- a/BotSharp.NLP/Featuring/Word2VecFeatureExtractor.cs +++ /dev/null @@ -1,56 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; -//using Bigtree.Algorithm.Matrix; -using Txt2Vec; - -namespace BotSharp.NLP.Featuring -{ - public class Word2VecFeatureExtractor : IFeatureExtractor - { - public int Dimension { get; set; } - public List Sentences { get; set; } - public List> Dictionary { get; set; } - public List Features { get; set; } - //public Shape Shape { get; set; } - public VectorGenerator Vg { get; set; } - public int SentenceVectorSize { get; set; } - public string ModelFile { get; set; } - - public void Vectorize(List features) - { - Init(); - - Sentences.ForEach(s => { - List wordLemmas = new List(); - s.Words.ForEach(word => { - if (features.Contains(word.Lemma)) - { - wordLemmas.Add(word.Lemma); - } - }); - Vec sentenceVec = Vg.Sent2Vec(wordLemmas); - - s.Vector = sentenceVec.VecNodes.ToArray(); - }); - - - } - - private void Init() - { - if(Vg == null) - { - Args args = new Args(); - args.ModelFile = ModelFile; - Vg = new VectorGenerator(args); - SentenceVectorSize = this.Vg.Model.VectorSize; - Features = new List(); - for (int i = 0; i < SentenceVectorSize; i++) - { - Features.Add($"f-{i}"); - } - } - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/AbstractDataIndexer.cs b/BotSharp.NLP/Models/Entropy/AbstractDataIndexer.cs deleted file mode 100644 index 224b408ec..000000000 --- a/BotSharp.NLP/Models/Entropy/AbstractDataIndexer.cs +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the AbstractDataIndexer.java source file found in the -//original java implementation of MaxEnt. - -using System; -using System.Collections.Generic; - -namespace BotSharp.Models -{ - ///

- /// Abstract base for DataIndexer implementations. - ///

- /// - /// Tom Morton - /// - /// - /// Richard J. Northedge - /// - public abstract class AbstractDataIndexer : ITrainingDataIndexer - { - private int[][] mContexts; - private int[] mOutcomeList; - private int[] mNumTimesEventsSeen; - private string[] mPredicateLabels; - private string[] mOutcomeLabels; - - ///

- /// Gets an array of context data calculated from the training data. - ///

- /// - /// Array of integer arrays, each containing the context data for an event. - /// - public virtual int[][] GetContexts() - { - return mContexts; - } - - ///

- /// Sets the array of context data calculated from the training data. - ///

- /// - /// Array of integer arrays, each containing the context data for an event. - /// - protected internal void SetContexts(int[][] newContexts) - { - mContexts = newContexts; - } - - ///

- /// Gets an array indicating how many times each event is seen. - ///

- /// - /// Integer array with event frequencies. - /// - public virtual int[] GetNumTimesEventsSeen() - { - return mNumTimesEventsSeen; - } - - ///

- /// Sets an array indicating how many times each event is seen. - ///

- /// - /// Integer array with event frequencies. - /// - protected internal void SetNumTimesEventsSeen(int[] newNumTimesEventsSeen) - { - mNumTimesEventsSeen = newNumTimesEventsSeen; - } - - ///

- /// Gets an outcome list. - ///

- /// - /// Integer array of outcomes. - /// - public virtual int[] GetOutcomeList() - { - return mOutcomeList; - } - - ///

- /// Sets an outcome list. - ///

- /// - /// Integer array of outcomes. - /// - protected internal void SetOutcomeList(int[] newOutcomeList) - { - mOutcomeList = newOutcomeList; - } - - ///

- /// Gets an array of predicate labels. - ///

- /// - /// Array of predicate labels. - /// - public virtual string[] GetPredicateLabels() - { - return mPredicateLabels; - } - - ///

- /// Sets an array of predicate labels. - ///

- /// - /// Array of predicate labels. - /// - protected internal void SetPredicateLabels(string[] newPredicateLabels) - { - mPredicateLabels = newPredicateLabels; - } - - ///

- /// Gets an array of outcome labels. - ///

- /// - /// Array of outcome labels. - /// - public virtual string[] GetOutcomeLabels() - { - return mOutcomeLabels; - } - - ///

- /// Sets an array of outcome labels. - ///

- /// - /// Array of outcome labels. - /// - protected internal void SetOutcomeLabels(string[] newOutcomeLabels) - { - mOutcomeLabels = newOutcomeLabels; - } - - ///

- /// Sorts and uniques the array of comparable events. This method - /// will alter the eventsToCompare array -- it does an in place - /// sort, followed by an in place edit to remove duplicates. - ///

- /// - /// a List of ComparableEvent values - /// - protected internal virtual void SortAndMerge(List eventsToCompare) - { - eventsToCompare.Sort(); - int eventCount = eventsToCompare.Count; - int uniqueEventCount = 1; // assertion: eventsToCompare.length >= 1 - - if (eventCount <= 1) - { - return; // nothing to do; edge case (see assertion) - } - - ComparableEvent comparableEvent = eventsToCompare[0]; - for (int currentEvent = 1; currentEvent < eventCount; currentEvent++) - { - ComparableEvent eventToCompare = eventsToCompare[currentEvent]; - - if (comparableEvent.Equals(eventToCompare)) - { - comparableEvent.SeenCount++; // increment the seen count - eventsToCompare[currentEvent] = null; // kill the duplicate - } - else - { - comparableEvent = eventToCompare; // a new champion emerges... - uniqueEventCount++; // increment the # of unique events - } - } - - //NotifyProgress("done. Reduced " + eventCount + " events to " + uniqueEventCount + "."); - - mContexts = new int[uniqueEventCount][]; - mOutcomeList = new int[uniqueEventCount]; - mNumTimesEventsSeen = new int[uniqueEventCount]; - - for (int currentEvent = 0, currentStoredEvent = 0; currentEvent < eventCount; currentEvent++) - { - ComparableEvent eventToStore = eventsToCompare[currentEvent]; - if (null == eventToStore) - { - continue; // this was a dupe, skip over it. - } - mNumTimesEventsSeen[currentStoredEvent] = eventToStore.SeenCount; - mOutcomeList[currentStoredEvent] = eventToStore.Outcome; - mContexts[currentStoredEvent] = eventToStore.GetPredicateIndexes(); - ++currentStoredEvent; - } - } - - ///

- /// Utility method for creating a string[] array from a dictionary whose - /// keys are labels (strings) to be stored in the array and whose - /// values are the indices (integers) at which the corresponding - /// labels should be inserted. - ///

- /// - /// a Dictionary value - /// - /// - /// a string[] value - /// - protected internal static string[] ToIndexedStringArray(Dictionary labelToIndexMap) - { - string[] indexedArray = new string[labelToIndexMap.Count]; - int[] indices = new int[labelToIndexMap.Count]; - labelToIndexMap.Keys.CopyTo(indexedArray, 0); - labelToIndexMap.Values.CopyTo(indices, 0); - Array.Sort(indices, indexedArray); - return indexedArray; - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/BasicContextGenerator.cs b/BotSharp.NLP/Models/Entropy/BasicContextGenerator.cs deleted file mode 100644 index 6963a0d5d..000000000 --- a/BotSharp.NLP/Models/Entropy/BasicContextGenerator.cs +++ /dev/null @@ -1,70 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the BasicContextGenerator.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Generate contexts for maxent decisions, assuming that the input - /// given to the GetContext() method is a string containing contextual - /// predicates separated by spaces, e.g: - ///

- /// cp_1 cp_2 ... cp_n - ///

- ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// based on BasicContextGenerator.java, $Revision: 1.2 $, $Date: 2002/04/30 08:48:35 $ - /// - public class BasicContextGenerator : IContextGenerator - { - ///

- /// Builds up the list of contextual predicates given a string. - ///

- /// - /// string with contextual predicates separated by spaces. - /// - /// string array of contextual predicates. - public virtual string[] GetContext(string input) - { - return input.Split(' '); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/BasicEventReader.cs b/BotSharp.NLP/Models/Entropy/BasicEventReader.cs deleted file mode 100644 index a1a19532e..000000000 --- a/BotSharp.NLP/Models/Entropy/BasicEventReader.cs +++ /dev/null @@ -1,125 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the BasicEventStream.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// An object which can deliver a stream of training events assuming - /// that each event is represented as a space separated list containing - /// all the contextual predicates, with the last item being the - /// outcome, e.g.: - /// - ///

cp_1 cp_2 ... cp_n outcome

- ///

- public class BasicEventReader : ITrainingEventReader - { - private IContextGenerator mContext; - private ITrainingDataReader mDataReader; - private TrainingEvent mNextEvent; - - ///

- /// Constructor sets up the training event reader based on a stream of training data. - ///

- /// - /// Stream of training data. - /// - public BasicEventReader(ITrainingDataReader dataReader) - { - mContext = new BasicContextGenerator(); - - mDataReader = dataReader; - if (mDataReader.HasNext()) - { - mNextEvent = CreateEvent(mDataReader.NextToken()); - } - } - - ///

- /// Returns the next Event object held in this EventReader. Each call to ReadNextEvent advances the EventReader. - ///

- /// - /// the Event object which is next in this EventReader - /// - public virtual TrainingEvent ReadNextEvent() - { - while (mNextEvent == null && mDataReader.HasNext()) - { - mNextEvent = CreateEvent(mDataReader.NextToken()); - } - - TrainingEvent currentEvent = mNextEvent; - if (mDataReader.HasNext()) - { - mNextEvent = CreateEvent(mDataReader.NextToken()); - } - else - { - mNextEvent = null; - } - return currentEvent; - } - - ///

- /// Test whether there are any Events remaining in this EventReader. - ///

- /// - /// true if this EventReader has more Events - /// - public virtual bool HasNext() - { - while (mNextEvent == null && mDataReader.HasNext()) - { - mNextEvent = CreateEvent(mDataReader.NextToken()); - } - return mNextEvent != null; - } - - private TrainingEvent CreateEvent(string observation) - { - int lastSpace = observation.LastIndexOf((char)' '); - if (lastSpace == -1) - { - return null; - } - else - { - return new TrainingEvent(observation.Substring(lastSpace + 1), mContext.GetContext(observation.Substring(0, (lastSpace) - (0)))); - } - } - } -} - diff --git a/BotSharp.NLP/Models/Entropy/ComparableEvent.cs b/BotSharp.NLP/Models/Entropy/ComparableEvent.cs deleted file mode 100644 index 9c2786335..000000000 --- a/BotSharp.NLP/Models/Entropy/ComparableEvent.cs +++ /dev/null @@ -1,220 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the ComparableEvent.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Text; - -namespace BotSharp.Models -{ - ///

- /// A Maximum Entropy event representation which we can use to sort based on the - /// predicates indexes contained in the events. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on ComparableEvent.java, $Revision: 1.2 $, $Date: 2001/12/27 19:20:26 $ - /// - public class ComparableEvent : IComparable - { - private int mOutcome; - private int[] mPredicateIndexes ; - private int mSeenCount = 1; - - ///

- /// The outcome ID of this event. - ///

- public int Outcome - { - get - { - return mOutcome; - } - set - { - mOutcome = value; - } - } - - ///

- /// Returns an array containing the indexes of the predicates in this event. - ///

- /// - /// Integer array of predicate indexes. - /// - public int[] GetPredicateIndexes() - { - return mPredicateIndexes; - } - - ///

- /// Sets the array containing the indices of the predicates in this event. - ///

- /// - /// Integer array of predicate indexes. - /// - public void SetPredicateIndexes(int[] predicateIndexes) - { - mPredicateIndexes = predicateIndexes; - } - - ///

- /// The number of times this event - /// has been seen. - ///

- public int SeenCount - { - get - { - return mSeenCount; - } - set - { - mSeenCount = value; - } - } - - ///

- /// Constructor for the ComparableEvent. - ///

- /// - /// The ID of the outcome for this event. - /// - /// - /// Array of indexes for the predicates in this event. - /// - public ComparableEvent(int outcome, int[] predicateIndexes) - { - mOutcome = outcome; - System.Array.Sort(predicateIndexes); - mPredicateIndexes = predicateIndexes; - } - - ///

- /// Implementation of the IComparable interface. - ///

- /// - /// ComparableEvent to compare this event to. - /// - /// - /// A value indicating if the compared object is smaller, greater or the same as this event. - /// - public virtual int CompareTo(ComparableEvent eventToCompare) - { - if (mOutcome < eventToCompare.Outcome) - { - return - 1; - } - else if (mOutcome > eventToCompare.Outcome) - { - return 1; - } - - int smallerLength = (mPredicateIndexes .Length > eventToCompare.GetPredicateIndexes().Length ? eventToCompare.GetPredicateIndexes().Length : GetPredicateIndexes().Length); - - for (int currentIndex = 0; currentIndex < smallerLength; currentIndex++) - { - if (mPredicateIndexes [currentIndex] < eventToCompare.GetPredicateIndexes()[currentIndex]) - { - return - 1; - } - else if (mPredicateIndexes [currentIndex] > eventToCompare.GetPredicateIndexes()[currentIndex]) - { - return 1; - } - } - - if (mPredicateIndexes .Length < eventToCompare.GetPredicateIndexes().Length) - { - return - 1; - } - else if (mPredicateIndexes .Length > eventToCompare.GetPredicateIndexes().Length) - { - return 1; - } - - return 0; - } - - ///

- /// Tests if this event is equal to another object. - ///

- /// - /// Object to test against. - /// - /// - /// True if the objects are equal. - /// - public override bool Equals (object o) - { - if (!(o is ComparableEvent)) - { - return false; - } - return (this.CompareTo(o as ComparableEvent)== 0); - } - - ///

- /// Provides a hashcode for storing events in a dictionary or hashtable. - ///

- /// - /// A hashcode value. - /// - public override int GetHashCode() - { - return this.ToString().GetHashCode(); - } - - ///

- /// Override to provide a succint summary of the ComparableEvent object. - ///

- /// - /// string representation of the ComparableEvent object. - /// - public override string ToString() - { - StringBuilder stringBuilder = new StringBuilder(); - for (int currentIndex = 0; currentIndex < mPredicateIndexes.Length; currentIndex++) - { - stringBuilder.Append(" ").Append(mPredicateIndexes [currentIndex]); - } - return stringBuilder.ToString(); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/GisModel.cs b/BotSharp.NLP/Models/Entropy/GisModel.cs deleted file mode 100644 index b00017946..000000000 --- a/BotSharp.NLP/Models/Entropy/GisModel.cs +++ /dev/null @@ -1,308 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the GISModel.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models -{ - ///

- /// A maximum entropy model which has been trained using the Generalized - /// Iterative Scaling procedure. - ///

- /// - /// Tom Morton and Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on GISModel.java, $Revision: 1.13 $, $Date: 2004/06/11 20:51:44 $ - /// - public sealed class GisModel : IMaximumEntropyModel - { - - private readonly IO.IGisModelReader _reader; - - private readonly string[] _outcomeNames; - - private readonly int _outcomeCount; - private readonly double _initialProbability; - private readonly double _correctionConstantInverse; - - private readonly int[] _featureCounts; - - ///

- /// Constructor for a maximum entropy model trained using the - /// Generalized Iterative Scaling procedure. - ///

- /// - /// A reader providing the data for the model. - /// - public GisModel(IO.IGisModelReader reader) - { - this._reader = reader; - _outcomeNames = reader.GetOutcomeLabels(); - CorrectionConstant = reader.CorrectionConstant; - CorrectionParameter = reader.CorrectionParameter; - - _outcomeCount = _outcomeNames.Length; - _initialProbability = Math.Log(1.0 / _outcomeCount); - _correctionConstantInverse = 1.0 / CorrectionConstant; - _featureCounts = new int[_outcomeCount]; - } - - // implementation of IMaxentModel ------- - - ///

- /// Returns the number of outcomes for this model. - ///

- /// - /// The number of outcomes. - /// - public int OutcomeCount - { - get - { - return (_outcomeCount); - } - } - - ///

- /// Evaluates a context. - ///

- /// - /// A list of string names of the contextual predicates - /// which are to be evaluated together. - /// - /// - /// An array of the probabilities for each of the different - /// outcomes, all of which sum to 1. - /// - public double[] Evaluate(string[] context) - { - return Evaluate(context, new double[_outcomeCount]); - } - - ///

- /// Use this model to evaluate a context and return an array of the - /// likelihood of each outcome given that context. - ///

- /// - /// The names of the predicates which have been observed at - /// the present decision point. - /// - /// - /// This is where the distribution is stored. - /// - /// - /// The normalized probabilities for the outcomes given the - /// context. The indexes of the double[] are the outcome - /// ids, and the actual string representation of the - /// outcomes can be obtained from the method - /// GetOutcome(int outcomeIndex). - /// - public double[] Evaluate(string[] context, double[] outcomeSums) - { - for (int outcomeIndex = 0; outcomeIndex < _outcomeCount; outcomeIndex++) - { - outcomeSums[outcomeIndex] = _initialProbability; - _featureCounts[outcomeIndex] = 0; - } - - foreach (string con in context) - { - _reader.GetPredicateData(con, _featureCounts, outcomeSums); - } - - double normal = 0.0; - for (int outcomeIndex = 0;outcomeIndex < _outcomeCount; outcomeIndex++) - { - outcomeSums[outcomeIndex] = Math.Exp((outcomeSums[outcomeIndex] * _correctionConstantInverse) + ((1.0 - (_featureCounts[outcomeIndex] / CorrectionConstant)) * CorrectionParameter)); - normal += outcomeSums[outcomeIndex]; - } - - for (int outcomeIndex = 0; outcomeIndex < _outcomeCount;outcomeIndex++) - { - outcomeSums[outcomeIndex] /= normal; - } - return outcomeSums; - } - - ///

- /// Return the name of the outcome corresponding to the highest likelihood - /// in the parameter outcomes. - ///

- /// - /// A double[] as returned by the Evaluate(string[] context) - /// method. - /// - /// - /// The name of the most likely outcome. - /// - public string GetBestOutcome(double[] outcomes) - { - int bestOutcomeIndex = 0; - for (int currentOutcome = 1; currentOutcome < outcomes.Length; currentOutcome++) - if (outcomes[currentOutcome] > outcomes[bestOutcomeIndex]) - { - bestOutcomeIndex = currentOutcome; - } - return _outcomeNames[bestOutcomeIndex]; - } - - ///

- /// Return a string matching all the outcome names with all the - /// probabilities produced by the Evaluate(string[] context) - /// method. - ///

- /// - /// A double[] as returned by the - /// eval(string[] context) - /// method. - /// - /// - /// string containing outcome names paired with the normalized - /// probability (contained in the double[] outcomes) - /// for each one. - /// - public string GetAllOutcomes(double[] outcomes) - { - if (outcomes.Length != _outcomeNames.Length) - { - throw new ArgumentException("The double array sent as a parameter to GisModel.GetAllOutcomes() must not have been produced by this model."); - } - else - { - var outcomeInfo = new StringBuilder(outcomes.Length * 2); - outcomeInfo.Append(_outcomeNames[0]).Append("[").Append(outcomes[0].ToString("0.0000", System.Globalization.CultureInfo.CurrentCulture)).Append("]"); - for (int currentOutcome = 1; currentOutcome < outcomes.Length; currentOutcome++) - { - outcomeInfo.Append(" ").Append(_outcomeNames[currentOutcome]).Append("[").Append(outcomes[currentOutcome].ToString("0.0000", System.Globalization.CultureInfo.CurrentCulture)).Append("]"); - } - return outcomeInfo.ToString(); - } - } - - ///

- /// Return the name of an outcome corresponding to an integer ID value. - ///

- /// - /// An outcome ID. - /// - /// - /// The name of the outcome associated with that ID. - /// - public string GetOutcomeName(int outcomeIndex) - { - return _outcomeNames[outcomeIndex]; - } - - ///

- /// Gets the index associated with the string name of the given outcome. - ///

- /// - /// the string name of the outcome for which the - /// index is desired - /// - /// - /// the index if the given outcome label exists for this - /// model, -1 if it does not. - /// - public int GetOutcomeIndex(string outcome) - { - for (int iCurrentOutcomeName = 0; iCurrentOutcomeName < _outcomeNames.Length; iCurrentOutcomeName++) - { - if (_outcomeNames[iCurrentOutcomeName] == outcome) - { - return iCurrentOutcomeName; - } - } - return - 1; - } - - ///

- /// Provides the predicates data structure which is part of the encoding of the maxent model - /// information. This method will usually only be needed by - /// GisModelWriters. - ///

- /// - /// Dictionary containing PatternedPredicate objects. - /// - public Dictionary GetPredicates() - { - return _reader.GetPredicates(); - } - - ///

- /// Provides the list of outcome patterns used by the predicates. This method will usually - /// only be needed by GisModelWriters. - ///

- /// - /// Array of outcome patterns. - /// - public int[][] GetOutcomePatterns() - { - return _reader.GetOutcomePatterns(); - } - - ///

- /// Provides the outcome names data structure which is part of the encoding of the maxent model - /// information. This method will usually only be needed by - /// GisModelWriters. - ///

- /// - /// Array containing the outcome names. - /// - public string[] GetOutcomeNames() - { - return _outcomeNames; - } - - ///

- /// Provides the model's correction constant. - /// This property will usually only be needed by GisModelWriters. - ///

- public int CorrectionConstant { get; private set; } - - ///

- /// Provides the model's correction parameter. - /// This property will usually only be needed by GisModelWriters. - ///

- public double CorrectionParameter { get; private set; } - - } -} diff --git a/BotSharp.NLP/Models/Entropy/GisTrainer.cs b/BotSharp.NLP/Models/Entropy/GisTrainer.cs deleted file mode 100644 index e7d0bba3f..000000000 --- a/BotSharp.NLP/Models/Entropy/GisTrainer.cs +++ /dev/null @@ -1,886 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the GISTrainer.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections; -using System.Collections.Generic; - -namespace BotSharp.Models -{ - ///

- /// An implementation of Generalized Iterative Scaling. The reference paper - /// for this implementation was Adwait Ratnaparkhi's tech report at the - /// University of Pennsylvania's Institute for Research in Cognitive Science, - /// and is available at ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J, Northedge - /// - /// - /// based on GISTrainer.java, $Revision: 1.15 $, $Date: 2004/06/14 20:52:41 $ - /// - public class GisTrainer : IO.IGisModelReader - { - private int mTokenCount; // # of event tokens - private int mPredicateCount; // # of predicates - private int mOutcomeCount; // # of mOutcomes - private int mTokenID; // global index variable for Tokens - private int mPredicateId; // global index variable for Predicates - private int mOutcomeId; // global index variable for Outcomes - - // records the array of predicates seen in each event - private int[][] mContexts; - - // records the array of outcomes seen in each event - private int[] mOutcomes; - - // records the num of times an event has been seen, paired to - // int[][] mContexts - private int[] mNumTimesEventsSeen; - - // stores the string names of the outcomes. The GIS only tracks outcomes - // as ints, and so this array is needed to save the model to disk and - // thereby allow users to know what the outcome was in human - // understandable terms. - private string[] mOutcomeLabels; - - // stores the string names of the predicates. The GIS only tracks - // predicates as ints, and so this array is needed to save the model to - // disk and thereby allow users to know what the outcome was in human - // understandable terms. - private string[] mPredicateLabels; - - // stores the observed expections of each of the events - private double[][] mObservedExpections; - - // stores the estimated parameter value of each predicate during iteration - private double[][] mParameters; - - // Stores the expected values of the features based on the current models - private double[][] mModelExpections; - - //The maximum number of features fired in an event. Usually referred to as C. - private int mMaximumFeatureCount; - - // stores inverse of constant, 1/C. - private double mMaximumFeatureCountInverse; - - // the correction parameter of the model - private double mCorrectionParameter; - - // observed expectation of correction feature - private double mCorrectionFeatureObservedExpectation; - - // a global variable to help compute the amount to modify the correction - // parameter - private double mCorrectionFeatureModifier; - - private const double mNearZero = 0.01; - private const double mLLThreshold = 0.0001; - - // Stores the output of the current model on a single event durring - // training. This will be reset for every event for every iteration. - private double[] mModelDistribution; - - // Stores the number of features that get fired per event - private int[] mFeatureCounts; - - // initial probability for all outcomes. - private double mInitialProbability; - - private Dictionary mPredicates; - private int[][] mOutcomePatterns; - - // smoothing algorithm (unused) -------- - -// internal class UpdateParametersWithSmoothingProcedure : Trove.IIntDoubleProcedure -// { - -// private double mdSigma = 2.0; - -// public UpdateParametersWithSmoothingProcedure(GisTrainer enclosingInstance) -// { -// moEnclosingInstance = enclosingInstance; -// } -// -// private GisTrainer moEnclosingInstance; -// -// public virtual bool Execute(int outcomeID, double input) -// { -// double x = 0.0; -// double x0 = 0.0; -// double tmp; -// double f; -// double fp; -// for (int i = 0; i < 50; i++) -// { -// // check what domain these parameters are in -// tmp = moEnclosingInstance.maoModelExpections[moEnclosingInstance.miPredicateID][outcomeID] * System.Math.Exp(moEnclosingInstance.miConstant * x0); -// f = tmp + (input + x0) / moEnclosingInstance.mdSigma - moEnclosingInstance.maoObservedExpections[moEnclosingInstance.miPredicateID][outcomeID]; -// fp = tmp * moEnclosingInstance.miConstant + 1 / moEnclosingInstance.mdSigma; -// if (fp == 0) -// { -// break; -// } -// x = x0 - f / fp; -// if (System.Math.Abs(x - x0) < 0.000001) -// { -// x0 = x; -// break; -// } -// x0 = x; -// } -// moEnclosingInstance.maoParameters[moEnclosingInstance.miPredicateID].Put(outcomeID, input + x0); -// return true; -// } -// } - - - // training progress event ----------- - - ///

- /// Used to provide informational messages regarding the - /// progress of the training algorithm. - ///

- public event TrainingProgressEventHandler TrainingProgress; - - ///

- /// Used to raise events providing messages with information - /// about training progress. - ///

- /// - /// Contains the message with information about the progress of - /// the training algorithm. - /// - protected virtual void OnTrainingProgress(TrainingProgressEventArgs e) - { - if (TrainingProgress != null) - { - TrainingProgress(this, e); - } - } - - private void NotifyProgress(string message) - { - OnTrainingProgress(new TrainingProgressEventArgs(message)); - } - - - // training options -------------- - - ///

- /// Sets whether this trainer will use smoothing while training the model. - /// This can improve model accuracy, though training will potentially take - /// longer and use more memory. Model size will also be larger. - ///

- /// - /// Initial testing indicates improvements for models built on small data sets and - /// few outcomes, but performance degradation for those with large data - /// sets and lots of outcomes. - /// - public bool Smoothing { get; set; } - - ///

- /// Sets whether this trainer will use slack parameters while training the model. - ///

- public bool UseSlackParameter { get; set; } - - ///

- /// If smoothing is in use, this value indicates the "number" of - /// times we want the trainer to imagine that it saw a feature that it - /// actually didn't see. Defaulted to 0.1. - ///

- public double SmoothingObservation { get; set; } - - ///

- /// Creates a new GisTrainer instance. - ///

- public GisTrainer() - { - Smoothing = false; - UseSlackParameter = false; - SmoothingObservation = 0.1; - } - - ///

- /// Creates a new GisTrainer instance. - ///

- /// - /// Sets whether this trainer will use slack parameters while training the model. - /// - public GisTrainer(bool useSlackParameter) - { - Smoothing = false; - UseSlackParameter = useSlackParameter; - SmoothingObservation = 0.1; - } - - ///

- /// Creates a new GisTrainer instance. - ///

- /// - /// If smoothing is in use, this value indicates the "number" of - /// times we want the trainer to imagine that it saw a feature that it - /// actually didn't see. Defaulted to 0.1. - /// - public GisTrainer(double smoothingObservation) - { - Smoothing = true; - UseSlackParameter = false; - SmoothingObservation = smoothingObservation; - } - - ///

- /// Creates a new GisTrainer instance. - ///

- /// - /// Sets whether this trainer will use slack parameters while training the model. - /// - /// - /// If smoothing is in use, this value indicates the "number" of - /// times we want the trainer to imagine that it saw a feature that it - /// actually didn't see. Defaulted to 0.1. - /// - public GisTrainer(bool useSlackParameter, double smoothingObservation) - { - Smoothing = true; - UseSlackParameter = useSlackParameter; - SmoothingObservation = smoothingObservation; - } - - - // alternative TrainModel signatures -------------- - - ///

- /// Train a model using the GIS algorithm. - ///

- /// - /// The ITrainingEventReader holding the data on which this model - /// will be trained. - /// - public virtual void TrainModel(ITrainingEventReader eventReader) - { - TrainModel(eventReader, 100, 0); - } - - ///

- /// Train a model using the GIS algorithm. - ///

- /// - /// The ITrainingEventReader holding the data on which this model will be trained - /// - /// The number of GIS iterations to perform - /// - /// The number of times a predicate must be seen in order - /// to be relevant for training. - /// - public virtual void TrainModel(ITrainingEventReader eventReader, int iterations, int cutoff) - { - TrainModel(iterations, new OnePassDataIndexer(eventReader, cutoff)); - } - - - // training algorithm ----------------------------- - - ///

- /// Train a model using the GIS algorithm. - ///

- /// - /// The number of GIS iterations to perform. - /// - /// - /// The data indexer used to compress events in memory. - /// - public virtual void TrainModel(int iterations, ITrainingDataIndexer dataIndexer) - { - int[] outcomeList; - - //incorporate all of the needed info - NotifyProgress("Incorporating indexed data for training..."); - mContexts = dataIndexer.GetContexts(); - mOutcomes = dataIndexer.GetOutcomeList(); - mNumTimesEventsSeen = dataIndexer.GetNumTimesEventsSeen(); - mTokenCount = mContexts.Length; - - // determine the correction constant and its inverse - mMaximumFeatureCount = mContexts[0].Length; - for (mTokenID = 1; mTokenID < mContexts.Length; mTokenID++) - { - if (mContexts[mTokenID].Length > mMaximumFeatureCount) - { - mMaximumFeatureCount = mContexts[mTokenID].Length; - } - } - mMaximumFeatureCountInverse = 1.0 / mMaximumFeatureCount; - - NotifyProgress("done."); - - mOutcomeLabels = dataIndexer.GetOutcomeLabels(); - outcomeList = dataIndexer.GetOutcomeList(); - mOutcomeCount = mOutcomeLabels.Length; - mInitialProbability = Math.Log(1.0 / mOutcomeCount); - - mPredicateLabels = dataIndexer.GetPredicateLabels(); - mPredicateCount = mPredicateLabels.Length; - - NotifyProgress("\tNumber of Event Tokens: " + mTokenCount); - NotifyProgress("\t Number of Outcomes: " + mOutcomeCount); - NotifyProgress("\t Number of Predicates: " + mPredicateCount); - - // set up feature arrays - var predicateCounts = new int[mPredicateCount][]; - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - predicateCounts[mPredicateId] = new int[mOutcomeCount]; - } - for (mTokenID = 0; mTokenID < mTokenCount; mTokenID++) - { - for (int currentContext = 0; currentContext < mContexts[mTokenID].Length; currentContext++) - { - predicateCounts[mContexts[mTokenID][currentContext]][outcomeList[mTokenID]] += mNumTimesEventsSeen[mTokenID]; - } - } - - // A fake "observation" to cover features which are not detected in - // the data. The default is to assume that we observed "1/10th" of a - // feature during training. - double smoothingObservation = SmoothingObservation; - - // Get the observed expectations of the features. Strictly speaking, - // we should divide the counts by the number of Tokens, but because of - // the way the model's expectations are approximated in the - // implementation, this is cancelled out when we compute the next - // iteration of a parameter, making the extra divisions wasteful. - mOutcomePatterns = new int[mPredicateCount][]; - mParameters = new double[mPredicateCount][]; - mModelExpections = new double[mPredicateCount][]; - mObservedExpections = new double[mPredicateCount][]; - - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - int activeOutcomeCount; - if (Smoothing) - { - activeOutcomeCount = mOutcomeCount; - } - else - { - activeOutcomeCount = 0; - for (mOutcomeId = 0; mOutcomeId < mOutcomeCount; mOutcomeId++) - { - if (predicateCounts[mPredicateId][mOutcomeId] > 0) - { - activeOutcomeCount++; - } - } - } - - mOutcomePatterns[mPredicateId] = new int[activeOutcomeCount]; - mParameters[mPredicateId] = new double[activeOutcomeCount]; - mModelExpections[mPredicateId] = new double[activeOutcomeCount]; - mObservedExpections[mPredicateId] = new double[activeOutcomeCount]; - - int currentOutcome = 0; - for (mOutcomeId = 0; mOutcomeId < mOutcomeCount; mOutcomeId++) - { - if (predicateCounts[mPredicateId][mOutcomeId] > 0) - { - mOutcomePatterns[mPredicateId][currentOutcome] = mOutcomeId; - mObservedExpections[mPredicateId][currentOutcome] = Math.Log(predicateCounts[mPredicateId][mOutcomeId]); - currentOutcome++; - } - else if (Smoothing) - { - mOutcomePatterns[mPredicateId][currentOutcome] = mOutcomeId; - mObservedExpections[mPredicateId][currentOutcome] = Math.Log(smoothingObservation); - currentOutcome++; - } - } - } - - // compute the expected value of correction - if (UseSlackParameter) - { - int correctionFeatureValueSum = 0; - for (mTokenID = 0; mTokenID < mTokenCount; mTokenID++) - { - for (int currentContext = 0; currentContext < mContexts[mTokenID].Length; currentContext++) - { - mPredicateId = mContexts[mTokenID][currentContext]; - - if ((!Smoothing) && predicateCounts[mPredicateId][mOutcomes[mTokenID]] == 0) - { - correctionFeatureValueSum += mNumTimesEventsSeen[mTokenID]; - } - } - correctionFeatureValueSum += (mMaximumFeatureCount - mContexts[mTokenID].Length) * mNumTimesEventsSeen[mTokenID]; - } - if (correctionFeatureValueSum == 0) - { - mCorrectionFeatureObservedExpectation = Math.Log(mNearZero); //nearly zero so log is defined - } - else - { - mCorrectionFeatureObservedExpectation = Math.Log(correctionFeatureValueSum); - } - - mCorrectionParameter = 0.0; - } - - NotifyProgress("...done."); - - mModelDistribution = new double[mOutcomeCount]; - mFeatureCounts = new int[mOutcomeCount]; - - //Find the parameters - NotifyProgress("Computing model parameters..."); - FindParameters(iterations); - - NotifyProgress("Converting to new predicate format..."); - ConvertPredicates(); - - } - - ///

- /// Estimate and return the model parameters. - ///

- /// - /// Number of iterations to run through. - /// - private void FindParameters(int iterations) - { - double previousLogLikelihood = 0.0; - NotifyProgress("Performing " + iterations + " iterations."); - for (int currentIteration = 1; currentIteration <= iterations; currentIteration++) - { - if (currentIteration < 10) - { - NotifyProgress(" " + currentIteration + ": "); - } - else if (currentIteration < 100) - { - NotifyProgress(" " + currentIteration + ": "); - } - else - { - NotifyProgress(currentIteration + ": "); - } - double currentLogLikelihood = NextIteration(); - if (currentIteration > 1) - { - if (previousLogLikelihood > currentLogLikelihood) - { - throw new SystemException("Model Diverging: loglikelihood decreased"); - } - if (currentLogLikelihood - previousLogLikelihood < mLLThreshold) - { - break; - } - } - previousLogLikelihood = currentLogLikelihood; - } - - // kill a bunch of these big objects now that we don't need them - mObservedExpections = null; - mModelExpections = null; - mNumTimesEventsSeen = null; - mContexts = null; - } - - ///

- /// Use this model to evaluate a context and return an array of the - /// likelihood of each outcome given that context. - ///

- /// - /// The integers of the predicates which have been - /// observed at the present decision point. - /// - /// - /// The normalized probabilities for the outcomes given the - /// context. The indexes of the double[] are the outcome - /// ids. - /// - protected virtual void Evaluate(int[] context, double[] outcomeSums) - { - for (int outcomeIndex = 0; outcomeIndex < mOutcomeCount; outcomeIndex++) - { - outcomeSums[outcomeIndex] = mInitialProbability; - mFeatureCounts[outcomeIndex] = 0; - } - int[] activeOutcomes; - int outcomeId; - int predicateId; - int currentActiveOutcome; - - for (int currentContext = 0; currentContext < context.Length; currentContext++) - { - predicateId = context[currentContext]; - activeOutcomes = mOutcomePatterns[predicateId]; - for (currentActiveOutcome = 0; currentActiveOutcome < activeOutcomes.Length; currentActiveOutcome++) - { - outcomeId = activeOutcomes[currentActiveOutcome]; - mFeatureCounts[outcomeId]++; - outcomeSums[outcomeId] += mMaximumFeatureCountInverse * mParameters[predicateId][currentActiveOutcome]; - } - } - - double sum = 0.0; - for (int currentOutcomeId = 0; currentOutcomeId < mOutcomeCount; currentOutcomeId++) - { - outcomeSums[currentOutcomeId] = System.Math.Exp(outcomeSums[currentOutcomeId]); - if (UseSlackParameter) - { - outcomeSums[currentOutcomeId] += ((1.0 - ((double) mFeatureCounts[currentOutcomeId] / mMaximumFeatureCount)) * mCorrectionParameter); - } - sum += outcomeSums[currentOutcomeId]; - } - - for (int currentOutcomeId = 0; currentOutcomeId < mOutcomeCount; currentOutcomeId++) - { - outcomeSums[currentOutcomeId] /= sum; - } - } - - ///

- /// Compute one iteration of GIS and retutn log-likelihood. - ///

- /// The log-likelihood. - private double NextIteration() - { - // compute contribution of p(a|b_i) for each feature and the new - // correction parameter - double logLikelihood = 0.0; - mCorrectionFeatureModifier = 0.0; - int eventCount = 0; - int numCorrect = 0; - int outcomeId; - - for (mTokenID = 0; mTokenID < mTokenCount; mTokenID++) - { - Evaluate(mContexts[mTokenID], mModelDistribution); - for (int currentContext = 0; currentContext < mContexts[mTokenID].Length; currentContext++) - { - mPredicateId = mContexts[mTokenID][currentContext]; - for (int currentActiveOutcome = 0; currentActiveOutcome < mOutcomePatterns[mPredicateId].Length; currentActiveOutcome++) - { - outcomeId = mOutcomePatterns[mPredicateId][currentActiveOutcome]; - mModelExpections[mPredicateId][currentActiveOutcome] += (mModelDistribution[outcomeId] * mNumTimesEventsSeen[mTokenID]); - - if (UseSlackParameter) - { - mCorrectionFeatureModifier += mModelDistribution[mOutcomeId] * mNumTimesEventsSeen[mTokenID]; - } - } - } - - if (UseSlackParameter) - { - mCorrectionFeatureModifier += (mMaximumFeatureCount - mContexts[mTokenID].Length) * mNumTimesEventsSeen[mTokenID]; - } - - logLikelihood += System.Math.Log(mModelDistribution[mOutcomes[mTokenID]]) * mNumTimesEventsSeen[mTokenID]; - eventCount += mNumTimesEventsSeen[mTokenID]; - - //calculation solely for the information messages - int max = 0; - for (mOutcomeId = 1; mOutcomeId < mOutcomeCount; mOutcomeId++) - { - if (mModelDistribution[mOutcomeId] > mModelDistribution[max]) - { - max = mOutcomeId; - } - } - if (max == mOutcomes[mTokenID]) - { - numCorrect += mNumTimesEventsSeen[mTokenID]; - } - } - NotifyProgress("."); - - // compute the new parameter values - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - for (int currentActiveOutcome = 0; currentActiveOutcome < mOutcomePatterns[mPredicateId].Length; currentActiveOutcome++) - { - outcomeId = mOutcomePatterns[mPredicateId][currentActiveOutcome]; - mParameters[mPredicateId][currentActiveOutcome] += (mObservedExpections[mPredicateId][currentActiveOutcome] - Math.Log(mModelExpections[mPredicateId][currentActiveOutcome])); - mModelExpections[mPredicateId][currentActiveOutcome] = 0.0;// re-initialize to 0.0's - } - } - - if (mCorrectionFeatureModifier > 0.0 && UseSlackParameter) - { - mCorrectionParameter += (mCorrectionFeatureObservedExpectation - Math.Log(mCorrectionFeatureModifier)); - } - - NotifyProgress(". logLikelihood=" + logLikelihood + "\t" + ((double) numCorrect / eventCount)); - return (logLikelihood); - } - - ///

- /// Convert the predicate data into the outcome pattern / patterned predicate format used by the GIS models. - ///

- private void ConvertPredicates() - { - var predicates = new PatternedPredicate[mParameters.Length]; - - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - double[] parameters = mParameters[mPredicateId]; - predicates[mPredicateId] = new PatternedPredicate(mPredicateLabels[mPredicateId], parameters); - } - - var comparer = new OutcomePatternComparer(); - Array.Sort(mOutcomePatterns, predicates, comparer); - - List outcomePatterns = new List(); - int currentPatternId = 0; - int predicatesInPattern = 0; - int[] currentPattern = mOutcomePatterns[0]; - - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - if (comparer.Compare(currentPattern, mOutcomePatterns[mPredicateId]) == 0) - { - predicates[mPredicateId].OutcomePattern = currentPatternId; - predicatesInPattern++; - } - else - { - int[] pattern = new int[currentPattern.Length + 1]; - pattern[0] = predicatesInPattern; - currentPattern.CopyTo(pattern, 1); - outcomePatterns.Add(pattern); - currentPattern = mOutcomePatterns[mPredicateId]; - currentPatternId++; - predicates[mPredicateId].OutcomePattern = currentPatternId; - predicatesInPattern = 1; - } - } - int[] finalPattern = new int[currentPattern.Length + 1]; - finalPattern[0] = predicatesInPattern; - currentPattern.CopyTo(finalPattern, 1); - outcomePatterns.Add(finalPattern); - - mOutcomePatterns = outcomePatterns.ToArray(); - mPredicates = new Dictionary(predicates.Length); - for (mPredicateId = 0; mPredicateId < mPredicateCount; mPredicateId++) - { - mPredicates.Add(predicates[mPredicateId].Name, predicates[mPredicateId]); - } - } - - - // IGisModelReader implementation -------------------- - - ///

- /// The correction constant for the model produced as a result of training. - ///

- public int CorrectionConstant - { - get - { - return mMaximumFeatureCount; - } - } - - ///

- /// The correction parameter for the model produced as a result of training. - ///

- public double CorrectionParameter - { - get - { - return mCorrectionParameter; - } - } - - ///

- /// Obtains the outcome labels for the model produced as a result of training. - ///

- /// - /// Array of outcome labels. - /// - public string[] GetOutcomeLabels() - { - return mOutcomeLabels; - } - - ///

- /// Obtains the outcome patterns for the model produced as a result of training. - ///

- /// - /// Array of outcome patterns. - /// - public int[][] GetOutcomePatterns() - { - return mOutcomePatterns; - } - - ///

- /// Obtains the predicate data for the model produced as a result of training. - ///

- /// - /// Dictionary containing PatternedPredicate objects. - /// - public Dictionary GetPredicates() - { - return mPredicates; - } - - ///

- /// Returns trained model information for a predicate, given the predicate label. - ///

- /// Compare two outcome patterns and determines which comes first, - /// based on the outcome ids (lower outcome ids first) - ///

- /// - /// First outcome pattern to compare. - /// - /// - /// Second outcome pattern to compare. - /// - /// - public virtual int Compare(int[] firstPattern, int[] secondPattern) - { - int smallerLength = (firstPattern.Length > secondPattern.Length ? secondPattern.Length : firstPattern.Length); - - for (int currentOutcome = 0; currentOutcome < smallerLength; currentOutcome++) - { - if (firstPattern[currentOutcome] < secondPattern[currentOutcome]) - { - return - 1; - } - else if (firstPattern[currentOutcome] > secondPattern[currentOutcome]) - { - return 1; - } - } - - if (firstPattern.Length < secondPattern.Length) - { - return - 1; - } - else if (firstPattern.Length > secondPattern.Length) - { - return 1; - } - - return 0; - } - } - } - - ///

- /// Event arguments class for training progress events. - ///

- public class TrainingProgressEventArgs : EventArgs - { - private string mMessage; - - ///

- /// Constructor for the training progress event arguments. - ///

- /// - /// Information message about the progress of training. - /// - public TrainingProgressEventArgs(string message) - { - mMessage = message; - } - - ///

- /// Information message about the progress of training. - ///

- public string Message - { - get - { - return mMessage; - } - } - } - - ///

- /// Event handler delegate for the training progress event. - ///

- public delegate void TrainingProgressEventHandler(object sender, TrainingProgressEventArgs e); - - -} diff --git a/BotSharp.NLP/Models/Entropy/IContextGenerator.cs b/BotSharp.NLP/Models/Entropy/IContextGenerator.cs deleted file mode 100644 index c79e51430..000000000 --- a/BotSharp.NLP/Models/Entropy/IContextGenerator.cs +++ /dev/null @@ -1,71 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the ContextGenerator.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Generate contexts for maximum entropy decisions. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on ContextGenerator.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public interface IContextGenerator - { - ///

- /// Builds up the list of contextual predicates given an object. - ///

- string[] GetContext(object input); - } - - ///

- /// Generate contexts for maximum entropy decisions. - ///

- public interface IContextGenerator - { - ///

- /// Builds up the list of contextual predicates given an object of type T. - ///

- string[] GetContext(T input); - } - -} diff --git a/BotSharp.NLP/Models/Entropy/IMaximumEntropyModel.cs b/BotSharp.NLP/Models/Entropy/IMaximumEntropyModel.cs deleted file mode 100644 index d8e9494a6..000000000 --- a/BotSharp.NLP/Models/Entropy/IMaximumEntropyModel.cs +++ /dev/null @@ -1,151 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the MaxentModel.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Interface for maximum entropy models. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on MaxentModel.java, $Revision: 1.4 $, $Date: 2003/12/09 23:13:53 $ - /// - public interface IMaximumEntropyModel - { - ///

- /// Returns the number of outcomes for this model. - ///

- /// - /// The number of outcomes. - /// - int OutcomeCount - { - get; - } - - ///

- /// Evaluates a context. - ///

- /// - /// A list of string names of the contextual predicates - /// which are to be evaluated together. - /// - /// - /// An array which is populated with the probabilities for each of the different - /// outcomes, all of which sum to 1. - /// - /// - /// an array of the probabilities for each of the different - /// outcomes, all of which sum to 1. The probabilities array is returned if it is appropiately sized. - /// - double[] Evaluate(string[] context, double[] probabilities); - - ///

- /// Simple function to return the outcome associated with the index - /// containing the highest probability in the double[]. - ///

- /// - /// A double[] as returned by the - /// Evaluate(string[] context) - /// method. - /// - /// - /// the string name of the best outcome - /// - string GetBestOutcome(double[] outcomes); - - ///

- /// Return a string matching all the outcome names with all the - /// probabilities produced by the

eval(string[]
-		/// context)

method. - ///

- /// Gets the string name of the outcome associated with the supplied index - ///

- /// - /// the index for which the name of the associated outcome is desired. - /// - /// - /// the string name of the outcome - /// - string GetOutcomeName(int index); - - ///

- /// Gets the index associated with the string name of the given - /// outcome. - ///

- /// - /// the string name of the outcome for which the - /// index is desired - /// - /// - /// the index if the given outcome label exists for this - /// model, -1 if it does not. - /// - int GetOutcomeIndex(string outcome); - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/BinaryGisModelReader.cs b/BotSharp.NLP/Models/Entropy/IO/BinaryGisModelReader.cs deleted file mode 100644 index d8b684ca8..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/BinaryGisModelReader.cs +++ /dev/null @@ -1,175 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the BinaryGISModelReader.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.IO -{ - ///

- /// A reader for GIS models stored in a binary format. This format is not the one - /// used by the java version of MaxEnt. - /// It has two main differences, designed for performance when loading the data - /// from file: first, it uses big endian data values, which is native for C#, and secondly it - /// encodes the outcome patterns and values in a more efficient manner. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on BinaryGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class BinaryGisModelReader : GisModelReader - { - private readonly Stream _input; - private readonly byte[] _buffer; - private int _stringLength = 0; - private readonly Encoding _encoding = Encoding.UTF8; - - ///

- /// Constructor which directly instantiates the Stream containing - /// the model contents. - ///

- /// - /// The Stream containing the model information. - /// - public BinaryGisModelReader(Stream dataInputStream) - { - using (_input = dataInputStream) - { - _buffer = new byte[256]; - base.ReadModel(); - } - } - - ///

- /// Constructor which takes a filename and creates a reader for it. - ///

- /// - /// The full path and name of the file in which the model is stored. - /// - public BinaryGisModelReader(string fileName) - { - using (_input = new FileStream(fileName, FileMode.Open, FileAccess.Read)) - { - _buffer = new byte[256]; - base.ReadModel(); - } - } - - ///

- /// Reads a 32-bit signed integer from the model file. - ///

- protected override int ReadInt32() - { - _input.Read(_buffer, 0, 4); - return BitConverter.ToInt32(_buffer, 0); - } - - ///

- /// Reads a double-precision floating point number from the model file. - ///

- protected override double ReadDouble() - { - _input.Read(_buffer, 0, 8); - return BitConverter.ToDouble(_buffer, 0); - } - - ///

- /// Reads a UTF-8 encoded string from the model file. - ///

- protected override string ReadString() - { - _stringLength = _input.ReadByte(); - _input.Read(_buffer, 0, _stringLength); - return _encoding.GetString(_buffer, 0, _stringLength); - } - - ///

- /// Reads the predicate data from the file in a more efficient format to that implemented by - /// GisModelReader. - ///

- /// - /// Jagged 2-dimensional array of integers that will contain the outcome patterns for the model - /// after this method is called. - /// - /// - /// Dictionary that will contain the predicate information for the model - /// after this method is called. - /// - protected override void ReadPredicates(out int[][] outcomePatterns, out Dictionary predicates) - { - //read from the model how many outcome patterns there are - int outcomePatternCount = ReadInt32(); - outcomePatterns = new int[outcomePatternCount][]; - //read from the model how many predicates there are - predicates = new Dictionary(ReadInt32()); - - //for each outcome pattern in the model - for (int currentOutcomePattern = 0; currentOutcomePattern < outcomePatternCount; currentOutcomePattern++) - { - //read the number of outcomes in this pattern. This number is 1 greater than the real number of outcomes - //in the pattern, because the 0th value contains the number of predicates that use this pattern. - var currentOutcomePatternLength = ReadInt32(); - outcomePatterns[currentOutcomePattern] = new int[currentOutcomePatternLength]; - //read in the outcomes for this pattern - for (int currentOutcome = 0; currentOutcome - /// A writer for GIS models that saves models in a binary format. This format is not the one - /// used by the java version of MaxEnt. - /// It has two main differences, designed for performance when loading the data - /// from file: first, it uses big endian data values, which is native for C#, and secondly it - /// encodes the outcome patterns and values in a more efficient manner. - /// - /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on BinaryGISModelWriter.java $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class BinaryGisModelWriter : GisModelWriter - { - private Stream _output; - private byte[] _buffer = new byte[7]; - private readonly System.Text.Encoding _encoding = System.Text.Encoding.UTF8; - - ///

- /// Default constructor. - ///

- public BinaryGisModelWriter(){} - - ///

- /// Takes a GIS model and a file and - /// writes the model to that file. - ///

- /// - /// The GisModel which is to be persisted. - /// - /// - /// The full path and name of the file in which the model is to be persisted. - /// - public void Persist(GisModel model, string fileName) - { - using (_output = new FileStream(fileName, FileMode.Create)) - { - base.Persist(model); - } - } - - ///

- /// Takes a GIS model and a Stream and - /// writes the model to that Stream. - ///

- /// - /// The GIS model which is to be persisted. - /// - /// - /// The Stream which will be used to persist the model. - /// - public void Persist(GisModel model, Stream dataOutputStream) - { - using (_output = dataOutputStream) - { - base.Persist(model); - } - } - - ///

- /// Writes a UTF-8 encoded string to the model file. - ///

- /// - /// The string data to be persisted. - /// - protected override void WriteString(string data) - { - _output.WriteByte((byte)_encoding.GetByteCount(data)); - _output.Write(_encoding.GetBytes(data), 0, _encoding.GetByteCount(data)); - } - - ///

- /// Writes a 32-bit signed integer to the model file. - ///

- /// - /// The integer data to be persisted. - /// - protected override void WriteInt32(int data) - { - _buffer = BitConverter.GetBytes(data); - _output.Write(_buffer, 0, 4); - } - - ///

- /// Writes a double-precision floating point number to the model file. - ///

- /// - /// The floating point data to be persisted. - /// - protected override void WriteDouble(double data) - { - _buffer = BitConverter.GetBytes(data); - _output.Write(_buffer, 0, 8); - } - - ///

- /// Writes the predicate data to the file in a more efficient format to that implemented by - /// GisModelWriter. - ///

- /// - /// The GIS model containing the predicate data to be persisted. - /// - protected override void WritePredicates(GisModel model) - { - int[][] outcomePatterns = model.GetOutcomePatterns(); - PatternedPredicate[] predicates = GetPredicates(); - - //write the number of outcome patterns - WriteInt32(outcomePatterns.Length); - - //write the number of predicates - WriteInt32(predicates.Length); - - int currentPredicate = 0; - - for (int currentOutcomePattern = 0; currentOutcomePattern < outcomePatterns.Length; currentOutcomePattern++) - { - //write how many outcomes in this pattern - WriteInt32(outcomePatterns[currentOutcomePattern].Length); - - //write the outcomes in this pattern (the first value contains the number of predicates in the pattern - //rather than an outcome) - for (int currentOutcome = 0; currentOutcome < outcomePatterns[currentOutcomePattern].Length; currentOutcome++) - { - WriteInt32(outcomePatterns[currentOutcomePattern][currentOutcome]); - } - - //write predicates for this pattern - while (currentPredicate < predicates.Length && predicates[currentPredicate].OutcomePattern == currentOutcomePattern) - { - WriteString(predicates[currentPredicate].Name); - for (int currentParameter = 0; currentParameter < predicates[currentPredicate].ParameterCount; currentParameter++) - { - WriteDouble(predicates[currentPredicate].GetParameter(currentParameter)); - } - currentPredicate++; - } - } - } - - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/GisModelReader.cs b/BotSharp.NLP/Models/Entropy/IO/GisModelReader.cs deleted file mode 100644 index cbe9d1727..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/GisModelReader.cs +++ /dev/null @@ -1,347 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the GISModelReader.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; - -namespace BotSharp.Models.IO -{ - ///

- /// Abstract parent class for readers of GIS models. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on GISModelReader.java, $Revision: 1.5 $, $Date: 2004/06/11 20:51:36 $ - /// - public abstract class GisModelReader : IGisModelReader - { - private char[] _spaces; - private int _correctionConstant; - private double _correctionParameter; - private string[] _outcomeLabels; - private int[][] _outcomePatterns; - private int _predicateCount; - private Dictionary _predicates; - - ///

- /// The number of predicates contained in the model. - ///

- protected int PredicateCount - { - get - { - return _predicateCount; - } - } - - ///

- /// Retrieve a model from disk. - /// - ///

This method delegates to worker methods for each part of this - /// sequence. If you are creating a reader that conforms largely to this - /// sequence but varies at one or more points, override the relevant worker - /// method(s) to achieve the required format.

- /// - ///

If you are creating a reader for a format which does not follow this - /// sequence at all, override this method and ignore the - /// other ReadX methods provided in this abstract class.

- ///

- /// - /// Thie method assumes that models are saved in the - /// following sequence: - /// - ///

GIS (model type identifier)

- ///

1. the correction constant (int)

- ///

2. the correction constant parameter (double)

- ///

3. outcomes

- ///

3a. number of outcomes (int)

- ///

3b. outcome names (string array - length specified in 3a)

- ///

4. predicates

- ///

4a. outcome patterns

- ///

4ai. number of outcome patterns (int)

- ///

4aii. outcome pattern values (each stored in a space delimited string)

- ///

4b. predicate labels

- ///

4bi. number of predicates (int)

- ///

4bii. predicate names (string array - length specified in 4bi)

- ///

4c. predicate parameters (double values)

- /// - protected virtual void ReadModel() - { - _spaces = new char[] {' '}; //cached constant to improve performance - CheckModelType(); - _correctionConstant = ReadCorrectionConstant(); - _correctionParameter = ReadCorrectionParameter(); - _outcomeLabels = ReadOutcomes(); - ReadPredicates(out _outcomePatterns, out _predicates); - } - - ///

- /// Checks the model file being read from begins with the sequence of characters - /// "GIS". - ///

- protected virtual void CheckModelType() - { - string modelType = ReadString(); - if (modelType != "GIS") - { - throw new ApplicationException("Error: attempting to load a " + modelType + " model as a GIS model." + " You should expect problems."); - } - } - - ///

- /// Reads the correction constant from the model file. - ///

- protected virtual int ReadCorrectionConstant() - { - return ReadInt32(); - } - - ///

- /// Reads the correction constant parameter from the model file. - ///

- protected virtual double ReadCorrectionParameter() - { - return ReadDouble(); - } - - ///

- /// Reads the outcome names from the model file. - ///

- protected virtual string[] ReadOutcomes() - { - int outcomeCount = ReadInt32(); - var outcomeLabels = new string[outcomeCount]; - for (int currentLabel = 0; currentLabel < outcomeCount; currentLabel++) - { - outcomeLabels[currentLabel] = ReadString(); - } - return outcomeLabels; - } - - ///

- /// Reads the predicate information from the model file, placing the data in two - /// structures - an array of outcome patterns, and a Dictionary of predicates - /// keyed by predicate name. - ///

- protected virtual void ReadPredicates(out int[][] outcomePatterns, out Dictionary predicates) - { - outcomePatterns = ReadOutcomePatterns(); - string[] asPredicateLabels = ReadPredicateLabels(); - predicates = ReadParameters(outcomePatterns, asPredicateLabels); - } - - ///

- /// Reads the outcome pattern information from the model file. - ///

- protected virtual int[][] ReadOutcomePatterns() - { - //get the number of outcome patterns (that is, the number of unique combinations of outcomes in the model) - int outcomePatternCount = ReadInt32(); - //initialize an array of outcome patterns. Each outcome pattern is itself an array of integers - var outcomePatterns = new int[outcomePatternCount][]; - //for each outcome pattern - for (int currentOutcomePattern = 0; currentOutcomePattern < outcomePatternCount; currentOutcomePattern++) - { - //read a space delimited string from the model file containing the information for the integer array. - //The first value in the integer array is the number of predicates related to this outcome pattern; the - //other values make up the outcome IDs for this pattern. - string[] tokens = ReadString().Split(_spaces); - //convert this string to the array of integers required for the pattern - var patternData = new int[tokens.Length]; - for (int currentPatternValue = 0; currentPatternValue < tokens.Length; currentPatternValue++) - { - patternData[currentPatternValue] = int.Parse(tokens[currentPatternValue], System.Globalization.CultureInfo.InvariantCulture); - } - outcomePatterns[currentOutcomePattern] = patternData; - } - return outcomePatterns; - } - - ///

- /// Reads the outcome labels from the model file. - ///

- protected virtual string[] ReadPredicateLabels() - { - _predicateCount = ReadInt32(); - var predicateLabels = new string[_predicateCount]; - for (int currentPredicate = 0; currentPredicate < _predicateCount; currentPredicate++) - { - predicateLabels[currentPredicate] = ReadString(); - } - return predicateLabels; - } - - ///

- /// Reads the predicate parameter information from the model file. - ///

- protected virtual Dictionary ReadParameters(int[][] outcomePatterns, string[] predicateLabels) - { - var predicates = new Dictionary(predicateLabels.Length); - int parameterIndex = 0; - - for (int currentOutcomePattern = 0; currentOutcomePattern < outcomePatterns.Length; currentOutcomePattern++) - { - for (int currentOutcomeInfo = 0; currentOutcomeInfo < outcomePatterns[currentOutcomePattern][0]; currentOutcomeInfo++) - { - var parameters = new double[outcomePatterns[currentOutcomePattern].Length - 1]; - for (int currentParameter = 0; currentParameter < outcomePatterns[currentOutcomePattern].Length - 1; currentParameter++) - { - parameters[currentParameter] = ReadDouble(); - } - predicates.Add(predicateLabels[parameterIndex], new PatternedPredicate(currentOutcomePattern, parameters)); - parameterIndex++; - } - } - return predicates; - } - - ///

- /// Implement as needed for the format the model is stored in. - ///

- protected abstract int ReadInt32(); - - ///

- /// Implement as needed for the format the model is stored in. - ///

- protected abstract double ReadDouble(); - - ///

- /// Implement as needed for the format the model is stored in. - ///

- protected abstract string ReadString(); - - ///

- /// The model's correction constant. - ///

- public int CorrectionConstant - { - get - { - return _correctionConstant; - } - } - - ///

- /// The model's correction constant parameter. - ///

- public double CorrectionParameter - { - get - { - return _correctionParameter; - } - } - - ///

- /// Returns the labels for all the outcomes in the model. - ///

- /// - /// string array containing outcome labels. - /// - public string[] GetOutcomeLabels() - { - return _outcomeLabels; - } - - ///

- /// Returns the outcome patterns in the model. - ///

- /// - /// Array of integer arrays containing the information for - /// each outcome pattern in the model. - /// - public int[][] GetOutcomePatterns() - { - return _outcomePatterns; - } - - ///

- /// Returns the predicates in the model. - ///

- /// - /// Dictionary containing PatternedPredicate objects keyed - /// by predicate label. - /// - public Dictionary GetPredicates() - { - return _predicates; - } - - ///

- /// Returns model information for a predicate, given the predicate label. - ///

- /// - /// The predicate label to fetch information for. - /// - /// - /// Array to be passed in to the method; it should have a length equal to the number of outcomes - /// in the model. The method increments the count of each outcome that is active in the specified - /// predicate. - /// - /// - /// Array to be passed in to the method; it should have a length equal to the number of outcomes - /// in the model. The method adds the parameter values for each of the active outcomes in the - /// predicate. - /// - public virtual void GetPredicateData(string predicateLabel, int[] featureCounts, double[] outcomeSums) - { - try - { - if (predicateLabel != null && _predicates.ContainsKey(predicateLabel)) - { - PatternedPredicate predicate = _predicates[predicateLabel]; - int[] activeOutcomes = _outcomePatterns[predicate.OutcomePattern]; - - for (int currentActiveOutcome = 1; currentActiveOutcome < activeOutcomes.Length; currentActiveOutcome++) - { - int outcomeIndex = activeOutcomes[currentActiveOutcome]; - featureCounts[outcomeIndex]++; - outcomeSums[outcomeIndex] += predicate.GetParameter(currentActiveOutcome - 1); - } - } - } - catch (ArgumentNullException ex) - { - throw new ArgumentException(string.Format("Try to find key '{0}' in predicates dictionary ({1} entries)", predicateLabel, _predicates.Count), ex); - } - } - - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/GisModelWriter.cs b/BotSharp.NLP/Models/Entropy/IO/GisModelWriter.cs deleted file mode 100644 index 1f0b829d6..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/GisModelWriter.cs +++ /dev/null @@ -1,313 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the GISModeWriter.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.IO -{ - ///

Abstract parent class for GIS model writers that save data to a single - /// file. It provides the persist method which takes care of the structure of a stored - /// document, and requires an extending class to define precisely how the data should - /// be stored. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on GISModelWriter.java, $Revision: 1.5 $, $Date: 2004/06/11 20:51:36 $ - /// - public abstract class GisModelWriter - { - private PatternedPredicate[] mPredicates; - - ///

- /// Implement as needed for the format the model is stored in. - ///

- /// - /// string data to be written to storage. - /// - protected abstract void WriteString(string data); - - ///

- /// Implement as needed for the format the model is stored in. - ///

- /// - /// Integer data to be written to storage. - /// - protected abstract void WriteInt32(int data); - - ///

- /// Implement as needed for the format the model is stored in. - ///

- /// - /// Double precision floating point data to be written to storage. - /// - protected abstract void WriteDouble(double data); - - ///

- /// Obtains a list of the predicates in the model to be written to storage. - ///

- /// - /// Array of PatternedPredicate objects containing the predicate data for the model. - /// - protected PatternedPredicate[] GetPredicates() - { - return mPredicates; - } - - ///

- /// Sets the list of predicates to be written to storage. - ///

- /// - /// Array of PatternedPredicate objects to be persisted. - /// - protected void SetPredicates(PatternedPredicate[] predicates) - { - mPredicates = predicates; - } - - ///

- /// Writes the model to persistent storage, using the writeX() methods - /// provided by extending classes. - /// - ///

This method delegates to worker methods for each part of this - /// sequence. If you are creating a writer that conforms largely to this - /// sequence but varies at one or more points, override the relevant worker - /// method(s) to achieve the required format.

- /// - ///

If you are creating a writer for a format which does not follow this - /// sequence at all, override this method and ignore the - /// other WriteX methods provided in this abstract class.

- ///

- /// - /// GIS model whose data is to be persisted. - /// - protected void Persist(GisModel model) - { - Initialize(model); - WriteModelType("GIS"); - WriteCorrectionConstant(model.CorrectionConstant); - WriteCorrectionParameter(model.CorrectionParameter); - WriteOutcomes(model.GetOutcomeNames()); - WritePredicates(model); - } - - ///

- /// Organises the data available in the GIS model into a structure that is easier to - /// persist from. - ///

- /// - /// The GIS model to be persisted. - /// - protected virtual void Initialize(GisModel model) - { - //read the predicates from the model - Dictionary predicates = model.GetPredicates(); - //build arrays of predicates and predicate names from the dictionary - mPredicates = new PatternedPredicate[predicates.Count]; - var predicateNames = new string[predicates.Count]; - predicates.Values.CopyTo(mPredicates, 0); - predicates.Keys.CopyTo(predicateNames, 0); - //give each PatternedPredicate in the array the name taken from the dictionary keys - for (int currentPredicate = 0; currentPredicate < predicates.Count; currentPredicate++) - { - mPredicates[currentPredicate].Name = predicateNames[currentPredicate]; - } - //sort the PatternedPredicate array based on the outcome pattern that each predicate uses - Array.Sort(mPredicates, new OutcomePatternIndexComparer()); - } - - ///

- /// Writes the model type identifier at the beginning of the file. - ///

- /// string identifying the model type. - protected virtual void WriteModelType(string modelType) - { - WriteString(modelType); - } - - ///

- /// Writes the value of the correction constant - ///

- /// the model's correction constant value. - protected virtual void WriteCorrectionConstant(int correctionConstant) - { - WriteInt32(correctionConstant); - } - - ///

- /// Writes the value of the correction constant parameter. - ///

- /// the model's correction constant parameter. - protected virtual void WriteCorrectionParameter(double correctionParameter) - { - WriteDouble(correctionParameter); - } - - ///

- /// Writes the outcome labels to the file. - ///

- /// string array of outcome labels. - protected virtual void WriteOutcomes(string[] outcomeLabels) - { - //write the number of outcomes - WriteInt32(outcomeLabels.Length); - - //write each label - foreach (string label in outcomeLabels) - { - WriteString(label); - } - } - - ///

- /// Writes the predicate information to the model file. - ///

- /// The GIS model to write the data from. - protected virtual void WritePredicates(GisModel model) - { - WriteOutcomePatterns(model.GetOutcomePatterns()); - WritePredicateNames(); - WriteParameters(); - } - - ///

- /// Writes the outcome pattern data to the file. - ///

- /// - /// Array of outcome patterns, each an integer array containing - /// the number of predicates using the pattern, and then the list of - /// outcome IDs in the pattern. - /// - protected void WriteOutcomePatterns(int[][] outcomePatterns) - { - //write the number of outcome patterns - WriteInt32(outcomePatterns.Length); - - //for each pattern - foreach (int[] pattern in outcomePatterns) - { - //build a string with the pattern values separated by spaces - var outcomePatternBuilder = new StringBuilder(); - for (int currentOutcome = 0; currentOutcome < pattern.Length; currentOutcome++) - { - if (currentOutcome > 0) - { - outcomePatternBuilder.Append(" "); - } - outcomePatternBuilder.Append(pattern[currentOutcome]); - } - //write the string containing pattern values to the file - WriteString(outcomePatternBuilder.ToString()); - } - } - - ///

- /// Write the names of the predicates to the model file. - ///

- protected void WritePredicateNames() - { - //write the number of predicates - WriteInt32(mPredicates.Length); - - //for each predicate, write its name to the file - foreach (PatternedPredicate predicate in mPredicates) - { - WriteString(predicate.Name); - } - } - - ///

- /// Writes out the parameter values for all the predicates to the model file. - ///

- protected void WriteParameters() - { - foreach (PatternedPredicate predicate in mPredicates) - { - for (int currentParameter = 0; currentParameter < predicate.ParameterCount; currentParameter++) - { - WriteDouble(predicate.GetParameter(currentParameter)); - } - } - } - - ///

- /// Class to enable sorting PatternedPredicates into order based on the - /// outcome pattern index. - ///

- private class OutcomePatternIndexComparer : IComparer - { - - ///

- /// Default constructor. - ///

- internal OutcomePatternIndexComparer(){} - - ///

- /// Implementation of the IComparer interface. - /// Compares two PatternedPredicate objects and returns a value indicating whether - /// one is less than, equal to or greater than the other. - ///

- /// - /// First object to compare. - /// - /// - /// Second object to compare. - /// - /// - /// -1 if the first PatternedPredicate has a lower outcome pattern index; - /// 1 if the second PatternedPredicate has a lower outcome pattern index; - /// 0 if they both have the same outcome pattern index. - /// - public virtual int Compare(PatternedPredicate firstPredicate, PatternedPredicate secondPredicate) - { - if (firstPredicate.OutcomePattern < secondPredicate.OutcomePattern) - { - return -1; - } - else if (firstPredicate.OutcomePattern > secondPredicate.OutcomePattern) - { - return 1; - } - return 0; - } - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/IGisModelReader.cs b/BotSharp.NLP/Models/Entropy/IO/IGisModelReader.cs deleted file mode 100644 index 3632cb441..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/IGisModelReader.cs +++ /dev/null @@ -1,87 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file has no equivalent in the java MaxEnt library, because the link -//between GISModel and GISModelReader is implemented differently there. This -//interface is designed so that GIS model reader classes can hold some or all of -//their data in persistent storage rather than in memory. - -using System; -using System.Collections.Generic; - -namespace BotSharp.Models.IO -{ - ///

- /// Interface for readers of GIS models. - ///

- public interface IGisModelReader - { - ///

- /// Returns the value of the model's correction constant. This property should - /// usually only be accessed by GIS model writer classes via the GisModel class. - ///

- int CorrectionConstant - { - get; - } - - ///

- /// Returns the value of the model's correction constant parameter. This property should - /// usually only be accessed by GIS model writer classes via the GisModel class. - ///

- double CorrectionParameter - { - get; - } - - ///

- /// Returns the model's outcome labels as a string array. This method should - /// usually only be accessed by GIS model writer classes via the GisModel class. - ///

- string[] GetOutcomeLabels(); - - ///

- /// Returns the model's outcome patterns. This method should - /// usually only be accessed by GIS model writer classes via the GisModel class. - ///

- int[][] GetOutcomePatterns(); - - ///

- /// Returns the model's predicates. This method should - /// usually only be accessed by GIS model writer classes via the GisModel class. - ///

- Dictionary GetPredicates(); - - ///

- /// Returns model information for a predicate, given the predicate label. - ///

- /// - /// The predicate label to fetch information for. - /// - /// - /// Array to be passed in to the method; it should have a length equal to the number of outcomes - /// in the model. The method increments the count of each outcome that is active in the specified - /// predicate. - /// - /// - /// Array to be passed in to the method; it should have a length equal to the number of outcomes - /// in the model. The method adds the parameter values for each of the active outcomes in the - /// predicate. - /// - void GetPredicateData(string predicateLabel, int[] featureCounts, double[] outcomeSums); - - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs b/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs deleted file mode 100644 index 4b744014e..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelReader.cs +++ /dev/null @@ -1,123 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the BinaryGISModelReader.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; - -namespace BotSharp.Models.IO -{ - ///

- /// A reader for GIS models stored in the binary format produced by the java version - /// of MaxEnt. This binary format stores data using big-endian values, which means - /// that the C# version must reverse the byte order of each value in turn, making it - /// less efficient. Use only for compatibility with the java MaxEnt library. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on BinaryGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class JavaBinaryGisModelReader : GisModelReader - { - private readonly Stream _input; - private readonly byte[] _buffer; - private int _stringLength = 0; - private readonly System.Text.Encoding _encoding = System.Text.Encoding.UTF8; - - ///

- /// Constructor which directly instantiates the Stream containing - /// the model contents. - ///

- /// The Stream containing the model information. - /// - public JavaBinaryGisModelReader(Stream dataInputStream) - { - using (_input = dataInputStream) - { - _buffer = new byte[256]; - base.ReadModel(); - } - } - - ///

- /// Constructor which takes a filename and creates a reader for it. - ///

- /// The full path and name of the file in which the model is stored. - /// - public JavaBinaryGisModelReader(string fileName) - { - using (_input = new FileStream(fileName, FileMode.Open, FileAccess.Read)) - { - _buffer = new byte[256]; - base.ReadModel(); - } - } - - ///

- /// Reads a 32-bit signed integer from the model file. - ///

- protected override int ReadInt32() - { - _input.Read(_buffer, 0, 4); - Array.Reverse(_buffer, 0, 4); - return BitConverter.ToInt32(_buffer, 0); - } - - ///

- /// Reads a double-precision floating point number from the model file. - ///

- protected override double ReadDouble() - { - _input.Read(_buffer, 0, 8); - Array.Reverse(_buffer, 0, 8); - return BitConverter.ToDouble(_buffer, 0); - } - - ///

- /// Reads a UTF-8 encoded string from the model file. - ///

- protected override string ReadString() - { - //read string from binary file with UTF8 encoding - _stringLength = (_input.ReadByte() * 256) + _input.ReadByte(); - _input.Read(_buffer, 0, _stringLength); - return _encoding.GetString(_buffer, 0, _stringLength); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs b/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs deleted file mode 100644 index 800b8b493..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/JavaBinaryGisModelWriter.cs +++ /dev/null @@ -1,140 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the BinaryGISModelWriter.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; - -namespace BotSharp.Models.IO -{ - ///

- /// A writer for GIS models that saves models in the binary format used by the java - /// version of MaxEnt. This binary format stores data using big-endian values, which means - /// that the C# version must reverse the byte order of each value in turn, making it - /// less efficient. Use only for compatibility with the java MaxEnt library. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on BinaryGISModelWriter.java $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class JavaBinaryGisModelWriter : GisModelWriter - { - private Stream mOutput; - private byte[] mBuffer = new byte[7]; - private System.Text.Encoding mEncoding = System.Text.Encoding.UTF8; - - ///

- /// Default constructor. - ///

- public JavaBinaryGisModelWriter() - { - } - - ///

Takes a GisModel and a File and - /// writes the model to that file. - ///

- /// The GisModel which is to be persisted. - /// - /// The name of the file in which the model is to be persisted. - /// - public void Persist(GisModel model, string fileName) - { - using (mOutput = new FileStream(fileName, FileMode.Create)) - { - base.Persist(model); - } - } - - ///

- /// Takes a GisModel and a Stream and writes the model to that stream. - ///

- /// - /// The GIS model which is to be persisted. - /// - /// - /// The Stream which will be used to persist the model. - /// - public void Persist(GisModel model, Stream dataOutputStream) - { - using (mOutput = dataOutputStream) - { - base.Persist(model); - } - } - - ///

- /// Writes a UTF-8 encoded string to the model file. - ///

- /// /// - /// The string data to be persisted. - /// - protected override void WriteString(string data) - { - mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) / 256)); - mOutput.WriteByte((byte)(mEncoding.GetByteCount(data) % 256)); - mOutput.Write(mEncoding.GetBytes(data), 0, mEncoding.GetByteCount(data)); - } - - ///

- /// Writes a 32-bit signed integer to the model file. - ///

- /// /// - /// The integer data to be persisted. - /// - protected override void WriteInt32(int data) - { - mBuffer = BitConverter.GetBytes(data); - Array.Reverse(mBuffer); - mOutput.Write(mBuffer, 0, 4); - } - - ///

- /// Writes a double-precision floating point number to the model file. - ///

- /// /// - /// The floating point data to be persisted. - /// - protected override void WriteDouble(double data) - { - mBuffer = BitConverter.GetBytes(data); - Array.Reverse(mBuffer); - mOutput.Write(mBuffer, 0, 8); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelReader.cs b/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelReader.cs deleted file mode 100644 index 1d5682c34..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelReader.cs +++ /dev/null @@ -1,111 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the PlainTextGISModelReader.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; - -namespace BotSharp.Models.IO -{ - ///

- /// A reader for GIS models stored in plain text format. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on PlainTextGISModelReader.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class PlainTextGisModelReader : GisModelReader - { - private StreamReader mInput; - - ///

- /// Constructor which directly instantiates the StreamReader containing - /// the model contents. - ///

- /// - /// The StreamReader containing the model information. - /// - public PlainTextGisModelReader(StreamReader reader) - { - using (mInput = reader) - { - base.ReadModel(); - } - } - - ///

- /// Constructor which takes a file and creates a reader for it. - ///

- /// - /// The full path and file name in which the model is stored. - /// - public PlainTextGisModelReader(string fileName) - { - using (mInput = new StreamReader(fileName, System.Text.Encoding.UTF7)) - { - base.ReadModel(); - } - } - - ///

- /// Reads a 32-bit signed integer from the model file. - ///

- protected override int ReadInt32() - { - return int.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture); - } - - ///

- /// Reads a double-precision floating point number from the model file. - ///

- protected override double ReadDouble() - { - return double.Parse(mInput.ReadLine(), System.Globalization.CultureInfo.InvariantCulture); - } - - ///

- /// Reads a string from the model file. - ///

- protected override string ReadString() - { - return mInput.ReadLine(); - } - - } -} diff --git a/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelWriter.cs b/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelWriter.cs deleted file mode 100644 index 193c8b84e..000000000 --- a/BotSharp.NLP/Models/Entropy/IO/PlainTextGisModelWriter.cs +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the PlainTextGISModelReader.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; - -namespace BotSharp.Models.IO -{ - ///

- /// Model writer that saves models in plain text format. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on PlainTextGISModelWriter.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class PlainTextGisModelWriter : GisModelWriter - { - private StreamWriter mOutput; - - ///

- /// Default constructor. - ///

- public PlainTextGisModelWriter() - { - } - - ///

- /// Takes a GIS model and a file and writes the model to that file. - ///

- /// - /// The GisModel which is to be persisted. - /// - /// - /// The name of the file in which the model is to be persisted. - /// - public void Persist(GisModel model, string fileName) - { - using (mOutput = new StreamWriter(fileName, false, System.Text.Encoding.UTF7)) - { - base.Persist(model); - } - } - - ///

- /// Takes a GisModel and a stream and writes the model to that stream. - ///

- /// - /// The GisModel which is to be persisted. - /// - /// - /// The StreamWriter which will be used to persist the model. - /// - public void Persist(GisModel model, StreamWriter writer) - { - using (mOutput = writer) - { - base.Persist(model); - } - } - - ///

- /// Writes a string to the model file. - ///

- /// /// - /// The string data to be persisted. - /// - protected override void WriteString(string data) - { - mOutput.Write(data); - mOutput.WriteLine(); - } - - ///

- /// Writes a 32-bit signed integer to the model file. - ///

- /// - /// The integer data to be persisted. - /// - protected override void WriteInt32(int data) - { - mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture)); - mOutput.WriteLine(); - } - - ///

- /// Writes a double-precision floating point number to the model file. - ///

- /// - /// The floating point data to be persisted. - /// - protected override void WriteDouble(double data) - { - mOutput.Write(data.ToString(System.Globalization.CultureInfo.InvariantCulture)); - mOutput.WriteLine(); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/ITrainingDataIndexer.cs b/BotSharp.NLP/Models/Entropy/ITrainingDataIndexer.cs deleted file mode 100644 index 4e9b1f802..000000000 --- a/BotSharp.NLP/Models/Entropy/ITrainingDataIndexer.cs +++ /dev/null @@ -1,86 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the DataIndexer.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -//Copyright (C) 2003 Thomas Morton -// -//This library is free software; you can redistribute it and/or -//modify it under the terms of the GNU Lesser General Public -//License as published by the Free Software Foundation; either -//version 2.1 of the License, or (at your option) any later version. -// -//This library is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU Lesser General Public -//License along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Object that compresses events in memory and performs feature selection. - ///

- public interface ITrainingDataIndexer - { - - ///

- /// Gets an array of context data calculated from the training data. - ///

- /// - /// Array of integer arrays, each containing the context data for an event. - /// - int[][] GetContexts(); - - ///

- /// Gets an array indicating how many times each event is seen. - ///

- /// - /// Integer array with event frequencies. - /// - int[] GetNumTimesEventsSeen(); - - ///

- /// Gets an outcome list. - ///

- /// - /// Integer array of outcomes. - /// - int[] GetOutcomeList(); - - ///

- /// Gets an array of predicate labels. - ///

- /// - /// Array of predicate labels. - /// - string[] GetPredicateLabels(); - - ///

- /// Gets an array of outcome labels. - ///

- /// - /// Array of outcome labels. - /// - string[] GetOutcomeLabels(); - } -} diff --git a/BotSharp.NLP/Models/Entropy/ITrainingDataReader.cs b/BotSharp.NLP/Models/Entropy/ITrainingDataReader.cs deleted file mode 100644 index 842f20e5d..000000000 --- a/BotSharp.NLP/Models/Entropy/ITrainingDataReader.cs +++ /dev/null @@ -1,74 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the DataStream.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// A interface for objects which can deliver a stream of training data to be - /// supplied to an ITrainingEventReader. It is not necessary to use a ITrainingDataReader in a - /// SharpEntropy application, but it can be used to support a wider variety of formats - /// in which your training data can be held. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on DataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public interface ITrainingDataReader - { - ///

- /// Returns the next slice of data held in this ITrainingDataReader. - ///

- /// - /// the object representing the data which is next in this - /// ITrainingDataReader - /// - T NextToken(); - - ///

- /// Test whether there are any training data items remaining in this ITrainingDataReader. - ///

- /// - /// true if this ITrainingDataReader has more data tokens - /// - bool HasNext(); - } -} diff --git a/BotSharp.NLP/Models/Entropy/ITrainingEventReader.cs b/BotSharp.NLP/Models/Entropy/ITrainingEventReader.cs deleted file mode 100644 index 9b1861025..000000000 --- a/BotSharp.NLP/Models/Entropy/ITrainingEventReader.cs +++ /dev/null @@ -1,66 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the EventStream.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// An object which can deliver a stream of training events for the GIS - /// procedure (or others such as IIS if and when they are implemented). - /// TrainingEventReaders don't need to use SharpEntropy.ITrainingDataReader, but doing so - /// would provide greater flexibility for producing events from data stored in - /// different formats. - ///

- public interface ITrainingEventReader - { - - ///

- /// Returns the next TrainingEvent object held in this TrainingEventReader. - ///

- /// - /// the TrainingEvent object which is next in this TrainingEventReader - /// - TrainingEvent ReadNextEvent(); - - ///

- /// Test whether there are any TrainingEvents remaining in this TrainingEventReader. - ///

- /// - /// true if this TrainingEventReader has more TrainingEvents - /// - bool HasNext(); - } -} diff --git a/BotSharp.NLP/Models/Entropy/OnePassDataIndexer.cs b/BotSharp.NLP/Models/Entropy/OnePassDataIndexer.cs deleted file mode 100644 index 219da7144..000000000 --- a/BotSharp.NLP/Models/Entropy/OnePassDataIndexer.cs +++ /dev/null @@ -1,212 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the OnePassDataIndexer.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; - -namespace BotSharp.Models -{ - ///

- /// An indexer for maxent model data which handles cutoffs for uncommon - /// contextual predicates and provides a unique integer index for each of the - /// predicates. The data structures built in the constructor of this class are - /// used by the GIS trainer. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on OnePassDataIndexer.java, $Revision: 1.1 $, $Date: 2003/12/13 16:41:29 $ - /// - public class OnePassDataIndexer : AbstractDataIndexer - { - ///

- /// One argument constructor for OnePassDataIndexer which calls the two argument - /// constructor assuming no cutoff. - ///

- /// - /// An ITrainingEventReader which contains the a list of all the Events - /// seen in the training data. - /// - public OnePassDataIndexer(ITrainingEventReader eventReader) : this(eventReader, 0) - { - } - - ///

- /// Two argument constructor for OnePassDataIndexer. - ///

- /// - /// An ITrainingEventReader which contains the a list of all the Events - /// seen in the training data. - /// - /// - /// The minimum number of times a predicate must have been - /// observed in order to be included in the model. - /// - public OnePassDataIndexer(ITrainingEventReader eventReader, int cutoff) - { - Dictionary predicateIndex; - List events; - List eventsToCompare; - - predicateIndex = new Dictionary(); - //NotifyProgress("Indexing events using cutoff of " + cutoff + "\n"); - - //NotifyProgress("\tComputing event counts... "); - events = ComputeEventCounts(eventReader, predicateIndex, cutoff); - //NotifyProgress("done. " + events.Count + " events"); - - //NotifyProgress("\tIndexing... "); - eventsToCompare = Index(events, predicateIndex); - - //NotifyProgress("done."); - - //NotifyProgress("Sorting and merging oEvents... "); - SortAndMerge(eventsToCompare); - //NotifyProgress("Done indexing."); - } - - ///

- /// Reads events from eventReader into a List<TrainingEvent>. The - /// predicates associated with each event are counted and any which - /// occur at least cutoff times are added to the - /// predicatesInOut dictionary along with a unique integer index. - ///

- /// - /// an ITrainingEventReader value - /// - /// - /// a Dictionary value - /// - /// - /// an int value - /// - /// - /// an List of TrainingEvents value - /// - private List ComputeEventCounts(ITrainingEventReader eventReader, Dictionary predicatesInOut, int cutoff) - { - var counter = new Dictionary(); - var events = new List(); - int predicateIndex = 0; - while (eventReader.HasNext()) - { - TrainingEvent trainingEvent = eventReader.ReadNextEvent(); - events.Add(trainingEvent); - string[] eventContext = trainingEvent.Context; - for (int currentEventContext = 0; currentEventContext < eventContext.Length; currentEventContext++) - { - if (!predicatesInOut.ContainsKey(eventContext[currentEventContext])) - { - if (counter.ContainsKey(eventContext[currentEventContext])) - { - counter[eventContext[currentEventContext]]++; - } - else - { - counter.Add(eventContext[currentEventContext], 1); - } - if (counter[eventContext[currentEventContext]] >= cutoff) - { - predicatesInOut.Add(eventContext[currentEventContext], predicateIndex++); - counter.Remove(eventContext[currentEventContext]); - } - } - } - } - return events; - } - - private List Index(List events, Dictionary predicateIndex) - { - var map = new Dictionary(); - - int eventCount = events.Count; - int outcomeCount = 0; - - var eventsToCompare = new List(eventCount); - var indexedContext = new List(); - - for (int eventIndex = 0; eventIndex < eventCount; eventIndex++) - { - TrainingEvent currentTrainingEvent = events[eventIndex]; - string[] eventContext = currentTrainingEvent.Context; - ComparableEvent comparableEvent; - - int outcomeIndex; - - string outcome = currentTrainingEvent.Outcome; - - if (map.ContainsKey(outcome)) - { - outcomeIndex = map[outcome]; - } - else - { - outcomeIndex = outcomeCount++; - map.Add(outcome, outcomeIndex); - } - - for (int currentEventContext = 0; currentEventContext < eventContext.Length; currentEventContext++) - { - string predicate = eventContext[currentEventContext]; - if (predicateIndex.ContainsKey(predicate)) - { - indexedContext.Add(predicateIndex[predicate]); - } - } - - // drop events with no active features - if (indexedContext.Count > 0) - { - comparableEvent = new ComparableEvent(outcomeIndex, indexedContext.ToArray()); - eventsToCompare.Add(comparableEvent); - } - else - { - //"Dropped event " + oEvent.Outcome + ":" + oEvent.Context); - } - // recycle the list - indexedContext.Clear(); - } - SetOutcomeLabels(ToIndexedStringArray(map)); - SetPredicateLabels(ToIndexedStringArray(predicateIndex)); - return eventsToCompare; - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/PatternedPredicate.cs b/BotSharp.NLP/Models/Entropy/PatternedPredicate.cs deleted file mode 100644 index 01ab1fdae..000000000 --- a/BotSharp.NLP/Models/Entropy/PatternedPredicate.cs +++ /dev/null @@ -1,118 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Object containing predicate data, where the parameters are matched to - /// the outcomes in an outcome pattern. - ///

- /// - /// Richard J. Northedge - /// - public class PatternedPredicate - { - private int mOutcomePattern; - private double[] mParameters; - private string mName; - - ///

- /// Creates a PatternedPredicate object. - ///

- /// - /// Index into the outcome pattern array, specifying which outcome pattern relates to - /// this predicate. - /// - /// - /// Array of parameters for this predicate. - /// - protected internal PatternedPredicate(int outcomePattern, double[] parameters) - { - mOutcomePattern = outcomePattern; - mParameters = parameters; - } - - ///

- /// Creates a PatternedPredicate object. - ///

- /// - /// The predicate name. - /// - /// - /// Array of parameters for this predicate. - /// - protected internal PatternedPredicate(string name, double[] parameters) - { - mName = name; - mParameters = parameters; - } - - ///

- /// Index into array of outcome patterns. - ///

- public int OutcomePattern - { - get - { - return mOutcomePattern; - } - set // for trainer - { - mOutcomePattern = value; - } - } - - ///

- /// Gets the value of a parameter from this predicate. - ///

- /// - /// index into the parameter array. - /// - /// - public double GetParameter(int index) - { - return mParameters[index]; - } - - ///

- /// Number of parameters associated with this predicate. - ///

- public int ParameterCount - { - get - { - return mParameters.Length; - } - } - - ///

- /// Name of the predicate. - ///

- public string Name - { - get - { - return mName; - } - set - { - mName = value; - } - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/PlainTextByLineDataReader.cs b/BotSharp.NLP/Models/Entropy/PlainTextByLineDataReader.cs deleted file mode 100644 index 004669ed6..000000000 --- a/BotSharp.NLP/Models/Entropy/PlainTextByLineDataReader.cs +++ /dev/null @@ -1,86 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the PlainTextByLineDataStream.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; - -namespace BotSharp.Models -{ - ///

- /// This ITrainingDataReader implementation will take care of reading a plain text file - /// and returning the strings between each new line character, which is what - /// many SharpEntropy applications need in order to create ITrainingEventReaders. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on PlainTextByLineDataStream.java, $Revision: 1.1.1.1 $, $Date: 2001/10/23 14:06:53 $ - /// - public class PlainTextByLineDataReader : ITrainingDataReader - { - private readonly StreamReader _dataReader; - private string _nextLine; - - ///

- /// Creates a training data reader for reading text lines from a file or other text stream - ///

- /// StreamReader containing the source of the training data - public PlainTextByLineDataReader(StreamReader dataSource) - { - _dataReader = dataSource; - _nextLine = _dataReader.ReadLine(); - } - - ///

Gets the next text line from the training data

- /// Next text line from the training data - public virtual string NextToken() - { - string currentLine = _nextLine; - _nextLine = _dataReader.ReadLine(); - return currentLine; - } - - ///

Checks if there is any more training data

- /// true if there is more training data to be read - public virtual bool HasNext() - { - return (_nextLine != null); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/TrainingEvent.cs b/BotSharp.NLP/Models/Entropy/TrainingEvent.cs deleted file mode 100644 index 5128c7787..000000000 --- a/BotSharp.NLP/Models/Entropy/TrainingEvent.cs +++ /dev/null @@ -1,94 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the Event.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -// Copyright (C) 2001 Jason Baldridge and Gann Bierner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// The context of a decision point during training. This includes - /// contextual predicates and an outcome. - ///

- /// - /// Jason Baldridge - /// - /// - /// Richard J. Northedge - /// - /// - /// based on Event.java, $Revision: 1.3 $, $Date: 2003/12/09 23:13:08 $ - /// - public class TrainingEvent - { - ///

- /// The outcome label for this training event. - ///

- public string Outcome { get; private set; } - - ///

- /// The context for this training event. - ///

- /// - /// A string array of context values for this training event. - /// - public string[] Context { get; private set; } - - ///

- /// Constructor for a training event. - ///

- /// - /// the outcome label - /// - /// - /// array containing context values - /// - public TrainingEvent(string outcome, string[] context) - { - Outcome = outcome; - Context = context; - } - - ///

- /// Override providing text summary of the training event. - ///

- /// - /// Summary of the training event. - /// - public override string ToString() - { - return Outcome + " " + string.Join(", ", Context); - } - } -} diff --git a/BotSharp.NLP/Models/Entropy/TwoPassDataIndexer.cs b/BotSharp.NLP/Models/Entropy/TwoPassDataIndexer.cs deleted file mode 100644 index f451039e7..000000000 --- a/BotSharp.NLP/Models/Entropy/TwoPassDataIndexer.cs +++ /dev/null @@ -1,282 +0,0 @@ -//Copyright (C) 2005 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the TwoPassDataIndexer.java source file found in the -//original java implementation of MaxEnt. That source file contains the following header: - -//Copyright (C) 2003 Thomas Morton -// -//This library is free software; you can redistribute it and/or -//modify it under the terms of the GNU Lesser General Public -//License as published by the Free Software Foundation; either -//version 2.1 of the License, or (at your option) any later version. -// -//This library is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU Lesser General Public -//License along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; - -namespace BotSharp.Models -{ - ///

- /// Collecting event and context counts by making two passes over the events. - /// The first pass determines which contexts will be used by the model, and the second - /// pass creates the events in memory containing only the contexts which will be used. - /// This greatly reduces the amount of memory required for storing the events. - /// During the first pass a temporary event file is created which is read during the second pass. - ///

- /// /// - /// Tom Morton - /// - /// /// /// - /// Richard J. Northedge - /// - public class TwoPassDataIndexer : AbstractDataIndexer - { - ///

- /// One argument constructor for DataIndexer which calls the two argument - /// constructor assuming no cutoff. - ///

- /// - /// An ITrainingEventReader which contains the list of all the events - /// seen in the training data. - /// - public TwoPassDataIndexer(ITrainingEventReader eventReader): this(eventReader, 0){} - - ///

- /// Two argument constructor for TwoPassDataIndexer. - ///

- /// - /// An ITrainingEventReader which contains the a list of all the events - /// seen in the training data. - /// - /// - /// The minimum number of times a predicate must have been - /// observed in order to be included in the model. - /// - public TwoPassDataIndexer(ITrainingEventReader eventReader, int cutoff) - { - List eventsToCompare; - - var predicateIndex = new Dictionary(); - //NotifyProgress("Indexing events using cutoff of " + cutoff + "\n"); - - //NotifyProgress("\tComputing event counts... "); - - string tempFile = new FileInfo(Path.GetTempFileName()).FullName; - - int eventCount = ComputeEventCounts(eventReader, tempFile, predicateIndex, cutoff); - //NotifyProgress("done. " + eventCount + " events"); - - //NotifyProgress("\tIndexing... "); - - using (var fileEventReader = new FileEventReader(tempFile)) - { - eventsToCompare = Index(eventCount, fileEventReader, predicateIndex); - } - - if (File.Exists(tempFile)) - { - File.Delete(tempFile); - } - - //NotifyProgress("done."); - - //NotifyProgress("Sorting and merging events... "); - SortAndMerge(eventsToCompare); - //NotifyProgress("Done indexing."); - } - - ///

- /// Reads events from eventStream into a dictionary. The - /// predicates associated with each event are counted and any which - /// occur at least cutoff times are added to the - /// predicatesInOut map along with a unique integer index. - ///

- /// - /// an ITrainingEventReader value - /// - /// - /// a file name to which the events are written to for later processing. - /// - /// - /// a Dictionary value - /// - /// - /// an int value - /// - private int ComputeEventCounts(ITrainingEventReader eventReader, string eventStoreFile, Dictionary predicatesInOut, int cutoff) - { - var counter = new Dictionary(); - int predicateIndex = 0; - int eventCount = 0; - - using (var eventStoreWriter = new StreamWriter(eventStoreFile)) - { - while (eventReader.HasNext()) - { - TrainingEvent currentTrainingEvent = eventReader.ReadNextEvent(); - eventCount++; - eventStoreWriter.Write(FileEventReader.ToLine(currentTrainingEvent)); - string[] eventContext = currentTrainingEvent.Context; - for (int currentPredicate = 0; currentPredicate < eventContext.Length; currentPredicate++) - { - if (!predicatesInOut.ContainsKey(eventContext[currentPredicate])) - { - if (counter.ContainsKey(eventContext[currentPredicate])) - { - counter[eventContext[currentPredicate]]++; - } - else - { - counter.Add(eventContext[currentPredicate], 1); - } - if (counter[eventContext[currentPredicate]] >= cutoff) - { - predicatesInOut.Add(eventContext[currentPredicate], predicateIndex++); - counter.Remove(eventContext[currentPredicate]); - } - } - } - } - } - return eventCount; - } - - private List Index(int eventCount, ITrainingEventReader eventReader, Dictionary predicateIndex) - { - var outcomeMap = new Dictionary(); - int outcomeCount = 0; - var eventsToCompare = new List(eventCount); - var indexedContext = new List(); - while (eventReader.HasNext()) - { - TrainingEvent currentTrainingEvent = eventReader.ReadNextEvent(); - string[] eventContext = currentTrainingEvent.Context; - ComparableEvent comparableEvent; - - int outcomeId; - string outcome = currentTrainingEvent.Outcome; - - if (outcomeMap.ContainsKey(outcome)) - { - outcomeId = outcomeMap[outcome]; - } - else - { - outcomeId = outcomeCount++; - outcomeMap.Add(outcome, outcomeId); - } - - for (int currentPredicate = 0; currentPredicate < eventContext.Length; currentPredicate++) - { - string predicate = eventContext[currentPredicate]; - if (predicateIndex.ContainsKey(predicate)) - { - indexedContext.Add(predicateIndex[predicate]); - } - } - - // drop events with no active features - if (indexedContext.Count > 0) - { - comparableEvent = new ComparableEvent(outcomeId, indexedContext.ToArray()); - eventsToCompare.Add(comparableEvent); - } - else - { - //"Dropped event " + currentTrainingEvent.Outcome + ":" + currentTrainingEvent.Context); - } - // recycle the list - indexedContext.Clear(); - } - SetOutcomeLabels(ToIndexedStringArray(outcomeMap)); - SetPredicateLabels(ToIndexedStringArray(predicateIndex)); - return eventsToCompare; - } - } - - class FileEventReader : ITrainingEventReader, IDisposable - { - private StreamReader mReader; - private string mCurrentLine; - - private char[] mWhitespace; - - public FileEventReader(string fileName) - { - mReader = new StreamReader(fileName, Encoding.UTF7); - mWhitespace = new char[] {'\t', '\n', '\r', ' '}; - } - - public virtual bool HasNext() - { - mCurrentLine = mReader.ReadLine(); - return (mCurrentLine != null); - } - - public virtual TrainingEvent ReadNextEvent() - { - string[] tokens = mCurrentLine.Split(mWhitespace); - string outcome = tokens[0]; - var context = new string[tokens.Length - 1]; - Array.Copy(tokens, 1, context, 0, tokens.Length - 1); - - return (new TrainingEvent(outcome, context)); - } - - public static string ToLine(TrainingEvent eventToConvert) - { - var lineBuilder = new StringBuilder(); - lineBuilder.Append(eventToConvert.Outcome); - string[] context = eventToConvert.Context; - for (int contextIndex = 0, contextLength = context.Length; contextIndex < contextLength; contextIndex++) - { - lineBuilder.Append(" " + context[contextIndex]); - } - lineBuilder.Append(System.Environment.NewLine); - return lineBuilder.ToString(); - } - - public void Dispose() - { - Dispose(true); - GC.SuppressFinalize(this); - } - - protected virtual void Dispose(bool disposing) - { - if (disposing) - { - mReader.Close(); - } - } - - ~FileEventReader() - { - Dispose (false); - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/DataFileEngine.cs b/BotSharp.NLP/Models/WordNet/DataFileEngine.cs deleted file mode 100644 index 640dc70b9..000000000 --- a/BotSharp.NLP/Models/WordNet/DataFileEngine.cs +++ /dev/null @@ -1,511 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.IO; -using System.Collections.Generic; - -namespace BotSharp.Models -{ - ///

- /// Summary description for DataFileEngine. - ///

- public class DataFileEngine : WordNetEngine - { - private readonly string _dataFolder; - private readonly Dictionary _dataFileDictionary; - private string[] _lexicographerFiles; - private Dictionary _relationTypeDictionary; - - - // Public Methods (class specific) ------------------ - public string DataFolder - { - get - { - return _dataFolder; - } - } - - public DataFileEngine(string dataFolder) - { - _dataFolder = dataFolder; - - _dataFileDictionary = new Dictionary(4) - { - {"noun", new PosDataFileSet(dataFolder, "noun")}, - {"verb", new PosDataFileSet(dataFolder, "verb")}, - {"adjective", new PosDataFileSet(dataFolder, "adj")}, - {"adverb", new PosDataFileSet(dataFolder, "adv")} - }; - - InitializeLexicographerFiles(); - - InitializeRelationTypes(); - } - - - // abstract methods implementation ------------------ - - public override string[] GetPartsOfSpeech() - { - return new List(_dataFileDictionary.Keys).ToArray(); - } - - public override string[] GetPartsOfSpeech(string lemma) - { - var partsOfSpeech = new List(); - foreach (string partOfSpeech in _dataFileDictionary.Keys) - { - if (BinarySearch(lemma, _dataFileDictionary[partOfSpeech].IndexFile) != null) - { - partsOfSpeech.Add(partOfSpeech); - } - } - return partsOfSpeech.ToArray(); - } - - public override IndexWord[] GetAllIndexWords(string partOfSpeech) - { - StreamReader searchFile = _dataFileDictionary[partOfSpeech].IndexFile; - string line; - string space = " "; - var indexWords = new List(); - searchFile.DiscardBufferedData(); - searchFile.BaseStream.Position = 0; - while (!searchFile.EndOfStream) - { - line = searchFile.ReadLine(); - if (!line.StartsWith(space)) - { - indexWords.Add(CreateIndexWord(partOfSpeech, line)); - } - } - return indexWords.ToArray(); - } - - public override IndexWord GetIndexWord(string lemma, string partOfSpeech) - { - string line = BinarySearch(lemma, _dataFileDictionary[partOfSpeech].IndexFile); - if (line != null) - { - return CreateIndexWord(partOfSpeech, line); - } - return null; - } - - public override Synset[] GetSynsets(string lemma) - { - var synsets = new List(); - - foreach (string partOfSpeech in _dataFileDictionary.Keys) - { - IndexWord indexWord = GetIndexWord(lemma, partOfSpeech); - - if (indexWord != null) - { - foreach (int synsetOffset in indexWord.SynsetOffsets) - { - Synset synset = CreateSynset(partOfSpeech, synsetOffset); - synsets.Add(synset); - } - } - } - return synsets.ToArray(); - } - - public override Synset[] GetSynsets(string lemma, string partOfSpeech) - { - var synsets = new List(); - - IndexWord indexWord = GetIndexWord(lemma, partOfSpeech); - - if (indexWord != null) - { - foreach (int synsetOffset in indexWord.SynsetOffsets) - { - Synset synset = CreateSynset(partOfSpeech, synsetOffset); - synsets.Add(synset); - } - } - - return synsets.ToArray(); - } - - public override RelationType[] GetRelationTypes(string lemma, string partOfSpeech) - { - IndexWord indexWord = GetIndexWord(lemma, partOfSpeech); - - if (indexWord != null) - { - if (indexWord.RelationTypes != null) - { - int relationTypeCount = indexWord.RelationTypes.Length; - var relationTypes = new RelationType[relationTypeCount]; - for (int currentRelationType = 0; currentRelationType < relationTypeCount; currentRelationType++) - { - relationTypes[currentRelationType] = _relationTypeDictionary[indexWord.RelationTypes[currentRelationType]]; - } - return relationTypes; - } - return null; - } - return null; - } - - public override Synset GetSynset(string lemma, string partOfSpeech, int senseNumber) - { - if (senseNumber < 1) - { - throw new ArgumentOutOfRangeException("senseNumber", senseNumber, "cannot be less than 1"); - } - - IndexWord indexWord = GetIndexWord(lemma, partOfSpeech); - - if (indexWord != null) - { - if (senseNumber > (indexWord.SynsetOffsets.Length + 1)) - { - return (null); - } - Synset synset = CreateSynset(partOfSpeech, indexWord.SynsetOffsets[senseNumber - 1]); - return (synset); - } - return null; - } - - - // Private Methods---------------------------------- - - private string BinarySearch(string searchKey, StreamReader searchFile) - { - if (searchKey.Length == 0) - { - return null; - } - - int c,n; - long top,bot,mid,diff; - string line,key; - diff = 666; - line = ""; - bot = searchFile.BaseStream.Seek(0, SeekOrigin.End); - top = 0; - mid = (bot-top)/2; - - do - { - searchFile.DiscardBufferedData(); - searchFile.BaseStream.Position = mid - 1; - if (mid != 1) - { - while ((c = searchFile.Read()) != '\n' && c != -1) { } - } - line = searchFile.ReadLine(); - if (line == null) - { - return null; - } - n = line.IndexOf(' '); - key = line.Substring(0,n); - key=key.Replace("-"," ").Replace("_"," "); - if (string.CompareOrdinal(key, searchKey) < 0) - { - top = mid; - diff = (bot - top)/2; - mid = top + diff; - } - if (string.CompareOrdinal(key, searchKey) > 0) - { - bot = mid; - diff = (bot - top)/2; - mid = top + diff; - } - } while (key!=searchKey && diff!=0); - - if (key == searchKey) - { - return line; - } - return null; - } - - private IndexWord CreateIndexWord(string partOfSpeech, string line) - { - var tokenizer = new Tokenizer(line); - string word = tokenizer.NextToken().Replace('_', ' '); - string redundantPartOfSpeech = tokenizer.NextToken(); - int senseCount = int.Parse(tokenizer.NextToken()); - - int relationTypeCount = int.Parse(tokenizer.NextToken()); - string[] relationTypes = null; - if (relationTypeCount > 0) - { - relationTypes = new string[relationTypeCount]; - for (int currentRelationType = 0; currentRelationType < relationTypeCount; currentRelationType++) - { - relationTypes[currentRelationType] = tokenizer.NextToken(); - } - } - int redundantSenseCount = int.Parse(tokenizer.NextToken()); - int tagSenseCount = int.Parse(tokenizer.NextToken()); - - int[] synsetOffsets = null; - if (senseCount > 0) - { - synsetOffsets = new int[senseCount]; - for (int currentOffset = 0; currentOffset < senseCount; currentOffset++) - { - synsetOffsets[currentOffset] = int.Parse(tokenizer.NextToken()); - } - } - return new IndexWord(word, partOfSpeech, relationTypes, synsetOffsets, tagSenseCount); - } - - protected internal override Synset CreateSynset(string partOfSpeech, int synsetOffset) - { - StreamReader dataFile = _dataFileDictionary[partOfSpeech].DataFile; - dataFile.DiscardBufferedData(); - dataFile.BaseStream.Seek(synsetOffset, SeekOrigin.Begin); - string record = dataFile.ReadLine(); - - var tokenizer = new Tokenizer(record); - var nextToken = tokenizer.NextToken(); - int offset = int.Parse(nextToken); - - - var nt = int.Parse(tokenizer.NextToken()); - string lexicographerFile = _lexicographerFiles[nt]; - string synsetType = tokenizer.NextToken(); - int wordCount = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); - - var words = new string[wordCount]; - for (int iCurrentWord = 0; iCurrentWord < wordCount; iCurrentWord++) - { - words[iCurrentWord] = tokenizer.NextToken().Replace("_", " "); - int uniqueID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); - } - - int relationCount = int.Parse(tokenizer.NextToken()); - var relations = new Relation[relationCount]; - for (int currentRelation = 0; currentRelation < relationCount; currentRelation++) - { - string relationTypeKey = tokenizer.NextToken(); -// if (fpos.name=="adj" && sstype==AdjSynSetType.DontKnow) -// { -// if (ptrs[j].ptp.mnemonic=="ANTPTR") -// sstype = AdjSynSetType.DirectAnt; -// else if (ptrs[j].ptp.mnemonic=="PERTPTR") -// sstype = AdjSynSetType.Pertainym; -// } - int targetSynsetOffset = int.Parse(tokenizer.NextToken()); - string targetPartOfSpeech = tokenizer.NextToken(); - switch (targetPartOfSpeech) - { - case "n": - targetPartOfSpeech = "noun"; - break; - case "v": - targetPartOfSpeech = "verb"; - break; - case "a": - case "s": - targetPartOfSpeech = "adjective"; - break; - case "r": - targetPartOfSpeech = "adverb"; - break; - } - - int sourceTarget = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); - if (sourceTarget == 0) - { - relations[currentRelation] = new Relation(this, (RelationType)_relationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech); - } - else - { - int sourceWord = sourceTarget >> 8; - int targetWord = sourceTarget & 0xff; - relations[currentRelation] = new Relation(this, (RelationType)_relationTypeDictionary[relationTypeKey], targetSynsetOffset, targetPartOfSpeech, sourceWord, targetWord); - } - } - string frameData = tokenizer.NextToken(); - if (frameData != "|") - { - int frameCount = int.Parse(frameData); - for (int currentFrame = 0; currentFrame < frameCount; currentFrame++) - { - frameData = tokenizer.NextToken(); // + - int frameNumber = int.Parse(tokenizer.NextToken()); - int wordID = int.Parse(tokenizer.NextToken(), System.Globalization.NumberStyles.HexNumber); - } - frameData = tokenizer.NextToken(); - } - string gloss = record.Substring(record.IndexOf('|') + 1); - - var synset = new Synset(synsetOffset, gloss, words, lexicographerFile, relations); - return synset; - } - - protected internal override string[] GetExceptionForms(string lemma, string partOfSpeech) - { - string line = BinarySearch(lemma, _dataFileDictionary[partOfSpeech].ExceptionFile); - if (line != null) - { - var exceptionForms = new List(); - var tokenizer = new Tokenizer(line); - string skipWord = tokenizer.NextToken(); - string word = tokenizer.NextToken(); - while (word != null) - { - exceptionForms.Add(word); - word = tokenizer.NextToken(); - } - return exceptionForms.ToArray(); - } - return mEmpty; - } - - private void InitializeLexicographerFiles() - { - _lexicographerFiles = new string[45]; - - _lexicographerFiles[0] = "adj.all - all adjective clusters"; - _lexicographerFiles[1] = "adj.pert - relational adjectives (pertainyms)"; - _lexicographerFiles[2] = "adv.all - all adverbs"; - _lexicographerFiles[3] = "noun.Tops - unique beginners for nouns"; - _lexicographerFiles[4] = "noun.act - nouns denoting acts or actions"; - _lexicographerFiles[5] = "noun.animal - nouns denoting animals"; - _lexicographerFiles[6] = "noun.artifact - nouns denoting man-made objects"; - _lexicographerFiles[7] = "noun.attribute - nouns denoting attributes of people and objects"; - _lexicographerFiles[8] = "noun.body - nouns denoting body parts"; - _lexicographerFiles[9] = "noun.cognition - nouns denoting cognitive processes and contents"; - _lexicographerFiles[10] = "noun.communication - nouns denoting communicative processes and contents"; - _lexicographerFiles[11] = "noun.event - nouns denoting natural events"; - _lexicographerFiles[12] = "noun.feeling - nouns denoting feelings and emotions"; - _lexicographerFiles[13] = "noun.food - nouns denoting foods and drinks"; - _lexicographerFiles[14] = "noun.group - nouns denoting groupings of people or objects"; - _lexicographerFiles[15] = "noun.location - nouns denoting spatial position"; - _lexicographerFiles[16] = "noun.motive - nouns denoting goals"; - _lexicographerFiles[17] = "noun.object - nouns denoting natural objects (not man-made)"; - _lexicographerFiles[18] = "noun.person - nouns denoting people"; - _lexicographerFiles[19] = "noun.phenomenon - nouns denoting natural phenomena"; - _lexicographerFiles[20] = "noun.plant - nouns denoting plants"; - _lexicographerFiles[21] = "noun.possession - nouns denoting possession and transfer of possession"; - _lexicographerFiles[22] = "noun.process - nouns denoting natural processes"; - _lexicographerFiles[23] = "noun.quantity - nouns denoting quantities and units of measure"; - _lexicographerFiles[24] = "noun.relation - nouns denoting relations between people or things or ideas"; - _lexicographerFiles[25] = "noun.shape - nouns denoting two and three dimensional shapes"; - _lexicographerFiles[26] = "noun.state - nouns denoting stable states of affairs"; - _lexicographerFiles[27] = "noun.substance - nouns denoting substances"; - _lexicographerFiles[28] = "noun.time - nouns denoting time and temporal relations"; - _lexicographerFiles[29] = "verb.body - verbs of grooming, dressing and bodily care"; - _lexicographerFiles[30] = "verb.change - verbs of size, temperature change, intensifying, etc."; - _lexicographerFiles[31] = "verb.cognition - verbs of thinking, judging, analyzing, doubting"; - _lexicographerFiles[32] = "verb.communication - verbs of telling, asking, ordering, singing"; - _lexicographerFiles[33] = "verb.competition - verbs of fighting, athletic activities"; - _lexicographerFiles[34] = "verb.consumption - verbs of eating and drinking"; - _lexicographerFiles[35] = "verb.contact - verbs of touching, hitting, tying, digging"; - _lexicographerFiles[36] = "verb.creation - verbs of sewing, baking, painting, performing"; - _lexicographerFiles[37] = "verb.emotion - verbs of feeling"; - _lexicographerFiles[38] = "verb.motion - verbs of walking, flying, swimming"; - _lexicographerFiles[39] = "verb.perception - verbs of seeing, hearing, feeling"; - _lexicographerFiles[40] = "verb.possession - verbs of buying, selling, owning"; - _lexicographerFiles[41] = "verb.social - verbs of political and social activities and events"; - _lexicographerFiles[42] = "verb.stative - verbs of being, having, spatial relations"; - _lexicographerFiles[43] = "verb.weather - verbs of raining, snowing, thawing, thundering"; - _lexicographerFiles[44] = "adj.ppl - participial adjectives"; - - } - - private void InitializeRelationTypes() - { - _relationTypeDictionary = new Dictionary(30) - { - {"!", new RelationType("Antonym", new string[] {"noun", "verb", "adjective", "adverb"})}, - {"@", new RelationType("Hypernym", new string[] {"noun", "verb"})}, - {"@i", new RelationType("Instance Hypernym", new string[] {"noun"})}, - {"~", new RelationType("Hyponym", new string[] {"noun", "verb"})}, - {"~i", new RelationType("Instance Hyponym", new string[] {"noun"})}, - {"#m", new RelationType("Member holonym", new string[] {"noun"})}, - {"#s", new RelationType("Substance holonym", new string[] {"noun"})}, - {"#p", new RelationType("Part holonym", new string[] {"noun"})}, - {"%m", new RelationType("Member meronym", new string[] {"noun"})}, - {"%s", new RelationType("Substance meronym", new string[] {"noun"})}, - {"%p", new RelationType("Part meronym", new string[] {"noun"})}, - {"=", new RelationType("Attribute", new string[] {"noun", "adjective"})}, - {"+", new RelationType("Derivationally related form", new string[] {"noun", "verb"})}, - {";c", new RelationType("Domain of synset - TOPIC", new string[] {"noun", "verb", "adjective", "adverb"})}, - {"-c", new RelationType("Member of this domain - TOPIC", new string[] {"noun"})}, - {";r", new RelationType("Domain of synset - REGION", new string[] {"noun", "verb", "adjective", "adverb"})}, - {"-r", new RelationType("Member of this domain - REGION", new string[] {"noun"})}, - {";u", new RelationType("Domain of synset - USAGE", new string[] {"noun", "verb", "adjective", "adverb"})}, - {"-u", new RelationType("Member of this domain - USAGE", new string[] {"noun"})}, - {"*", new RelationType("Entailment", new string[] {"verb"})}, - {">", new RelationType("Cause", new string[] {"verb"})}, - {"^", new RelationType("Also see", new string[] {"verb", "adjective"})}, - {"$", new RelationType("Verb Group", new string[] {"verb"})}, - {"&", new RelationType("Similar to", new string[] {"adjective"})}, - {"<", new RelationType("Participle of verb", new string[] {"adjective"})}, - {@"\", new RelationType("Pertainym", new string[] {"adjective", "adverb"})} - }; - - //moRelationTypeDictionary.Add(";", new RelationType("Domain of synset", new string[] {"noun", "verb", "adjective", "adverb"})); - //moRelationTypeDictionary.Add("-", new RelationType("Member of this domain", new string[] {"noun"})); - - } - - private class PosDataFileSet - { - private readonly StreamReader _indexFile; - private readonly StreamReader _dataFile; - private readonly StreamReader _exceptionFile; - - public StreamReader IndexFile - { - get - { - return _indexFile; - } - } - - public StreamReader DataFile - { - get - { - return _dataFile; - } - } - - public StreamReader ExceptionFile - { - get - { - return _exceptionFile; - } - } - - public PosDataFileSet(string dataFolder, string partOfSpeech) - { - _indexFile = new StreamReader(Path.Combine(dataFolder, "index." + partOfSpeech)); - _dataFile = new StreamReader(Path.Combine(dataFolder, "data." + partOfSpeech)); - _exceptionFile = new StreamReader(Path.Combine(dataFolder, partOfSpeech + ".exc")); - } - } - - - } -} diff --git a/BotSharp.NLP/Models/WordNet/IndexWord.cs b/BotSharp.NLP/Models/WordNet/IndexWord.cs deleted file mode 100644 index ebeee561d..000000000 --- a/BotSharp.NLP/Models/WordNet/IndexWord.cs +++ /dev/null @@ -1,56 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Linq; - -namespace BotSharp.Models -{ - ///

- /// Summary description for IndexWord. - ///

- public class IndexWord - { - // Properties ------------------------ - - public string PartOfSpeech { get; private set; } - - public int[] SynsetOffsets { get; private set; } - - public string Lemma { get; private set; } - - public int SenseCount - { - get { return this.SynsetOffsets != null ? this.SynsetOffsets.Count() : 0; } - } - - public int TagSenseCount { get; private set; } - - public string[] RelationTypes { get; private set; } - - - // Constructors -------------------- - - public IndexWord(string lemma, string partOfSpeech, string[] relationTypes, int[] synsetOffsets, int tagSenseCount) - { - this.Lemma = lemma; - this.PartOfSpeech = partOfSpeech; - this.RelationTypes = relationTypes; - this.SynsetOffsets = synsetOffsets; - this.TagSenseCount = tagSenseCount; - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs deleted file mode 100644 index 85ef931d9..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/AbstractDelegatingOperation.cs +++ /dev/null @@ -1,74 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the AbstractDelegatingOperation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - public abstract class AbstractDelegatingOperation : IOperation - { - private Dictionary mOperationSets; - - public virtual void AddDelegate(string key, IOperation[] operations) - { - if (!mOperationSets.ContainsKey(key)) - { - mOperationSets.Add(key, operations); - } - else - { - mOperationSets[key] = operations; - } - } - - protected internal AbstractDelegatingOperation() - { - mOperationSets = new Dictionary(); - } - - //protected internal abstract AbstractDelegatingOperation getInstance(System.Collections.IDictionary params_Renamed); - - protected internal virtual bool HasDelegate(string key) - { - return mOperationSets.ContainsKey(key); - } - - protected internal virtual bool ExecuteDelegate(string lemma, string partOfSpeech, ListbaseForms, string key) - { - IOperation[] operations = mOperationSets[key]; - bool result = false; - for (int currentOperation = 0; currentOperation < operations.Length; currentOperation++) - { - if (operations[currentOperation].Execute(lemma, partOfSpeech, baseForms)) - { - result = true; - } - } - return result; - } - - #region IOperation Members - - public abstract bool Execute(string lemma, string partOfSpeech, List baseForms); - - #endregion - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/DetachSuffixesOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/DetachSuffixesOperation.cs deleted file mode 100644 index e69ad11cc..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/DetachSuffixesOperation.cs +++ /dev/null @@ -1,67 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the DetachSuffixesOperation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - ///

- /// Remove all applicable suffixes from the word(s) and do a look-up. - ///

- public class DetachSuffixesOperation : AbstractDelegatingOperation - { - public const string Operations = "operations"; - - private Dictionary mSuffixMap; - - public DetachSuffixesOperation(Dictionary suffixMap) - { - mSuffixMap = suffixMap; - } - - #region IOperation Members - - public override bool Execute(string lemma, string partOfSpeech, List baseForms) - { - if (!mSuffixMap.ContainsKey(partOfSpeech)) - { - return false; - } - string[][] suffixArray = mSuffixMap[partOfSpeech]; - - bool addedBaseForm = false; - for (int currentSuffix = 0; currentSuffix < suffixArray.Length; currentSuffix++) - { - if (lemma.EndsWith(suffixArray[currentSuffix][0])) - { - string stem = lemma.Substring(0, (lemma.Length - suffixArray[currentSuffix][0].Length) - (0)) + suffixArray[currentSuffix][1]; - if (ExecuteDelegate(stem, partOfSpeech, baseForms, Operations)) - { - addedBaseForm = true; - } - } - } - return addedBaseForm; - } - - #endregion - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/IOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/IOperation.cs deleted file mode 100644 index c7f41e1f8..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/IOperation.cs +++ /dev/null @@ -1,46 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the Operation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - public interface IOperation - { - ///

- /// Execute the operation. - ///

- /// - /// input lemma to look up - /// - /// - /// part of speech of the lemma to look up - /// - /// - /// List to which all discovered base forms should be added. - /// - /// - /// True if at least one base form was discovered by the operation and - /// added to baseForms. - /// - bool Execute(string lemma, string partOfSpeech, List baseForms); - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/LookupExceptionsOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/LookupExceptionsOperation.cs deleted file mode 100644 index 41419617d..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/LookupExceptionsOperation.cs +++ /dev/null @@ -1,57 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the LookupExceptionsOperation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - ///

Lookup the word in the exceptions file of the given part-of-speech.

- public class LookupExceptionsOperation : IOperation - { - private WordNetEngine mEngine; - - public LookupExceptionsOperation(WordNetEngine engine) - { - mEngine = engine; - } - - #region IOperation Members - - public bool Execute(string lemma, string partOfSpeech, List baseForms) - { - bool addedBaseForm = false; - string[] exceptionForms = mEngine.GetExceptionForms(lemma, partOfSpeech); - - foreach (string exceptionForm in exceptionForms) - { - if (!baseForms.Contains(exceptionForm)) - { - baseForms.Add(exceptionForm); - addedBaseForm = true; - } - } - - return addedBaseForm; - } - - #endregion - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/LookupIndexWordOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/LookupIndexWordOperation.cs deleted file mode 100644 index 10f311df2..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/LookupIndexWordOperation.cs +++ /dev/null @@ -1,49 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the LookupIndexWordOperation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - public class LookupIndexWordOperation : IOperation - { - private WordNetEngine mEngine; - - public LookupIndexWordOperation(WordNetEngine engine) - { - mEngine = engine; - } - - #region IOperation Members - - public bool Execute(string lemma, string partOfSpeech, List baseForms) - { - if (!baseForms.Contains(lemma) && mEngine.GetIndexWord(lemma, partOfSpeech) != null) - { - baseForms.Add(lemma); - return true; - } - return false; - } - - #endregion - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/TokenizerOperation.cs b/BotSharp.NLP/Models/WordNet/Morph/TokenizerOperation.cs deleted file mode 100644 index c2d3c94da..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/TokenizerOperation.cs +++ /dev/null @@ -1,181 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the TokenizerOperation.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections.Generic; -using System.Text; -using System.Collections; - -namespace BotSharp.Models.Morph -{ - public class TokenizerOperation : AbstractDelegatingOperation - { - ///

- /// Parameter that determines the operations this operation - /// will perform on the tokens. - ///

- public const string TokenOperations = "token_operations"; - ///

- /// Parameter that determines the operations this operation - /// will perform on the phrases. - ///

- public const string PhraseOperations = "phrase_operations"; - ///

- /// Parameter list that determines the delimiters this - /// operation will use to concatenate tokens. - ///

- public const string Delimiters = "delimiters"; - - private WordNetEngine mEngine; - - private string[] mDelimiters; - - public TokenizerOperation(WordNetEngine engine) - { - mEngine = engine; - } - - public TokenizerOperation(WordNetEngine engine, string[] delimiters) - { - mEngine = engine; - mDelimiters = delimiters; - } - - #region IOperation Members - - public override bool Execute(string lemma, string partOfSpeech, List baseForms) - { - string[] tokens = Util.Split(lemma); - List[] tokenForms = new List[tokens.Length]; - - if (!HasDelegate(TokenOperations)) - { - AddDelegate(TokenOperations, new IOperation[] { new LookupIndexWordOperation(mEngine) }); - } - if (!HasDelegate(PhraseOperations)) - { - AddDelegate(PhraseOperations, new IOperation[] { new LookupIndexWordOperation(mEngine) }); - } - - for (int currentToken = 0; currentToken < tokens.Length; currentToken++) - { - tokenForms[currentToken] = new List(); - tokenForms[currentToken].Add(tokens[currentToken]); - ExecuteDelegate(tokens[currentToken], partOfSpeech, tokenForms[currentToken], TokenOperations); - } - bool foundForms = false; - for (int currentTokenForm = 0; currentTokenForm < tokenForms.Length; currentTokenForm++) - { - for (int tokenFormToCompare = tokenForms.Length - 1; tokenFormToCompare >= currentTokenForm; tokenFormToCompare--) - { - if (TryAllCombinations(partOfSpeech, tokenForms, currentTokenForm, tokenFormToCompare, baseForms)) - { - foundForms = true; - } - } - } - return foundForms; - } - - #endregion - - private bool TryAllCombinations(string partOfSpeech, List[] tokenForms, int startIndex, int endIndex, List baseForms) - { - int length = endIndex - startIndex + 1; - int[] indexArray = new int[length]; - int[] endArray = new int[length]; - for (int i = 0; i < indexArray.Length; i++) - { - indexArray[i] = 0; - endArray[i] = tokenForms[startIndex + i].Count - 1; - } - - bool foundForms = false; - for (; ; ) - { - string[] tokens = new string[length]; - for (int i = 0; i < length; i++) - { - tokens[i] = tokenForms[i + startIndex][indexArray[i]]; - } - for (int i = 0; i < mDelimiters.Length; i++) - { - if (TryAllCombinations(partOfSpeech, tokens, mDelimiters[i], baseForms)) - { - foundForms = true; - } - } - - if (IsArrayEqual(indexArray, endArray)) - { - break; - } - - for (int i = length - 1; i >= 0; i--) - { - if (indexArray[i] == endArray[i]) - { - indexArray[i] = 0; - } - else - { - indexArray[i]++; - break; - } - } - } - return foundForms; - } - - private bool TryAllCombinations(string partOfSpeech, string[] tokens, string delimiter, List baseForms) - { - BitArray bits = new BitArray(64); - int size = tokens.Length - 1; - - bool foundForms = false; - do - { - string lemma = Util.GetLemma(tokens, bits, delimiter); - if (ExecuteDelegate(lemma, partOfSpeech, baseForms, PhraseOperations)) - { - foundForms = true; - } - } - while (Util.Increment(bits, size)); - - return foundForms; - } - - private bool IsArrayEqual(int[] array1, int[] array2) - { - if (array1.Length != array2.Length) - { - return false; - } - for (int i = 0; i < array1.Length; i++) - { - if (array1[i] != array2[i]) - { - return false; - } - } - return true; - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/Morph/Util.cs b/BotSharp.NLP/Models/WordNet/Morph/Util.cs deleted file mode 100644 index fe65911da..000000000 --- a/BotSharp.NLP/Models/WordNet/Morph/Util.cs +++ /dev/null @@ -1,89 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -//This file is based on the Util.java source file found in -//the Java WordNet Library (JWNL). That source file is licensed under BSD. - -using System; -using System.Collections; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.Models.Morph -{ - public class Util - { - public static string GetLemma(string[] tokens, BitArray bits, string delimiter) - { - StringBuilder buf = new StringBuilder(); - for (int i = 0; i < tokens.Length; i++) - { - if (i != 0 && !bits.Get(i - 1)) - { - buf.Append(delimiter); - } - buf.Append(tokens[i]); - } - return buf.ToString(); - } - - public static bool Increment(BitArray bits, int size) - { - int i = size - 1; - while (i >= 0 && bits.Get(i)) - { - bits.Set(i--, false); - } - if (i < 0) - { - return false; - } - bits.Set(i, true); - return true; - } - - public static string[] Split(string str) - { - char[] chars = str.ToCharArray(); - List tokens = new List(); - StringBuilder buf = new StringBuilder(); - for (int i = 0; i < chars.Length; i++) - { - if ((chars[i] >= 'a' && chars[i] <= 'z') || chars[i] == '\'') - { - buf.Append(chars[i]); - } - else - { - if (buf.Length > 0) - { - tokens.Add(buf.ToString()); - buf = new StringBuilder(); - } - } - } - if (buf.Length > 0) - { - tokens.Add(buf.ToString()); - } - return (tokens.ToArray()); - } - - private Util() - { - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/Relation.cs b/BotSharp.NLP/Models/WordNet/Relation.cs deleted file mode 100644 index 8b201086f..000000000 --- a/BotSharp.NLP/Models/WordNet/Relation.cs +++ /dev/null @@ -1,85 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Summary description for Relation. - ///

- public class Relation - { - private WordNetEngine mWordNetEngine; - - private RelationType mRelationType; - - private int mTargetSynsetOffset; - private string mTargetSynsetPartOfSpeech; - - private Synset mTargetSynset; - - private int miSourceWord; - private int miTargetWord; - - public RelationType SynsetRelationType - { - get - { - return mRelationType; - } - } - - public int TargetSynsetOffset - { - get - { - return mTargetSynsetOffset; - } - } - - public Synset TargetSynset - { - get - { - if (mTargetSynset == null) - { - mTargetSynset = mWordNetEngine.CreateSynset(mTargetSynsetPartOfSpeech, mTargetSynsetOffset); - } - return mTargetSynset; - } - } - - private Relation() - { - } - - protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech) - { - mWordNetEngine = wordNetEngine; - mRelationType = relationType; - - mTargetSynsetOffset = targetSynsetOffset; - mTargetSynsetPartOfSpeech = targetSynsetPartOfSpeech; - } - - protected internal Relation(WordNetEngine wordNetEngine, RelationType relationType, int targetSynsetOffset, string targetSynsetPartOfSpeech, int sourceWord, int targetWord) : this(wordNetEngine, relationType, targetSynsetOffset, targetSynsetPartOfSpeech) - { - miSourceWord = sourceWord; - miTargetWord = targetWord; - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/RelationType.cs b/BotSharp.NLP/Models/WordNet/RelationType.cs deleted file mode 100644 index fb03d63ec..000000000 --- a/BotSharp.NLP/Models/WordNet/RelationType.cs +++ /dev/null @@ -1,72 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Summary description for RelationType. - ///

- public class RelationType - { - private string mName; - private RelationType mOpposite; - private string[] mPartsOfSpeech; - - public string Name - { - get - { - return mName; - } - } - - public RelationType Opposite - { - get - { - return mOpposite; - } - } - - public string GetPartOfSpeech(int index) - { - return mPartsOfSpeech[index]; - } - - public int PartsOfSpeechCount - { - get - { - return mPartsOfSpeech.Length; - } - } - - protected internal RelationType(string name, string[] partsOfSpeech) - { - mName = name; - mPartsOfSpeech = partsOfSpeech; - } - - protected internal RelationType(string name, RelationType opposite, string[] partsOfSpeech) - { - mName = name; - mOpposite = opposite; - mPartsOfSpeech = partsOfSpeech; - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/Synset.cs b/BotSharp.NLP/Models/WordNet/Synset.cs deleted file mode 100644 index 85595190f..000000000 --- a/BotSharp.NLP/Models/WordNet/Synset.cs +++ /dev/null @@ -1,113 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Summary description for Synset. - ///

- public class Synset - { - private int mOffset; - private string mGloss; - private string[] mWordList; - private string mLexicographerFile; - private Relation[] mRelations; - - private Synset() - { - } - - internal Synset(int offset, string gloss, string[] wordList, string lexicographerFile, Relation[] relations) - { - mOffset = offset; - mGloss = gloss; - mWordList = wordList; - mLexicographerFile = lexicographerFile; - mRelations = relations; - } - - public int Offset - { - get - { - return mOffset; - } - } - - public string Gloss - { - get - { - return mGloss; - } - } - - public string GetWord(int wordIndex) - { - return mWordList[wordIndex]; - } - - public int WordCount - { - get - { - return mWordList.Length; - } - } - - public string LexicographerFile - { - get - { - return mLexicographerFile; - } - } - - public Relation GetRelation(int relationIndex) - { - return mRelations[relationIndex]; - } - - public int RelationCount - { - get - { - return mRelations.Length; - } - } - - public override string ToString() - { - System.Text.StringBuilder oOutput = new System.Text.StringBuilder(); - - for (int iCurrentWord = 0; iCurrentWord < mWordList.Length; iCurrentWord++) - { - oOutput.Append(mWordList[iCurrentWord]); - if (iCurrentWord < mWordList.Length - 1) - { - oOutput.Append(", "); - } - } - - oOutput.Append(" -- ").Append(mGloss); - - return oOutput.ToString(); - } - } -} diff --git a/BotSharp.NLP/Models/WordNet/Tokenizer.cs b/BotSharp.NLP/Models/WordNet/Tokenizer.cs deleted file mode 100644 index ac351d70f..000000000 --- a/BotSharp.NLP/Models/WordNet/Tokenizer.cs +++ /dev/null @@ -1,49 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; - -namespace BotSharp.Models -{ - ///

- /// Summary description for Tokenizer. - ///

- public class Tokenizer - { - private readonly string[] _tokens; - int _position; - - public Tokenizer(string input, params char[] separators) - { - _tokens = input.Split(separators); - _position = 0; - } - - public string NextToken() - { - while (_position < _tokens.Length) - { - if ((_tokens[_position].Length > 0)) - { - return _tokens[_position++]; - } - _position++; - } - return null; - } - - } -} diff --git a/BotSharp.NLP/Models/WordNet/WordNetEngine.cs b/BotSharp.NLP/Models/WordNet/WordNetEngine.cs deleted file mode 100644 index 2256d4c81..000000000 --- a/BotSharp.NLP/Models/WordNet/WordNetEngine.cs +++ /dev/null @@ -1,151 +0,0 @@ -//Copyright (C) 2006 Richard J. Northedge -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -using System; -using System.Collections.Generic; - -namespace BotSharp.Models -{ - ///

- /// Summary description for WordNetEngine. - ///

- public abstract class WordNetEngine - { - private Morph.IOperation[] mDefaultOperations; - - protected string[] mEmpty = new string[0]; - - public abstract string[] GetPartsOfSpeech(); - - public abstract string[] GetPartsOfSpeech(string lemma); - - public abstract IndexWord[] GetAllIndexWords(string partOfSpeech); - - public abstract IndexWord GetIndexWord(string lemma, string partOfSpeech); - - public abstract Synset[] GetSynsets(string lemma); - - public abstract Synset[] GetSynsets(string lemma, string partOfSpeech); - - public abstract RelationType[] GetRelationTypes(string lemma, string partOfSpeech); - - public abstract Synset GetSynset(string lemma, string partOfSpeech, int senseNumber); - - public delegate void MorphologicalProcessOperation (string lemma, string partOfSpeech, ListbaseForms); - - public string[] GetBaseForms(string lemma, string partOfSpeech, MorphologicalProcessOperation morphologicalProcess) - { - var baseForms = new List(); - morphologicalProcess(lemma, partOfSpeech, baseForms); - return baseForms.ToArray(); - } - - public string[] GetBaseForms(string lemma, string partOfSpeech, Morph.IOperation[] operations) - { - var baseForms = new List(); - foreach (Morph.IOperation operation in operations) - { - operation.Execute(lemma, partOfSpeech, baseForms); - } - return baseForms.ToArray(); - } - - public string[] GetBaseForms(string lemma, string partOfSpeech) - { - if (mDefaultOperations == null) - { - var suffixMap = new Dictionary - { - { - "noun", new string[][] - { - new string[] {"s", ""}, new string[] {"ses", "s"}, new string[] {"xes", "x"}, - new string[] {"zes", "z"}, new string[] {"ches", "ch"}, new string[] {"shes", "sh"}, - new string[] {"men", "man"}, new string[] {"ies", "y"} - } - }, - { - "verb", new string[][] - { - new string[] {"s", ""}, new string[] {"ies", "y"}, new string[] {"es", "e"}, - new string[] {"es", ""}, new string[] {"ed", "e"}, new string[] {"ed", ""}, - new string[] {"ing", "e"}, new string[] {"ing", ""} - } - }, - { - "adjective", new string[][] - { - new string[] {"er", ""}, new string[] {"est", ""}, new string[] {"er", "e"}, - new string[] {"est", "e"} - } - } - }; - var tokDso = new Morph.DetachSuffixesOperation(suffixMap); - tokDso.AddDelegate(Morph.DetachSuffixesOperation.Operations, new Morph.IOperation[] - { - new Morph.LookupIndexWordOperation(this), new Morph.LookupExceptionsOperation(this) - }); - var tokOp = new Morph.TokenizerOperation(this, new string[] { " ", "-" }); - tokOp.AddDelegate(Morph.TokenizerOperation.TokenOperations, new Morph.IOperation[] - { - new Morph.LookupIndexWordOperation(this), new Morph.LookupExceptionsOperation(this), tokDso - }); - var morphDso = new Morph.DetachSuffixesOperation(suffixMap); - morphDso.AddDelegate(Morph.DetachSuffixesOperation.Operations, new Morph.IOperation[] - { - new Morph.LookupIndexWordOperation(this), new Morph.LookupExceptionsOperation(this) - }); - mDefaultOperations = new Morph.IOperation[] { new Morph.LookupExceptionsOperation(this), morphDso, tokOp }; - } - return GetBaseForms(lemma, partOfSpeech, mDefaultOperations); - } - - public MorphologicalProcessOperation LookupExceptionsOperation - { - get - { - return delegate(string lemma, string partOfSpeech, List baseForms) - { - string[] exceptionForms = GetExceptionForms(lemma, partOfSpeech); - foreach (string exceptionForm in exceptionForms) - { - if (!baseForms.Contains(exceptionForm)) - { - baseForms.Add(exceptionForm); - } - } - }; - } - } - - public MorphologicalProcessOperation LookupIndexWordOperation - { - get - { - return delegate(string lemma, string partOfSpeech, List baseForms) - { - if (!baseForms.Contains(lemma) && GetIndexWord(lemma, partOfSpeech) != null) - { - baseForms.Add(lemma); - } - }; - } - } - - protected internal abstract Synset CreateSynset(string partOfSpeech, int synsetOffset); - protected internal abstract string[] GetExceptionForms(string lemma, string partOfSpeech); - } -} diff --git a/BotSharp.NLP/NER/README.md b/BotSharp.NLP/NER/README.md deleted file mode 100644 index c391ff17d..000000000 --- a/BotSharp.NLP/NER/README.md +++ /dev/null @@ -1,5 +0,0 @@ -ï»¿IOB tagging - -B-{CHUNK_TYPE} â€“ for the word in the Beginning chunk -I-{CHUNK_TYPE} â€“ for words Inside the chunk -O â€“ Outside any chunk \ No newline at end of file diff --git a/BotSharp.NLP/README.rst b/BotSharp.NLP/README.rst deleted file mode 100644 index 819aaf0b7..000000000 --- a/BotSharp.NLP/README.rst +++ /dev/null @@ -1,41 +0,0 @@ -BotSharp.NLP -############ -Botsharp.NLP is a set of tools for building C# programs to work with human language data. It is the NLP low-level processing library of the BotSharp robot construction platform. It provides a separate installation package for downloading. It can be used as a common POS, NER and text classification service in the NLP field, providing a variety of machine learning algorithms to switch freely. - - -How to install -============== -1. Install through NuGet -:: - - PM> Install-Package BotSharp.NL - -2. Download source code -:: - - git clone https://github.com/Oceania2018/BotSharp.NLP - -Start to use -============ -:: - - public void TokenizeInWhiteSpace() - { - // use RegexTokenizer - var tokenizer = new TokenizerFactory(); - - // tokenize and return tokens - var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?", - new TokenizationOptions - { - // use built-in regex pattern - Pattern = RegexTokenizer.WHITE_SPACE - }); - - // test result - Assert.IsTrue(tokens[0].Offset == 0); - Assert.IsTrue(tokens[0].Text == "Chop"); - - Assert.IsTrue(tokens[1].Offset == 5); - Assert.IsTrue(tokens[1].Text == "into"); - } \ No newline at end of file diff --git a/BotSharp.NLP/Sentence.cs b/BotSharp.NLP/Sentence.cs deleted file mode 100644 index d7ac684e9..000000000 --- a/BotSharp.NLP/Sentence.cs +++ /dev/null @@ -1,18 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP -{ - public class Sentence - { - public List Words { get; set; } - - public String Label { get; set; } - - public String Text { get; set; } - - public double[] Vector { get; set; } - } -} diff --git a/BotSharp.NLP/Stem/IStemmer.cs b/BotSharp.NLP/Stem/IStemmer.cs deleted file mode 100644 index 6441039f0..000000000 --- a/BotSharp.NLP/Stem/IStemmer.cs +++ /dev/null @@ -1,23 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Stem -{ - ///

- /// Stemmer is used to remove morphological affixes from words, leaving only the word stem. - /// Stemming algorithms aim to remove those affixes leaving only the stem of the word. - /// IStemmer defines a standard interface for stemmers. - ///

- public interface IStemmer - { - ///

- /// Strip affixes from the token and return the stem. - ///

- /// - /// - /// - string Stem(string word, StemOptions options); - } -} diff --git a/BotSharp.NLP/Stem/RegexStemmer.cs b/BotSharp.NLP/Stem/RegexStemmer.cs deleted file mode 100644 index 4d98c21e3..000000000 --- a/BotSharp.NLP/Stem/RegexStemmer.cs +++ /dev/null @@ -1,47 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; -using BotSharp.NLP.Tokenize; - -namespace BotSharp.NLP.Stem -{ - ///

- /// A stemmer that uses regular expressions to identify morphological affixes. - /// Any substrings that match the regular expressions will be removed. - ///

- public class RegexStemmer : IStemmer - { - public const string DEFAULT = "ing$|s$|e$|able$"; - - private Regex _regex; - - public string Stem(string word, StemOptions options) - { - _regex = new Regex(options.Pattern); - - var match = _regex.Matches(word).Cast().FirstOrDefault(); - - return match == null ? word : word.Substring(0, match.Index); - } - } -} diff --git a/BotSharp.NLP/Stem/StemOptions.cs b/BotSharp.NLP/Stem/StemOptions.cs deleted file mode 100644 index 6022730cb..000000000 --- a/BotSharp.NLP/Stem/StemOptions.cs +++ /dev/null @@ -1,14 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Stem -{ - public class StemOptions - { - ///

- /// Regex pattern - ///

- public string Pattern { get; set; } - } -} diff --git a/BotSharp.NLP/Stem/StemmerFactory.cs b/BotSharp.NLP/Stem/StemmerFactory.cs deleted file mode 100644 index be7b4f700..000000000 --- a/BotSharp.NLP/Stem/StemmerFactory.cs +++ /dev/null @@ -1,35 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Stem -{ - ///

- /// BotSharp Stemmer Factory - /// In linguistic morphology and information retrieval, - /// stemming is the process of reducing inflected (or sometimes derived) words to their word stem, - /// base or root formâ€”generally a written word form. - ///

- /// - public class StemmerFactory where IStem : IStemmer, new() - { - private SupportedLanguage _lang { get; set; } - - private IStem _stemmer; - - private StemOptions _options; - - public StemmerFactory(StemOptions options, SupportedLanguage lang) - { - _lang = lang; - _options = options; - _stemmer = new IStem(); - } - - public string Stem(string word) - { - return _stemmer.Stem(word, _options); - } - } -} diff --git a/BotSharp.NLP/SupportedLanguage.cs b/BotSharp.NLP/SupportedLanguage.cs deleted file mode 100644 index 51472d1d9..000000000 --- a/BotSharp.NLP/SupportedLanguage.cs +++ /dev/null @@ -1,60 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP -{ - public class SupportedLanguage - { - public static readonly SupportedLanguage English = new SupportedLanguage("en"); - public static readonly SupportedLanguage Russian = new SupportedLanguage("ru"); - public static readonly SupportedLanguage German = new SupportedLanguage("de"); - public static readonly SupportedLanguage Portuguese = new SupportedLanguage("pt"); - public static readonly SupportedLanguage PortugueseBrazil = new SupportedLanguage("pt-BR"); - public static readonly SupportedLanguage Spanish = new SupportedLanguage("es"); - public static readonly SupportedLanguage French = new SupportedLanguage("fr"); - public static readonly SupportedLanguage Italian = new SupportedLanguage("it"); - public static readonly SupportedLanguage Dutch = new SupportedLanguage("nl"); - public static readonly SupportedLanguage Japanese = new SupportedLanguage("ja"); - public static readonly SupportedLanguage ChineseChina = new SupportedLanguage("zh-CN"); - public static readonly SupportedLanguage ChineseHongKong = new SupportedLanguage("zh-HK"); - public static readonly SupportedLanguage ChineseTaiwan = new SupportedLanguage("zh-TW"); - - private static readonly SupportedLanguage[] AllLangs = - { - English, - Russian, - German, - Portuguese, - PortugueseBrazil, - Spanish, - French, - Italian, - Dutch, - Japanese, - ChineseChina, - ChineseHongKong, - ChineseTaiwan - }; - - public readonly string code; - - private SupportedLanguage(string code) - { - this.code = code; - } - - public static SupportedLanguage FromLanguageTag(string languageTag) - { - foreach (var item in AllLangs) - { - if (string.Equals(item.code, languageTag, StringComparison.OrdinalIgnoreCase)) - { - return item; - } - } - - return English; - } - } -} diff --git a/BotSharp.NLP/Tag/DefaultTagger.cs b/BotSharp.NLP/Tag/DefaultTagger.cs deleted file mode 100644 index 48abb43c2..000000000 --- a/BotSharp.NLP/Tag/DefaultTagger.cs +++ /dev/null @@ -1,25 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; -using BotSharp.NLP.Tokenize; - -namespace BotSharp.NLP.Tag -{ - ///

- /// The simplest possible tagger assigns the same tag to each token. - /// This may seem to be a rather banal step, but it establishes an important baseline for tagger performance. - /// In order to get the best result, we tag each word with the most likely tag. - ///

- public class DefaultTagger : ITagger - { - public void Tag(Sentence sentence, TagOptions options) - { - - } - - public void Train(List sentences, TagOptions options) - { - - } - } -} diff --git a/BotSharp.NLP/Tag/ITagger.cs b/BotSharp.NLP/Tag/ITagger.cs deleted file mode 100644 index 71aa4ffb2..000000000 --- a/BotSharp.NLP/Tag/ITagger.cs +++ /dev/null @@ -1,24 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Tag -{ - ///

- /// Part-Of-Speech tagging (or POS tagging, for short) is one of the main components of almost any NLP analysis. - /// The task of POS-tagging simply implies labelling words with their appropriate Part-Of-Speech (Noun, Verb, Adjective, Adverb, Pronoun, â€¦). - ///

- public interface ITagger - { - ///

- /// - ///

- /// A tagged corpus. Each item should be a list of tokens. - /// - /// - void Train(List sentences, TagOptions options); - - void Tag(Sentence sentence, TagOptions options); - } -} diff --git a/BotSharp.NLP/Tag/NGramTagger.cs b/BotSharp.NLP/Tag/NGramTagger.cs deleted file mode 100644 index 2b9da1cf2..000000000 --- a/BotSharp.NLP/Tag/NGramTagger.cs +++ /dev/null @@ -1,138 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using BotSharp.NLP.Corpus; -using BotSharp.NLP.Tokenize; - -namespace BotSharp.NLP.Tag -{ - ///

- /// N-Gramm taggers are based on a simple statistical algorithm: - /// for each token, assign the tag that is most likely for that particular token. - ///

- public class NGramTagger : ITagger - { - private List _contextMapping { get; set; } - - public void Tag(Sentence sentence, TagOptions options) - { - // need training to generate model - if(_contextMapping == null) - { - var corpus = new CoNLLReader().Read(new ReaderOptions - { - DataDir = Path.Combine(options.CorpusDir, "CoNLL"), - FileName = "conll2000_chunking_train.txt" - }); - - Train(corpus, options); - } - - Fill(sentence, options); - - for (int pos = options.NGram - 1; pos < sentence.Words.Count; pos++) - { - sentence.Words[pos].Pos = _contextMapping.FirstOrDefault(x => x.Context == GetContext(pos, sentence.Words, options))?.Tag; - - // set default tag - if(sentence.Words[pos].Pos == null) - { - sentence.Words[pos].Pos = options.Tag; - } - } - - for(int pos = 0; pos < options.NGram - 1; pos++) - { - sentence.Words.RemoveAt(0); - } - } - - public void Train(List sentences, TagOptions options) - { - var cache = new List(); - - for (int idx = 0; idx < sentences.Count; idx++) - { - var sent = sentences[idx]; - - Fill(sent, options); - - for (int pos = options.NGram - 1; pos < sent.Words.Count; pos++) - { - var freq = new NGramFreq - { - Context = GetContext(pos, sent.Words, options), - Tag = sent.Words[pos].Pos, - Count = 1 - }; - - cache.Add(freq); - } - } - - _contextMapping = (from c in cache - group c by new { c.Context, c.Tag } into g - select new NGramFreq - { - Context = g.Key.Context, - Tag = g.Key.Tag, - Count = g.Count() - }).OrderByDescending(x => x.Count) - .ToList(); - } - - private string GetContext(int pos, List words, TagOptions options) - { - string context = words[pos].Text; - for (int ngram = options.NGram - 1; ngram > 0; ngram--) - { - context = words[pos - ngram].Pos + " " + context; - } - - return context; - } - - private void Fill(Sentence sent, TagOptions options) - { - for (int ngram = 1; ngram < options.NGram; ngram++) - { - sent.Words.Insert(0, new Token { Text = "NIL", Pos = options.Tag, Start = (ngram - 1) * 3 }); - } - } - - private class NGramFreq - { - ///

- /// Current token tag - ///

- public string Tag { get; set; } - - ///

- /// Occurence frequency - ///

- public int Count { get; set; } - - public string Context { get; set; } - } - } -} diff --git a/BotSharp.NLP/Tag/TagOptions.cs b/BotSharp.NLP/Tag/TagOptions.cs deleted file mode 100644 index 2ea6e8578..000000000 --- a/BotSharp.NLP/Tag/TagOptions.cs +++ /dev/null @@ -1,33 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Tag -{ - public class TagOptions - { - ///

- /// Display some stats, if requested. - ///

- public bool Verbose { get; set; } - - ///

- /// Default Tag - /// Used in DefaultTagger - ///

- public string Tag { get; set; } - - ///

- /// N-Gram number - ///

- public int NGram { get; set; } - - public string CorpusDir { get; set; } - - public TagOptions() - { - NGram = 1; - Tag = "NN"; - } - } -} diff --git a/BotSharp.NLP/Tag/TaggerFactory.cs b/BotSharp.NLP/Tag/TaggerFactory.cs deleted file mode 100644 index aae92224d..000000000 --- a/BotSharp.NLP/Tag/TaggerFactory.cs +++ /dev/null @@ -1,49 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Reflection; -using System.Text; - -namespace BotSharp.NLP.Tag -{ - public class TaggerFactory - { - private SupportedLanguage _lang; - - private ITagger _tagger; - - private TagOptions _options; - - public TaggerFactory(TagOptions options, SupportedLanguage lang) - { - _lang = lang; - _options = options; - } - - public ITagger GetTagger() where ITag : ITagger, new() - { - return _tagger = new ITag(); - } - - public ITagger GetTagger(string name) - { - List types = new List(); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.Core")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.NLP")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - Type type = types.FirstOrDefault(x => x.Name == name); - var instance = (ITagger)Activator.CreateInstance(type); - - return _tagger = instance; - } - - public void Tag(Sentence sentence) - { - _tagger.Tag(sentence, _options); - } - } -} diff --git a/BotSharp.NLP/Tokenize/ITokenizer.cs b/BotSharp.NLP/Tokenize/ITokenizer.cs deleted file mode 100644 index ca12b0084..000000000 --- a/BotSharp.NLP/Tokenize/ITokenizer.cs +++ /dev/null @@ -1,21 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Tokenize -{ - ///

- /// A tokenizer is a component used for dividing text intotokens. - /// A tokenizer is language specific and takes into account the peculiarities of the language, e.g. donâ€™t in English is tokenized as two tokens. - ///

- public interface ITokenizer - { - ///

- /// Tokenize - ///

- /// input sentence - /// Options such as: regex expression - /// - List Tokenize(string sentence, TokenizationOptions options); - } -} diff --git a/BotSharp.NLP/Tokenize/README.rst b/BotSharp.NLP/Tokenize/README.rst deleted file mode 100644 index 5f282702b..000000000 --- a/BotSharp.NLP/Tokenize/README.rst +++ /dev/null @@ -1 +0,0 @@ -ï»¿ \ No newline at end of file diff --git a/BotSharp.NLP/Tokenize/RegexTokenizer.cs b/BotSharp.NLP/Tokenize/RegexTokenizer.cs deleted file mode 100644 index e28e9a333..000000000 --- a/BotSharp.NLP/Tokenize/RegexTokenizer.cs +++ /dev/null @@ -1,124 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; - -namespace BotSharp.NLP.Tokenize -{ - ///

- /// Regular-Expression Tokenizers - ///

- public class RegexTokenizer : ITokenizer - { - ///

- /// Tokenize a text into a sequence of alphabetic and non-alphabetic characters - ///

- public const string WORD_PUNC = @"[^\w\s]+|\w+"; - - ///

- /// Tokenize a string, treating any sequence of blank lines as a delimiter. - /// Blank lines are defined as lines containing no characters, except for space or tab characters. - /// options.IsGap = true - ///

- public const string BLANK_LINE = @"\s*\n\s*\n\s*"; - - ///

- /// Tokenize a string on whitespace (space, tab, newline). - /// In general, users should use the string ``split()`` method instead. - /// options.IsGap = true - ///

- public const string WHITE_SPACE = @"\s+"; - - private Regex _regex; - - public List Tokenize(string sentence, TokenizationOptions options) - { - string pattern = options.Pattern; - if (options.SpecialWords != null) - { - options.SpecialWords.ForEach(r => - { - sentence = Regex.Replace(sentence, r, " " + r); - }); - - pattern = String.Join("|", options.SpecialWords) + "|" + pattern; - } - - _regex = new Regex(pattern); - - var matches = _regex.Matches(sentence).Cast().ToArray(); - - options.IsGap = new string[] { WHITE_SPACE, BLANK_LINE }.Contains(pattern); - - if (options.IsGap) - { - int pos = 0; - var tokens = new Token[matches.Length + 1]; - - for (int span = 0; span <= matches.Length; span++) - { - var token = new Token - { - Text = (span == matches.Length) ? sentence.Substring(pos) : sentence.Substring(pos, matches[span].Index - pos), - Start = pos - }; - - token.Text = token.Text.Trim(); - - tokens[span] = token; - - if (span < matches.Length) - { - pos = matches[span].Index + 1; - } - } - - return tokens.ToList(); - } - else - { - var m = matches.Select(x => new Token - { - Text = x.Value, - Start = x.Index - }).ToList(); - - if(options.SpecialWords != null) - { - int offset = 0; - m.ForEach(t => - { - if (options.SpecialWords.Contains(t.Text)) - { - offset++; - } - - t.Start = t.Start - offset; - }); - } - - - return m; - } - } - } -} diff --git a/BotSharp.NLP/Tokenize/Token.cs b/BotSharp.NLP/Tokenize/Token.cs deleted file mode 100644 index f19ef2ee0..000000000 --- a/BotSharp.NLP/Tokenize/Token.cs +++ /dev/null @@ -1,73 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; -using System.Text.RegularExpressions; - -namespace BotSharp.NLP.Tokenize -{ - public class Token - { - ///

- /// The original word text. - ///

- public string Text { get; set; } - - ///

- /// The offset of word - ///

- public int Start { get; set; } - - ///

- /// The simple part-of-speech tag. - /// Not widely used, Tag is more general. - ///

- public string Pos { get; set; } - - ///

- /// The detailed part-of-speech tag. - /// https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html - ///

- public string Tag { get; set; } - - ///

- /// The base form of the word. - ///

- public string Lemma { get; set; } - - ///

- /// The word shape â€“ capitalisation, punctuation, digits. - ///

- public string Shape { get; set; } - - ///

- /// Is the token an alpha character? - ///

- public bool IsAlpha - { - get - { - return Regex.IsMatch(Text, @"^[a-zA-Z]+|[\u4e00-\u9fa5]+$"); - } - } - - ///

- /// Is the token part of a stop list, i.e. the most common words of the language? - ///

- public bool IsStop { get; set; } - - public int End - { - get - { - return Start + Text.Length; - } - } - - public override string ToString() - { - return $"{Text} {Start} {Pos}"; - } - - public double Vector { get; set; } - } -} diff --git a/BotSharp.NLP/Tokenize/TokenizationOptions.cs b/BotSharp.NLP/Tokenize/TokenizationOptions.cs deleted file mode 100644 index 4501b888c..000000000 --- a/BotSharp.NLP/Tokenize/TokenizationOptions.cs +++ /dev/null @@ -1,36 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Tokenize -{ - public class TokenizationOptions - { - ///

- /// Regex pattern - ///

- public string Pattern { get; set; } - - ///

- /// True if this tokenizer's pattern should be used to find separators between tokens; - /// False if this tokenizer's pattern should be used to find the tokens themselves. - ///

- public bool IsGap { get; set; } - - ///

- /// True if any empty tokens generated by the tokenizer should be discarded. - /// Empty tokens can only be generated if `IsGap == True` - ///

- public bool IgnoreEmpty { get; set; } - - ///

- /// Split "isn't" into "is", "n't" - ///

- public List SpecialWords { get; set; } - - ///

- /// Convert bracket-like characters to avoid confusion with parse brackets. - ///

- public bool ConvertParentheses { get; set; } - } -} diff --git a/BotSharp.NLP/Tokenize/TokenizerBase.cs b/BotSharp.NLP/Tokenize/TokenizerBase.cs deleted file mode 100644 index 2753394fa..000000000 --- a/BotSharp.NLP/Tokenize/TokenizerBase.cs +++ /dev/null @@ -1,22 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Text; - -namespace BotSharp.NLP.Tokenize -{ - public abstract class TokenizerBase - { - protected void CorrectTokenPosition(string sentence, List tokens) - { - int startPos = 0; - - for (int i = 0; i < tokens.Count; i++) - { - var token = tokens[i]; - token.Start = sentence.IndexOf(token.Text, startPos); - - startPos = token.End; - } - } - } -} diff --git a/BotSharp.NLP/Tokenize/TokenizerFactory.cs b/BotSharp.NLP/Tokenize/TokenizerFactory.cs deleted file mode 100644 index d2fe93efd..000000000 --- a/BotSharp.NLP/Tokenize/TokenizerFactory.cs +++ /dev/null @@ -1,78 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Reflection; -using System.Text; -using System.Threading.Tasks; - -namespace BotSharp.NLP.Tokenize -{ - ///

- /// BotSharp Tokenizer Factory - /// Tokenizers divide strings into lists of substrings. - /// The particular tokenizer requires implement interface - /// models to be installed.BotSharp.NLP also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation. - ///

- public class TokenizerFactory - { - private SupportedLanguage _lang; - - private ITokenizer _tokenizer; - - private TokenizationOptions _options; - - public ITokenizer GetTokenizer() where ITokenize : ITokenizer, new() - { - return _tokenizer = new ITokenize(); - } - - public ITokenizer GetTokenizer(string name) - { - List types = new List(); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.Core")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - types.AddRange(Assembly.Load(new AssemblyName("BotSharp.NLP")) - .GetTypes().Where(x => !x.IsAbstract && !x.FullName.StartsWith("<>f__AnonymousType")).ToList()); - - Type type = types.FirstOrDefault(x => x.Name == name); - var instance = (ITokenizer)Activator.CreateInstance(type); - - return _tokenizer = instance; - } - - public TokenizerFactory(TokenizationOptions options, SupportedLanguage lang) - { - _lang = lang; - _options = options; - } - - public List Tokenize(string sentence) - { - var tokens = _tokenizer.Tokenize(sentence, _options); - tokens.ForEach(x => x.Lemma = x.Text.ToLower()); - return tokens; - } - - public List Tokenize(List sentences) - { - var sents = sentences.Select(s => new Sentence { Text = s }).ToList(); - - Parallel.ForEach(sents, (sentence) => - { - sentence.Words = Tokenize(sentence.Text); - sentence.Words.ForEach(x => x.Lemma = x.Text.ToLower()); - }); - - return sents; - } - - private class ParallelToken - { - public String Text { get; set; } - - public List Tokens { get; set; } - } - } -} diff --git a/BotSharp.NLP/Tokenize/TreebankTokenizer.cs b/BotSharp.NLP/Tokenize/TreebankTokenizer.cs deleted file mode 100644 index 4eecc6a51..000000000 --- a/BotSharp.NLP/Tokenize/TreebankTokenizer.cs +++ /dev/null @@ -1,165 +0,0 @@ -ï»¿/* - * BotSharp.NLP Library - * Copyright (C) 2018 Haiping Chen - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Text.RegularExpressions; - -namespace BotSharp.NLP.Tokenize -{ - ///

- /// Penn Treebank Tokenizer - /// The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. - /// This implementation is a port of the tokenizer sed script written by Robert McIntyre - /// and available at ftp://ftp.cis.upenn.edu/pub/treebank/public_html/tokenizer.sed, - /// or reference ftp://ftp.cis.upenn.edu/pub/treebank/public_html/tokenization.html. - ///

- public class TreebankTokenizer : TokenizerBase, ITokenizer - { - private List> STARTING_QUOTES = new List>(); - private List> PUNCTUATION = new List>(); - private List> PARENS_BRACKETS = new List>(); - private List> CONVERT_PARENTHESES = new List>(); - private List> ENDING_QUOTES = new List>(); - private List> CONVENTIONS = new List>(); - - public TreebankTokenizer() - { - Init(); - } - - public List Tokenize(string sentence, TokenizationOptions options) - { - string text = sentence; - - // starting quoting replace - STARTING_QUOTES.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - - // replace PUNCTUATION - PUNCTUATION.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - - // Handles parentheses. - PARENS_BRACKETS.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - - // convert parentheses - if (options.ConvertParentheses) - { - CONVERT_PARENTHESES.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - } - - // Handles repeated dash. - text = Regex.Replace(text, "(-{2,})", " $1 ").Trim(); - - // replace ending quotes - ENDING_QUOTES.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - - // replace ending quotes - CONVENTIONS.ForEach(x => - { - text = Regex.Replace(text, x.Item1, x.Item2); - }); - - // remove duplicated spaces - text = Regex.Replace(text, "\\s+", " ") + " "; - - // split - int pos = 0; - - var tokens = Regex.Matches(text, "\\s") - .Cast() - .Select(x => { - - var token = new Token - { - Start = pos, - Text = text.Substring(pos, x.Index - pos) - }; - - pos = x.Index + 1; - - return token; - - }).ToList(); - - // correct token position - CorrectTokenPosition(sentence, tokens); - - return tokens; - } - - private void Init() - { - STARTING_QUOTES.Add(new Tuple(@"([Â«â€œâ€˜â€ž]|[`]+)", " $1 ")); - STARTING_QUOTES.Add(new Tuple("^\"", "``")); - STARTING_QUOTES.Add(new Tuple(@"(``)", " $1 ")); - STARTING_QUOTES.Add(new Tuple("([ ([{<])(\" | '{2})", "$1 `` ")); - - PUNCTUATION.Add(new Tuple(@"([^\.])(\.)([\]\)}>" + "\"" + @"\\'Â»â€â€™ ]*)\s*$", "$1 $2 $3 ")); - PUNCTUATION.Add(new Tuple(@"([:,])([^\d])", " $1 $2")); - PUNCTUATION.Add(new Tuple(@"([:,])$", " $1 ")); - PUNCTUATION.Add(new Tuple(@"(\.\.\.)", " $1 ")); - PUNCTUATION.Add(new Tuple(@"([;@#$%&])", " $1 ")); - PUNCTUATION.Add(new Tuple(@"([^\.])(\.)([\]\)}>" + "\"" + @"']*)\s*$", "$1 $2 $3 ")); - PUNCTUATION.Add(new Tuple(@"([?!])", " $1 ")); - PUNCTUATION.Add(new Tuple(@"([^'])' ", "$1 ' ")); - - PARENS_BRACKETS.Add(new Tuple(@"([\]\[\{\}\<\>])", " $1 ")); - - CONVERT_PARENTHESES.Add(new Tuple(@"$", "-LRB-")); - CONVERT_PARENTHESES.Add(new Tuple(@"$", "-RRB-")); - CONVERT_PARENTHESES.Add(new Tuple(@"\[", "-LSB-")); - CONVERT_PARENTHESES.Add(new Tuple(@"\]", "-RRB-")); - CONVERT_PARENTHESES.Add(new Tuple(@"\{", "-LCB-")); - CONVERT_PARENTHESES.Add(new Tuple(@"\}", "-RCB-")); - - ENDING_QUOTES.Add(new Tuple(@"([Â»â€â€™])", " $1 ")); - ENDING_QUOTES.Add(new Tuple("\"", " '' ")); - ENDING_QUOTES.Add(new Tuple(@"(\S)(\'\')", "$1 $2 ")); - ENDING_QUOTES.Add(new Tuple(@"('[sS]|'[mM]|'[dD]|') ", " $1 ")); - ENDING_QUOTES.Add(new Tuple(@"('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", " $1 ")); - - CONVENTIONS.Add(new Tuple(@"(?i)\b(can)(?#X)(not)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(d)(?#X)('ye)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(gim)(?#X)(me)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(gon)(?#X)(na)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(got)(?#X)(ta)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(lem)(?#X)(me)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(mor)(?#X)('n)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i)\b(wan)(?#X)(na)\s", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i) ('t)(?#X)(is)\b", "$1 $2 ")); - CONVENTIONS.Add(new Tuple(@"(?i) ('t)(?#X)(was)\b", "$1 $2 ")); - } - } -} diff --git a/BotSharp.NLP/Txt2Vec/Decoder.cs b/BotSharp.NLP/Txt2Vec/Decoder.cs deleted file mode 100644 index a200e029e..000000000 --- a/BotSharp.NLP/Txt2Vec/Decoder.cs +++ /dev/null @@ -1,211 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.IO; - -namespace Txt2Vec -{ - public enum TermOperation { ADD, SUB }; - - public class TermOP - { - public string strTerm; - public TermOperation operation; - } - - public class Result : IComparable - { - public string strTerm; - public double score; - - public Result() - { - strTerm = null; - score = -1; - } - - int IComparable.CompareTo(Result other) - { - return other.score.CompareTo(score); - } - } - - public class Decoder - { - int BLOCK_N = 16; - Model model; - object locker = new object(); - - ParallelOptions parallelOption; - public Decoder(Model m) - { - parallelOption = new ParallelOptions(); - model = m; - } - - public double[] ToVector(string[] termList, int N = 40) - { - List termOPList = new List(); - foreach (string term in termList) - { - TermOP termOP = new TermOP(); - termOP.strTerm = term; - termOP.operation = TermOperation.ADD; - - termOPList.Add(termOP); - } - - double[] vec = GetVector(termOPList); - return vec; - } - - public double[] GetVector(List termList) - { - double[] vec = new double[model.VectorSize]; - - //Calculate input terms' vector - for (int b = 0; b < termList.Count; b++) - { - Term term = model.GetTerm(termList[b].strTerm); - if (term == null) - { - continue; - } - - if (termList[b].operation == TermOperation.ADD) - { - for (int a = 0; a < model.VectorSize; a++) - { - vec[a] += term.vector[a]; - } - } - else if (termList[b].operation == TermOperation.SUB) - { - for (int a = 0; a < model.VectorSize; a++) - { - vec[a] -= term.vector[a]; - } - } - } - return vec; - } - - private List GenerateTermOP(string[] termList) - { - List termOPList = new List(); - foreach (string term in termList) - { - TermOP termOP = new TermOP(); - termOP.strTerm = term; - termOP.operation = TermOperation.ADD; - - termOPList.Add(termOP); - } - - return termOPList; - } - - public double Similarity(string[] tokens1, string[] tokens2) - { - double score = 0; - List termOPList1 = GenerateTermOP(tokens1); - List termOPList2 = GenerateTermOP(tokens2); - double[] vec1 = GetVector(termOPList1); - double[] vec2 = GetVector(termOPList2); - - //Cosine distance - for (int i = 0; i < model.VectorSize; i++) - { - score += vec1[i] * vec2[i]; - } - - return score; - } - - public List Distance(string strTerm, int N = 40) - { - string[] termList = new string[1]; - termList[0] = strTerm; - - return Distance(termList, N); - } - - //N is the number of closest words that will be shown - public List Distance(string[] termList, int N = 40) - { - List termOPList = new List(); - foreach (string term in termList) - { - TermOP termOP = new TermOP(); - termOP.strTerm = term; - termOP.operation = TermOperation.ADD; - - termOPList.Add(termOP); - } - - return Distance(termOPList, N); - } - - public List Distance(List termList, int N = 40) - { - long termCount = termList.Count; - - for (int i = 0; i < termCount; i++) - { - if (model.GetTerm(termList[i].strTerm) == null) - { - //The term is OOV, no result - return null; - } - } - - //Calculate input terms' vector - double[] vec = GetVector(termList); - - int candidateWordCount = model.Vocabulary.Count; - //Calculate the distance betweens words in parallel - int size_per_block = candidateWordCount / BLOCK_N; - List rstList = new List(); - Parallel.For>(0, BLOCK_N + 1, parallelOption, () => new List(), (k, loop, subtotal) => - { - for (int c = (int)(k * size_per_block); c < (k + 1) * size_per_block && c < candidateWordCount; c++) - { - //Calculate the distance - double dist = 0; - for (int a = 0; a < model.VectorSize; a++) - { - dist += vec[a] * model.Vocabulary[c].vector[a]; - } - - //Save the result - Result rst = new Result(); - rst.strTerm = model.Vocabulary[c].strTerm; - rst.score = dist; - - subtotal.Add(rst); - } - - return subtotal; - }, - (subtotal) => // lock free accumulator - { - //Mereg the result from different threads - lock (locker) - { - rstList.AddRange(subtotal); - } - }); - - //Sort the result according the distance - rstList.Sort(); - - int maxN = Math.Min(N, rstList.Count); - - - return rstList.GetRange(0, maxN); - } - - } -} diff --git a/BotSharp.NLP/Txt2Vec/Encoder.cs b/BotSharp.NLP/Txt2Vec/Encoder.cs deleted file mode 100644 index c4ddc28f6..000000000 --- a/BotSharp.NLP/Txt2Vec/Encoder.cs +++ /dev/null @@ -1,900 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.IO; -using System.Threading; -//using AdvUtils; - -namespace Txt2Vec -{ - public enum WORD_SOURCE - { - CORPUS, - PRETRAINED_MODEL - } - - public class vocab_word - { - public string word; - public int cnt; - public WORD_SOURCE source; - } - - - public class Encoder - { - const int EXP_TABLE_SIZE = 1000; - const int MAX_EXP = 6; - const int MAX_CODE_LENGTH = 40; - - StreamReader srTrainCorpus = null; - Dictionary word2id; - List vocab; - int vocab_size = 0; - - long train_words = 0; - long word_count_actual = 0; - long next_save_step = 10000000; - long next_save_trained_words = 10000000; - long sentence_count = 0; - - public long iter = 5; - public int layer1_size = 200; - public double starting_alpha = 0.025; - public double sample = 0; - public int min_count = 5; - public int num_threads = 1; - public int cbow = 1, window = 5; - public int classes = 1; - public int debug_mode = 0; - public long savestep = 100000000; - public int negative = 5; - public string strPreTrainedModelFileName = null; - public int onlyUpdateCorpusWord = 0; - - double[] syn0; - double[] syn1; - double[] totalNeu_e; - double[] expTable; - object[] syn0Locker; - object[] syn1Locker; - - Random rand = new Random(DateTime.Today.Millisecond); - - public int[] accFreqTable; - public int accTotalFreq = 0; - public int accFactor = 1; - - void InitAccTermFreq() - { - //Logger.WriteLine("Initializing acculumate term frequency..."); - accFreqTable = new int[vocab_size]; - accTotalFreq = 0; - - //Keep accTotalFreq is less than int.MaxValue - accFactor = 1 + (int)(train_words / int.MaxValue); - //Logger.WriteLine("Acculumate factor: {0}", accFactor); - - int i = 0; - foreach (vocab_word word in vocab) - { - accTotalFreq += (word.cnt / accFactor); - accFreqTable[i] = accTotalFreq; - i++; - } - - //Logger.WriteLine("Acculumated total frequency : {0}", accTotalFreq); - } - - int SearchAccTermTable(int freq) - { - int mid = vocab_size >> 1; - int left = 0, right = vocab_size - 1; - - while (true) - { - if (accFreqTable[mid] < freq) - { - left = mid + 1; - } - else if (accFreqTable[mid] > freq) - { - if (mid == 0) - { - return 0; - } - - if (accFreqTable[mid - 1] < freq) - { - return mid; - } - - right = mid - 1; - } - else - { - return mid; - } - - mid = (left + right) >> 1; - } - } - - - public Encoder() - { - word2id = new Dictionary(); - vocab = new List(); - - expTable = new double[EXP_TABLE_SIZE + 1]; - - for (int i = 0; i < EXP_TABLE_SIZE; i++) - { - expTable[i] = Math.Exp((i / (double)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table - expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) - } - - } - - // Returns position of a word in the vocabulary; if the word is not found, returns -1 - int SearchVocab(string word) - { - if (word2id.ContainsKey(word) == false) - { - return -1; - } - return word2id[word]; - } - - public class VocabComparer : IComparer - { - public int Compare(vocab_word x, vocab_word y) - { - return y.cnt.CompareTo(x.cnt); - } - } - - // Shrink the vocabulary by frequency using word counts - void ShrinkVocab() - { - // Sort the vocabulary - vocab.Sort(new VocabComparer()); - - word2id.Clear(); - int size = vocab_size; - train_words = 0; - for (int a = 0; a < size; a++) - { - // Words occuring less than min_count times will be discarded from the vocab - if (vocab[a].cnt < min_count) - { - vocab_size--; - vocab[a].word = null; - } - else - { - word2id.Add(vocab[a].word, a); - train_words += vocab[a].cnt; - } - } - - vocab.RemoveRange(vocab_size, vocab.Count - vocab_size); - } - - void LoadPreTrainModelSyn(string strModelFileName, double[] syn) - { - Model preTrainedModel = new Model(); - preTrainedModel.LoadModel(strModelFileName, false); - - - if (preTrainedModel.VectorSize != layer1_size) - { - throw new Exception("The layer size is inconsistent between given parameter and pre-trained model."); - } - - string[] allTerms = preTrainedModel.GetAllTerms(); - foreach (string strTerm in allTerms) - { - int wordId = SearchVocab(strTerm); - if (wordId < 0) - { - //Ingore the dropped term - continue; - } - - float[] vector = preTrainedModel.GetVector(strTerm); - for (int i = 0; i < layer1_size;i++) - { - syn[i + wordId * layer1_size] = vector[i]; - } - } - - } - - void LoadVocabFromPreTrainModel(string strModelFileName) - { - Model preTrainedModel = new Model(); - preTrainedModel.LoadModel(strModelFileName, false); - - layer1_size = preTrainedModel.VectorSize; - //Logger.WriteLine("Apply the following options from pr-trained model file {0}", preTrainedModel); - //Logger.WriteLine("Vector Size: {0}", layer1_size); - - string[] allTerms = preTrainedModel.GetAllTerms(); - foreach (string strTerm in allTerms) - { - //Add terms in pre-trained model into vocabulary - //If the term is already added from corpus or given dictionary, we ignore it - if (word2id.ContainsKey(strTerm) == false) - { - Term term = preTrainedModel.GetTerm(strTerm); - vocab_word word = new vocab_word(); - word.word = strTerm; - word.cnt = 0; - word.source = WORD_SOURCE.PRETRAINED_MODEL; - - word2id.Add(word.word, vocab_size); - vocab.Add(word); - - vocab_size++; - } - } - } - - public void LoadVocabFromFile(string vocab_file) - { - StreamReader sr = new StreamReader(vocab_file); - string strLine = null; - - word2id = new Dictionary(); - vocab = new List(); - vocab_size = 0; - - while ((strLine = sr.ReadLine()) != null) - { - string[] items = strLine.Split('\t'); - vocab_word word = new vocab_word(); - word.word = items[0]; - word.source = WORD_SOURCE.CORPUS; - - word2id.Add(word.word, vocab_size); - vocab.Add(word); - - vocab_size++; - - } - - sr.Close(); - } - - - private void GetTrainWordSize(string train_file) - { - StreamReader fin = new StreamReader(train_file); - string strLine = null; - train_words = 0; - - foreach (vocab_word vw in vocab) - { - vw.cnt = 0; - } - - while ((strLine = fin.ReadLine()) != null) - { - //Append the end of sentence - strLine = strLine.Trim(); - string[] items = strLine.Split(); - foreach (string item in items) - { - int wordId = SearchVocab(item); - if (wordId >= 0) - { - vocab[wordId].cnt++; - - if (vocab[wordId].source == WORD_SOURCE.PRETRAINED_MODEL && onlyUpdateCorpusWord == 1) - { - continue; - } - - train_words++; - if (debug_mode > 0 && train_words % 1000000 == 0) - { - //Logger.WriteLine("{0}M... ", train_words / 1000000); - } - } - } - } - - fin.Close(); - } - - public void LearnVocabFromTrainFile(string train_file) - { - StreamReader fin = new StreamReader(train_file); - string strLine = null; - - vocab_size = 0; - int i = 0; - while ((strLine = fin.ReadLine()) != null) - { - //Append the end of sentence - strLine = strLine.Trim(); - string[] items = strLine.Split(); - - foreach (string word in items) - { - //This term is normal word - train_words++; - if (debug_mode > 0 && train_words % 1000000 == 0) - { - //Logger.WriteLine("{0}M... ", train_words / 1000000); - } - - i = SearchVocab(word); - if (i == -1) - { - word2id.Add(word, vocab_size); - - vocab_word voc_word = new vocab_word(); - voc_word.word = word; - voc_word.cnt = 1; - voc_word.source = WORD_SOURCE.CORPUS; - - vocab.Add(voc_word); - vocab_size++; - - } - else - { - vocab[i].cnt++; - } - } - } - - fin.Close(); - } - - public void SaveVocab(string save_vocab_file) - { - StreamWriter fo = new StreamWriter(save_vocab_file); - for (int i = 0; i < vocab_size; i++) - { - fo.WriteLine("{0}\t{1}", vocab[i].word, vocab[i].cnt); - } - fo.Close(); - } - - void InitNet() - { - syn0Locker = new object[vocab_size]; - syn1Locker = new object[vocab_size]; - for (int i = 0; i < vocab_size; i++) - { - syn0Locker[i] = new object(); - syn1Locker[i] = new object(); - } - - totalNeu_e = new double[layer1_size]; - - syn0 = new double[vocab_size * layer1_size]; - for (long b = 0; b < layer1_size; b++) - { - for (long a = 0; a < vocab_size; a++) - { - syn0[a * layer1_size + b] = (rand.NextDouble() - 0.5) / layer1_size; - } - } - syn1 = new double[vocab_size * layer1_size]; - - - } - - object rdlocker = new object(); - object wrlocker = new object(); - object locker_rand = new object(); - int RandNext(int max) - { - lock (locker_rand) - { - return rand.Next(max); - } - } - - double RandNextDouble() - { - lock (locker_rand) - { - return rand.NextDouble(); - } - } - - void TrainModelThread() - { - int word_count = 0, last_word_count = 0; - double alpha = starting_alpha * (1 - word_count_actual / (double)(iter * train_words + 1)); - - while (true) - { - if (word_count - last_word_count > 10000) - { - last_word_count = word_count; - if (debug_mode > 0) - { - - double sumErr = 0; - for (int i = 0; i < layer1_size; i++) - { - sumErr += (totalNeu_e[i] / word_count_actual); - } - - //Logger.WriteLine("Alpha: {0:0.0000} Prog: {1:0.00}% Words: {2}K Sent: {3}K Error: {4}", alpha, - //word_count_actual / (double)(iter * train_words + 1) * 100, word_count_actual / 1024, sentence_count / 1024, sumErr); - } - - if (word_count_actual > next_save_trained_words) - { - long old_next_save_trained_words = next_save_trained_words; - lock (rdlocker) - { - if (old_next_save_trained_words == next_save_trained_words) - { - //Logger.WriteLine("Saving temporary word vector into file..."); - - Model.SaveModel("vector_tmp.bin", vocab_size, layer1_size, vocab, syn0); - Model.SaveModel("vector_tmp_bin.syn1", vocab_size, layer1_size, vocab, syn1); - - next_save_trained_words += next_save_step; - } - } - } - - } - - alpha = starting_alpha * (1 - word_count_actual / (double)(iter * train_words + 1)); - if (alpha < starting_alpha * 0.0001) - { - alpha = starting_alpha * 0.0001; - } - - //Read a line from training corpus - string strLine = ""; - lock (rdlocker) - { - strLine = srTrainCorpus.ReadLine(); - } - if (strLine == null) - { - break; - } - - Interlocked.Increment(ref sentence_count); - //Parse each word in current sentence - string[] strWords = strLine.Split(); - - bool bIgnore = true; - for (int i = 0; i < strWords.Length; i++) - { - int wordId = SearchVocab(strWords[i]); - if (wordId < 0) - { - continue; - } - - if (vocab[wordId].source == WORD_SOURCE.CORPUS) - { - bIgnore = false; - break; - } - - if (vocab[wordId].source == WORD_SOURCE.PRETRAINED_MODEL && onlyUpdateCorpusWord == 0) - { - bIgnore = false; - break; - } - } - - if (bIgnore == true) - { - continue; - } - - for (int sentence_position = 0; sentence_position < strWords.Length; sentence_position++) - { - string strPredictedWord = strWords[sentence_position]; - int word = SearchVocab(strPredictedWord); - if (word < 0) - { - //Ingore the dropped term - continue; - } - if (vocab[word].source == WORD_SOURCE.CORPUS || - (vocab[word].source == WORD_SOURCE.PRETRAINED_MODEL && onlyUpdateCorpusWord == 0)) - { - word_count++; - Interlocked.Increment(ref word_count_actual); - } - - int rnd_window = RandNext(window); - if (cbow != 0) - { - TrainByCBOW(sentence_position, rnd_window, strWords, alpha, word); - } - else - { - TrainBySkipGram(sentence_position, rnd_window, strWords, alpha, word); - } - } - } - } - - private void TrainBySkipGram(int sentence_position, int b, string[] sen, double alpha, int word) - { - double[] neu1e = new double[layer1_size]; - - //train skip-gram - for (int a = b; a < window * 2 + 1 - b; a++) - { - int c = sentence_position - window + a; - if (c < 0 || c >= sen.Length || c == sentence_position) - { - //Invalidated position. out of sentence boundary - continue; - } - - string strNGram = sen[c]; - int wordId = SearchVocab(strNGram); - if (wordId == -1) - { - continue; - } - - - int l1 = wordId * layer1_size; - for (c = 0; c < layer1_size; c++) - { - neu1e[c] = 0; - } - - lock (syn0Locker[wordId]) - { - //Negative sampling - int target = 0; - int label = 1; - for (int d = 0; d < negative + 1; d++) - { - if (d == 0) - { - target = word; - label = 1; - } - else - { - target = SearchAccTermTable(RandNext(accTotalFreq)); - if (target == word) - { - continue; - } - label = 0; - } - - - long l2 = target * layer1_size; - double f = 0; - double g; - - lock (syn1Locker[target]) - { - for (c = 0; c < layer1_size; c++) - { - f += syn0[c + l1] * syn1[c + l2]; - } - if (f > MAX_EXP) - { - g = (label - 1) * alpha; - } - else if (f < -MAX_EXP) - { - g = (label - 0) * alpha; - } - else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; - - for (c = 0; c < layer1_size; c++) - { - neu1e[c] += g * syn1[c + l2]; - } - - if (onlyUpdateCorpusWord == 1 && vocab[target].source == WORD_SOURCE.PRETRAINED_MODEL) - { - continue; - } - - for (c = 0; c < layer1_size; c++) - { - syn1[c + l2] += g * syn0[c + l1]; - } - } - } - - - if (onlyUpdateCorpusWord == 1 && vocab[wordId].source == WORD_SOURCE.PRETRAINED_MODEL) - { - continue; - } - - // Learn weights input -> hidden - for (c = 0; c < layer1_size; c++) - { - syn0[c + l1] += neu1e[c]; - } - } - - - for (int i = 0; i < neu1e.Length; i++) - { - totalNeu_e[i] += Math.Abs(neu1e[i] / (window * 2 + 1 - b * 2)); - } - - } - } - - private void TrainByCBOW(int sentence_position, int b, string[] sen, double alpha, int word) - { - double[] neu1 = new double[layer1_size]; - double[] neu1e = new double[layer1_size]; - int cw = 0; - List wordIdList = new List(); - - //train the cbow architecture - // in -> hidden - for (int a = b; a < window * 2 + 1 - b; a++) - { - - int c = sentence_position - window + a; - if (c < 0 || c >= sen.Length || c == sentence_position) - { - //Invalidated position. out of sentence boundary - continue; - } - - //Generate ngram string and word id - string strNGram = null; - int wordId = -1; - - strNGram = sen[c]; - - wordId = SearchVocab(strNGram); - if (wordId < 0) - { - //Ingore the dropped term - continue; - } - - //The subsampling randomly discards frequent words while keeping the ranking same - if (sample > 0) - { - double ran = (Math.Sqrt(vocab[wordId].cnt / (sample * train_words)) + 1) * (sample * train_words) / vocab[wordId].cnt; - if (ran < RandNextDouble()) - { - continue; - } - } - - if (onlyUpdateCorpusWord == 0 || (onlyUpdateCorpusWord == 1 && vocab[wordId].source == WORD_SOURCE.CORPUS)) - { - //Terms that need to update their syn0 - wordIdList.Add(wordId); - } - - lock (syn0Locker[wordId]) - { - for (int t = 0; t < layer1_size; t++) - { - neu1[t] += syn0[t + wordId * layer1_size]; - } - } - cw++; - } - - if (wordIdList.Count == 0) - { - //No term need to update its syn, return - return; - } - - double synUpdateFactor = (double)(cw) / (double)(wordIdList.Count); - for (int c = 0; c < layer1_size; c++) - { - neu1[c] /= cw; - } - - - int target = 0; - int label = 1; - for (int d = 0; d < negative + 1; d++) - { - if (d == 0) - { - target = word; - label = 1; - } - else - { - target = SearchAccTermTable(RandNext(accTotalFreq)); - if (target == word) - { - continue; - } - label = 0; - } - - long l2 = target * layer1_size; - double f = 0; - - lock (syn1Locker[target]) - { - for (int c = 0; c < layer1_size; c++) - { - f += neu1[c] * syn1[c + l2]; - } - double g = 0; - if (f > MAX_EXP) g = (label - 1) * alpha; - else if (f < -MAX_EXP) g = (label - 0) * alpha; - else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; - - for (int c = 0; c < layer1_size; c++) - { - neu1e[c] += g * syn1[c + l2]; - } - - - if (onlyUpdateCorpusWord == 1 && vocab[target].source == WORD_SOURCE.PRETRAINED_MODEL) - { - continue; - } - - for (int c = 0; c < layer1_size; c++) - { - syn1[c + l2] += g * neu1[c]; - } - } - } - - - // hidden -> in - foreach (int wordId in wordIdList) - { - lock (syn0Locker[wordId]) - { - for (int c = 0; c < layer1_size; c++) - { - syn0[c + wordId * layer1_size] += (neu1e[c] * synUpdateFactor); - } - } - } - - for (int i = 0; i < neu1e.Length; i++) - { - totalNeu_e[i] += Math.Abs(neu1e[i]); - } - } - - public void TrainModel(string train_file, string output_file, string vocab_file) - { - if (debug_mode > 0) - { - //Logger.WriteLine("Starting training using file {0}", train_file); - } - - if ((vocab_file != null && File.Exists(vocab_file) == true) || - strPreTrainedModelFileName != null) - { - if (vocab_file != null && File.Exists(vocab_file) == true) - { - //Logger.WriteLine("Loading vocabulary {0} from file...", vocab_file); - LoadVocabFromFile(vocab_file); - } - - if (strPreTrainedModelFileName != null) - { - //Logger.WriteLine("Load vocabulary from pre-trained model file {0}", strPreTrainedModelFileName); - LoadVocabFromPreTrainModel(strPreTrainedModelFileName); - } - - //Vocaburary is loaded from given dict, then we need to calculate how many words need to be train - //Logger.WriteLine("Calculating how many words need to be train..."); - GetTrainWordSize(train_file); - //Logger.WriteLine("Total training words : {0}", train_words); - } - else - { - //We have no input vocabulary, so we get vocabulary from training corpus - //Logger.WriteLine("Generate vocabulary from training corpus {0}...", train_file); - LearnVocabFromTrainFile(train_file); - } - - //filter out words which frequenct is lower - ShrinkVocab(); - - //If vocabulary is specified in parameter list, but not existed in folder, we need to create it - if (vocab_file != null && vocab_file.Length > 0 && File.Exists(vocab_file) == false) - { - if (debug_mode > 0) - { - //Logger.WriteLine("Saving vocabulary into file..."); - } - SaveVocab(vocab_file); - } - - next_save_step = savestep; - next_save_trained_words = next_save_step; - - if (output_file == null) - { - //Logger.WriteLine("No specified output file name"); - return; - } - //Initialize neural network - InitNet(); - - //Generate word's frequency distribution for negative samping - InitAccTermFreq(); - - - //Load pre-trained model syn0 - if (strPreTrainedModelFileName != null) - { - //Logger.WriteLine("Loading syn0 from pre-trained model..."); - LoadPreTrainModelSyn(strPreTrainedModelFileName, syn0); - - //Logger.WriteLine("Loading syn1 from pre-trained model..."); - LoadPreTrainModelSyn(strPreTrainedModelFileName + ".syn1", syn1); - } - - if (File.Exists(train_file) == true) - { - string strCurTrainFile = train_file; - for (int j = 0; j < iter; j++) - { - totalNeu_e = new double[layer1_size]; - - //Logger.WriteLine("Starting training iteration {0}/{1}...", j + 1, iter); - srTrainCorpus = new StreamReader(strCurTrainFile, Encoding.UTF8, true, 102400000); - List threadList = new List(); - for (int i = 0; i < num_threads; i++) - { - Thread thread = new Thread(new ThreadStart(TrainModelThread)); - thread.Start(); - threadList.Add(thread); - } - - //Wait all threads finish their jobs - for (int i = 0; i < num_threads; i++) - { - threadList[i].Join(); - } - - srTrainCorpus.Close(); - - double sumErr = 0; - for (int i = 0; i < layer1_size; i++) - { - sumErr += (totalNeu_e[i] / word_count_actual); - } - - //Logger.WriteLine("Error: {0}", sumErr); - } - } - else - { - //Logger.WriteLine("Train train file isn't existed."); - return; - } - - - Model.SaveModel(output_file, vocab_size, layer1_size, vocab, syn0); - Model.SaveModel(output_file + ".syn1", vocab_size, layer1_size, vocab, syn1); - } - } -} diff --git a/BotSharp.NLP/Txt2Vec/Model.cs b/BotSharp.NLP/Txt2Vec/Model.cs deleted file mode 100644 index 2fc6447cd..000000000 --- a/BotSharp.NLP/Txt2Vec/Model.cs +++ /dev/null @@ -1,313 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.IO; -using Bigtree.Algorithm.CRFLite.Utils; - -namespace Txt2Vec -{ - public class Term - { - public string strTerm; - public float[] vector; - public byte[] vectorVQ; - } - - public class Model - { - private Dictionary term2vector; - private List entireTermList; - private int vectorSize; - private double[][] codebooks; - - public List Vocabulary { get { return entireTermList; } } - public int VectorSize { get { return vectorSize; } } - public string[] GetAllTerms() - { - return term2vector.Keys.ToArray(); - } - - public Term GetTerm(string strTerm) - { - if (term2vector.ContainsKey(strTerm) == false) - { - return null; - } - - return term2vector[strTerm]; - } - - public void LoadModel(string strFileName, bool bTextFormat) - { - if (bTextFormat == true) - { - LoadTextModel(strFileName); - } - else - { - LoadBinaryModel(strFileName); - } - } - - public bool DumpModel(string strFileName) - { - if (entireTermList == null || entireTermList.Count == 0) - { - return false; - } - - StreamWriter sw = new StreamWriter(strFileName); - foreach (Term term in entireTermList) - { - StringBuilder sb = new StringBuilder(); - sb.Append(term.strTerm); - sb.Append("\t"); - - foreach (double v in term.vector) - { - sb.Append(v); - sb.Append("\t"); - } - - sw.WriteLine(sb.ToString().Trim()); - } - sw.Close(); - - return true; - } - - public void LoadTextModel(string strFileName) - { - term2vector = new Dictionary(); - entireTermList = new List(); - vectorSize = 0; - - StreamReader sr = new StreamReader(strFileName); - string strLine = null; - while ((strLine = sr.ReadLine()) != null) - { - //the format is "word \t vector - //eah dim of vector is splitted by \t - Term term = new Term(); - string[] items = strLine.Split('\t'); - int vSize = items.Length - 1; - if (vectorSize > 0 && vectorSize != vSize) - { - throw new InvalidDataException(String.Format("Invalidated data : {0} . The length of vector must be fixed (current length {1} != previous length {2}).", strLine, vSize, vectorSize)); - } - - term.strTerm = items[0]; - term.vector = new float[vSize]; - for (int i = 0; i < vSize; i++) - { - term.vector[i] = float.Parse(items[i + 1]); - } - - vectorSize = vSize; - term.vector = NormalizeVector(term.vector); - term2vector.Add(term.strTerm, term); - entireTermList.Add(term); - } - - sr.Close(); - } - - public float[] GetVector(string strTerm) - { - if (term2vector.ContainsKey(strTerm) == true) - { - return term2vector[strTerm].vector; - } - - return null; - } - - private float[] NormalizeVector(float[] vec) - { - //Normalize the vector - double len = 0; - for (int a = 0; a < vectorSize; a++) - { - len += vec[a] * vec[a]; - } - len = Math.Sqrt(len); - for (int a = 0; a < vectorSize; a++) - { - vec[a] = (float)(vec[a] / len); - } - - return vec; - } - - public void LoadBinaryModel(string strFileName) - { - StreamReader sr = new StreamReader(strFileName); - BinaryReader br = new BinaryReader(sr.BaseStream); - - //The number of words - int words = br.ReadInt32(); - //The size of vector - vectorSize = br.ReadInt32(); - int vqSize = br.ReadInt32(); - - term2vector = new Dictionary(); - entireTermList = new List(); - - // Logger.WriteLine("vocabulary size: {0}, vector size: {1}, VQ size: {2}", words, vectorSize, vqSize); - - codebooks = null; - if (vqSize > 0) - { - //Read code books - codebooks = new double[vectorSize][]; - for (int i = 0; i < vectorSize; i++) - { - codebooks[i] = new double[vqSize]; - for (int j = 0; j < vqSize; j++) - { - codebooks[i][j] = br.ReadDouble(); - } - } - } - - for (int b = 0; b < words; b++) - { - Term term = new Term(); - term.strTerm = br.ReadString(); - term.vector = new float[vectorSize]; - if (codebooks != null) - { - term.vectorVQ = new byte[vectorSize]; - } - else - { - term.vectorVQ = null; - } - - for (int i = 0; i < vectorSize; i++) - { - if (codebooks == null) - { - term.vector[i] = br.ReadSingle(); - } - else - { - byte idx = br.ReadByte(); - term.vector[i] = (float)codebooks[i][idx]; - term.vectorVQ[i] = idx; - } - } - - term.vector = NormalizeVector(term.vector); - term2vector.Add(term.strTerm, term); - entireTermList.Add(term); - - } - sr.Close(); - } - - public static void SaveModel(string strFileName, int vocab_size, int vector_size, List vocab, double[] syn) - { - StreamWriter fo = new StreamWriter(strFileName); - BinaryWriter bw = new BinaryWriter(fo.BaseStream); - - // Logger.WriteLine("Saving term and vector into model file..."); - // Save the word vectors - bw.Write(vocab_size); - bw.Write(vector_size); - bw.Write(0); //no VQ - - for (int i = 0; i < vocab_size; i++) - { - //term string - bw.Write(vocab[i].word); - - //term vector - for (int j = 0; j < vector_size; j++) - { - bw.Write((float)(syn[i * vector_size + j])); - } - } - - bw.Flush(); - fo.Flush(); - fo.Close(); - } - - public bool BuildVQModel(string strFileName) - { - int vqSize = 256; - if (entireTermList == null || entireTermList.Count == 0) - { - return false; - } - - StreamWriter fo = new StreamWriter(strFileName); - BinaryWriter bw = new BinaryWriter(fo.BaseStream); - - // Save the word vectors - bw.Write(entireTermList.Count); //Vocabulary size - bw.Write(vectorSize); //Vector size - bw.Write(vqSize); //VQ size - - // Logger.WriteLine("vocabulary size: {0}, vector size: {1}, vq size: {2}", entireTermList.Count, vectorSize, vqSize); - - //Create word and VQ values mapping table - Dictionary> vqResult = new Dictionary>(); - foreach (Term term in entireTermList) - { - vqResult.Add(term.strTerm, new List()); - } - - // Logger.WriteLine("Dims Distortion:"); - for (int i = 0; i < vectorSize; i++) - { - //Generate VQ values for each dimension - VectorQuantization vq = new VectorQuantization(); - for (int j = 0; j < entireTermList.Count; j++) - { - vq.Add(entireTermList[j].vector[i]); - } - double distortion = vq.BuildCodebook(vqSize); - // Logger.WriteLine("Dim {0}: {1}", i, distortion); - - for (int j = 0; j < entireTermList.Count; j++) - { - byte vqValue = (byte)vq.ComputeVQ(entireTermList[j].vector[i]); - vqResult[entireTermList[j].strTerm].Add(vqValue); - } - - //Save VQ codebook into model file - for (int j = 0; j < vqSize; j++) - { - bw.Write(vq.CodeBook[j]); - } - } - - foreach (KeyValuePair> pair in vqResult) - { - if (pair.Value.Count != vectorSize) - { - throw new Exception(String.Format("word {0} has inconsistent vector size: orginial size is {1}, vq size is {2}", - pair.Key, vectorSize, pair.Value.Count)); - } - - //term string - bw.Write(pair.Key); - //term vector - for (int b = 0; b < pair.Value.Count; b++) - { - bw.Write(pair.Value[b]); - } - } - - bw.Flush(); - fo.Flush(); - fo.Close(); - - return true; - } - } -} diff --git a/BotSharp.NLP/Txt2Vec/OneHotEncoder.cs b/BotSharp.NLP/Txt2Vec/OneHotEncoder.cs deleted file mode 100644 index d220c426c..000000000 --- a/BotSharp.NLP/Txt2Vec/OneHotEncoder.cs +++ /dev/null @@ -1,63 +0,0 @@ -ï»¿using BotSharp.NLP.Tokenize; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace BotSharp.NLP.Txt2Vec -{ - ///

- /// A one hot encoding is a representation of categorical variables as binary vectors. - /// Each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1. - ///

- public class OneHotEncoder - { - public List Sentences { get; set; } - - public List Words { get; set; } - - public void Encode(Sentence sentence) - { - InitDictionary(); - - var vector = Words.Select(x => 0D).ToArray(); - - sentence.Words.ForEach(w => - { - int index = Words.IndexOf(w.Lemma); - if(index > 0) - { - vector[index] = 1; - } - }); - - sentence.Vector = vector; - } - - public List EncodeAll() - { - InitDictionary(); - - Sentences.ForEach(sent => Encode(sent)); - //Parallel.ForEach(Sentences, sent => Encode(sent)); - - return Words; - } - - private List InitDictionary() - { - if (Words == null) - { - Words = new List(); - Sentences.ForEach(x => - { - Words.AddRange(x.Words.Where(w => w.IsAlpha).Select(w => w.Lemma)); - }); - Words = Words.Distinct().OrderBy(x => x).ToList(); - } - - return Words; - } - } -} diff --git a/BotSharp.NLP/Txt2Vec/Shrink.cs b/BotSharp.NLP/Txt2Vec/Shrink.cs deleted file mode 100644 index 555c188d3..000000000 --- a/BotSharp.NLP/Txt2Vec/Shrink.cs +++ /dev/null @@ -1,93 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.IO; - -namespace Txt2Vec -{ - public class Shrink - { - public void Run(string strModelFileName, string strNewModelFileName, string strDictFileName) - { - string strLine = null; - - //Load lexical dictionary - // Logger.WriteLine("Load lexical dictionary..."); - StreamReader sr = new StreamReader(strDictFileName); - HashSet setTerm = new HashSet(); - while ((strLine = sr.ReadLine()) != null) - { - string[] items = strLine.Split('\t'); - setTerm.Add(items[0]); - } - sr.Close(); - - - //Load raw model - // Logger.WriteLine("Loading raw model..."); - sr = new StreamReader(strModelFileName); - BinaryReader br = new BinaryReader(sr.BaseStream); - - int words = br.ReadInt32(); - int size = br.ReadInt32(); - int vqSize = br.ReadInt32(); - - // Logger.WriteLine("vocabulary size: {0}, vector size: {1}, VQ size: {2}", words, size, vqSize); - if (vqSize != 0) - { - // Logger.WriteLine(Logger.Level.err, "Currently, we don't support to shrink vector quantization model."); - return; - } - - Dictionary vocab = new Dictionary(); - Dictionary rev_vocab = new Dictionary(); - List termList = new List(); - double []M = new double[words * size]; - - int newwords = 0; - for (int b = 0; b < words; b++) - { - string strTerm = br.ReadString(); - if (setTerm.Contains(strTerm) == true) - { - termList.Add(strTerm); - for (int a = 0; a < size; a++) - { - M[a + newwords * size] = br.ReadSingle(); - } - newwords++; - } - else - { - //Skip the vectors of this word - for (int a = 0; a < size; a++) - { - br.ReadSingle(); - } - } - } - sr.Close(); - - //Save the shrinked model - // Logger.WriteLine("Saving shrinked model..."); - StreamWriter sw = new StreamWriter(strNewModelFileName); - BinaryWriter bw = new BinaryWriter(sw.BaseStream); - - bw.Write(newwords); - bw.Write(size); - bw.Write(vqSize); - - for (int i = 0; i < newwords; i++) - { - bw.Write(termList[i]); - for (int j = 0; j < size; j++) - { - bw.Write((float)M[j + i * size]); - } - } - sw.Close(); - } - } -} diff --git a/BotSharp.NLP/Txt2Vec/VectorGenerator.cs b/BotSharp.NLP/Txt2Vec/VectorGenerator.cs deleted file mode 100644 index b54bb6429..000000000 --- a/BotSharp.NLP/Txt2Vec/VectorGenerator.cs +++ /dev/null @@ -1,230 +0,0 @@ -ï»¿using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using System.IO; -using System.Threading; -//using AdvUtils; - -namespace Txt2Vec -{ - public class VectorGenerator - { - public Model Model { get; set; } - // Txt2Vec.Model model = new Txt2Vec.Model(); - Dictionary dict = new Dictionary(); - - public VectorGenerator(Args args) - { - this.Model = new Txt2Vec.Model(); - bool bTxtFormat = false; - string strModelFileName = args.ModelFile; - - if (strModelFileName == null) - { - Console.Write("Failed: must to set the model file name"); - throw new IOException(); - } - if (System.IO.File.Exists(strModelFileName) == false) - { - Console.Write("Failed: model file {0} isn't existed.", strModelFileName); - throw new IOException(); - } - - this.Model.LoadModel(strModelFileName, bTxtFormat); - } - - public List Sentence2Vec(List sentences, WeightingScheme weightingScheme = WeightingScheme.AVG) - { - // Inplementing TF-IDF - // TFIDFGenerator tfidfGenerator = new TFIDFGenerator(); - List> weights = null;// tfidfGenerator.TFIDFWeightVectorsForSentences(sentences.ToArray()); - - List> matixList = new List>(); - List sentenceVectorList = new List(); - sentences.ForEach (sentence=>{ - //List sentenceVectorList = new List(); - //string[] words = sentence.Split(' '); - //foreach (string word in words) - //{ - // Vec vec = Word2Vec(word.ToLower()); - // sentenceVectorList.Add(vec); - //} - //matixList.Add(sentenceVectorList); - }); - - return sentenceVectorList; - /* - List vectorList = new List(); - // Traverse each sentence - for (int i = 0; i < sentences.Count; i++) - { - Vec sentenceVector = null; - List curVecList = matixList[i]; - if (weightingScheme == WeightingScheme.TFIDF) - { - // Get this sentence - List weight = weights[i]; - sentenceVector = TFIDFMultiply(curVecList, weight); - } - if (weightingScheme == WeightingScheme.AVG) - { - int dim = curVecList[0].VecNodes.Count; - sentenceVector = new Vec(); - double nodeTotalValue; - for (int k = 0; k < dim; k++) - { - nodeTotalValue = 0; - for (int j = 0; j < curVecList.Count; j++) - { - Vec curWordVec = curVecList[j]; - double curNodeVal = curWordVec.VecNodes[k]; - nodeTotalValue += curNodeVal; - } - sentenceVector.VecNodes.Add(nodeTotalValue / dim); - - } - - } - vectorList.Add(sentenceVector); - } - for (int i = 0; i < vectorList.Count; i++) - { - if (this.dict.ContainsKey(sentences[i])) - { - continue; - } - else - { - this.dict.Add(sentences[i], vectorList[i]); - } - - } - return vectorList; - */ - } - - public Vec SingleSentence2Vec(string sentence, WeightingScheme weightingScheme = WeightingScheme.AVG) - { - - Vec sentenceVector = new Vec(); - List sentenceVectorList = new List(); - string[] words = sentence.Split(' '); - foreach (string word in words) - { - Vec vec = Word2Vec(word.ToLower()); - sentenceVectorList.Add(vec); - } - if (weightingScheme == WeightingScheme.AVG) - { - int dim = sentenceVectorList[0].VecNodes.Count; - double nodeTotalValue; - for (int k = 0; k < dim; k++) - { - nodeTotalValue = 0; - for (int j = 0; j < sentenceVectorList.Count; j++) - { - Vec curWordVec = sentenceVectorList[j]; - double curNodeVal = curWordVec.VecNodes[k]; - nodeTotalValue += curNodeVal; - } - sentenceVector.VecNodes.Add(nodeTotalValue / dim); - } - - } - return sentenceVector; - } - - public Vec TFIDFMultiply(List curVecList, List weight) - { - int dim = curVecList[0].VecNodes.Count; - int sentenceWordsCount = curVecList.Count; - Vec res = new Vec(); - for (int k = 0; k < dim; k++) - { - double nodeTotalValue = 0; - for (int i = 0; i < curVecList.Count; i++) - { - Vec curWordVec = curVecList[i]; - double curNodeVal = curWordVec.VecNodes[k]; - double curWeight = weight[i]; - nodeTotalValue += curNodeVal * curWeight; - } - res.VecNodes.Add(nodeTotalValue / sentenceWordsCount); - } - - return res; - } - - public Vec Word2Vec(string word) - { - Vec vec= new Vec(); - - Txt2Vec.Decoder decoder = new Txt2Vec.Decoder(Model); - string[] termList = new string[1]; - termList[0] = word; - vec.VecNodes = decoder.ToVector(termList).ToList(); - - return vec; - } - - public Vec Sent2Vec(List words) - { - Vec vec = new Vec(); - - Txt2Vec.Decoder decoder = new Txt2Vec.Decoder(Model); - string[] termList = words.ToArray(); - vec.VecNodes = decoder.ToVector(termList).ToList(); - - return vec; - } - - public void Distance(List words) - { - - Txt2Vec.Decoder decoder = new Txt2Vec.Decoder(Model); - words.ForEach(word=> { - Console.WriteLine($"current word: {word}"); - List