Merge pull request #74 from rodrigopivi/dev

Prepare 2.3.0 release
rodrigopivi · Jun 26, 2019 · 83af292 · 83af292
2 parents 60c7cc2 + 25f95be
commit 83af292
Show file tree

Hide file tree

Showing 17 changed files with 1,270 additions and 228 deletions.
diff --git a/examples/citySearch_medium.chatito b/examples/citySearch_medium.chatito
@@ -31,15 +31,15 @@
     places to eat
     where to eat
 
-~[newYork]
+~[newYork]('synonym': 'true')
     new york ~[city?]
     ny ~[city?]
 
-~[sanFrancisco]
+~[sanFrancisco]('synonym': 'true')
     san francisco
     san francisco city
 
-~[atlanta]
+~[atlanta]('synonym': 'true')
     atlanta
     atlanta city
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "chatito",
-  "version": "2.2.2",
+  "version": "2.3.0",
   "description": "Generate training datasets for NLU chatbots using a simple DSL",
   "bin": {
     "chatito": "./dist/bin.js"
@@ -49,7 +49,8 @@
   "homepage": "https://github.com/rodrigopivi/Chatito",
   "dependencies": {
     "chance": "1.0.18",
-    "minimist": "1.2.0"
+    "minimist": "1.2.0",
+    "wink-tokenizer": "5.2.1"
   },
   "jest": {
     "transform": {
@@ -82,6 +83,7 @@
     "@types/react-dom": "16.8.4",
     "@types/react-helmet": "5.0.8",
     "@types/react-router-dom": "4.3.3",
+    "@types/wink-tokenizer": "4.0.0",
     "babel-loader": "8.0.5",
     "babel-plugin-import": "1.11.0",
     "babel-plugin-styled-components": "1.10.0",

diff --git a/parser/chatito.pegjs b/parser/chatito.pegjs
@@ -4,7 +4,7 @@ Start = (ImportFile/TopLevelStatement/CommentLine)+
 TopLevelStatement = od:(IntentDefinition/SlotDefinition/AliasDefinition) { return od; }
 
 // ============= Probability operator =============
-ProbabilityOperatorDefinition = "*[" probability:BasicKeywordLiteral "]" { return probability; }
+ProbabilityOperatorDefinition = "*[" probability:Number percent:"%"? "]" { return `${probability}${percent || ''}`; }
 // ============= Entities =============
 EntityOpt = "?"
 EntityBody = "[" value:EntityKeywordLiteral "]" { return value }
@@ -45,11 +45,12 @@ SlotDefinition = EOL? o:EntitySlotDefinition EOL
     { return { type: o.type, key: o.value, args: o.args, location: o.location, inner: s, variation: o.variation } }
 
 // Alias
-EntityAliasDefinition = "~" value:EntityBody { return { value: value, type: "AliasDefinition", location: location() } }
+EntityAliasDefinition = "~" value:EntityBody args:EntityArguments?
+    { return { value: value, type: "AliasDefinition", location: location(), args: args } }
 OptionalAlias = "~" op:EntityOptionalBody { return { value: op.value, type: "Alias", opt: op.opt } }
 AliasDefinition = EOL? o:EntityAliasDefinition EOL
     Indent s:IntentAndSlotInnerStatements Dedent
-    { return { type: o.type, key: o.value, location: o.location, inner: s } }
+    { return { type: o.type, key: o.value, location: o.location, inner: s, args: o.args } }
 
 // ============= Identation =============
 Samedent "correct indentation" = s:" "* &{ return s.length === level * STEP; }
@@ -69,7 +70,15 @@ BasicKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]") .) { return t.j
 EntityKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); }
 SlotKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"#")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); }
 
-Integer "integer" = [0-9]+ { return parseInt(text(), 10); }
+// Number
+Number "number" = int frac? { return parseFloat(text()); }
+DecimalPoint = "."
+Digit1_9 = [1-9]
+Digit0_9  = [0-9]
+frac = DecimalPoint Digit0_9+
+int = zero / (Digit1_9 Digit0_9*)
+zero = "0"
+
 EOS "end of sentence" = EOL / EOF
 EOL "end of line "= (EOLNonWindows/EOLWindows)+
 EOLNonWindows "non windows end of line" = "\n"

diff --git a/readme.md b/readme.md
@@ -28,7 +28,9 @@ This project contains the:
 For the full language specification and documentation, please refer to the [DSL spec document](https://github.com/rodrigopivi/Chatito/blob/master/spec.md).
 
 ### Adapters
-The language is independent from the generated output format and because each model can receive different parameters and settings, there are 3 data format adapters provided. This section describes the adapters, their specific behaviors and use cases:
+The language is independent from the generated output format and because each model can receive different parameters and settings, this are the currently implemented data formats, if your provider is not listed, at the Tools and resources section there is more information on how to support more formats.
+
+NOTE: Samples may not not shuffled between intents for easier review.
 
 #### Default format
 Use the default format if you plan to train a custom model or if you are writing a custom adapter. This is the most flexible format because you can annotate `Slots` and `Intents` with custom entity arguments, and they all will be present at the generated output, so for example, you could also include dialog/response generation logic with the DSL. E.g.:
@@ -46,7 +48,7 @@ Custom entities like 'context', 'required' and 'type' will be available at the o
 
 #### [Rasa NLU](https://rasa.com/docs/nlu/)
 [Rasa NLU](https://rasa.com/docs/nlu/) is a great open source framework for training NLU models.
-One particular behavior of the Rasa adapter is that when a slot definition sentence only contains one alias, the generated Rasa dataset will map the alias as a synonym. e.g.:
+One particular behavior of the Rasa adapter is that when a slot definition sentence only contains one alias, and that alias defines the 'synonym' argument with 'true', the generated Rasa dataset will map the alias as a synonym. e.g.:
 
 ```
 %[some intent]('training': '1')
@@ -55,13 +57,20 @@ One particular behavior of the Rasa adapter is that when a slot definition sente
 @[some slot]
     ~[some slot synonyms]
 
-~[some slot synonyms]
+~[some slot synonyms]('synonym': 'true')
     synonym 1
     synonym 2
 ```
 
 In this example, the generated Rasa dataset will contain the `entity_synonyms` of `synonym 1` and `synonym 2` mapping to `some slot synonyms`.
 
+#### [Flair](https://github.com/zalandoresearch/flair)
+[Flair](https://github.com/zalandoresearch/flair) A very simple framework for state-of-the-art NLP. Developed by Zalando Research. It provides state of the art (GPT, BERT, ELMo, etc...) pre trained models and embeddings for many languages that work out of the box. This adapter supports the `text classification` dataset in FastText format and the `named entity recognition` dataset in two column [BIO](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) annotated words, as documented at [flair corpus documentation](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md). This two data formats are very common and with many other providers or models.
+
+The NER dataset requires a word tokenization processing that is currently done using [wink-tokenizer](https://github.com/winkjs/wink-tokenizer) npm package. Extending the adapter to add PoS tagging can be explored in the future, but it's not implemented.
+
+NOTE: Flair adapter is only available for the NodeJS NPM CLI package, not for the IDE.
+
 #### [LUIS](https://www.luis.ai/)
 [LUIS](https://www.luis.ai/) is part of Microsoft's Cognitive services. Chatito supports training a LUIS NLU model through its [batch add labeled utterances endpoint](https://westus.dev.cognitive.microsoft.com/docs/services/5890b47c39e2bb17b84a55ff/operations/5890b47c39e2bb052c5b9c09), and its [batch testing api](https://docs.microsoft.com/en-us/azure/cognitive-services/LUIS/luis-how-to-batch-test).
 
@@ -87,14 +96,10 @@ In the previous example, all `@[date]` values will be tagged with the `snips/dat
 
 Chatito supports Node.js `v8.11.2 LTS` or higher.
 
-Install it globally:
-```
-npm i chatito -g
-```
-Or locally:
+Install it with yarn or npm:
 ```
 npm i chatito --save
-````
+```
 
 Then create a definition file (e.g.: `trainClimateBot.chatito`) with your code.
 
@@ -112,7 +117,7 @@ npx chatito <pathToFileOrDirectory> --format=<format> --formatOptions=<formatOpt
 ```
 
  - `<pathToFileOrDirectory>` path to a `.chatito` file or a directory that contains chatito files. If it is a directory, will search recursively for all `*.chatito` files inside and use them to generate the dataset. e.g.: `lightsChange.chatito` or `./chatitoFilesFolder`
- - `<format>` Optional. `default`, `rasa` or `snips`
+ - `<format>` Optional. `default`, `rasa`, `luis`, `flair` or `snips`.
  - `<formatOptions>` Optional. Path to a .json file that each adapter optionally can use
  - `<outputPath>` Optional. The directory where to save the generated datasets. Uses the current directory as default.
 - `<trainingFileName>` Optional. The name of the generated training dataset file. Do not forget to add a .json extension at the end. Uses `<format>`_dataset_training.json as default file name.
@@ -122,5 +127,15 @@ npx chatito <pathToFileOrDirectory> --format=<format> --formatOptions=<formatOpt
 
 [Overfitting](https://en.wikipedia.org/wiki/Overfitting) is a problem that can be prevented if we use Chatito correctly. The idea behind this tool, is to have an intersection between data augmentation and a probabilistic description of possible sentences combinations. It is not intended to generate deterministic datasets, you should avoid generating all possible combinations.
 
+### Tools and resources
+
+- [Visual Studio Code syntax highlighting plugin](https://marketplace.visualstudio.com/items?itemName=nimfin.chatito) Thanks to [Yuri Golobokov](https://github.com/nimf) for his [work on this](https://github.com/nimf/chatito-vscode).
+
+- [AI Blueprints: How to build and deploy AI business projects](https://books.google.com.pe/books?id=sR2CDwAAQBAJ) implements practical full chatbot examples using chatito at chapter 7.
+
+- [3 steps to convert chatbot training data between different NLP Providers](https://medium.com/@benoit.alvarez/3-steps-to-convert-chatbot-training-data-between-different-nlp-providers-fa235f67617c) details a simple way to convert the data format to non implemented adapters. You can use a generated dataset with providers like DialogFlow, Wit.ai and Watson.
+
+- [Aida-nlp](https://github.com/rodrigopivi/aida) is a tiny experimental NLP deep learning library for text classification and NER. Built with Tensorflow.js, Keras and Chatito. Implemented in JS and Python.
+
 ### Author and maintainer
 Rodrigo Pimentel