Update README (#100)

* Update README * update * Use trunk-action * fix * fix
daac-tools · May 29, 2023 · 163a6ee · 163a6ee
1 parent adb6bcd
commit 163a6ee
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 79 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -21,18 +21,19 @@ jobs:
     - name: Download model
       working-directory: ./examples/wasm
       run: |
-        wget 'https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag-huge.tar.xz'
-        tar xf ./bccwj-suw+unidic+tag-huge.tar.xz
-        mv ./bccwj-suw+unidic+tag-huge/bccwj-suw+unidic+tag-huge.model.zst ./src/
+        wget 'https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz'
+        tar xf 'bccwj-suw+unidic_pos+pron.tar.xz'
+        mv ./bccwj-suw+unidic_pos+pron/bccwj-suw+unidic_pos+pron.model.zst ./src/
 
-    - name: Install environment
-      run: |
-        rustup target add wasm32-unknown-unknown
-        cargo install trunk
+    - uses: jetli/trunk-action@v0.4.0
+      with:
+        version: 'latest'
 
     - name: Build
       working-directory: ./examples/wasm
-      run: trunk build --release
+      run: |
+        rustup target add wasm32-unknown-unknown
+        trunk build --release
 
     - name: Publish to Cloudflare Pages
       id: cloudflare_pages_deploy

diff --git a/README-ja.md b/README-ja.md
@@ -23,21 +23,21 @@ Vaporetto はトークン化モデルを生成するための方法を3つ用意
 #### 配布モデルをダウンロードする
 
 1つ目は最も単純な方法で、学習済みモデルをダウンロードすることです。
-モデルファイルは[ここ](https://github.com/daac-tools/vaporetto/releases)にあります。
+モデルファイルは[ここ](https://github.com/daac-tools/vaporetto-models/releases)にあります。
 
 `bccwj-suw+unidic+tag` を選びました。
 ```
-% wget https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag.tar.xz
+% wget https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz
 ```
 
 各ファイルはモデルファイルとライセンス条項が含まれた圧縮ファイルなので、ダウンロードしたファイルを展開する必要があります。
 ```
-% tar xf ./bccwj-suw+unidic+tag.tar.xz
+% tar xf ./bccwj-suw+unidic_pos+pron.tar.xz
 ```
 
 トークン化には、以下のコマンドを実行します。
 ```
-% echo 'ヴェネツィアはイタリアにあります。' | cargo run --release -p predict -- --model path/to/bccwj-suw+unidic+tag.model.zst
+% echo 'ヴェネツィアはイタリアにあります。' | cargo run --release -p predict -- --model path/to/bccwj-suw+unidic_pos+pron.model.zst
 ```
 
 以下が出力されます。
@@ -52,7 +52,7 @@ Vaporetto はトークン化モデルを生成するための方法を3つ用意
 
 ```rust
 // zstd クレートまたは ruzstd クレートが必要
-let reader = zstd::Decoder::new(File::open("path/to/model.bin.zst")?)?;
+let reader = zstd::Decoder::new(File::open("path/to/model.zst")?)?;
 let model = Model::read(reader)?;
 ```
 
@@ -137,18 +137,18 @@ Vaporetto は2種類のコーパス（フルアノテーションコーパスと
 例えば、以下のコマンドで `外国人参政権` は誤ったトークンに分割されます。
 `--scores` オプションを使って、各文字間のスコアを出力します。
 ```
-% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic.model.zst
+% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic_pos+pron.model.zst
 外国 人 参 政権 と 政権 交代
-0:外国 -11785
-1:国人 16634
-2:人参 5450
-3:参政 4480
-4:政権 -3697
-5:権と 17702
-6:と政 18699
-7:政権 -12742
-8:権交 14578
-9:交代 -7658
+0:外国 -10784
+1:国人 17935
+2:人参 5308
+3:参政 3833
+4:政権 -3299
+5:権と 14635
+6:と政 17653
+7:政権 -12705
+8:権交 11611
+9:交代 -5794
 ```
 
 正しくは `外国 人 参政 権` です。
@@ -169,11 +169,11 @@ Vaporetto は2種類のコーパス（フルアノテーションコーパスと
 
    Vaporetto は、重みの合計が正の値になった際にテキストを分割するので、以下のように新しいエントリを追加します。
    ```diff
-    参撾,3167 -6074 3790,
-    参政,3167 -6074 3790,
+    参撾,3328 -5545 3514,
+    参政,3328 -5545 3514,
    +参政権,0 -10000 10000 0,参政/権
-    参朝,3167 -6074 3790,
-    参校,3167 -6074 3790,
+    参朝,3328 -5545 3514,
+    参校,3328 -5545 3514,
    ```
 
    この場合、 `参` と `政` の間に `-10000` が、 `政` と `権` の間に `10000` が加算されます。
@@ -186,23 +186,23 @@ Vaporetto は2種類のコーパス（フルアノテーションコーパスと
 
 3. モデルファイルの重みを置換します。
    ```
-   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic.model.zst --replace-dict path/to/dictionary.csv --model-out path/to/bccwj-suw+unidic-new.model.zst
+   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic_pos+pron.model.zst --replace-dict path/to/dictionary.csv --model-out path/to/bccwj-suw+unidic_pos+pron-new.model.zst
    ```
 
 これで `外国人参政権` が正しいトークンに分割されます。
 ```
-% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic-new.model.zst
+% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic_pos+pron-new.model.zst
 外国 人 参政 権 と 政権 交代
-0:外国 -11785
-1:国人 16634
-2:人参 5450
-3:参政 -5520
-4:政権 6303
-5:権と 17702
-6:と政 18699
-7:政権 -12742
-8:権交 14578
-9:交代 -7658
+0:外国 -10784
+1:国人 17935
+2:人参 5308
+3:参政 -6167
+4:政権 6701
+5:権と 14635
+6:と政 17653
+7:政権 -12705
+8:権交 11611
+9:交代 -5794
 ```
 
 ### 品詞推定

diff --git a/README.md b/README.md
@@ -25,21 +25,21 @@ Vaporetto provides three ways to generate tokenization models:
 #### Download Distribution Model
 
 The first is the simplest way, which is to download a model we have trained.
-Models are available [here](https://github.com/daac-tools/vaporetto/releases).
+Models are available [here](https://github.com/daac-tools/vaporetto-models/releases).
 
-We chose `bccwj-suw+unidic+tag`:
+We chose `bccwj-suw+unidic_pos+pron`:
 ```
-% wget https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag.tar.xz
+% wget https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz
 ```
 
 Each file is a compressed file containing a model file and license terms, so you need to decompress the downloaded file as shown in the following command:
 ```
-% tar xf ./bccwj-suw+unidic+tag.tar.xz
+% tar xf ./bccwj-suw+unidic_pos+pron.tar.xz
 ```
 
 To perform tokenization, run the following command:
 ```
-% echo 'ヴェネツィアはイタリアにあります。' | cargo run --release -p predict -- --model path/to/bccwj-suw+unidic+tag.model.zst
+% echo 'ヴェネツィアはイタリアにあります。' | cargo run --release -p predict -- --model path/to/bccwj-suw+unidic_pos+pron.model.zst
 ```
 
 The following will be output:
@@ -55,7 +55,7 @@ you must decompress them outside of the API.
 
 ```rust
 // Requires zstd crate or ruzstd crate
-let reader = zstd::Decoder::new(File::open("path/to/model.bin.zst")?)?;
+let reader = zstd::Decoder::new(File::open("path/to/model.zst")?)?;
 let model = Model::read(reader)?;
 ```
 
@@ -143,26 +143,26 @@ Sometimes, your model will output different results than what you expect.
 For example, `外国人参政権` is split into wrong tokens in the following command.
 We use the `--scores` option to show the score of each character boundary:
 ```
-% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic.model.zst
+% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic_pos+pron.model.zst
 外国 人 参 政権 と 政権 交代
-0:外国 -11785
-1:国人 16634
-2:人参 5450
-3:参政 4480
-4:政権 -3697
-5:権と 17702
-6:と政 18699
-7:政権 -12742
-8:権交 14578
-9:交代 -7658
+0:外国 -10784
+1:国人 17935
+2:人参 5308
+3:参政 3833
+4:政権 -3299
+5:権と 14635
+6:と政 17653
+7:政権 -12705
+8:権交 11611
+9:交代 -5794
 ```
 
 The correct is `外国 人 参政 権`.
 To split `外国人参政権` into correct tokens, manipulate the model in the following steps so that the sign of score of `参政権` becomes inverted:
 
 1. Dump a dictionary by the following command:
    ```
-   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic.model.zst --dump-dict path/to/dictionary.csv
+   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic_pos+pron.model.zst --dump-dict path/to/dictionary.csv
    ```
 
 2. Edit the dictionary.
@@ -175,11 +175,11 @@ To split `外国人参政権` into correct tokens, manipulate the model in the f
 
    Vaporetto splits a text when the total weight of the boundary is a positive number, so we add a new entry as follows:
    ```diff
-    参撾,3167 -6074 3790,
-    参政,3167 -6074 3790,
+    参撾,3328 -5545 3514,
+    参政,3328 -5545 3514,
    +参政権,0 -10000 10000 0,参政/権
-    参朝,3167 -6074 3790,
-    参校,3167 -6074 3790,
+    参朝,3328 -5545 3514,
+    参校,3328 -5545 3514,
    ```
 
    In this case, `-10000` will be added between `参` and `政`, and `10000` will be added between `政` and `権`.
@@ -192,23 +192,23 @@ To split `外国人参政権` into correct tokens, manipulate the model in the f
 
 3. Replaces weight data of a model file
    ```
-   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic.model.zst --replace-dict path/to/dictionary.csv --model-out path/to/bccwj-suw+unidic-new.model.zst
+   % cargo run --release -p manipulate_model -- --model-in path/to/bccwj-suw+unidic_pos+pron.model.zst --replace-dict path/to/dictionary.csv --model-out path/to/bccwj-suw+unidic_pos+pron-new.model.zst
    ```
 
 Now `外国人参政権` is split into correct tokens.
 ```
-% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic-new.model.zst
+% echo '外国人参政権と政権交代' | cargo run --release -p predict -- --scores --model path/to/bccwj-suw+unidic_pos+pron-new.model.zst
 外国 人 参政 権 と 政権 交代
-0:外国 -11785
-1:国人 16634
-2:人参 5450
-3:参政 -5520
-4:政権 6303
-5:権と 17702
-6:と政 18699
-7:政権 -12742
-8:権交 14578
-9:交代 -7658
+0:外国 -10784
+1:国人 17935
+2:人参 5308
+3:参政 -6167
+4:政権 6701
+5:権と 14635
+6:と政 17653
+7:政権 -12705
+8:権交 11611
+9:交代 -5794
 ```
 
 ### Tagging

diff --git a/examples/wasm/README.md b/examples/wasm/README.md
@@ -14,9 +14,9 @@ cargo install trunk
 cargo install wasm-bindgen-cli
 
 # Downloads and extracts the model file
-wget https://github.com/daac-tools/vaporetto/releases/download/v0.5.0/bccwj-suw+unidic+tag-huge.tar.xz
-tar xf ./bccwj-suw+unidic+tag-huge.tar.xz
-mv ./bccwj-suw+unidic+tag-huge/bccwj-suw+unidic+tag-huge.model.zst ./src/
+wget https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz
+tar xf ./bccwj-suw+unidic_pos+pron.tar.xz
+mv ./bccwj-suw+unidic_pos+pron/bccwj-suw+unidic_pos+pron.model.zst ./src/
 
 # Builds and launches the server
 # Note: We recommend using --release flag to reduce loading time.

diff --git a/examples/wasm/src/lib.rs b/examples/wasm/src/lib.rs
@@ -50,7 +50,7 @@ impl Worker for VaporettoWorker {
     type Output = (Vec<Token>, usize);
 
     fn create(_scope: &WorkerScope<Self>) -> Self {
-        let model_data = include_bytes!("bccwj-suw+unidic+tag-huge.model.zst");
+        let model_data = include_bytes!("bccwj-suw+unidic_pos+pron.model.zst");
         let mut decoder = ruzstd::StreamingDecoder::new(model_data.as_slice()).unwrap();
         let mut buff = vec![];
         decoder.read_to_end(&mut buff).unwrap();

diff --git a/predict/src/main.rs b/predict/src/main.rs
@@ -136,11 +136,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 s_orig.tags_mut().clone_from_slice(s.tags());
                 s_orig.write_tokenized_text(&mut buf);
                 out.write_all(buf.as_bytes())?;
+                out.write_all(b"\n")?;
                 if args.scores {
                     print_scores(&s, &mut out)?;
                 }
+            } else {
+                out.write_all(b"\n")?;
             }
-            out.write_all(b"\n")?;
             if is_tty {
                 out.flush()?;
             }