Skip to content

Commit

Permalink
Use cedarwood for user's kana-trie.
Browse files Browse the repository at this point in the history
  • Loading branch information
tokuhirom committed Jan 14, 2023
1 parent b1ead97 commit d86ce29
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 14 deletions.
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ibus-akaza/ibus-akaza-debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ exec 2>&1

export RUST_BACKTRACE=4

exec $BASEDIR/../target/debug/ibus-akaza --ibus -v
exec $BASEDIR/../target/debug/ibus-akaza --ibus -vv
1 change: 1 addition & 0 deletions libakaza/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ kelp = "0.3.0"
xdg = "2.4.1"
crawdad = "0.4.0"
encoding_rs = "0.8.31"
cedarwood = "0.4.6"

[build-dependencies]

Expand Down
68 changes: 68 additions & 0 deletions libakaza/src/kana_trie/cedarwood_kana_trie.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use cedarwood::Cedar;

use crate::kana_trie::base::KanaTrie;

pub struct CedarwoodKanaTrie {
cedar: Cedar,
words: Vec<String>,
}

impl Default for CedarwoodKanaTrie {
fn default() -> Self {
let cedar = Cedar::new();
CedarwoodKanaTrie {
cedar,
words: Vec::new(),
}
}
}

impl CedarwoodKanaTrie {
pub fn build(keys: Vec<String>) -> CedarwoodKanaTrie {
let mut cedar = Cedar::new();
let mut words: Vec<String> = Vec::new();
for key in keys {
cedar.update(key.as_str(), words.len() as i32);
words.push(key);
}
CedarwoodKanaTrie { cedar, words }
}

pub fn contains(&self, key: &str) -> bool {
self.cedar.exact_match_search(key).is_some()
}

pub fn update(&mut self, key: &str) {
self.cedar.update(key, self.words.len() as i32);
self.words.push(key.to_string());
}
}

impl KanaTrie for CedarwoodKanaTrie {
fn common_prefix_search(&self, query: &str) -> Vec<String> {
self.cedar
.common_prefix_iter(query)
.map(|(n, _)| self.words[n as usize].clone())
.collect::<Vec<String>>()
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn hello() -> anyhow::Result<()> {
let trie = CedarwoodKanaTrie::build(vec![
"わたし".to_string(),
"わた".to_string(),
"わし".to_string(),
"ほげほげ".to_string(),
])?;
assert_eq!(
trie.common_prefix_search("わたしのきもち"),
vec!("わた", "わたし")
);
Ok(())
}
}
1 change: 1 addition & 0 deletions libakaza/src/kana_trie/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
* 何を利用するかを入れ替えられるようにしようかな、と。
*/
pub mod base;
pub mod cedarwood_kana_trie;
pub mod crawdad_kana_trie;
pub mod marisa_kana_trie;
61 changes: 48 additions & 13 deletions libakaza/src/user_side_data/user_data.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use std::collections::HashMap;
use std::path::Path;
use std::sync::Mutex;
use std::time::SystemTime;

use crate::graph::word_node::WordNode;
use anyhow::Result;
use log::{info, warn};
use log::{info, trace, warn};

use crate::kana_trie::crawdad_kana_trie::CrawdadKanaTrie;
use crate::graph::word_node::WordNode;
use crate::kana_trie::cedarwood_kana_trie::CedarwoodKanaTrie;
use crate::user_side_data::bigram_user_stats::BiGramUserStats;
use crate::user_side_data::unigram_user_stats::UniGramUserStats;
use crate::user_side_data::user_stats_utils::{read_user_stats_file, write_user_stats_file};
Expand All @@ -17,9 +18,10 @@ use crate::user_side_data::user_stats_utils::{read_user_stats_file, write_user_s
#[derive(Default)]
pub struct UserData {
/// 読み仮名のトライ。入力変換時に共通接頭辞検索するために使用。
// ここで MARISA ではなく Crawdad を採用しているのは、FFI していると std::marker::Send を実装できなくて
// スレッドをまたいだ処理が困難になるから、以上の理由はないです。
kana_trie: Mutex<CrawdadKanaTrie>,
// ここで MARISA ではなく Cedarwood を採用しているのは
// - FFI していると std::marker::Send を実装できなくてスレッドをまたいだ処理が困難になるから
// - 更新可能なトライ構造だから
kana_trie: Mutex<CedarwoodKanaTrie>,

unigram_user_stats: UniGramUserStats,
bigram_user_stats: BiGramUserStats,
Expand Down Expand Up @@ -98,13 +100,29 @@ impl UserData {
}
};

let kana_trie = match CrawdadKanaTrie::load(kana_trie_path) {
Ok(trie) => trie,
Err(err) => {
warn!("Cannot load kana trie: {} {}", kana_trie_path, err);
CrawdadKanaTrie::default()
}
};
// let kana_trie = match CedarwoodKanaTrie::load(kana_trie_path) {
// Ok(trie) => trie,
// Err(err) => {
// warn!("Cannot load kana trie: {} {}", kana_trie_path, err);
// CedarwoodKanaTrie::default()
// }
// };

// cedarwood トライを構築する。
// キャッシュせずに動的に構築する方向性。
let t1 = SystemTime::now();
let yomis = unigram_user_stats
.word_count
.keys()
.filter_map(|it| it.split_once('/'))
.map(|(_, yomi)| yomi.to_string())
.collect::<Vec<_>>();
let kana_trie = CedarwoodKanaTrie::build(yomis);
let t2 = SystemTime::now();
info!(
"Built kana trie in {}msec",
t2.duration_since(t1).unwrap().as_millis()
);

UserData {
unigram_user_stats,
Expand All @@ -122,6 +140,21 @@ impl UserData {
pub fn record_entries(&mut self, kanji_kanas: &[String]) {
self.unigram_user_stats.record_entries(kanji_kanas);
self.bigram_user_stats.record_entries(kanji_kanas);

let kana_trie = self.kana_trie.get_mut().unwrap();
for kanji_kanas in kanji_kanas {
let Some((_, yomi)) = kanji_kanas.split_once('/') else {
continue
};
if kana_trie.contains(yomi) {
trace!("Skip word: {}", yomi);
continue;
}
trace!("Record word to kana_trie: {}", yomi);
kana_trie.update(yomi);
}

self.need_save = true;
}

pub fn write_user_stats_file(&self) -> Result<()> {
Expand All @@ -132,6 +165,8 @@ impl UserData {
if let Some(bigram_path) = &self.bigram_path {
write_user_stats_file(bigram_path, &self.bigram_user_stats.word_count)?;
}
// ↓ TODO ここ更新しないと意味ない
// self.need_save = false;
Ok(())
}

Expand Down

0 comments on commit d86ce29

Please sign in to comment.