Skip to content

Commit

Permalink
Get dictionary path from environment variables.
Browse files Browse the repository at this point in the history
  • Loading branch information
tokuhirom committed Jan 16, 2023
1 parent d0792b1 commit 1bca98f
Show file tree
Hide file tree
Showing 9 changed files with 62 additions and 53 deletions.
6 changes: 1 addition & 5 deletions akaza-data/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ struct EvaluateArgs {
load_user_config: bool,
/// コーパスが格納されているディレクトリ
corpus_dir: String,
/// 評価に利用するシステムデータのディレクトリ
system_data_dir: String,
}

/// 動作確認する
Expand Down Expand Up @@ -206,9 +204,7 @@ fn main() -> anyhow::Result<()> {
Commands::MakeSystemDict(opt) => {
make_system_dict(&opt.txt_file, Some(opt.vocab_file.as_str()), opt.corpus)
}
Commands::Evaluate(opt) => {
evaluate(&opt.corpus_dir, &opt.system_data_dir, opt.load_user_config)
}
Commands::Evaluate(opt) => evaluate(&opt.corpus_dir, opt.load_user_config),
Commands::Check(opt) => check(&opt.yomi, opt.expected, opt.user_data),
Commands::LearnCorpus(opts) => learn_corpus(
opts.delta,
Expand Down
4 changes: 1 addition & 3 deletions akaza-data/src/subcmd/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,12 @@ use libakaza::engine::bigram_word_viterbi_engine::BigramWordViterbiEngineBuilder
use libakaza::user_side_data::user_data::UserData;

pub fn check(yomi: &str, expected: Option<String>, user_data: bool) -> anyhow::Result<()> {
let datadir = env!("CARGO_MANIFEST_DIR").to_string() + "/data/";

let dict = merge_dict(vec![
read_skkdict(Path::new("skk-dev-dict/SKK-JISYO.L"), EUC_JP)?,
read_skkdict(Path::new("data/SKK-JISYO.akaza"), UTF_8)?,
]);

let mut builder = BigramWordViterbiEngineBuilder::new(&datadir, Some(dict), None);
let mut builder = BigramWordViterbiEngineBuilder::new(Some(dict), None);
if user_data {
info!("Enabled user data");
let user_data = UserData::load_from_default_path()?;
Expand Down
8 changes: 2 additions & 6 deletions akaza-data/src/subcmd/evaluate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,7 @@ impl SaigenRitsu {
/// にのっている評価方法を採用。
///
/// なぜこうしているかというと、mozc の論文にのっている BLEU を使用する方式より実装が楽だからです!
pub fn evaluate(
corpus_dir: &String,
system_data_dir: &str,
load_user_config: bool,
) -> anyhow::Result<()> {
pub fn evaluate(corpus_dir: &String, load_user_config: bool) -> anyhow::Result<()> {
/*
# corpus.0.txt デバッグ用のファイル
# corpus.1.txt メイン(候補割り当ても含む)
Expand All @@ -75,7 +71,7 @@ pub fn evaluate(
read_skkdict(Path::new("data/SKK-JISYO.akaza"), UTF_8)?,
]);

let akaza = BigramWordViterbiEngineBuilder::new(system_data_dir, Some(dicts), None)
let akaza = BigramWordViterbiEngineBuilder::new(Some(dicts), None)
.load_user_config(load_user_config)
.build()?;

Expand Down
7 changes: 1 addition & 6 deletions ibus-akaza/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,14 @@ DATADIR ?= $(PREFIX)/share
DESTDIR ?=
AKAZA_DATA_DIR ?= $(DATADIR)/akaza/

all: akaza.xml config.h src/config.rs
all: akaza.xml config.h

# ibus_akaza/config.py: ibus_akaza/config.py.in
# sed -e "s:@SYSCONFDIR@:$(SYSCONFDIR):g" \
# -e "s:@MODELDIR@:$(DESTDIR)/$(DATADIR)/akaza-data/:g" \
# -e "s:@DICTIONARYDIR@:$(DESTDIR)/$(DATADIR)/ibus-akaza/dictionary:g" \
# $< > $@

src/config.rs: src/config.rs.in
sed \
-e "s:@AKAZA_DATA_DIR@:$(AKAZA_DATA_DIR):g" \
-e "s:@DATADIR@:$(DATADIR)/:g" $< > $@


config.h: config.h.in
sed \
Expand Down
2 changes: 2 additions & 0 deletions ibus-akaza/ibus-akaza-debug.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ umask 077
exec 1>> ~/.ibus-akaza.log
exec 2>&1

export AKAZA_DATA_DIR="$BASEDIR/../akaza-data/data/"

export RUST_BACKTRACE=4

exec $BASEDIR/../target/release/ibus-akaza --ibus -vv
1 change: 0 additions & 1 deletion ibus-akaza/src/config.rs.in

This file was deleted.

4 changes: 1 addition & 3 deletions ibus-akaza/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use anyhow::Result;
use clap::Parser;
use log::{error, info, warn};

use crate::config::AKAZA_DATA_DIR;
use ibus_sys::core::ibus_main;
use ibus_sys::engine::IBusEngine;
use ibus_sys::glib::{gchar, guint};
Expand All @@ -22,7 +21,6 @@ use crate::context::AkazaContext;
use crate::wrapper_bindings::{ibus_akaza_init, ibus_akaza_set_callback};

mod commands;
mod config;
mod context;
mod input_mode;
mod keymap;
Expand Down Expand Up @@ -103,7 +101,7 @@ fn main() -> Result<()> {
unsafe {
let sys_time = SystemTime::now();
let user_data = load_user_data();
let akaza = BigramWordViterbiEngineBuilder::new(AKAZA_DATA_DIR, None, None)
let akaza = BigramWordViterbiEngineBuilder::new(None, None)
.user_data(user_data.clone())
.load_user_config(true)
.build()?;
Expand Down
80 changes: 52 additions & 28 deletions libakaza/src/engine/bigram_word_viterbi_engine.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
use std::collections::vec_deque::VecDeque;
use std::collections::HashMap;
use std::env;
use std::ops::Range;
use std::path::{Path, PathBuf};
use std::rc::Rc;
use std::sync::{Arc, Mutex};
use std::time::SystemTime;

use anyhow::Result;
use anyhow::{bail, Result};
use encoding_rs::UTF_8;
use log::{info, warn};

use crate::config::Config;
use crate::dict::loader::load_dicts;
use crate::dict::merge_dict::merge_dict;
use crate::dict::skk::read::read_skkdict;
use crate::engine::base::HenkanEngine;
use crate::graph::graph_builder::GraphBuilder;
use crate::graph::graph_resolver::{Candidate, GraphResolver};
Expand All @@ -23,27 +27,6 @@ use crate::lm::system_unigram_lm::MarisaSystemUnigramLM;
use crate::romkan::RomKanConverter;
use crate::user_side_data::user_data::UserData;

pub struct SystemDataLoader {
pub system_unigram_lm: MarisaSystemUnigramLM,
pub system_bigram_lm: MarisaSystemBigramLM,
}

impl SystemDataLoader {
pub fn load(system_data_dir: &str) -> Result<SystemDataLoader> {
let system_unigram_lm = MarisaSystemUnigramLM::load(
(system_data_dir.to_string() + "/stats-vibrato-unigram.trie").as_str(),
)?;
let system_bigram_lm = MarisaSystemBigramLM::load(
(system_data_dir.to_string() + "/stats-vibrato-bigram.trie").as_str(),
)?;

Ok(SystemDataLoader {
system_unigram_lm,
system_bigram_lm,
})
}
}

/// バイグラムのビタビベースかな漢字変換エンジンです。
/// 単語バイグラムを採用しています。
pub struct BigramWordViterbiEngine<U: SystemUnigramLM, B: SystemBigramLM> {
Expand Down Expand Up @@ -139,7 +122,6 @@ impl<U: SystemUnigramLM, B: SystemBigramLM> BigramWordViterbiEngine<U, B> {
}

pub struct BigramWordViterbiEngineBuilder {
system_data_dir: String,
user_data: Option<Arc<Mutex<UserData>>>,
load_user_config: bool,
dicts: Option<HashMap<String, Vec<String>>>,
Expand All @@ -148,12 +130,10 @@ pub struct BigramWordViterbiEngineBuilder {

impl BigramWordViterbiEngineBuilder {
pub fn new(
system_data_dir: &str,
dicts: Option<HashMap<String, Vec<String>>>,
single_term: Option<HashMap<String, Vec<String>>>,
) -> BigramWordViterbiEngineBuilder {
BigramWordViterbiEngineBuilder {
system_data_dir: system_data_dir.to_string(),
user_data: None,
load_user_config: false,
dicts,
Expand All @@ -176,7 +156,19 @@ impl BigramWordViterbiEngineBuilder {
pub fn build(
&self,
) -> Result<BigramWordViterbiEngine<MarisaSystemUnigramLM, MarisaSystemBigramLM>> {
let system_data_loader = SystemDataLoader::load(self.system_data_dir.as_str())?;
let system_unigram_lm = MarisaSystemUnigramLM::load(
Self::try_load("stats-vibrato-unigram.trie")?
.to_string_lossy()
.to_string()
.as_str(),
)?;
let system_bigram_lm = MarisaSystemBigramLM::load(
Self::try_load("stats-vibrato-bigram.trie")?
.to_string_lossy()
.to_string()
.as_str(),
)?;
let system_dict = read_skkdict(Self::try_load("SKK-JISYO.akaza")?.as_path(), UTF_8)?;

let user_data = if let Some(d) = &self.user_data {
d.clone()
Expand All @@ -193,6 +185,7 @@ impl BigramWordViterbiEngineBuilder {
Config::default()
};
let dicts = load_dicts(&config.dicts)?;
let dicts = merge_dict(vec![system_dict, dicts]);
let single_term = if let Some(st) = &config.single_term {
load_dicts(st)?
} else {
Expand Down Expand Up @@ -235,8 +228,8 @@ impl BigramWordViterbiEngineBuilder {
dict,
single_term,
user_data.clone(),
Rc::new(system_data_loader.system_unigram_lm),
Rc::new(system_data_loader.system_bigram_lm),
Rc::new(system_unigram_lm),
Rc::new(system_bigram_lm),
);

let graph_resolver = GraphResolver::default();
Expand Down Expand Up @@ -273,4 +266,35 @@ impl BigramWordViterbiEngineBuilder {
);
Ok(config)
}

pub fn try_load(file_name: &str) -> Result<PathBuf> {
if cfg!(test) {
let path = Path::new(env!("CARGO_MANIFEST_DIR"));
let path = path.join("../akaza-data/data/").join(file_name);
if path.exists() {
Ok(path)
} else {
bail!("There's no {} for testing.", path.to_string_lossy(),)
}
} else if let Ok(dir) = env::var("AKAZA_DATA_DIR") {
let dir = Path::new(dir.as_str());
let file = dir.join(file_name);
if file.exists() {
Ok(file)
} else {
bail!(
"There's no {} in AKAZA_DATA_DIR({:?})",
file.to_string_lossy(),
dir,
)
}
} else {
let path = xdg::BaseDirectories::with_prefix("akaza")?.find_data_file(file_name);
if let Some(path) = path {
Ok(path)
} else {
bail!("There's no {} in XDG_DATA_DIRS", file_name)
}
}
}
}
3 changes: 2 additions & 1 deletion libakaza/tests/wnn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#[cfg(feature = "it")]
mod tests {
use std::collections::vec_deque::VecDeque;
use std::env;
use std::path::Path;

use anyhow::Result;
Expand All @@ -21,8 +22,8 @@ mod tests {
{
let datadir = env!("CARGO_MANIFEST_DIR").to_string() + "/../akaza-data/data/";
assert!(Path::new(datadir.as_str()).exists());
env::set_var("AKAZA_DATA_DIR", datadir);
BigramWordViterbiEngineBuilder::new(
datadir.as_str(),
Some(read_skkdict(
Path::new(
(env!("CARGO_MANIFEST_DIR").to_string()
Expand Down

0 comments on commit 1bca98f

Please sign in to comment.