diff --git a/README.md b/README.md index 74a762f..13fabf5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,86 @@ -# Lexicon-NN +# LGN -Code for an EMNLP long paper (Under review). +Pytorch implementation of A Lexicon-Based Graph Neural Network for Chinese NER. + +The code is partially referred to https://github.com/jiesutd/LatticeLSTM. + +## Requirements + +* Python 3.6 or higher +* Pytorch 0.4.1 or higher + +## Input Format + +BMES tag scheme, with each character its label for one line. Sentences are splited with a null line. + + 印 B-LOC + 度 M-LOC + 河 E-LOC + 流 O + 经 O + 印 B-GPE + 度 E-GPE + +## Usage + +* Training + + python main.py --status train \ + --train data/onto4ner.cn/train.char.bmes \ + --dev data/onto4ner.cn/dev.char.bmes \ + --test data/onto4ner.cn/test.char.bmes \ + --saved_model saved_model/model_onto4ner \ + --saved_set data/onto4ner.cn/saved.dset + +* Testing + + python main.py --status test \ + --test data/onto4ner.cn/test.char.bmes \ + --saved_model saved_model/model_onto4ner \ + --saved_set data/onto4ner.cn/saved.dset + +* Decoding (Raw file can either be labeled or not.) + + python main.py --status decode \ + --raw data/onto4ner.cn/test.char.bmes \ + --output tagged_file.txt \ + --saved_model saved_model/model_onto4ner \ + --saved_set data/onto4ner.cn/saved.dset + +## Data Downloads + +The pretrained character and word embeddings can be downloaded from [Lattice LSTM](https://github.com/jiesutd/LatticeLSTM). + +Datasets including OntoNotes, MSRA, Weibo and Resume are available at Google Drive or Baidu Pan. + +## Pretrained Model Downloads + +We also provide pretrained models on the four datasets, which are the same models as reported in the paper. +If you try to retrain models from scratch under the same hyper-parameter settings, you may obtain a sightly +lower or higher F1 score than that reported in the paper (in our experiments we selected the model that performed best). + +Pretrained models and related hyper-parameter settings are available at Google Drive or Baidu Pan. + +When running main.py in test mode for pretrained models, you can get the results as follows: + +| Datasets | Precision | Recall | F1 | +|:--------------:|:---------:|:-------:|:-----:| +| OntoNotes dev | 74.00 | 70.03 | 71.96 | +| OntoNotes test | 76.13 | 73.68 | 74.89 | +| MSRA dev | - | - | - | +| MSRA test | 94.19 | 92.73 | 93.46 | +| Weibo dev | 66.09 | 59.13 | 62.42 | +| Weibo test | 65.71 | 55.56 | 60.21 | +| Resume dev | 94.27 | 94.59 | 94.43 | +| Resume test | 95.28 | 95.46 | 95.37 | + +## Cite + + @article{gui2019lexicon, + title={A Lexicon-Based Graph Neural Network for Chinese NER}, + author={Gui, Tao and Zou, Yicheng and Zhang, Qi and Peng, Minlong and + Fu, Jinlan and Wei, Zhongyu and Huang, Xuanjing}, + booktitle={2019 Conference on Empirical Methods in Natural Language Processing and + 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, + year={2019} + } \ No newline at end of file diff --git a/main.py b/main.py index e660f3f..0a3b846 100644 --- a/main.py +++ b/main.py @@ -87,6 +87,29 @@ def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet): return pred_label, gold_label +def print_args(args): + print("CONFIG SUMMARY:") + print(" Batch size: %s" % (args.batch_size)) + print(" If use GPU: %s" % (args.use_gpu)) + print(" If use CRF: %s" % (args.use_crf)) + print(" Epoch number: %s" % (args.num_epoch)) + print(" Learning rate: %s" % (args.lr)) + print(" L2 normalization rate: %s" % (args.weight_decay)) + print(" If use edge embedding: %s" % (args.use_edge)) + print(" If use global node: %s" % (args.use_global)) + print(" Bidirectional digraph: %s" % (args.bidirectional)) + print(" Update step number: %s" % (args.iters)) + print(" Attention dropout rate: %s" % (args.tf_drop_rate)) + print(" Embedding dropout rate: %s" % (args.emb_drop_rate)) + print(" Hidden state dimension: %s" % (args.hidden_dim)) + print(" Learning rate decay ratio: %s" % (args.lr_decay)) + print(" Aggregation module dropout rate: %s" % (args.cell_drop_rate)) + print(" Head number of attention: %s" % (args.num_head)) + print(" Head dimension of attention: %s" % (args.head_dim)) + print("CONFIG SUMMARY END.") + sys.stdout.flush() + + def evaluate(data, args, model, name): if name == "train": instances = data.train_Ids @@ -342,7 +365,7 @@ def load_model_decode(model_dir, data, args, name): parser.add_argument('--raw', help='Raw file for decoding.') parser.add_argument('--output', help='Output results for decoding.') parser.add_argument('--saved_set', help='Path of saved data set.', default='data/onto4ner.cn/saved.dset') - parser.add_argument('--saved_model', help='Path of saved model.', default="saved_model/model_ontonote") + parser.add_argument('--saved_model', help='Path of saved model.', default="saved_model/model_onto4ner") parser.add_argument('--char_emb', help='Path of character embedding file.', default="data/gigaword_chn.all.a2b.uni.ite50.vec") parser.add_argument('--word_emb', help='Path of word embedding file.', default="data/ctb.50d.vec") @@ -352,7 +375,7 @@ def load_model_decode(model_dir, data, args, name): parser.add_argument('--bidirectional', type=str2bool, default=True, help='If use bidirectional digraph.') parser.add_argument('--seed', help='Random seed', default=1023, type=int) - parser.add_argument('--batch_size', help='Batch size. ', default=1, type=int) + parser.add_argument('--batch_size', help='Batch size.', default=1, type=int) parser.add_argument('--num_epoch',default=100, type=int, help="Epoch number.") parser.add_argument('--iters', default=4, type=int, help='The number of Graph iterations.') parser.add_argument('--hidden_dim', default=50, type=int, help='Hidden state size.') @@ -378,7 +401,6 @@ def load_model_decode(model_dir, data, args, name): torch.manual_seed(seed_num) np.random.seed(seed_num) - train_file = args.train dev_file = args.dev test_file = args.test @@ -412,6 +434,7 @@ def load_model_decode(model_dir, data, args, name): args.label_alphabet_size = data.label_alphabet.size() args.char_dim = data.char_emb_dim args.word_dim = data.word_emb_dim + print_args(args) train(data, args, saved_model_path) elif status == 'test': @@ -426,6 +449,8 @@ def load_model_decode(model_dir, data, args, name): data.generate_instance_with_words(test_file, 'test') with open(saved_model_path + "_best_HP.config", "rb") as f: args = pickle.load(f) + data.show_data_summary() + print_args(args) load_model_decode(saved_model_path, data, args, "test") elif status == 'decode': @@ -440,6 +465,8 @@ def load_model_decode(model_dir, data, args, name): data.generate_instance_with_words(raw_file, 'raw') with open(saved_model_path + "_best_HP.config", "rb") as f: args = pickle.load(f) + data.show_data_summary() + print_args(args) decode_results = load_model_decode(saved_model_path, data, args, "raw") data.write_decoded_results(output_file, decode_results, 'raw') else: diff --git a/utils/functions.py b/utils/functions.py index 4f951f5..2038b74 100644 --- a/utils/functions.py +++ b/utils/functions.py @@ -3,7 +3,6 @@ # @Last Modified by: Yicheng Zou, Contact: yczou18@fudan.edu.cn import numpy as np -import re def normalize_word(word):