Skip to content

Commit

Permalink
Added server command line option for cased tokenization, as required …
Browse files Browse the repository at this point in the history
…to keep the casing and accented letters in the pretrained multilingual cased model and other models.
  • Loading branch information
PeterisP committed May 13, 2019
1 parent 7a033e1 commit 9da8f9c
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 1 deletion.
3 changes: 2 additions & 1 deletion server/bert_serving/server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ def __init__(self, id, args, worker_address_list, sink_address, device_id, graph
self.device_id = device_id
self.logger = set_logger(colored('WORKER-%d' % self.worker_id, 'yellow'), args.verbose)
self.max_seq_len = args.max_seq_len
self.do_lower_case = args.do_lower_case
self.mask_cls_sep = args.mask_cls_sep
self.daemon = True
self.exit_flag = multiprocessing.Event()
Expand Down Expand Up @@ -532,10 +533,10 @@ def input_fn_builder(self, socks, tf, sink):
from .bert.tokenization import FullTokenizer

def gen():
tokenizer = FullTokenizer(vocab_file=os.path.join(self.model_dir, 'vocab.txt'))
# Windows does not support logger in MP environment, thus get a new logger
# inside the process for better compatibility
logger = set_logger(colored('WORKER-%d' % self.worker_id, 'yellow'), self.verbose)
tokenizer = FullTokenizer(vocab_file=os.path.join(self.model_dir, 'vocab.txt'), do_lower_case=self.do_lower_case)

poller = zmq.Poller()
for sock in socks:
Expand Down
3 changes: 3 additions & 0 deletions server/bert_serving/server/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ def get_args_parser():
group2.add_argument('-max_seq_len', type=check_max_seq_len, default=25,
help='maximum length of a sequence, longer sequence will be trimmed on the right side. '
'set it to NONE for dynamically using the longest sequence in a (mini)batch.')
group2.add_argument('-cased_tokenization', dest='do_lower_case', action='store_false', default=True,
help='Whether tokenizer should skip the default lowercasing and accent removal.'
'Should be used for e.g. the multilingual cased pretrained BERT model.')
group2.add_argument('-pooling_layer', type=int, nargs='+', default=[-2],
help='the encoder layer(s) that receives pooling. \
Give a list in order to concatenate several layers into one')
Expand Down

0 comments on commit 9da8f9c

Please sign in to comment.