diff --git a/README.md b/README.md index fd9c2ab..ebcf5af 100644 --- a/README.md +++ b/README.md @@ -42,19 +42,19 @@ For tf_idf prediction, you need to first calculate the idf score for your datase ## Pre-training To run pretraining : -`bash sentence_encoders/scripts/pretrain_bert.sh --model-type [model type]` +`bash olfmlm/scripts/pretrain_bert.sh --model-type [model type]` Where model type is the name of the model you want to train. If model type is one of the modes, it will train using mlm and that mode (if model type is mlm, it will train using just mlm). The --modes argument will override this default behaviour. If model type is not a specified mode, the--modes argument is required. ## Distributed Pretraining Use pretrain_bert_distributed.sh instead. -`bash sentence_encoders/scripts/pretrain_bert_distributed.sh --model-type [model type]` +`bash olfmlm/scripts/pretrain_bert_distributed.sh --model-type [model type]` ## Evaluation To run evaluation: You will need to convert the saved state dict of the required model using the convert_state_dict.py file. Then run: -`python3 -m sentence_encoders.evaluate.main --exp_name [experiment name]` +`python3 -m olfmlm.evaluate.main --exp_name [experiment name]` Where experiment name is the same as the model type above. If using a saved checkpoint instead of the best model, use the --checkpoint argument. diff --git a/arguments.py b/arguments.py index 9ab8e47..373bee8 100644 --- a/arguments.py +++ b/arguments.py @@ -19,7 +19,7 @@ import os import re import torch -from sentence_encoders.paths import bert_config_file, pretrained_path +from olfmlm.paths import bert_config_file, pretrained_path def str2bool(v): if isinstance(v, bool): diff --git a/configure_data.py b/configure_data.py index 1205d03..f8949de 100644 --- a/configure_data.py +++ b/configure_data.py @@ -17,7 +17,7 @@ import copy import torch -from sentence_encoders import data_utils +from olfmlm import data_utils class DataConfig: diff --git a/data_utils/__init__.py b/data_utils/__init__.py index 7265fdd..dc1ddb6 100644 --- a/data_utils/__init__.py +++ b/data_utils/__init__.py @@ -16,12 +16,12 @@ import os import math -from sentence_encoders.data_utils.samplers import DistributedBatchSampler -from sentence_encoders.data_utils.datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, \ +from olfmlm.data_utils.samplers import DistributedBatchSampler +from olfmlm.data_utils.datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, \ bert_dataset -from sentence_encoders.data_utils.lazy_loader import exists_lazy, make_lazy, lazy_array_loader -from sentence_encoders.data_utils.tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer -import sentence_encoders.data_utils.corpora +from olfmlm.data_utils.lazy_loader import exists_lazy, make_lazy, lazy_array_loader +from olfmlm.data_utils.tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer +import olfmlm.data_utils.corpora TRAIN_DATA = 0 VAL_DATA = 1 diff --git a/data_utils/corpora.py b/data_utils/corpora.py index 4bc5981..99220b5 100755 --- a/data_utils/corpora.py +++ b/data_utils/corpora.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """several datasets with preset arguments""" -from sentence_encoders.data_utils.datasets import json_dataset, csv_dataset +from olfmlm.data_utils.datasets import json_dataset, csv_dataset -from sentence_encoders.paths import train_data_path +from olfmlm.paths import train_data_path import os class wikipedia(json_dataset): diff --git a/data_utils/datasets.py b/data_utils/datasets.py index 2b1873f..d6ff534 100644 --- a/data_utils/datasets.py +++ b/data_utils/datasets.py @@ -38,8 +38,8 @@ nltk.download('punkt') from nltk import tokenize -from sentence_encoders.data_utils.lazy_loader import lazy_array_loader, exists_lazy, make_lazy -from sentence_encoders.data_utils.tokenization import Tokenization +from olfmlm.data_utils.lazy_loader import lazy_array_loader, exists_lazy, make_lazy +from olfmlm.data_utils.tokenization import Tokenization def clean_tokens(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" diff --git a/data_utils/make_dataset.py b/data_utils/make_dataset.py index 0894804..e6d5e95 100644 --- a/data_utils/make_dataset.py +++ b/data_utils/make_dataset.py @@ -12,7 +12,7 @@ """ -from sentence_encoders import data_utils +from olfmlm import data_utils from multiprocessing import Pool from blingfire import text_to_sentences from torch.utils import data diff --git a/data_utils/tokenization.py b/data_utils/tokenization.py index 072adbe..95482d2 100755 --- a/data_utils/tokenization.py +++ b/data_utils/tokenization.py @@ -23,7 +23,7 @@ from nltk import tokenize as nltk_tokenize import sentencepiece as spm -from sentence_encoders.data_utils.wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP +from olfmlm.data_utils.wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs): """ diff --git a/data_utils/wordpiece.py b/data_utils/wordpiece.py index 8d54d6e..ed0ece5 100755 --- a/data_utils/wordpiece.py +++ b/data_utils/wordpiece.py @@ -22,7 +22,7 @@ import unicodedata from io import open -from sentence_encoders.data_utils.file_utils import cached_path +from olfmlm.data_utils.file_utils import cached_path logger = logging.getLogger(__name__) diff --git a/evaluate/utils/config.py b/evaluate/utils/config.py index b666c5a..c86b258 100644 --- a/evaluate/utils/config.py +++ b/evaluate/utils/config.py @@ -121,7 +121,7 @@ def params_from_file(config_files: Union[str, Iterable[str]], overrides: str = N if isinstance(config_files, str): config_files = [config_files] for config_file in config_files: - config_file = os.path.join("sentence_encoders/evaluate/config", config_file) + config_file = os.path.join("olfmlm/evaluate/config", config_file) with open(config_file) as fd: log.info("Loading config from %s", config_file) config_string += fd.read() diff --git a/idf.py b/idf.py index deb86b7..c37375b 100644 --- a/idf.py +++ b/idf.py @@ -2,7 +2,7 @@ Script to calculate the inverse document frequency (idf) used in tf-idf labels of a dataset. """ -from sentence_encoders import data_utils +from olfmlm import data_utils import numpy as np from math import ceil, log from multiprocessing import Pool diff --git a/model/model.py b/model/model.py index e279994..95bc098 100755 --- a/model/model.py +++ b/model/model.py @@ -17,10 +17,10 @@ import torch -from sentence_encoders.model.modeling import BertConfig -from sentence_encoders.model.modeling import BertLayerNorm +from olfmlm.model.modeling import BertConfig +from olfmlm.model.modeling import BertLayerNorm -from sentence_encoders.model.new_models import Bert +from olfmlm.model.new_models import Bert def get_params_for_weight_decay_optimization(module): diff --git a/model/modeling.py b/model/modeling.py index 42ffa7d..1a2f530 100644 --- a/model/modeling.py +++ b/model/modeling.py @@ -34,7 +34,7 @@ from torch.utils.checkpoint import checkpoint -from sentence_encoders.data_utils.file_utils import cached_path +from olfmlm.data_utils.file_utils import cached_path logger = logging.getLogger(__name__) diff --git a/model/new_models.py b/model/new_models.py index 72f8e7e..10f987c 100644 --- a/model/new_models.py +++ b/model/new_models.py @@ -1,7 +1,7 @@ import numpy as np import torch from torch.nn import CrossEntropyLoss -from sentence_encoders.model.modeling import * +from olfmlm.model.modeling import * class BertSentHead(nn.Module): def __init__(self, config, num_classes=2): diff --git a/optim/__init__.py b/optim/__init__.py index 668b11e..e3c9c74 100755 --- a/optim/__init__.py +++ b/optim/__init__.py @@ -23,4 +23,4 @@ from torch.optim import RMSprop from torch.optim import Optimizer from torch.optim import LBFGS -from sentence_encoders.optim.adam import Adam +from olfmlm.optim.adam import Adam diff --git a/pretrain_bert.py b/pretrain_bert.py index 5068ee9..45cc3d0 100755 --- a/pretrain_bert.py +++ b/pretrain_bert.py @@ -24,16 +24,16 @@ import psutil import torch -from sentence_encoders.arguments import get_args -from sentence_encoders.configure_data import configure_data -from sentence_encoders.learning_rates import AnnealingLR -from sentence_encoders.model import BertModel -from sentence_encoders.model import get_params_for_weight_decay_optimization -from sentence_encoders.model import DistributedDataParallel as DDP -from sentence_encoders.optim import Adam -from sentence_encoders.utils import Timers -from sentence_encoders.utils import save_checkpoint -from sentence_encoders.utils import load_checkpoint +from olfmlm.arguments import get_args +from olfmlm.configure_data import configure_data +from olfmlm.learning_rates import AnnealingLR +from olfmlm.model import BertModel +from olfmlm.model import get_params_for_weight_decay_optimization +from olfmlm.model import DistributedDataParallel as DDP +from olfmlm.optim import Adam +from olfmlm.utils import Timers +from olfmlm.utils import save_checkpoint +from olfmlm.utils import load_checkpoint def get_model(tokenizer, args): diff --git a/scripts/pretrain_bert.sh b/scripts/pretrain_bert.sh index 9d92f9d..156d756 100755 --- a/scripts/pretrain_bert.sh +++ b/scripts/pretrain_bert.sh @@ -3,4 +3,4 @@ RANK=0 WORLD_SIZE=1 -python3 -m sentence_encoders.pretrain_bert "$@" +python3 -m olfmlm.pretrain_bert "$@" diff --git a/scripts/pretrain_bert_distributed.sh b/scripts/pretrain_bert_distributed.sh index b165337..27b944d 100755 --- a/scripts/pretrain_bert_distributed.sh +++ b/scripts/pretrain_bert_distributed.sh @@ -9,7 +9,7 @@ NODE_RANK=0 DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" -python -m torch.distributed.launch $DISTRIBUTED_ARGS sentence_encoders/pretrain_bert.py "$@" +python -m torch.distributed.launch $DISTRIBUTED_ARGS olfmlm/pretrain_bert.py "$@"