Skip to content

Commit

Permalink
changes required for repo name change
Browse files Browse the repository at this point in the history
  • Loading branch information
StephAO committed Sep 23, 2020
1 parent 6cc3852 commit 50d42fc
Show file tree
Hide file tree
Showing 18 changed files with 37 additions and 37 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,19 @@ For tf_idf prediction, you need to first calculate the idf score for your datase

## Pre-training
To run pretraining :
`bash sentence_encoders/scripts/pretrain_bert.sh --model-type [model type]`
`bash olfmlm/scripts/pretrain_bert.sh --model-type [model type]`
Where model type is the name of the model you want to train. If model type is one of the modes, it will train using mlm and that mode (if model type is mlm, it will train using just mlm).
The --modes argument will override this default behaviour. If model type is not a specified mode, the--modes argument is required.

## Distributed Pretraining
Use pretrain_bert_distributed.sh instead.
`bash sentence_encoders/scripts/pretrain_bert_distributed.sh --model-type [model type]`
`bash olfmlm/scripts/pretrain_bert_distributed.sh --model-type [model type]`

## Evaluation
To run evaluation:
You will need to convert the saved state dict of the required model using the convert_state_dict.py file.
Then run:
`python3 -m sentence_encoders.evaluate.main --exp_name [experiment name]`
`python3 -m olfmlm.evaluate.main --exp_name [experiment name]`
Where experiment name is the same as the model type above. If using a saved checkpoint instead of the best model, use the --checkpoint argument.


Expand Down
2 changes: 1 addition & 1 deletion arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import os
import re
import torch
from sentence_encoders.paths import bert_config_file, pretrained_path
from olfmlm.paths import bert_config_file, pretrained_path

def str2bool(v):
if isinstance(v, bool):
Expand Down
2 changes: 1 addition & 1 deletion configure_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import copy
import torch
from sentence_encoders import data_utils
from olfmlm import data_utils

class DataConfig:

Expand Down
10 changes: 5 additions & 5 deletions data_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@
import os
import math

from sentence_encoders.data_utils.samplers import DistributedBatchSampler
from sentence_encoders.data_utils.datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, \
from olfmlm.data_utils.samplers import DistributedBatchSampler
from olfmlm.data_utils.datasets import json_dataset, csv_dataset, split_ds, ConcatDataset, SplitDataset, \
bert_dataset
from sentence_encoders.data_utils.lazy_loader import exists_lazy, make_lazy, lazy_array_loader
from sentence_encoders.data_utils.tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer
import sentence_encoders.data_utils.corpora
from olfmlm.data_utils.lazy_loader import exists_lazy, make_lazy, lazy_array_loader
from olfmlm.data_utils.tokenization import Tokenization, CommandToken, Tokenizer, CharacterLevelTokenizer, BertWordPieceTokenizer, make_tokenizer
import olfmlm.data_utils.corpora

TRAIN_DATA = 0
VAL_DATA = 1
Expand Down
4 changes: 2 additions & 2 deletions data_utils/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""several datasets with preset arguments"""
from sentence_encoders.data_utils.datasets import json_dataset, csv_dataset
from olfmlm.data_utils.datasets import json_dataset, csv_dataset

from sentence_encoders.paths import train_data_path
from olfmlm.paths import train_data_path
import os

class wikipedia(json_dataset):
Expand Down
4 changes: 2 additions & 2 deletions data_utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
nltk.download('punkt')
from nltk import tokenize

from sentence_encoders.data_utils.lazy_loader import lazy_array_loader, exists_lazy, make_lazy
from sentence_encoders.data_utils.tokenization import Tokenization
from olfmlm.data_utils.lazy_loader import lazy_array_loader, exists_lazy, make_lazy
from olfmlm.data_utils.tokenization import Tokenization

def clean_tokens(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
Expand Down
2 changes: 1 addition & 1 deletion data_utils/make_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""


from sentence_encoders import data_utils
from olfmlm import data_utils
from multiprocessing import Pool
from blingfire import text_to_sentences
from torch.utils import data
Expand Down
2 changes: 1 addition & 1 deletion data_utils/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from nltk import tokenize as nltk_tokenize
import sentencepiece as spm

from sentence_encoders.data_utils.wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP
from olfmlm.data_utils.wordpiece import BertTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP

def make_tokenizer(tokenizer_type, corpus, model_path=None, vocab_size=None, model_type='bpe', pad_token=0, character_coverage=1.0, command_tokens=None, type_tokens=None, **kwargs):
"""
Expand Down
2 changes: 1 addition & 1 deletion data_utils/wordpiece.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import unicodedata
from io import open

from sentence_encoders.data_utils.file_utils import cached_path
from olfmlm.data_utils.file_utils import cached_path

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion evaluate/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def params_from_file(config_files: Union[str, Iterable[str]], overrides: str = N
if isinstance(config_files, str):
config_files = [config_files]
for config_file in config_files:
config_file = os.path.join("sentence_encoders/evaluate/config", config_file)
config_file = os.path.join("olfmlm/evaluate/config", config_file)
with open(config_file) as fd:
log.info("Loading config from %s", config_file)
config_string += fd.read()
Expand Down
2 changes: 1 addition & 1 deletion idf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Script to calculate the inverse document frequency (idf) used in tf-idf labels of a dataset.
"""

from sentence_encoders import data_utils
from olfmlm import data_utils
import numpy as np
from math import ceil, log
from multiprocessing import Pool
Expand Down
6 changes: 3 additions & 3 deletions model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

import torch

from sentence_encoders.model.modeling import BertConfig
from sentence_encoders.model.modeling import BertLayerNorm
from olfmlm.model.modeling import BertConfig
from olfmlm.model.modeling import BertLayerNorm

from sentence_encoders.model.new_models import Bert
from olfmlm.model.new_models import Bert

def get_params_for_weight_decay_optimization(module):

Expand Down
2 changes: 1 addition & 1 deletion model/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

from torch.utils.checkpoint import checkpoint

from sentence_encoders.data_utils.file_utils import cached_path
from olfmlm.data_utils.file_utils import cached_path

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion model/new_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from sentence_encoders.model.modeling import *
from olfmlm.model.modeling import *

class BertSentHead(nn.Module):
def __init__(self, config, num_classes=2):
Expand Down
2 changes: 1 addition & 1 deletion optim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@
from torch.optim import RMSprop
from torch.optim import Optimizer
from torch.optim import LBFGS
from sentence_encoders.optim.adam import Adam
from olfmlm.optim.adam import Adam
20 changes: 10 additions & 10 deletions pretrain_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@
import psutil
import torch

from sentence_encoders.arguments import get_args
from sentence_encoders.configure_data import configure_data
from sentence_encoders.learning_rates import AnnealingLR
from sentence_encoders.model import BertModel
from sentence_encoders.model import get_params_for_weight_decay_optimization
from sentence_encoders.model import DistributedDataParallel as DDP
from sentence_encoders.optim import Adam
from sentence_encoders.utils import Timers
from sentence_encoders.utils import save_checkpoint
from sentence_encoders.utils import load_checkpoint
from olfmlm.arguments import get_args
from olfmlm.configure_data import configure_data
from olfmlm.learning_rates import AnnealingLR
from olfmlm.model import BertModel
from olfmlm.model import get_params_for_weight_decay_optimization
from olfmlm.model import DistributedDataParallel as DDP
from olfmlm.optim import Adam
from olfmlm.utils import Timers
from olfmlm.utils import save_checkpoint
from olfmlm.utils import load_checkpoint


def get_model(tokenizer, args):
Expand Down
2 changes: 1 addition & 1 deletion scripts/pretrain_bert.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
RANK=0
WORLD_SIZE=1

python3 -m sentence_encoders.pretrain_bert "$@"
python3 -m olfmlm.pretrain_bert "$@"
2 changes: 1 addition & 1 deletion scripts/pretrain_bert_distributed.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ NODE_RANK=0

DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

python -m torch.distributed.launch $DISTRIBUTED_ARGS sentence_encoders/pretrain_bert.py "$@"
python -m torch.distributed.launch $DISTRIBUTED_ARGS olfmlm/pretrain_bert.py "$@"



Expand Down

0 comments on commit 50d42fc

Please sign in to comment.