Skip to content

Commit

Permalink
add README introduction
Browse files Browse the repository at this point in the history
  • Loading branch information
WarruzuEndo committed Dec 6, 2022
1 parent 0a338b3 commit 602665e
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 8 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@

## Introduction

MindNLP is an open source NLP library based on MindSpore.
MindNLP is an open source NLP library based on MindSpore. It supports a platform for solving natural language processing tasks, containing many common approaches in NLP. It can help researchers and developers to construct and train models more conveniently and rapidly.

The master branch works with **MindSpore master**.

### Major Features

- feature1: ...
- **Comprehensive data processing**: Several classical NLP datasets are packaged into friendly module for easy use, such as Multi30k, SQuAD, CoNLL, etc.
- **Friendly NLP model toolset**: MindNLP provides various configurable components. It is friendly to customize models using MindNLP.
- **Easy-to-use engine**: MindNLP simplified complicated training process in MindSpore. It supports Trainer and Evaluator interfaces to train and evaluate models easily.

## Quick Links

Expand Down
14 changes: 9 additions & 5 deletions examples/sentiment_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,12 @@ def construct(self, text):
lr = 0.001

# load datasets
imdb_train, imdb_test = load('imdb', shuffle=True)
print(imdb_train.get_col_names())
imdb_train, imdb_test = load('imdb')
embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)

lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
pad_op = ds.transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
type_cast_op = ds.transforms.TypeCast(ms.float32)

embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)
tokenizer = BasicTokenizer(True)
Expand All @@ -86,9 +90,9 @@ def construct(self, text):
dropout=drop, bidirectional=bidirectional)
sentiment_encoder = RNNEncoder(embedding, lstm_layer)
sentiment_head = Head(hidden_size, output_size, drop)

net = SentimentClassification(sentiment_encoder, sentiment_head)
loss = nn.BCELoss(reduction='mean')

loss = nn.BCEWithLogitsLoss(reduction='mean')
optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)

# define metrics
Expand All @@ -98,4 +102,4 @@ def construct(self, text):
trainer = Trainer(network=net, train_dataset=imdb_train, eval_dataset=imdb_valid, metrics=metric,
epochs=5, loss_fn=loss, optimizer=optimizer)
trainer.run(tgt_columns="label", jit=False)
print("end train")
print("end train")
148 changes: 148 additions & 0 deletions mindnlp/abc/pretrained_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Abstract class for Pretrained models.
"""
import json
import os
from typing import Union, Optional
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore import nn

class PretrainedConfig:
"""
Abstract class for Pretrained models config.
"""
def __init__(self, **kwargs):
self.finetuning_task = kwargs.pop('finetuning_task', None)
self.num_labels = kwargs.pop('num_labels', 2)
self.output_attentions = kwargs.pop('output_attentions', False)
self.output_hidden_states = kwargs.pop('output_hidden_states', False)

@classmethod
def from_json(cls, file_path):
"""load config from json."""
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
config_map = json.loads(text)
config = cls()
for key, value in config_map.items():
setattr(config, key, value)
return config

@classmethod
def load(cls, pretrained_model_name_or_path):
"""load config."""
if os.path.exists(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
else:
raise ValueError(
f"unable to parse {pretrained_model_name_or_path} as a local path or model name")

config = cls.from_json(config_file)

return config

class PretrainedModel(nn.Cell):
"""
Abstract class for Pretrained models
"""
config_class = None
def __init__(self, config):
super().__init__()
self.config = config

def init_model_weights(self):
"""
initialize model weights.
"""
raise NotImplementedError

def get_input_embeddings(self) -> "nn.Cell":
"""
Returns the model's input embeddings.
Returns:
:obj:`nn.Cell`: A mindspore cell mapping vocabulary to hidden states.
"""
raise NotImplementedError

def set_input_embeddings(self, value: "nn.Cell"):
"""
Set model's input embeddings.
Args:
value (:obj:`nn.Cell`): A mindspore cell mapping vocabulary to hidden states.
"""
raise NotImplementedError

def resize_position_embeddings(self, new_num_position_embeddings: int):
"""
resize the model position embeddings if necessary
"""
raise NotImplementedError(
f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
f"overwrite this method in the class {self.__class__}"
)

def get_position_embeddings(self):
"""
get the model position embeddings if necessary
"""
raise NotImplementedError(
f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
f"overwrite this method in the class {self.__class__}"
)

def save(self, save_dir: Union[str, os.PathLike]):
"save pretrain model"
raise NotImplementedError

@classmethod
def load(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
*args, **kwargs):
"""
Load a pre-trained checkpoint from a pre-trained model file or url,
download and cache the pre-trained model file if model name in model list.
Params:
pretrained_model_name_or_path:
"""
config = kwargs.pop("config", None)
# load config
if not isinstance(config, PretrainedConfig):
config_path = config if config is not None else pretrained_model_name_or_path
config = cls.config_class.load(config_path)
model = cls(config, *args, **kwargs)
if os.path.exists(pretrained_model_name_or_path):
# File exists.
model_file = os.path.join(pretrained_model_name_or_path)
assert os.path.isfile(model_file)
else:
# Something unknown
raise ValueError(
f"unable to parse {pretrained_model_name_or_path} as a local path or model name")
# load ckpt
try:
param_dict = load_checkpoint(model_file)
except Exception as exc:
raise ValueError(f"File {model_file} is not a checkpoint file, "
f"please check the path.") from exc

param_not_load = load_param_into_net(model, param_dict)
if len(param_not_load) == len(model.trainable_params()):
raise KeyError(f"The following weights in model are not found: {param_not_load}")

return model
2 changes: 1 addition & 1 deletion mindnlp/modules/embeddings/glove_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, vocab: Vocab, init_embed, requires_grad: bool = True, dropout

@classmethod
def from_pretrained(cls, name='6B', dims=300, root=DEFAULT_ROOT,
special_tokens=("<pad>", "<unk>"), special_first=True, **kwargs):
special_tokens=("<pad>", "<unk>"), special_first=False, **kwargs):
r"""
Creates Embedding instance from given 2-dimensional FloatTensor.
Expand Down

0 comments on commit 602665e

Please sign in to comment.