add README introduction

mindspore-lab · Dec 6, 2022 · 602665e · 602665e
1 parent 0a338b3
commit 602665e
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -27,13 +27,15 @@
 
 ## Introduction
 
-MindNLP is an open source NLP library based on MindSpore.
+MindNLP is an open source NLP library based on MindSpore. It supports a platform for solving natural language processing tasks, containing many common approaches in NLP. It can help researchers and developers to construct and train models more conveniently and rapidly.
 
 The master branch works with **MindSpore master**.
 
 ### Major Features
 
-- feature1: ...
+- **Comprehensive data processing**: Several classical NLP datasets are packaged into friendly module for easy use, such as Multi30k, SQuAD, CoNLL, etc.
+- **Friendly NLP model toolset**: MindNLP provides various configurable components. It is friendly to customize models using MindNLP.
+- **Easy-to-use engine**: MindNLP simplified complicated training process in MindSpore. It supports Trainer and Evaluator interfaces to train and evaluate models easily.
 
 ## Quick Links
 

diff --git a/examples/sentiment_classification.py b/examples/sentiment_classification.py
@@ -72,8 +72,12 @@ def construct(self, text):
 lr = 0.001
 
 # load datasets
-imdb_train, imdb_test = load('imdb', shuffle=True)
-print(imdb_train.get_col_names())
+imdb_train, imdb_test = load('imdb')
+embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)
+
+lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
+pad_op = ds.transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
+type_cast_op = ds.transforms.TypeCast(ms.float32)
 
 embedding, vocab = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"], dropout=drop)
 tokenizer = BasicTokenizer(True)
@@ -86,9 +90,9 @@ def construct(self, text):
                      dropout=drop, bidirectional=bidirectional)
 sentiment_encoder = RNNEncoder(embedding, lstm_layer)
 sentiment_head = Head(hidden_size, output_size, drop)
-
 net = SentimentClassification(sentiment_encoder, sentiment_head)
-loss = nn.BCELoss(reduction='mean')
+
+loss = nn.BCEWithLogitsLoss(reduction='mean')
 optimizer = nn.Adam(net.trainable_params(), learning_rate=lr)
 
 # define metrics
@@ -98,4 +102,4 @@ def construct(self, text):
 trainer = Trainer(network=net, train_dataset=imdb_train, eval_dataset=imdb_valid, metrics=metric,
                   epochs=5, loss_fn=loss, optimizer=optimizer)
 trainer.run(tgt_columns="label", jit=False)
-print("end train")
+print("end train")
diff --git a/mindnlp/abc/pretrained_model.py b/mindnlp/abc/pretrained_model.py
@@ -0,0 +1,148 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Abstract class for Pretrained models.
+"""
+import json
+import os
+from typing import Union, Optional
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore import nn
+
+class PretrainedConfig:
+    """
+    Abstract class for Pretrained models config.
+    """
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+
+    @classmethod
+    def from_json(cls, file_path):
+        """load config from json."""
+        with open(file_path, "r", encoding="utf-8") as file:
+            text = file.read()
+        config_map = json.loads(text)
+        config = cls()
+        for key, value in config_map.items():
+            setattr(config, key, value)
+        return config
+
+    @classmethod
+    def load(cls, pretrained_model_name_or_path):
+        """load config."""
+        if os.path.exists(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        else:
+            raise ValueError(
+                f"unable to parse {pretrained_model_name_or_path} as a local path or model name")
+
+        config = cls.from_json(config_file)
+
+        return config
+
+class PretrainedModel(nn.Cell):
+    """
+    Abstract class for Pretrained models
+    """
+    config_class = None
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+    def init_model_weights(self):
+        """
+        initialize model weights.
+        """
+        raise NotImplementedError
+
+    def get_input_embeddings(self) -> "nn.Cell":
+        """
+        Returns the model's input embeddings.
+
+        Returns:
+            :obj:`nn.Cell`: A mindspore cell mapping vocabulary to hidden states.
+        """
+        raise NotImplementedError
+
+    def set_input_embeddings(self, value: "nn.Cell"):
+        """
+        Set model's input embeddings.
+
+        Args:
+            value (:obj:`nn.Cell`): A mindspore cell mapping vocabulary to hidden states.
+        """
+        raise NotImplementedError
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        resize the model position embeddings if necessary
+        """
+        raise NotImplementedError(
+            f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
+            f"overwrite this method in the class {self.__class__}"
+        )
+
+    def get_position_embeddings(self):
+        """
+        get the model position embeddings if necessary
+        """
+        raise NotImplementedError(
+            f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
+            f"overwrite this method in the class {self.__class__}"
+        )
+
+    def save(self, save_dir: Union[str, os.PathLike]):
+        "save pretrain model"
+        raise NotImplementedError
+
+    @classmethod
+    def load(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+             *args, **kwargs):
+        """
+        Load a pre-trained checkpoint from a pre-trained model file or url,
+        download and cache the pre-trained model file if model name in model list.
+
+        Params:
+            pretrained_model_name_or_path:
+        """
+        config = kwargs.pop("config", None)
+        # load config
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config = cls.config_class.load(config_path)
+        model = cls(config, *args, **kwargs)
+        if os.path.exists(pretrained_model_name_or_path):
+            # File exists.
+            model_file = os.path.join(pretrained_model_name_or_path)
+            assert os.path.isfile(model_file)
+        else:
+            # Something unknown
+            raise ValueError(
+                f"unable to parse {pretrained_model_name_or_path} as a local path or model name")
+        # load ckpt
+        try:
+            param_dict = load_checkpoint(model_file)
+        except Exception as exc:
+            raise ValueError(f"File {model_file} is not a checkpoint file, "
+                             f"please check the path.") from exc
+
+        param_not_load = load_param_into_net(model, param_dict)
+        if len(param_not_load) == len(model.trainable_params()):
+            raise KeyError(f"The following weights in model are not found: {param_not_load}")
+
+        return model
diff --git a/mindnlp/modules/embeddings/glove_embedding.py b/mindnlp/modules/embeddings/glove_embedding.py
@@ -74,7 +74,7 @@ def __init__(self, vocab: Vocab, init_embed, requires_grad: bool = True, dropout
 
     @classmethod
     def from_pretrained(cls, name='6B', dims=300, root=DEFAULT_ROOT,
-                        special_tokens=("<pad>", "<unk>"), special_first=True, **kwargs):
+                        special_tokens=("<pad>", "<unk>"), special_first=False, **kwargs):
         r"""
         Creates Embedding instance from given 2-dimensional FloatTensor.