From abb6036e444fdbd5cdf969b93619b2d35eec1897 Mon Sep 17 00:00:00 2001
From: Zhaocheng Zhu <healmysoul@163.com>
Date: Fri, 11 Oct 2019 21:51:29 -0400
Subject: [PATCH] release v0.2.0

--fixup
---
 CHANGELOG.md                                  |   34 +
 README.md                                     |   75 +-
 conda/graphvite-mini/meta.yaml                |    3 +-
 conda/graphvite/meta.yaml                     |    3 +-
 conda/requirements.txt                        |    1 +
 config/demo/math.yaml                         |   40 +
 config/{ => demo}/quick_start.yaml            |   15 +-
 config/graph/deepwalk_flickr.yaml             |    2 +-
 config/graph/deepwalk_friendster-small.yaml   |    2 +-
 config/graph/deepwalk_friendster.yaml         |    2 +-
 config/graph/deepwalk_youtube.yaml            |    2 +-
 config/graph/line_flickr.yaml                 |    2 +-
 config/graph/line_friendster-small.yaml       |    2 +-
 config/graph/line_friendster.yaml             |    2 +-
 config/graph/line_youtube.yaml                |    2 +-
 config/graph/node2vec_youtube.yaml            |    2 +-
 config/knowledge_graph/complex_fb15k-237.yaml |    4 +-
 config/knowledge_graph/complex_fb15k.yaml     |    6 +-
 config/knowledge_graph/complex_wn18.yaml      |    6 +-
 config/knowledge_graph/complex_wn18rr.yaml    |    6 +-
 .../knowledge_graph/distmult_fb15k-237.yaml   |    6 +-
 config/knowledge_graph/distmult_fb15k.yaml    |    4 +-
 config/knowledge_graph/distmult_wn18.yaml     |    4 +-
 config/knowledge_graph/distmult_wn18rr.yaml   |    4 +-
 config/knowledge_graph/rotate_fb15k-237.yaml  |    4 +-
 config/knowledge_graph/rotate_fb15k.yaml      |    4 +-
 config/knowledge_graph/rotate_wn18.yaml       |    6 +-
 config/knowledge_graph/rotate_wn18rr.yaml     |    4 +-
 config/knowledge_graph/simple_fb15k-237.yaml  |    4 +-
 config/knowledge_graph/simple_fb15k.yaml      |    4 +-
 config/knowledge_graph/simple_wn18.yaml       |    2 +-
 config/knowledge_graph/simple_wn18rr.yaml     |    2 +-
 config/knowledge_graph/transe_fb15k-237.yaml  |    4 +-
 config/knowledge_graph/transe_fb15k.yaml      |    4 +-
 config/knowledge_graph/transe_wn18.yaml       |    4 +-
 config/knowledge_graph/transe_wn18rr.yaml     |    4 +-
 config/template/graph.yaml                    |  105 +
 config/template/knowledge_graph.yaml          |  107 +
 config/template/visualization.yaml            |  129 +
 config/template/word_graph.yaml               |   77 +
 config/word_graph/line_wikipedia.yaml         |    4 +-
 doc/source/benchmark.rst                      |  104 +-
 doc/source/developer/framework.rst            |    9 +-
 doc/source/developer/model.rst                |  127 +-
 doc/source/developer/routine.rst              |   89 +
 doc/source/developer/solver.rst               |   14 +-
 doc/source/faq.rst                            |   26 +-
 doc/source/index.rst                          |    2 +
 doc/source/install.rst                        |   46 +-
 doc/source/introduction.rst                   |   36 +-
 doc/source/quick_start.rst                    |   47 +-
 doc/source/user/auto.rst                      |    2 +-
 doc/source/user/command_line.rst              |   41 +-
 doc/source/user/configuration.rst             |   81 +-
 doc/source/user/format.rst                    |   60 +
 doc/source/user/python.rst                    |   14 +-
 include/base/alias_table.cuh                  |   15 +-
 include/base/memory.h                         |   59 +-
 include/base/vector.h                         |   16 +-
 include/bind.h                                |   40 +-
 include/core/graph.h                          |    3 +-
 include/core/optimizer.h                      |   26 +-
 include/core/solver.h                         |  588 ++++-
 include/gpu/knowledge_graph.cuh               | 2266 -----------------
 include/{ => instance}/gpu/graph.cuh          |  250 +-
 include/instance/gpu/knowledge_graph.cuh      |  355 +++
 include/{ => instance}/gpu/visualization.cuh  |  208 +-
 include/instance/graph.cuh                    |  110 +-
 include/instance/knowledge_graph.cuh          |  125 +-
 include/instance/model/graph.h                |  108 +
 include/instance/model/knowledge_graph.h      |  547 ++++
 include/instance/model/visualization.h        |   88 +
 include/instance/visualization.cuh            |   83 +-
 include/util/common.h                         |    2 +-
 include/util/gpu.cuh                          |   23 +-
 include/util/io.h                             |   16 +-
 include/util/math.h                           |   77 +
 python/graphvite/application/application.py   |  511 +++-
 python/graphvite/application/network.py       |   17 +-
 python/graphvite/base.py                      |    5 +-
 python/graphvite/cmd.py                       |   80 +-
 python/graphvite/dataset.py                   |  194 +-
 python/graphvite/util.py                      |   77 +
 python/setup.py                               |    2 +-
 src/CMakeLists.txt                            |    2 +-
 src/graphvite.cu                              |   11 +-
 86 files changed, 4004 insertions(+), 3295 deletions(-)
 create mode 100644 CHANGELOG.md
 create mode 100644 config/demo/math.yaml
 rename config/{ => demo}/quick_start.yaml (67%)
 create mode 100644 config/template/graph.yaml
 create mode 100644 config/template/knowledge_graph.yaml
 create mode 100644 config/template/visualization.yaml
 create mode 100644 config/template/word_graph.yaml
 create mode 100644 doc/source/developer/routine.rst
 create mode 100644 doc/source/user/format.rst
 delete mode 100644 include/gpu/knowledge_graph.cuh
 rename include/{ => instance}/gpu/graph.cuh (55%)
 create mode 100644 include/instance/gpu/knowledge_graph.cuh
 rename include/{ => instance}/gpu/visualization.cuh (57%)
 create mode 100644 include/instance/model/graph.h
 create mode 100644 include/instance/model/knowledge_graph.h
 create mode 100644 include/instance/model/visualization.h
 create mode 100644 include/util/math.h

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..6459956
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,34 @@
+Change log
+==========
+
+Here list all notable changes in GraphVite library.
+
+v0.2.0 - 2019-10-11
+-------------------
+- Add scalable multi-GPU prediction for node embedding and knowledge graph embedding.
+  Evaluation on link prediction is 4.6x faster than v0.1.0.
+- New demo dataset `math` and entity prediction evaluation for knowledge graph.
+- Support Kepler and Turing GPU architectures.
+- Automatically choose the best episode size with regrad to RAM limit.
+- Add template config files for applications.
+- Change the update of global embeddings from average to accumulation. Fix a serious
+  numeric problem in the update.
+- Move file format settings from graph to application. Now one can customize formats
+  and use comments in evaluation files. Add document for data format.
+- Separate GPU implementation into training routines and models. Routines are in
+  `include/instance/gpu/*` and models are in `include/instance/model/*`.
+
+v0.1.0 - 2019-08-05
+-------------------
+- Multi-GPU training of large-scale graph embedding 
+- 3 applications: node embedding, knowledge graph embedding and graph &
+  high-dimensional data visualization
+- Node embedding
+    - Model: DeepWalk, LINE, node2vec
+    - Evaluation: node classification, link prediction
+- Knowledge graph embedding
+    - Model: TransE, DistMult, ComplEx, SimplE, RotatE
+    - Evaluation: link prediction
+- Graph & High-dimensional data visualization
+    - Model: LargeVis
+    - Evaluation: visualization(2D / 3D), animation(3D), hierarchy(2D)
\ No newline at end of file
diff --git a/README.md b/README.md
index 821b155..1e85166 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Here is a summary of the training time of GraphVite along with the best open-sou
 implementations on 3 applications. All the time is reported based on a server with
 24 CPU threads and 4 V100 GPUs.
 
-Node embedding on [Youtube] dataset.
+Training time of node embedding on [Youtube] dataset.
 
 | Model      | Existing Implementation       | GraphVite | Speedup |
 |------------|-------------------------------|-----------|---------|
@@ -50,12 +50,12 @@ Node embedding on [Youtube] dataset.
 [2]: https://github.com/tangjianpku/LINE
 [3]: https://github.com/aditya-grover/node2vec
 
-Knowledge graph embedding on [FB15k] dataset.
+Training / evaluation time of knowledge graph embedding on [FB15k] dataset.
 
-| Model           | Existing Implementation       | GraphVite | Speedup |
-|-----------------|-------------------------------|-----------|---------|
-| [TransE]        | [1.31 hrs (1 GPU)][3]         | 14.8 mins | 5.30x   |
-| [RotatE]        | [3.69 hrs (1 GPU)][4]         | 27.0 mins | 8.22x   |
+| Model           | Existing Implementation           | GraphVite          | Speedup       |
+|-----------------|-----------------------------------|--------------------|---------------|
+| [TransE]        | [1.31 hrs / 1.75 mins (1 GPU)][3] | 13.5 mins / 54.3 s | 5.82x / 1.93x |
+| [RotatE]        | [3.69 hrs / 4.19 mins (1 GPU)][4] | 28.1 mins / 55.8 s | 7.88x / 4.50x |
 
 [FB15k]: http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf
 [TransE]: http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf
@@ -63,11 +63,11 @@ Knowledge graph embedding on [FB15k] dataset.
 [3]: https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
 [4]: https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
 
-High-dimensional data visualization on [MNIST] dataset.
+Training time of high-dimensional data visualization on [MNIST] dataset.
 
 | Model        | Existing Implementation       | GraphVite | Speedup |
 |--------------|-------------------------------|-----------|---------|
-| [LargeVis]   | [15.3 mins (CPU parallel)][5] | 15.1 s    | 60.8x   |
+| [LargeVis]   | [15.3 mins (CPU parallel)][5] | 13.9 s    | 66.8x   |
 
 [MNIST]: http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf
 [LargeVis]: https://arxiv.org/pdf/1602.00370.pdf
@@ -85,19 +85,15 @@ Installation
 
 ### From Conda ###
 
-GraphVite can be installed through conda with only one line.
-
 ```bash
-conda install -c milagraph graphvite cudatoolkit=x.x
+conda install -c milagraph graphvite cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+.\d+")
 ```
 
-where `x.x` is your CUDA version, e.g. 9.2 or 10.0.
-
 If you only need embedding training without evaluation, you can use the following
 alternative with minimal dependencies.
 
 ```bash
-conda install -c milagraph graphvite-mini cudatoolkit=x.x
+conda install -c milagraph graphvite-mini cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+.\d+")
 ```
 
 ### From Source ###
@@ -113,6 +109,24 @@ cd build && cmake .. && make && cd -
 cd python && python setup.py install && cd -
 ```
 
+### On Colab ###
+
+```bash
+!wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+!chmod +x Miniconda3-latest-Linux-x86_64.sh
+!./Miniconda3-latest-Linux-x86_64.sh -b -p /usr/local -f
+
+!conda install -y -c milagraph -c conda-forge graphvite \
+    python=3.6 cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+\.\d+")
+!conda install -y wurlitzer ipykernel
+```
+
+```python
+import site
+site.addsitedir("/usr/local/lib/python3.6/site-packages")
+%reload_ext wurlitzer
+```
+
 Quick Start
 -----------
 
@@ -126,10 +140,14 @@ Typically, the example takes no more than 1 minute. You will obtain some output
 
 ```
 Batch id: 6000
-loss = 0.371641
+loss = 0.371041
+
+------------- link prediction --------------
+AUC: 0.899933
 
-macro-F1@20%: 0.236794
-micro-F1@20%: 0.388110
+----------- node classification ------------
+macro-F1@20%: 0.242114
+micro-F1@20%: 0.391342
 ```
 
 Baseline Benchmark
@@ -139,13 +157,30 @@ To reproduce a baseline benchmark, you only need to specify the keywords of the
 experiment. e.g. model and dataset.
 
 ```bash
-graphvite baseline [keyword ...] [--no-eval] [--gpu n] [--cpu m]
+graphvite baseline [keyword ...] [--no-eval] [--gpu n] [--cpu m] [--epoch e]
 ```
 
 You may also set the number of GPUs and the number of CPUs per GPU.
 
 Use ``graphvite list`` to get a list of available baselines.
 
+Custom Experiment
+-----------------
+
+Create a yaml configuration scaffold for graph, knowledge graph, visualization or
+word graph.
+
+```bash
+graphvite new [application ...] [--file f]
+```
+
+Fill some necessary entries in the configuration following the instructions. You
+can run the configuration by
+
+```bash
+graphvite run [config] [--no-eval] [--gpu n] [--cpu m] [--epoch e]
+```
+
 High-dimensional Data Visualization
 -----------------------------------
 
@@ -156,8 +191,8 @@ GraphVite.
 graphvite visualize [file] [--label label_file] [--save save_file] [--perplexity n] [--3d]
 ```
 
-The file can be either in numpy dump or text format. For the save file, we recommend
-to use a `png` format, while `pdf` is also supported.
+The file can be either a numpy dump `*.npy` or a text matrix `*.txt`. For the save
+file, we recommend to use `png` format, while `pdf` is also supported.
 
 Contributing
 ------------
diff --git a/conda/graphvite-mini/meta.yaml b/conda/graphvite-mini/meta.yaml
index cd3b6f3..f60e705 100644
--- a/conda/graphvite-mini/meta.yaml
+++ b/conda/graphvite-mini/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: graphvite-mini
-  version: 0.1.0
+  version: 0.2.0
 
 source:
   path: ../..
@@ -39,6 +39,7 @@ requirements:
     - easydict
     - six
     - future
+    - psutil
 
 build:
   string:
diff --git a/conda/graphvite/meta.yaml b/conda/graphvite/meta.yaml
index 988202e..7fad22d 100644
--- a/conda/graphvite/meta.yaml
+++ b/conda/graphvite/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: graphvite
-  version: 0.1.0
+  version: 0.2.0
 
 source:
   path: ../..
@@ -40,6 +40,7 @@ requirements:
     - six
     - future
     - imageio
+    - psutil
     - scipy
     - matplotlib
     - pytorch
diff --git a/conda/requirements.txt b/conda/requirements.txt
index cfdc208..087c2ac 100644
--- a/conda/requirements.txt
+++ b/conda/requirements.txt
@@ -17,6 +17,7 @@ conda-forge::easydict
 six
 future
 imageio
+psutil
 scipy
 matplotlib
 pytorch
diff --git a/config/demo/math.yaml b/config/demo/math.yaml
new file mode 100644
index 0000000..c9b9144
--- /dev/null
+++ b/config/demo/math.yaml
@@ -0,0 +1,40 @@
+application:
+  knowledge graph
+
+resource:
+  gpus: [0]
+  cpu_per_gpu: 8
+  dim: 512
+
+graph:
+  file_name: <math.train>
+
+build:
+  optimizer:
+    type: Adam
+    lr: 5.0e-3
+    weight_decay: 0
+  num_partition: auto
+  num_negative: 8
+  batch_size: 100000
+  episode_size: 100
+
+train:
+  model: RotatE
+  num_epoch: 2000
+  margin: 9
+  sample_batch_size: 2000
+  adversarial_temperature: 2
+  log_frequency: 100
+
+evaluate:
+  task: link prediction
+  file_name: <math.test>
+  filter_files:
+    - <math.train>
+    - <math.valid>
+    - <math.test>
+  target: tail
+
+save:
+  file_name: rotate_math.pkl
\ No newline at end of file
diff --git a/config/quick_start.yaml b/config/demo/quick_start.yaml
similarity index 67%
rename from config/quick_start.yaml
rename to config/demo/quick_start.yaml
index d1b51e6..b84e5e7 100644
--- a/config/quick_start.yaml
+++ b/config/demo/quick_start.yaml
@@ -6,6 +6,10 @@ resource:
   cpu_per_gpu: 8
   dim: 128
 
+format:
+  delimiters: " \t\r\n"
+  comment: "#"
+
 graph:
   file_name: <blogcatalog.train>
   as_undirected: true
@@ -30,10 +34,13 @@ train:
   log_frequency: 1000
 
 evaluate:
-  task: node classification
-  file_name: <blogcatalog.label>
-  portions: [0.2]
-  times: 1
+  - task: link prediction
+    file_name: <blogcatalog.test>
+    filter_file: <blogcatalog.train>
+  - task: node classification
+    file_name: <blogcatalog.label>
+    portions: [0.2]
+    times: 1
 
 save:
   file_name: line_blogcatalog.pkl
\ No newline at end of file
diff --git a/config/graph/deepwalk_flickr.yaml b/config/graph/deepwalk_flickr.yaml
index 181ddb8..00f7c57 100644
--- a/config/graph/deepwalk_flickr.yaml
+++ b/config/graph/deepwalk_flickr.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <flickr.train>
+  file_name: <flickr.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/deepwalk_friendster-small.yaml b/config/graph/deepwalk_friendster-small.yaml
index e49150d..2e89515 100644
--- a/config/graph/deepwalk_friendster-small.yaml
+++ b/config/graph/deepwalk_friendster-small.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <friendster.small_train>
+  file_name: <friendster.small_graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/deepwalk_friendster.yaml b/config/graph/deepwalk_friendster.yaml
index 5ddfdd6..ab8f73f 100644
--- a/config/graph/deepwalk_friendster.yaml
+++ b/config/graph/deepwalk_friendster.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 96
 
 graph:
-  file_name: <friendster.train>
+  file_name: <friendster.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/deepwalk_youtube.yaml b/config/graph/deepwalk_youtube.yaml
index feeb2cb..71e6b74 100644
--- a/config/graph/deepwalk_youtube.yaml
+++ b/config/graph/deepwalk_youtube.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <youtube.train>
+  file_name: <youtube.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/line_flickr.yaml b/config/graph/line_flickr.yaml
index a3d0ef1..d995b7c 100644
--- a/config/graph/line_flickr.yaml
+++ b/config/graph/line_flickr.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <flickr.train>
+  file_name: <flickr.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/line_friendster-small.yaml b/config/graph/line_friendster-small.yaml
index ab1ec64..b83cdff 100644
--- a/config/graph/line_friendster-small.yaml
+++ b/config/graph/line_friendster-small.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <friendster.small_train>
+  file_name: <friendster.small_graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/line_friendster.yaml b/config/graph/line_friendster.yaml
index 26594ef..ac370bf 100644
--- a/config/graph/line_friendster.yaml
+++ b/config/graph/line_friendster.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 96
 
 graph:
-  file_name: <friendster.train>
+  file_name: <friendster.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/line_youtube.yaml b/config/graph/line_youtube.yaml
index a2461d1..215f006 100644
--- a/config/graph/line_youtube.yaml
+++ b/config/graph/line_youtube.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <youtube.train>
+  file_name: <youtube.graph>
   as_undirected: true
 
 build:
diff --git a/config/graph/node2vec_youtube.yaml b/config/graph/node2vec_youtube.yaml
index cae070c..8066580 100644
--- a/config/graph/node2vec_youtube.yaml
+++ b/config/graph/node2vec_youtube.yaml
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <youtube.train>
+  file_name: <youtube.graph>
   as_undirected: true
 
 build:
diff --git a/config/knowledge_graph/complex_fb15k-237.yaml b/config/knowledge_graph/complex_fb15k-237.yaml
index 68c6f61..e848197 100644
--- a/config/knowledge_graph/complex_fb15k-237.yaml
+++ b/config/knowledge_graph/complex_fb15k-237.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-4
+    lr: 2.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/complex_fb15k.yaml b/config/knowledge_graph/complex_fb15k.yaml
index e04beb9..e757df1 100644
--- a/config/knowledge_graph/complex_fb15k.yaml
+++ b/config/knowledge_graph/complex_fb15k.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-4
+    lr: 2.0e-4
     weight_decay: 0
   num_partition: auto
   num_negative: 64
@@ -22,7 +22,7 @@ build:
 train:
   model: ComplEx
   num_epoch: 1000
-  l3_regularization: 2.0e-3
+  l3_regularization: 1.0e-3
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/complex_wn18.yaml b/config/knowledge_graph/complex_wn18.yaml
index 9d30f90..ce02b47 100644
--- a/config/knowledge_graph/complex_wn18.yaml
+++ b/config/knowledge_graph/complex_wn18.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-5
+    lr: 1.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
@@ -22,7 +22,7 @@ build:
 train:
   model: ComplEx
   num_epoch: 4000
-  l3_regularization: 1.0e-3
+  l3_regularization: 5.0e-5
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/complex_wn18rr.yaml b/config/knowledge_graph/complex_wn18rr.yaml
index 3351804..333be6f 100644
--- a/config/knowledge_graph/complex_wn18rr.yaml
+++ b/config/knowledge_graph/complex_wn18rr.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 2.0e-5
+    lr: 1.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
@@ -22,7 +22,7 @@ build:
 train:
   model: ComplEx
   num_epoch: 6000
-  l3_regularization: 5.0e-3
+  l3_regularization: 5.0e-6
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/distmult_fb15k-237.yaml b/config/knowledge_graph/distmult_fb15k-237.yaml
index 416b3cc..fe8df62 100644
--- a/config/knowledge_graph/distmult_fb15k-237.yaml
+++ b/config/knowledge_graph/distmult_fb15k-237.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-5
+    lr: 2.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
@@ -22,7 +22,7 @@ build:
 train:
   model: DistMult
   num_epoch: 1000
-  l3_regularization: 2.0e-3
+  l3_regularization: 5.0e-3
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/distmult_fb15k.yaml b/config/knowledge_graph/distmult_fb15k.yaml
index edc0adb..947475d 100644
--- a/config/knowledge_graph/distmult_fb15k.yaml
+++ b/config/knowledge_graph/distmult_fb15k.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -22,7 +22,7 @@ build:
 train:
   model: DistMult
   num_epoch: 1000
-  l3_regularization: 5.0e-4
+  l3_regularization: 1.0e-3
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/distmult_wn18.yaml b/config/knowledge_graph/distmult_wn18.yaml
index 9990798..61e4d43 100644
--- a/config/knowledge_graph/distmult_wn18.yaml
+++ b/config/knowledge_graph/distmult_wn18.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -22,7 +22,7 @@ build:
 train:
   model: DistMult
   num_epoch: 4000
-  l3_regularization: 2.0e-3
+  l3_regularization: 1.0e-3
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/distmult_wn18rr.yaml b/config/knowledge_graph/distmult_wn18rr.yaml
index f0a18c8..4b37861 100644
--- a/config/knowledge_graph/distmult_wn18rr.yaml
+++ b/config/knowledge_graph/distmult_wn18rr.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-5
+    lr: 2.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/rotate_fb15k-237.yaml b/config/knowledge_graph/rotate_fb15k-237.yaml
index e7e27cd..b9f8d72 100644
--- a/config/knowledge_graph/rotate_fb15k-237.yaml
+++ b/config/knowledge_graph/rotate_fb15k-237.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-5
+    lr: 2.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/rotate_fb15k.yaml b/config/knowledge_graph/rotate_fb15k.yaml
index e36b446..0b6f384 100644
--- a/config/knowledge_graph/rotate_fb15k.yaml
+++ b/config/knowledge_graph/rotate_fb15k.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-4
+    lr: 2.0e-4
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/rotate_wn18.yaml b/config/knowledge_graph/rotate_wn18.yaml
index 4f90ce5..32c1065 100644
--- a/config/knowledge_graph/rotate_wn18.yaml
+++ b/config/knowledge_graph/rotate_wn18.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-4
+    lr: 5.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
@@ -22,7 +22,7 @@ build:
 train:
   model: RotatE
   num_epoch: 4000
-  margin: 12
+  margin: 9
   sample_batch_size: 2000
   adversarial_temperature: 2
   log_frequency: 100
diff --git a/config/knowledge_graph/rotate_wn18rr.yaml b/config/knowledge_graph/rotate_wn18rr.yaml
index f7b82c1..07ec39c 100644
--- a/config/knowledge_graph/rotate_wn18rr.yaml
+++ b/config/knowledge_graph/rotate_wn18rr.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 2.0e-5
+    lr: 5.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/simple_fb15k-237.yaml b/config/knowledge_graph/simple_fb15k-237.yaml
index 9deaa92..6bb6c98 100644
--- a/config/knowledge_graph/simple_fb15k-237.yaml
+++ b/config/knowledge_graph/simple_fb15k-237.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-4
+    lr: 2.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/simple_fb15k.yaml b/config/knowledge_graph/simple_fb15k.yaml
index 11a1e0f..e6f972e 100644
--- a/config/knowledge_graph/simple_fb15k.yaml
+++ b/config/knowledge_graph/simple_fb15k.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-5
+    lr: 2.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/simple_wn18.yaml b/config/knowledge_graph/simple_wn18.yaml
index 7fb8d43..c6ec638 100644
--- a/config/knowledge_graph/simple_wn18.yaml
+++ b/config/knowledge_graph/simple_wn18.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
diff --git a/config/knowledge_graph/simple_wn18rr.yaml b/config/knowledge_graph/simple_wn18rr.yaml
index 86db73d..1b0c385 100644
--- a/config/knowledge_graph/simple_wn18rr.yaml
+++ b/config/knowledge_graph/simple_wn18rr.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
diff --git a/config/knowledge_graph/transe_fb15k-237.yaml b/config/knowledge_graph/transe_fb15k-237.yaml
index fc88665..a0161c7 100644
--- a/config/knowledge_graph/transe_fb15k-237.yaml
+++ b/config/knowledge_graph/transe_fb15k-237.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-5
+    lr: 2.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/transe_fb15k.yaml b/config/knowledge_graph/transe_fb15k.yaml
index 7fda879..15063bd 100644
--- a/config/knowledge_graph/transe_fb15k.yaml
+++ b/config/knowledge_graph/transe_fb15k.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 5.0e-5
+    lr: 1.0e-5
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/transe_wn18.yaml b/config/knowledge_graph/transe_wn18.yaml
index 0ffde0b..5fcf133 100644
--- a/config/knowledge_graph/transe_wn18.yaml
+++ b/config/knowledge_graph/transe_wn18.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 1.0e-5
+    lr: 5.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/knowledge_graph/transe_wn18rr.yaml b/config/knowledge_graph/transe_wn18rr.yaml
index bce1ed8..34df908 100644
--- a/config/knowledge_graph/transe_wn18rr.yaml
+++ b/config/knowledge_graph/transe_wn18rr.yaml
@@ -1,5 +1,5 @@
 application:
-  knowledge_graph
+  knowledge graph
 
 resource:
   gpus: []
@@ -12,7 +12,7 @@ graph:
 build:
   optimizer:
     type: Adam
-    lr: 2.0e-6
+    lr: 1.0e-6
     weight_decay: 0
   num_partition: auto
   num_negative: 64
diff --git a/config/template/graph.yaml b/config/template/graph.yaml
new file mode 100644
index 0000000..468175a
--- /dev/null
+++ b/config/template/graph.yaml
@@ -0,0 +1,105 @@
+###########################################################
+# Node embedding configuration file
+###########################################################
+
+application:
+  graph
+
+resource:
+  # List of GPU ids. Default is all GPUs
+  gpus: []
+  # Memory limit for each GPU in bytes. Default is all available memory.
+  gpu_memory_limit: auto
+  # Number of CPU thread per GPU. Default is all CPUs.
+  cpu_per_gpu: auto
+  # Dimension of the embeddings.
+  dim: 128
+
+format:
+  # String of delimiter characters. Change it if your node name contains blank character.
+  delimiters: " \t\r\n"
+  # Prefix of comment strings. Change it if you use comment style other than Python.
+  comment: "#"
+
+graph:
+  # Path to edge list file. Each line should be one of the following
+  # [node 1] [delimiter] [node 2] [comment]...
+  # [node 1] [delimiter] [node 2] [delimiter] [weight] [comment]...
+  # [comment]...
+  # For standard datasets, you can specify them by <[dataset].[split]>.
+  file_name:
+  # Symmetrize the graph or not. True is recommended.
+  as_undirected: true
+  # Normalize the adjacency matrix or not. This may influence the performance a little.
+  normalization: false
+
+build:
+  optimizer:
+    # Optimizer.
+    type: SGD
+    # Learning rate. Default is usually reasonable.
+    lr: 0.025
+    # Weight decay.
+    weight_decay: 0.005
+    # Learning rate schedule, can be "linear" or "constant". Linear is recommended.
+    schedule: linear
+  # Number of partitions. Auto is recommended.
+  num_partition: auto
+  # Number of negative samples per positive sample.
+  # Larger value results in slower training.
+  # The performance may be influenced by num_negative * negative_weight.
+  num_negative: 1
+  # Batch size of samples in CPU-GPU transfer. Default is recommended.
+  batch_size: 100000
+  # Number of batches in a partition block.
+  # Default is recommended.
+  episode_size: auto
+
+train:
+  # Model, can be DeepWalk, LINE or node2vec.
+  model: DeepWalk
+  # Number of epochs. Default is usually reasonable for sparse graphs.
+  # For dense graphs (|E| / |V| > 100), you may use smaller values.
+  num_epoch: 2000
+  # Weight of negative samples. Values larger than 10 may cause unstable training.
+  negative_weight: 5
+  # Exponent of degrees in negative sampling. Default is recommended.
+  negative_sample_exponent: 0.75
+  # Augmentation step. Need to be tuned on the validation set.
+  # Larger value is needed for sparser graphs.
+  augmentation_step: 5
+  # Return parameter and in-out parameters (node2vec). Need to be tuned on the validation set.
+  p: 1
+  q: 1
+  # Length of each random walk. Default is recommended.
+  random_walk_length: 40
+  # Batch size of random walks in samplers. Default is recommended.
+  random_walk_batch_size: 100
+  # Log every n batches.
+  log_frequency: 1000
+
+# Comment out this section if not needed.
+evaluate:
+  # Comment out any task if not needed.
+  - task: node classification
+    # Path to node label file. Each line should be one of the following
+    # [node] [delimiter] [label] [comment]...
+    # [comment]...
+    file_name:
+    # Portions of data used for training. Each of them corresponds to one evaluation.
+    portions: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
+    # Number of trials repeated. Change it to 1 if your evaluation set is large enough.
+    times: 5
+
+  - task: link prediction
+    # Path to link prediction file. Each line should be
+    # [node 1] [delimiter] [node 2] [delimiter] [label]
+    # where label is 1 for positive and 0 for negative.
+    file_name:
+    # Path to filter file. If you aren't sure that training data is excluded in evaluation,
+    # you can specify the training edge list here.
+    filter_file:
+
+# Comment out this section if not needed.
+save:
+  file_name: graph.pkl
\ No newline at end of file
diff --git a/config/template/knowledge_graph.yaml b/config/template/knowledge_graph.yaml
new file mode 100644
index 0000000..ead33d0
--- /dev/null
+++ b/config/template/knowledge_graph.yaml
@@ -0,0 +1,107 @@
+###########################################################
+# Knowledge graph embedding configuration file
+###########################################################
+
+application:
+  knowledge graph
+
+resource:
+  # List of GPU ids. Default is all GPUs
+  gpus: []
+  # Memory limit for each GPU in bytes. Default is all available memory.
+  gpu_memory_limit: auto
+  # Number of CPU thread per GPU. Default is all CPUs.
+  cpu_per_gpu: auto
+  # Dimension of the embeddings.
+  dim: 1024
+
+format:
+  # String of delimiter characters. Change it if your node name contains blank character.
+  delimiters: " \t\r\n"
+  # Prefix of comment strings. Change it if you use comment style other than Python.
+  comment: "#"
+
+graph:
+  # Path to triplet file. Each line should be one of the following
+  # [head] [delimiter] [relation] [tail] [comment]...
+  # [head] [delimiter] [relation] [tail] [delimiter] [weight] [comment]...
+  # [comment]...
+  # For standard datasets, you can specify them by <[dataset].[split]>.
+  file_name:
+  # Normalize the adjacency matrix or not. This may influence the performance a little.
+  normalization: false
+
+build:
+  optimizer:
+    # Optimizer.
+    type: Adam
+    # Learning rate. Default is usually reasonable.
+    lr: 5.0e-5
+    # Weight decay.
+    weight_decay: 0
+    # Learning rate schedule, can be "linear" or "constant". Linear is recommended.
+    schedule: linear
+  # Number of partitions. Auto is recommended.
+  num_partition: auto
+  # Number of negative samples per positive sample.
+  # Larger value results in slower training.
+  num_negative: 64
+  # Batch size of samples in CPU-GPU transfer. Default is recommended.
+  batch_size: 100000
+  # Number of batches in a partition block.
+  # Default is recommended.
+  episode_size: auto
+
+train:
+  # Model, can be TransE, DistMult, ComplEx, SimplE or RotatE
+  model: TransE
+  # Number of epochs. Default is usually reasonable.
+  num_epoch: 2000
+  # L3 regularization (DistMult, ComplEx and SimplE). Need to be tuned on the validation set.
+  l3_regularization: 2.0e-3
+  # Margin (TransE, RotatE). Need to be tuned on the validation set.
+  margin: 12
+  # Batch size of samples in samplers. Default is recommended.
+  sample_batch_size: 2000
+  # Temperature for self-adversarial negative sampling. Default is usually reasonable.
+  adversarial_temperature: 2
+  # Log every n batches.
+  log_frequency: 100
+
+# Comment out this section if not needed.
+evaluate:
+  # Comment out any task if not needed.
+  - task: link prediction
+    # Path to triplet file. Each line should be one of the following
+    # [head] [delimiter] [relation] [tail] [comment]...
+    # [head] [delimiter] [relation] [tail] [delimiter] [weight] [comment]...
+    # [comment]...
+    file_name:
+    # List of paths to filter files.
+    # Specify all dataset splits for filtered ranking. Comment out for unfiltered ranking.
+    filter_files:
+    # Target entity to rank, can be head, tail or both.
+    target: both
+    # Number of samples to be evaluated. Comment out for precise evaluation.
+    # fast_mode: 3000
+    # Backend, can be graphvite or torch
+    backend: graphvite
+
+  - task: entity prediction
+    # Path to triplet file. Each line should be one of the following
+    # [head] [delimiter] [relation] [tail] [comment]...
+    # [head] [delimiter] [relation] [tail] [delimiter] [weight] [comment]...
+    # [comment]...
+    file_name:
+    # Path to save file, can be "*.txt" or "*.pkl".
+    save_file:
+    # Target entity to predict, can be head or tail.
+    target: tail
+    # Top-k recalls will be returned.
+    k: 10
+    # Backend, can be graphvite or torch.
+    backend: graphvite
+
+# Comment out this section if not needed.
+save:
+  file_name: knowledge_graph.pkl
\ No newline at end of file
diff --git a/config/template/visualization.yaml b/config/template/visualization.yaml
new file mode 100644
index 0000000..2425c9e
--- /dev/null
+++ b/config/template/visualization.yaml
@@ -0,0 +1,129 @@
+###########################################################
+# High-dimensional data visualization configuration file
+###########################################################
+
+application:
+  visualization
+
+resource:
+  # List of GPU ids. Multiple GPUs will cause unstable results.
+  gpus: [0]
+  # Memory limit for each GPU in bytes. Default is all available memory.
+  gpu_memory_limit: auto
+  # Number of CPU thread per GPU. Default is all CPUs.
+  cpu_per_gpu: auto
+  # Dimension of the embeddings.
+  dim: 2
+
+format:
+  # String of delimiter characters. Change it if your node name contains blank character.
+  delimiters: " \t\r\n"
+  # Prefix of comment strings. Change it if you use comment style other than Python.
+  comment: "#"
+
+graph:
+  # Path to vector file. Each line should be one of the following
+  # [value] [delimiter] [value] [delimiter]... [comment]...
+  # [comment]...
+  # For standard datasets, you can specify them by <[dataset].[split]>.
+  vector_file:
+  # Number of neighbors for each node. Default is usually reasonable.
+  num_neighbor: 200
+  # Perplexity for the neighborhood of each node.
+  # Typical values are between 5 and 50. Need to be tuned for best results.
+  # Larger value focuses on global difference and results in larger clusters.
+  perplexity: 30
+  # Normalize the input vectors or not. True is recommended.
+  vector_normalization: true
+
+build:
+  optimizer:
+    # Optimizer.
+    type: Adam
+    # Learning rate. Default is usually reasonable.
+    lr: 0.5
+    # Weight decay. Default is usually reasonable.
+    weight_decay: 1.0e-5
+    # Learning rate schedule, can be "linear" or "constant". Linear is recommended.
+    schedule: linear
+  # Number of partitions. Auto is recommended.
+  num_partition: auto
+  # Number of negative samples per positive sample.
+  # Larger value results in slower training.
+  # The performance may be influenced by num_negative * negative_weight.
+  num_negative: 5
+  # Batch size of samples in CPU-GPU transfer. Default is recommended.
+  batch_size: 100000
+  # Number of batches in a partition block.
+  # Default is recommended.
+  episode_size: auto
+
+train:
+  # Model, can be LargeVis.
+  model: LargeVis
+  # Number of epochs. Default is recommended.
+  num_epoch: 50
+  # Weight of negative samples. Values larger than 10 may cause unstable training.
+  negative_weight: 3
+  # Exponent of degrees in negative sampling. Default is recommended.
+  negative_sample_exponent: 0.75
+  # Batch size of samples in samplers. Default is recommended.
+  sample_batch_size: 2000
+  # Log every n batches.
+  log_frequency: 1000
+
+# Comment out this section if not needed.
+evaluate:
+  # Comment out any task if not needed.
+  - task: visualization
+    # Path to label file. Each line should be one of the following
+    # [label] [comment]...
+    # [comment]...
+    # The file is assumed to have the same order as input vectors.
+    file_name:
+    # Path to save file, can be either "*.png" or "*.pdf".
+    # If not provided, show the figure in window.
+    save_file:
+    # Size of the figure.
+    figure_size: 10
+    # Size of points. Recommend to use figure_size / 5.
+    scale: 2
+
+  # This task only works for dim = 3.
+  - task: animation
+    # Path to label file. Each line should be one of the following
+    # [label] [comment]...
+    # [comment]...
+    file_name:
+    # Path to save file, can be "*.gif".
+    save_file:
+    # Size of the figure.
+    figure_size: 5
+    # Size of points. Recommend to use figure_size / 5.
+    scale: 1
+    # Elevation angle. Default is recommended.
+    elevation: 30
+    # Number of frames. Default is recommended.
+    num_frame: 700
+
+  - task: hierarchy
+    # Path to hierarchical label file. Each line should be one of the following
+    # [label] [delimiter] [label] [delimiter]... [comment]...
+    # [comment]...
+    # Labels should be ordered in ascending depth, i.e. the first label corresponds to the root in the hierarchy.
+    # The file is assumed to have the same order as input vectors.
+    file_name:
+    # Target class to be visualized.
+    target:
+    # Path to save file, can be "*.gif".
+    save_file:
+    # Size of the figure.
+    figure_size: 10
+    # Size of points. Recommend to use figure_size / 5.
+    scale: 2
+    # Duration of each frame in seconds. Default is recommended.
+    duration: 3
+  
+# Comment out this section if not needed.
+save:
+  file_name: visualization.pkl
\ No newline at end of file
diff --git a/config/template/word_graph.yaml b/config/template/word_graph.yaml
new file mode 100644
index 0000000..446ff6c
--- /dev/null
+++ b/config/template/word_graph.yaml
@@ -0,0 +1,77 @@
+###########################################################
+# Word embedding configuration file
+###########################################################
+
+application:
+  word graph
+
+resource:
+  # List of GPU ids. Default is all GPUs
+  gpus: []
+  # Memory limit for each GPU in bytes. Default is all available memory.
+  gpu_memory_limit: auto
+  # Number of CPU thread per GPU. Default is all CPUs.
+  cpu_per_gpu: auto
+  # Dimension of the embeddings.
+  dim: 128
+
+graph:
+  # Path to corpus file. Each line should be one of the following
+  # [word] [delimiter] [word] [delimiter]... [comment]...
+  # [comment]...
+  # For standard datasets, you can specify them by <[dataset].[split]>.
+  file_name: 
+  # Word pairs with distance <= window as counted as edges. Default is recommended.
+  window: 5
+  # Words with occurrence <= min_count are discarded.
+  min_count: 5
+  # Normalize the adjacency matrix or not. This may influence the performance a little.
+  normalization: false
+  # String of delimiter characters. Change it if your node name contains blank character.
+  delimiters: " \t\r\n"
+  # Prefix of comment strings. Change it if you use comment style other than Python.
+  comment: "#"
+
+build:
+  optimizer:
+    # Optimizer.
+    type: SGD
+    # Learning rate. Default is usually reasonable.
+    lr: 0.025
+    # Weight decay.
+    weight_decay: 0.005
+    # Learning rate schedule, can be "linear" or "constant". Linear is recommended.
+    schedule: linear
+  # Number of partitions. Auto is recommended.
+  num_partition: auto
+  # Number of negative samples per positive sample.
+  # Larger value results in slower training.
+  # The performance may be influenced by num_negative * negative_weight.
+  num_negative: 1
+  # Batch size of samples in CPU-GPU transfer. Default is recommended.
+  batch_size: 100000
+  # Number of batches in a partition block.
+  # Default is recommended, unless it overflows the memory (std::bad_alloc).
+  episode_size: auto
+
+train:
+  # Model, can be LINE.
+  model: LINE
+  # Number of epochs. Default is usually reasonable.
+  num_epoch: 80
+  # Weight of negative samples. Values larger than 10 may cause unstable training.
+  negative_weight: 5
+  # Exponent of degrees in negative sampling. Default is recommended.
+  negative_sample_exponent: 0.75
+  # Augmentation step. Default is recommended.
+  augmentation_step: 1
+  # Length of each random walk. Default is recommended.
+  random_walk_length: 40
+  # Batch size of random walks in samplers. Default is recommended.
+  random_walk_batch_size: 100
+  # Log every n batches.
+  log_frequency: 1000
+
+# Comment out this section if not needed.
+save:
+  file_name: word_graph.pkl
\ No newline at end of file
diff --git a/config/word_graph/line_wikipedia.yaml b/config/word_graph/line_wikipedia.yaml
index c248bf6..b4c2b86 100644
--- a/config/word_graph/line_wikipedia.yaml
+++ b/config/word_graph/line_wikipedia.yaml
@@ -1,5 +1,5 @@
 application:
-  word_graph
+  word graph
 
 resource:
   gpus: []
@@ -7,7 +7,7 @@ resource:
   dim: 128
 
 graph:
-  file_name: <wikipedia.train>
+  file_name: <wikipedia.graph>
   window: 5
   min_count: 5
 
diff --git a/doc/source/benchmark.rst b/doc/source/benchmark.rst
index d83dad7..1e6e6d8 100644
--- a/doc/source/benchmark.rst
+++ b/doc/source/benchmark.rst
@@ -99,20 +99,20 @@ Knowledge Graph Embedding
 -------------------------
 
 For knowledge graph embedding, we benchmark `TransE`_, `DistMult`_, `ComplEx`_ and
-`RotatE`_ on 4 standard datasets. The training time and resource of `RotatE`_ on
-these datasets is given in the following table.
-
-+---------------------+-------+-------+-------+---------------+-----------------+
-| Dataset             | \|V\| | \|E\| | \|R\| | Training Time | GPU memory cost |
-+=====================+=======+=======+=======+===============+=================+
-| `FB15k`_            | 15K   | 483K  | 1.3K  | 27.0 mins     | 4 * 785 MiB     |
-+---------------------+-------+-------+-------+---------------+-----------------+
-| `FB15k-237`_        | 15K   | 272K  | 237   | 14.3 mins     | 4 * 745 MiB     |
-+---------------------+-------+-------+-------+---------------+-----------------+
-| `WN18`_             | 41K   | 141K  | 18    | 15.3 mins     | 4 * 761 MiB     |
-+---------------------+-------+-------+-------+---------------+-----------------+
-| `WN18RR`_           | 41K   | 87K   | 11    | 13.8 mins     | 4 * 761 MiB     |
-+---------------------+-------+-------+-------+---------------+-----------------+
+`RotatE`_ on 4 standard datasets. The training time, evaluation time and resource of
+`RotatE`_ on these datasets are given in the following table.
+
++---------------------+-------+-------+-------+----------------------------+-----------------+
+| Dataset             | \|V\| | \|E\| | \|R\| | Training / Evaluation Time | GPU memory cost |
++=====================+=======+=======+=======+============================+=================+
+| `FB15k`_            | 15K   | 483K  | 1.3K  | 28.1 mins / 55.8 s         | 4 * 788 MiB     |
++---------------------+-------+-------+-------+----------------------------+-----------------+
+| `FB15k-237`_        | 15K   | 272K  | 237   | 15.3 mins / 20.8 s         | 4 * 758 MiB     |
++---------------------+-------+-------+-------+----------------------------+-----------------+
+| `WN18`_             | 41K   | 141K  | 18    | 16.4 mins / 12.0 s         | 4 * 776 MiB     |
++---------------------+-------+-------+-------+----------------------------+-----------------+
+| `WN18RR`_           | 41K   | 87K   | 11    | 14.8 mins / 8.98 s         | 4 * 776 MiB     |
++---------------------+-------+-------+-------+----------------------------+-----------------+
 
 To evaluate the knowledge graph embeddings, we test them on the link prediction
 task. We report the results for each model on the test set, where ranking metrics
@@ -121,15 +121,15 @@ are computed based on filtered results.
 +-------------+-----+-------+---------+---------+----------+
 | `FB15k`_    | MR  | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
 +=============+=====+=======+=========+=========+==========+
-| `TransE`_   | 42  | 0.694 | 0.576   | 0.789   | 0.868    |
+| `TransE`_   | 57  | 0.614 | 0.459   | 0.739   | 0.849    |
 +-------------+-----+-------+---------+---------+----------+
-| `DistMult`_ | 136 | 0.747 | 0.684   | 0.793   | 0.849    |
+| `DistMult`_ | 70  | 0.765 | 0.698   | 0.812   | 0.874    |
 +-------------+-----+-------+---------+---------+----------+
-| `ComplEx`_  | 50  | 0.678 | 0.571   | 0.755   | 0.857    |
+| `ComplEx`_  | 236 | 0.679 | 0.596   | 0.735   | 0.821    |
 +-------------+-----+-------+---------+---------+----------+
-| `SimplE`_   | 74  | 0.779 | 0.721   | 0.818   | 0.876    |
+| `SimplE`_   | 87  | 0.791 | 0.737   | 0.828   | 0.882    |
 +-------------+-----+-------+---------+---------+----------+
-| `RotatE`_   | 44  | 0.740 | 0.654   | 0.805   | 0.875    |
+| `RotatE`_   | 36  | 0.767 | 0.687   | 0.830   | 0.891    |
 +-------------+-----+-------+---------+---------+----------+
 
 .. seealso::
@@ -143,15 +143,15 @@ are computed based on filtered results.
 +--------------+-----+-------+---------+---------+----------+
 | `FB15k-237`_ | MR  | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
 +==============+=====+=======+=========+=========+==========+
-| `TransE`_    | 157 | 0.294 | 0.193   | 0.328   | 0.502    |
+| `TransE`_    | 172 | 0.288 | 0.190   | 0.324   | 0.487    |
 +--------------+-----+-------+---------+---------+----------+
-| `DistMult`_  | 272 | 0.281 | 0.182   | 0.312   | 0.490    |
+| `DistMult`_  | 224 | 0.295 | 0.204   | 0.329   | 0.478    |
 +--------------+-----+-------+---------+---------+----------+
-| `ComplEx`_   | 193 | 0.311 | 0.212   | 0.348   | 0.513    |
+| `ComplEx`_   | 372 | 0.271 | 0.184   | 0.301   | 0.447    |
 +--------------+-----+-------+---------+---------+----------+
-| `SimplE`_    | 176 | 0.298 | 0.198   | 0.333   | 0.504    |
+| `SimplE`_    | 253 | 0.284 | 0.196   | 0.315   | 0.462    |
 +--------------+-----+-------+---------+---------+----------+
-| `RotatE`_    | 176 | 0.314 | 0.217   | 0.347   | 0.511    |
+| `RotatE`_    | 201 | 0.314 | 0.218   | 0.348   | 0.506    |
 +--------------+-----+-------+---------+---------+----------+
 
 .. seealso::
@@ -162,19 +162,19 @@ are computed based on filtered results.
     :download:`simple_fb15k-237.yaml<../../config/knowledge_graph/simple_fb15k-237.yaml>`
     :download:`rotate_fb15k-237.yaml<../../config/knowledge_graph/rotate_fb15k-237.yaml>`
 
-+-------------+-----+-------+---------+---------+----------+
-| `WN18`_     | MR  | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
-+=============+=====+=======+=========+=========+==========+
-| `TransE`_   | 234 | 0.608 | 0.306   | 0.916   | 0.952    |
-+-------------+-----+-------+---------+---------+----------+
-| `DistMult`_ | 355 | 0.819 | 0.711   | 0.923   | 0.954    |
-+-------------+-----+-------+---------+---------+----------+
-| `ComplEx`_  | 760 | 0.940 | 0.936   | 0.943   | 0.946    |
-+-------------+-----+-------+---------+---------+----------+
-| `SimplE`_   | 412 | 0.948 | 0.944   | 0.950   | 0.954    |
-+-------------+-----+-------+---------+---------+----------+
-| `RotatE`_   | 226 | 0.945 | 0.938   | 0.950   | 0.958    |
-+-------------+-----+-------+---------+---------+----------+
++-------------+------+-------+---------+---------+----------+
+| `WN18`_     | MR   | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
++=============+======+=======+=========+=========+==========+
+| `TransE`_   | 522  | 0.545 | 0.211   | 0.881   | 0.933    |
++-------------+------+-------+---------+---------+----------+
+| `DistMult`_ | 661  | 0.819 | 0.717   | 0.918   | 0.945    |
++-------------+------+-------+---------+---------+----------+
+| `ComplEx`_  | 1262 | 0.877 | 0.857   | 0.892   | 0.909    |
++-------------+------+-------+---------+---------+----------+
+| `SimplE`_   | 487  | 0.944 | 0.941   | 0.946   | 0.949    |
++-------------+------+-------+---------+---------+----------+
+| `RotatE`_   | 303  | 0.948 | 0.924   | 0.950   | 0.957    |
++-------------+------+-------+---------+---------+----------+
 
 .. seealso::
     Configuration files:
@@ -184,19 +184,19 @@ are computed based on filtered results.
     :download:`simple_wn18.yaml<../../config/knowledge_graph/simple_wn18.yaml>`
     :download:`rotate_wn18.yaml<../../config/knowledge_graph/rotate_wn18.yaml>`
 
-+-------------+------+-------+---------+---------+----------+
-| `WN18RR`_   | MR   | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
-+=============+======+=======+=========+=========+==========+
-| `TransE`_   | 2620 | 0.215 | 0.012   | 0.382   | 0.526    |
-+-------------+------+-------+---------+---------+----------+
-| `DistMult`_ | 2954 | 0.467 | 0.416   | 0.489   | 0.562    |
-+-------------+------+-------+---------+---------+----------+
-| `ComplEx`_  | 7131 | 0.425 | 0.405   | 0.431   | 0.460    |
-+-------------+------+-------+---------+---------+----------+
-| `SimplE`_   | 4751 | 0.475 | 0.445   | 0.489   | 0.535    |
-+-------------+------+-------+---------+---------+----------+
-| `RotatE`_   | 1845 | 0.490 | 0.439   | 0.508   | 0.589    |
-+-------------+------+-------+---------+---------+----------+
++-------------+-------+-------+---------+---------+----------+
+| `WN18RR`_   | MR    | MRR   | HITS\@1 | HITS\@3 | HITS\@10 |
++=============+=======+=======+=========+=========+==========+
+| `TransE`_   | 3834  | 0.219 | 0.015   | 0.398   | 0.523    |
++-------------+-------+-------+---------+---------+----------+
+| `DistMult`_ | 5065  | 0.449 | 0.419   | 0.466   | 0.504    |
++-------------+-------+-------+---------+---------+----------+
+| `ComplEx`_  | 12602 | 0.328 | 0.312   | 0.339   | 0.353    |
++-------------+-------+-------+---------+---------+----------+
+| `SimplE`_   | 5569  | 0.446 | 0.421   | 0.458   | 0.492    |
++-------------+-------+-------+---------+---------+----------+
+| `RotatE`_   | 2359  | 0.500 | 0.455   | 0.518   | 0.589    |
++-------------+-------+-------+---------+---------+----------+
 
 .. seealso::
     Configuration files:
@@ -217,9 +217,9 @@ can be traded off with speed if necessary.
 +-------------+------------------+-------+------+---------------+-----------------+
 | Dataset     | Vector           | N     | dim  | Training Time | GPU memory cost |
 +=============+==================+=======+======+===============+=================+
-| `MNIST`_    | Raw pixels       | 70K   | 784  | 15.1 s        | 2.86 GiB        |
+| `MNIST`_    | Raw pixels       | 70K   | 784  | 13.9 s        | 2.86 GiB        |
 +-------------+------------------+-------+------+---------------+-----------------+
-| `ImageNet`_ | ResNet50 feature | 1.33M | 2048 | 16.6 mins     | 15.1 GiB        |
+| `ImageNet`_ | ResNet50 feature | 1.33M | 2048 | 13.6 mins     | 15.1 GiB        |
 +-------------+------------------+-------+------+---------------+-----------------+
 
 .. seealso::
diff --git a/doc/source/developer/framework.rst b/doc/source/developer/framework.rst
index 10be8ac..6afa58c 100644
--- a/doc/source/developer/framework.rst
+++ b/doc/source/developer/framework.rst
@@ -19,15 +19,16 @@ vectors. This design enables dynamic data type in Python interface, as well as m
 compile-time optimization.
 
 The C++ interface is highly abstracted to faciliate further development on GraphVite.
-Generally, by inheriting from the core interface, you can implement your graph deep
-learning routine without caring about scheduling details.
+Generally, by inheriting from the core interface, we can implement new graph embedding
+instances without caring about any scheduling detail.
 
 The source code is organized as follows.
 
     - ``include/base/*`` implements basic data structures
-    - ``include/util/*`` implements basic utils
     - ``include/core/*`` implements optimizers, and core interface of graphs and solvers
-    - ``include/gpu/*`` implements forward & backward propagation for all models
     - ``include/instance/*`` implements instances of graphs and solvers
+    - ``include/instance/gpu/*`` implements GPU training & evaluation routine       s
+    - ``include/instance/model/*`` implements forward & backward propagation of models
+    - ``include/util/*`` implements basic utils
     - ``include/bind.h`` implements Python bindings
     - ``src/graphvite.cu`` instantiates all Python classes
diff --git a/doc/source/developer/model.rst b/doc/source/developer/model.rst
index cd714bc..f45b81e 100644
--- a/doc/source/developer/model.rst
+++ b/doc/source/developer/model.rst
@@ -1,90 +1,97 @@
 Customize Models
 ================
 
-One common demand for graph embedding is to customize the model (i.e. loss function).
-Here we will show you an example of adding a new loss function to the knowledge graph
+One common demand for graph embedding is to customize the model (i.e. score function).
+Here we will demonstrate an example of adding a new model to the knowledge graph
 solver.
 
-Before start, it would be better if you know some basics about `the index and threads`_
-in CUDA. In GraphVite, the threads are arranged in a group of 32 (`warp`_). Threads in
-a group works simultaneously on an edge sample, where each thread is responsible for
-computation in some dimensions, according to the modulus of the dimension.
+First, get into ``include/model/knowledge_graph.h``. Fork an existing model class
+(e.g. TransE) and change it to a new name.
 
-.. _the index and threads: https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming)#Indexing
-.. _warp: https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming)#Warps
+.. code-block:: c++
 
-First, get into ``include/gpu/knowledge_graph.h``. Fork an existing loss function
-(e.g. transe) and change it to your own name.
+    template<class _Vector>
+    class TransE {
+        __host__ __device__ static void forward(...);
 
-You will find 3 implementations of the loss function in the namespace.
+        template <OptimizerType optimizer_type>
+        __host__ __device__ static void backward(...);
 
-.. code-block:: c++
+        template <OptimizerType optimizer_type>
+        __host__ __device__ static void backward(...);
 
-    namespace transe {
-    __global__ void train(...);
-    __global__ void train_1_moment(...);
-    __global__ void train_2_moment(...);
+        template <OptimizerType optimizer_type>
+        __host__ __device__ static void backward(...);
     }
 
-The three implementations correspond to 3 categories of optimizers. We are going to
-modify one, and then do some copy-and-paste work to the others.
+Here a model class contains a forward function and several overloads of the backward
+function, which correspond to different categories of optimizers. We are going to
+modify a forward and a backward function, and then do some copy-and-paste work to the
+others.
 
-Let's start from ``train_2_moment()``. Find the following two loops.
-You can locate them by searching ``i = lane_id``.
+Let's start from the forward function. This function takes a triplet of embedding
+vectors, and outputs a score.
 
 .. code-block:: c++
 
-    for (int i = lane_id; i < dim; i += kWarpSize) {
-        x += ...;
-    }
+    void forward(const Vector &head, const Vector &tail, const Vector &relation,
+                 Float &output, float margin)
 
-    for (int i = lane_id; i < dim; i += kWarpSize) {
-        head[i] -= (optimizer.*update)(head[i], ..., head_moment1[i], head_moment2[i], weight);
-        tail[i] -= (optimizer.*update)(tail[i], ..., tail_moment1[i], tail_moment2[i], weight);
-        Float relation_update = (optimizer.*update)(relation[i], ...,
-                                                    relation_moment1[i], relation_moment2[i], weight);
-        relation[i] -= relation_update;
-        relation_gradient[i] += relation_update;
-    }
+The last argument is either margin for latent distance model or l3 regularization
+for tensor decomposition models. For TransE, the function is implemented as
+
+.. code-block:: c++
 
-The first loop is the forward propagtion, which computes the score for each dimension.
-The second loop is the backward propagation, which computes the gradient for each
-dimension.
+    output = 0;
+    FOR(i, dim)
+        output += abs(head[i] + relation[i] - tail[i]);
+    output = margin - SUM(output);
+
+Here we need to replace this piece of code with our own formulas. Note that this
+function should be compatible with both CPU and GPU. This can be easily achieved by
+helper macros defined in GraphVite.
+
+We just need to use the macro ``FOR(i, stop)`` instead of the conventional
+``for (int i = 0; i < stop; i++)``. For any accumulator ``x`` inside the loop (e.g.
+``output`` in this case), update it with ``x = SUM(x)`` after the loop to get the
+correct value.
+
+For the backward function. It takes additional arguments of moment statistics, head
+gradient, optimizer and sample weight. For example, here is an overload with 1 moment
+per embedding.
+
+.. code-block:: c++
 
-What you need to do is to replace the ellipsis with your own formulas.
-Note the head gradient is already stored in ``gradient``, which you need to refer
-in your back propagation.
+    template<OptimizerType optimizer_type>
+    void backward(Vector &head, Vector &tail, Vector &relation,
+                  Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                  float margin, Float gradient, const Optimizer &optimizer, Float weight)
 
-If you want to change the loss function over the logit
-(e.g. change from margin loss to standard log-likelihood), you need also change
-the code between these two loops, as the following fragment shows.
+The backward function should compute the gradient for each embedding, and update them
+with the optimizer. Typically, this is implemented as
 
 .. code-block:: c++
 
-    x = WarpBroadcast(WarpReduce(x), 0);
-    Float prob = ...;
-    if (label) {
-        gradient = ...;
-        weight = ...;
-    #ifdef USE_LOSS
-        sample_loss += ...;
-    #endif
-    } else {
-        gradient = ...;
-        weight = ...;
-    #ifdef USE_LOSS
-        sample_loss += ...;
-    #endif
+    auto update = get_update_function_1_moment<Float, optimizer_type>();
+    FOR(i, dim) {
+        Float h = head[i];
+        Float t = tail[i];
+        Float r = relation[i];
+        Float s = h + r - t > 0 ? 1 : -1;
+        head[i] -= (optimizer.*update)(h, -gradient * s, head_moment1[i], weight);
+        tail[i] -= (optimizer.*update)(t, gradient * s, tail_moment1[i], weight);
+        relation[i] -= (optimizer.*update)(r, -gradient * s, relation_moment1[i], weight);
     }
 
-Now you are almost there. Copy the modified fragment to ``train()`` and
-``train_1_moment()``, and delete undeclared variables like ``head_moment2``.
-Now your model supports all optimizers.
+Here we modify this function according to the partial derivatives of our forward
+function. Once we complete a backward function, we can copy them to the other
+overloads. The only difference among overloads is that they use different update
+function and numbers of moment statistics.
 
-Finally, you have to let the solver know there is a new model. In
+Finally, we have to let the solver know there is a new model. In
 ``instance/knowledge_graph.cuh``, add the name of your model in
-``get_available_models()``. Also add run-time dispatch for optimizers in
-``kernel_dispatch()``.
+``get_available_models()``. Also add run-time dispatch of the new model in
+``train_dispatch()`` and ``predict_dispatch()``.
 
 .. code-block:: c++
 
diff --git a/doc/source/developer/routine.rst b/doc/source/developer/routine.rst
new file mode 100644
index 0000000..0bad145
--- /dev/null
+++ b/doc/source/developer/routine.rst
@@ -0,0 +1,89 @@
+Customize Routine
+=================
+
+For advanced developers, GraphVite also supports customizing routines, such as
+training and prediction. Here we will illustrate how to add a new routine to the
+knowledge graph solver.
+
+Before we start, it would be better if you know some basics about
+`the index and threads`_ in CUDA. In GraphVite, the threads are arranged in a group
+of 32 (`warp`_). Threads in a group works simultaneously on an edge sample, where
+each thread is responsible for computation in some dimensions, according to the
+modulus of the dimension.
+
+.. _the index and threads: https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming)#Indexing
+.. _warp: https://en.wikipedia.org/wiki/Thread_block_(CUDA_programming)#Warps
+
+First, get into ``include/instance/gpu/knowledge_graph.h``. This file includes several
+training functions and a prediction function.
+
+.. code-block:: c++
+
+    template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+    __global__ void train(...)
+
+    template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+    __global__ void train_1_moment(...)
+
+    template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+    __global__ void train_2_moment(...)
+
+    template<class Vector, class Index, template<class> class Model>
+    __global__ void predict(...)
+
+The 3 implementations correspond to 3 categories of optimizers, as we have seen in
+:doc:`routine`. Routines with different numbers of moment statistics are separated
+to achieve maximal compile-time optimization.
+
+Let's take a look at a training function. Generally, the function body looks like
+
+.. code-block:: c++
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        if (adversarial_temperature > kEpsilon)
+            for (int s = 0; s < num_negative; s++)
+                normalizer += ...;
+
+        for (int s = 0; s <= num_negative; s++) {
+            model.forward(sample[s], logit);
+            prob = sigmoid(logit);
+
+            gradient = ...;
+            weight = ...;
+            sample_loss += ...;
+            model.backward<optimizer_type>(sample[s], gradient);
+        }
+    }
+
+The outer loop iterates over all positive samples. For each positive sample and its
+negative samples, we first compute the normalizer of self-adversarial negative
+sampling, and then perform forward and backward propagation for each sample.
+
+For example, if we want to change the negative log likelihood to a mean square error,
+we can change the following lines.
+
+.. code-block:: c++
+
+    gradient = 2 * (logit - label);
+    sample_loss += weight * (logit - label) * (logit - label);
+
+Or we can use a margin-based ranking loss like
+
+.. code-block:: c++
+
+    model.forward(samples[num_negative], positive_score); // the positive sample
+
+    for (int s = 0; s < num_negative; s++) {
+        model.forward(samples[s], negative_logit);
+        if (positive_score - negative_score < margin) {
+            sample_loss += negative_score - positive_score + margin;
+            gradient = 1;
+            model.backward<optimizer_type>(sample[s], gradient);
+            model.backward<optimizer_type>(sample[num_negative], -gradient);
+        }
+    }
+
+We may also add new hyperparameters or training routines. Note if we change
+the signature of the function, we should also update its calls accrodingly. For
+knowledge graph, they are in ``train_dispatch()`` and ``predict_dispatch()`` of file
+``include/instance/knowledge_graph.cuh``.
\ No newline at end of file
diff --git a/doc/source/developer/solver.rst b/doc/source/developer/solver.rst
index bcc2ac1..1fb5a61 100644
--- a/doc/source/developer/solver.rst
+++ b/doc/source/developer/solver.rst
@@ -9,7 +9,7 @@ fits into the following paradigm.
   There may be additional attributes (e.g. labels) to edge samples.
 
 To support that, GraphVite provides a protocol interface and a series of abstract
-classes. You only need to declare the protocols for your parameters, and fill in the
+classes. We only need to declare the protocols for our parameters, and fill in the
 virtual member functions for the classes.
 
 Let's begin with the protocol interface. There are 3 main protocols for parameters.
@@ -28,7 +28,7 @@ parameter matrix takes in-place update and doesn't need storage for gradients.
 The other is ``shared``, which implies the matrix is shared with the previous one.
 This may be used for tied weight case.
 
-Each parameter matrix should also be specified with a shape. You can use ``auto``
+Each parameter matrix should also be specified with a shape. We can use ``auto``
 if the shape can be inferred from the protocol and the graph structure.
 
 For example, knowledge graph embeddings take the following settings.
@@ -39,7 +39,7 @@ For example, knowledge graph embeddings take the following settings.
     protocols = {head | in place, tail | in place | shared, global};
     shapes = {auto, auto, graph->num_relation};
 
-If your learning routine also needs negative sampling, you should additionally
+If the learning routine also needs negative sampling, we should additionally
 specify a negative sampler protocol. For knowledge graph embedding, this is
 
 .. code-block:: c++
@@ -57,19 +57,19 @@ detailed explanation of the algorithm, see section 3.2 in `GraphVite paper`_.
     is ``head`` or ``tail``. If all parameters are ``global``, GraphVite will schedule
     them by standard data parallel.
 
-To implement a new solver, you need to implement ``get_protocols()``,
+To implement a new solver, we need to implement ``get_protocols()``,
 ``get_sampler_protocol()`` and ``get_shapes()`` as above. Some additional helper
 functions may be required to complete the solver.
 
 A solver also contains a sampler and a worker class. By default, the sampler samples
 positive edges from the graph, with probability proportional to the weight of each
-edge. You only need to specify the additional edge attributes in ``get_attributes()``.
+edge. We only need to specify the additional edge attributes in ``get_attributes()``.
 
 For the worker, it will build the negative sampler according to the its protocol.
-You need to specify the GPU implementation of models in ``kernel_dispatch()``. See
+We need to specify the GPU implementation of models in ``train_dispatch()``. See
 :doc:`model` for how to do that.
 
-Finally, to get your new solver appeared in Python, add a Python declaration for it in
+Finally, to get our new solver appeared in Python, add a Python declaration for it in
 ``include/bind.h``, and instantiate it in ``src/graphvite.cu``.
 
 See ``include/instance/*`` for all solver instances.
diff --git a/doc/source/faq.rst b/doc/source/faq.rst
index 5fb5f33..35e6325 100644
--- a/doc/source/faq.rst
+++ b/doc/source/faq.rst
@@ -30,6 +30,22 @@ Config your conda with the following command, and try installation again.
 
     conda config --add channels conda-forge
 
+Why is my CUDA driver version insufficient for CUDA runtime version?
+--------------------------------------------------------------------
+
+This is because you have installed a GraphVite compiled for some later CUDA version.
+You can check your CUDA version with ``nvcc -V``, and then install the corresponding
+package by
+
+.. code-block:: bash
+
+    conda install -c milagraph graphvite cudatoolkit=x.x
+
+where ``x.x`` is your CUDA version, e.g. 9.2 or 10.0.
+
+Note graphvite does not support CUDA version earlier than 9.2, due to a failure of
+old version ``nvcc``.
+
 Why is there a compilation error for template deduction?
 --------------------------------------------------------
 
@@ -48,4 +64,12 @@ embeddings.
 
     embeddings = solver.vertex_embeddings
 
-Now the access to ``embeddings`` should be good.
\ No newline at end of file
+Now the access to ``embeddings`` should be good.
+
+How can I speed up compliation?
+-------------------------------
+
+The compilation can be accelerated by reducing the number of template instantiations.
+You can pass ``-DFAST_COMPILE=True`` to cmake, which will only compile commonly used
+embedding dimensions. You may also comment out unnecessary instantiations in
+``src/graphvite.cu`` for further speed-up.
\ No newline at end of file
diff --git a/doc/source/index.rst b/doc/source/index.rst
index a8214cf..0b38ea4 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -22,6 +22,7 @@ GraphVite - graph embedding at high speed and large scale
 
    user/command_line
    user/configuration
+   user/format
    user/python
    user/auto
 
@@ -31,6 +32,7 @@ GraphVite - graph embedding at high speed and large scale
 
    developer/framework
    developer/model
+   developer/routine
    developer/solver
 
 .. toctree::
diff --git a/doc/source/install.rst b/doc/source/install.rst
index c95d544..dfcf340 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -1,26 +1,27 @@
 Install
 =======
 
-There are 2 ways to install GraphVite.
+GraphVite can be installed from either conda or source. You can also easily install
+the library on `Google Colab`_ for demonstration.
+
+.. _Google Colab: https://colab.research.google.com/
 
 Install from conda
 ------------------
 
-You can install GraphVite from ``conda`` with only one line.
+To install GraphVite from ``conda``, you only need one line.
 
 .. code-block:: bash
 
-    conda install -c milagraph graphvite cudatoolkit=x.x
-
-where ``x.x`` is your CUDA version, e.g. 9.2 or 10.0.
+    conda install -c milagraph graphvite cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+.\d+")
 
 By default, this will install all dependencies, including ``PyTorch`` and
-``matplotlib``. If you only need embedding training without evaluation, you can take
-the following alternative with minimum dependencies.
+``matplotlib``. If you only need embedding training without evaluation, there is an
+alternative with minimum dependencies.
 
 .. code-block:: bash
 
-    conda install -c milagraph graphvite-mini cudatoolkit=x.x
+    conda install -c milagraph graphvite-mini cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+.\d+")
 
 Install from source
 -------------------
@@ -50,4 +51,31 @@ Finally, install Python bindings.
 
 .. code-block:: bash
 
-    cd python && python setup.py install && cd -
\ No newline at end of file
+    cd python && python setup.py install && cd -
+
+Install on Colab
+----------------
+
+First, install Miniconda on Colab.
+
+.. code-block:: bash
+
+    !wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    !chmod +x Miniconda3-latest-Linux-x86_64.sh
+    !./Miniconda3-latest-Linux-x86_64.sh -b -p /usr/local -f
+
+Then we install GraphVite and some tools for Jupyter Notebook.
+
+.. code-block:: bash
+
+    !conda install -y -c milagraph -c conda-forge graphvite \
+        python=3.6 cudatoolkit=$(nvcc -V | grep -Po "(?<=V)\d+\.\d+")
+    !conda install -y wurlitzer ipykernel
+
+Load the installed packages. Now you are ready to go.
+
+.. code-block:: python
+
+    import site
+    site.addsitedir("/usr/local/lib/python3.6/site-packages")
+    %reload_ext wurlitzer
diff --git a/doc/source/introduction.rst b/doc/source/introduction.rst
index 6af2343..a98255c 100644
--- a/doc/source/introduction.rst
+++ b/doc/source/introduction.rst
@@ -60,7 +60,7 @@ To give a brief idea of GraphVite's speed, we summarize the training time of
 GraphVite along with the best open-source implementations. All the time is reported
 based on a server with 24 CPU threads and 4 V100 GPUs.
 
-Node embedding on `Youtube`_ dataset.
+Training time of node embedding on `Youtube`_ dataset.
 
 +-------------+----------------------------+-----------+---------+
 | Model       | Existing Implementation    | GraphVite | Speedup |
@@ -76,25 +76,25 @@ Node embedding on `Youtube`_ dataset.
 .. _1.39 hrs (CPU parallel): https://github.com/tangjianpku/LINE
 .. _24.4 hrs (CPU parallel): https://github.com/aditya-grover/node2vec
 
-Knowledge graph embedding on `FB15k`_ dataset.
+Training / evaluation time of knowledge graph embedding on `FB15k`_ dataset.
 
-+-----------+-------------------------+-----------+---------+
-| Model     | Existing Implementation | GraphVite | Speedup |
-+===========+=========================+===========+=========+
-| `TransE`_ | `1.31 hrs (1 GPU)`_     | 14.8 mins | 5.30x   |
-+-----------+-------------------------+-----------+---------+
-| `RotatE`_ | `3.69 hrs (1 GPU)`_     | 27.0 mins | 8.22x   |
-+-----------+-------------------------+-----------+---------+
++-----------+---------------------------------+--------------------+---------------+
+| Model     | Existing Implementation         | GraphVite          | Speedup       |
++===========+=================================+====================+===============+
+| `TransE`_ | `1.31 hrs / 1.75 mins (1 GPU)`_ | 13.5 mins / 54.3 s | 5.82x / 1.93x |
++-----------+---------------------------------+--------------------+---------------+
+| `RotatE`_ | `3.69 hrs / 4.19 mins (1 GPU)`_ | 28.1 mins / 55.8 s | 7.88x / 4.50x |
++-----------+---------------------------------+--------------------+---------------+
 
-.. _1.31 hrs (1 GPU): https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
-.. _3.69 hrs (1 GPU): https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
+.. _1.31 hrs / 1.75 mins (1 GPU): https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
+.. _3.69 hrs / 4.19 mins (1 GPU): https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
 
-High-dimensional data visualization on `MNIST`_ dataset.
+Training time of high-dimensional data visualization on `MNIST`_ dataset.
 
 +-------------+-----------------------------+-----------+---------+
 | Model       | Existing Implementation     | GraphVite | Speedup |
 +=============+=============================+===========+=========+
-| `LargeVis`_ | `15.3 mins (CPU parallel)`_ | 15.1 s    | 60.8x   |
+| `LargeVis`_ | `15.3 mins (CPU parallel)`_ | 13.9 s    | 66.8x   |
 +-------------+-----------------------------+-----------+---------+
 
 .. _15.3 mins (CPU parallel): https://github.com/lferry007/LargeVis
@@ -104,19 +104,19 @@ Comparison to concurrent work
 
 A work concurrent to GraphVite is `PyTorch-BigGraph`_, which aims at accelerating
 knowledge graph embedding on large-scale data. Here is an apple-to-apple comparison
-of models implemented in both libraries on `FB15k`_, under the same hyperparameter
-setting.
+of models implemented in both libraries on `FB15k`_, under the same setting of
+hyperparameters.
 
 .. _PyTorch-BigGraph: https://torchbiggraph.readthedocs.io
 
 +-------------+------------------+-----------+---------+
 | Model       | PyTorch-BigGraph | GraphVite | Speedup |
 +=============+==================+===========+=========+
-| `TransE`_   | 1.21 hrs         | 11.1 mins | 6.57x   |
+| `TransE`_   | 1.21 hrs         | 8.37 mins | 8.70x   |
 +-------------+------------------+-----------+---------+
-| `DistMult`_ | 2.48 hrs         | 25.0 mins | 5.93x   |
+| `DistMult`_ | 2.48 hrs         | 20.3 mins | 7.33x   |
 +-------------+------------------+-----------+---------+
-| `ComplEx`_  | 3.13 hrs         | 20.6 mins | 9.13x   |
+| `ComplEx`_  | 3.13 hrs         | 18.5 mins | 10.1x   |
 +-------------+------------------+-----------+---------+
 
 GraphVite surpasses its counterpart by a signficant margin. Besides, the framework of
diff --git a/doc/source/quick_start.rst b/doc/source/quick_start.rst
index 6cbb292..45b62f8 100644
--- a/doc/source/quick_start.rst
+++ b/doc/source/quick_start.rst
@@ -1,43 +1,58 @@
 Quick Start
 ===========
 
-Here is a quick-start example that illustrate the pipeline in GraphVite. If you don't
-have ``pytorch`` installed, simply add ``--no-eval`` to skip the evaluation stage.
+Here is a quick-start example that illustrate the pipeline in GraphVite. If ``pytorch``
+is not installed, we can simply add ``--no-eval`` to skip the evaluation stage.
 
 .. code-block:: bash
 
     graphvite baseline quick start
 
 The example will automatically download a social network dataset called BlogCatalog,
-where nodes correspond to blog users. For each node, we learn an embedding vector,
-and evaluate the embeddings by using them as features for multi-label node
-classifcation.
+where nodes correspond to blog users. For each node, we learn an embedding vector that
+preserves its neighborhood structure, which is done by minimizing a reconstruction
+loss. GraphVite will display the progress and the loss during training.
 
-Typically, the example takes no more than 1 minute. You will obtain some output like
+Once the training is done, the learned embeddings are evaluated on link prediction and
+node classification tasks. For link prediction, we try to predict unseen edges with
+the embeddings. For node classification, we use the embeddings as inputs for
+multi-label classification of nodes.
+
+Typically, this example takes no more than 1 minute. We will obtain some output like
 
 .. code-block:: none
 
     Batch id: 6000
-    loss = 0.371641
+    loss = 0.371041
 
-    macro-F1@20%: 0.236794
-    micro-F1@20%: 0.388110
+    ------------- link prediction --------------
+    AUC: 0.899933
+    
+    ----------- node classification ------------
+    macro-F1@20%: 0.242114
+    micro-F1@20%: 0.391342
 
-Note that the F1 scores may vary across different trials,
-as only one random split is evaluated for quick demonstration here.
+Note that the F1 scores may vary across different trials, as only one random split is
+evaluated for quick demonstration here.
 
-The learned embeddings are saved into a compressed numpy dump.
-You can load them for further use
+The learned embeddings are saved into a pickle dump. We can load them for further
+use.
 
     >>> import pickle
     >>> with open("line_blogcatalog.pkl", "rb") as fin:
     >>>     blogcatalog = pickle.load(fin)
     >>> names = blogcatalog.id2name
     >>> embeddings = blogcatalog.vertex_embeddings
-    >>> print(names[1024], embddings[1024])
+    >>> print(names[1024], embeddings[1024])
+
+Another interesting example is a synthetic math dataset of arithmetic operations. By
+treating the operations as relations of a knowledge graph, we can learn embeddings
+that generalize to unseen triplets (i.e. computation formulas). Check out this example
+with
+
+.. code-block:: bash
 
-As the embeddings might be further used in other downstream tasks, it would be
-helpful if they can be obtained in the easiest way.
+    graphvite baseline math
 
 For a more in-depth tutorial about GraphVite, take a look at
 
diff --git a/doc/source/user/auto.rst b/doc/source/user/auto.rst
index 96e5ddd..968fd43 100644
--- a/doc/source/user/auto.rst
+++ b/doc/source/user/auto.rst
@@ -6,7 +6,7 @@ to help users focus on the most important part, GraphVite provides an auto deduc
 for many hyperparameters. Generally, auto deduction will maximize the speed of the
 system, while keep the performance loss as small as possible.
 
-To invoke auto deduction, you can simply leave hyperparameters to their default
+To invoke auto deduction, we can simply leave hyperparameters to their default
 values. An explicit way is to use ``auto`` in configuration files, or value
 ``gv.auto`` in Python.
 
diff --git a/doc/source/user/command_line.rst b/doc/source/user/command_line.rst
index 02224a0..9c8649d 100644
--- a/doc/source/user/command_line.rst
+++ b/doc/source/user/command_line.rst
@@ -1,22 +1,23 @@
 Command Line
 ============
 
-As you see in :doc:`../quick_start`, GraphVite can be simply invoked from a command
-line. Here are some other useful commands you may use.
+As we have seen in :doc:`../quick_start`, GraphVite can be simply invoked from a
+command line. Here are some other useful commands we can use.
 
 Reproduce baseline benchmarks
 -----------------------------
 
 .. code-block:: bash
 
-    graphvite baseline [keyword ...] [--no-eval] [--gpu n] [--cpu m]
+    graphvite baseline [keyword ...] [--no-eval] [--gpu n] [--cpu m] [--epoch e]
 
 GraphVite provides a large number of baselines on standard datasets. To reproduce
-a baseline benchmark, you only need to specify the keywords of the experiment, and
-the library will do all the rest.
+a baseline benchmark, we only need to specify the keywords of the experiment, and
+the library will do the rest for us.
 
-By default, baselines are configured to use all CPUs and GPUs. You may override this
-behavior by specifying the number of GPUs and the number of CPUs per GPU.
+By default, baselines are configured to use all CPUs and GPUs. We may override this
+behavior by specifying the number of GPUs and the number of CPUs per GPU. We may also
+override the number of training epochs for fast experiments.
 
 For example, the following command line reproduces RotatE model on FB15k dataset,
 using 4 GPUs and 12 CPUs.
@@ -30,12 +31,24 @@ Use ``graphvite list`` to get a list of available baselines.
 Run configuration files
 -----------------------
 
+Custom experiments can be easily carried out in GraphVite through a yaml configuration.
+This is especially convenient if we want to use GraphVite as an off-the-shelf tool
+for pretraining embeddings.
+
 .. code-block:: bash
 
-    graphvite run [config] [--no-eval] [--gpu n] [--cpu m]
+    graphvite new [application ...] [--file f]
+
+The above command creates a configuration scaffold for our application, where most
+settings are ready. We just need to fill a minimal number of settings following the
+instructions. For a more detailed introduction on configuration files, see
+:ref:`experiment configuration`.
+
+Once we complete the configuration file, we can run it by
+
+.. code-block:: bash
 
-Experiments can be easily conducted in GraphVite by specifying a yaml configuration.
-For how to write an experiment configuration, see :doc:`configuration`.
+    graphvite run [config] [--no-eval] [--gpu n] [--cpu m] [--epoch e]
 
 Visualize high-dimensional vectors
 ----------------------------------
@@ -44,9 +57,9 @@ Visualize high-dimensional vectors
 
     graphvite visualize [file] [--label label_file] [--save save_file] [--perplexity n] [--3d]
 
-You can visualize your high-dimensional vectors with a simple command line in
+We can visualize our high-dimensional vectors with a simple command line in
 GraphVite.
 
-The file can be either in numpy dump or text format. You can also provide a label
-file indicating the category of each data point. For the save file, we recommend to
-use a ``png`` format, while ``pdf`` is also supported.
\ No newline at end of file
+The file can be either a numpy dump ``*.npy`` or a text matrix ``*.txt``. We can
+also provide a label file indicating the category of each data point. For the save
+file, we recommend to use ``png`` format, while ``pdf`` is also supported.
\ No newline at end of file
diff --git a/doc/source/user/configuration.rst b/doc/source/user/configuration.rst
index 5cdb5da..305ae56 100644
--- a/doc/source/user/configuration.rst
+++ b/doc/source/user/configuration.rst
@@ -3,14 +3,17 @@ Configuration Files
 
 .. include:: ../link.rst
 
+.. _experiment configuration:
+
 Experiment configuration
 ------------------------
 
 An experiment configuration starts with an ``application type``, and contains settings
-for ``resource``, ``graph``, ``build``, ``train``, ``evaluate`` and ``save`` stages.
+for ``resource``, ``format``, ``graph``, ``build``, ``train``, ``evaluate`` and
+``save`` stages.
 
 Here is the configuration used in :doc:`../quick_start`.
-:download:`quick_start.yaml <../../../config/quick_start.yaml>`
+:download:`quick_start.yaml <../../../config/demo/quick_start.yaml>`
 
 The stages are configured as follows.
 
@@ -18,43 +21,52 @@ The stages are configured as follows.
 
     application: [type]
 
-The application type can be ``graph``, ``knowledge_graph`` or ``visualization``.
+The application type can be ``graph``, ``word graph``, ``knowledge graph`` or
+``visualization``.
 
 .. code-block:: yaml
 
     resource:
-        gpus: [list of GPU ids]
-        gpu_memory_limit: [limit for each GPU in bytes]
-        cpu_per_gpu: [CPU thread per GPU]
-        dim: [dim]
+      gpus: [list of GPU ids]
+      gpu_memory_limit: [limit for each GPU in bytes]
+      cpu_per_gpu: [CPU thread per GPU]
+      dim: [dim]
 
 .. note::
     For optimal performance, modules are compiled with pre-defined dimensions in C++.
     As a drawback, only dimensions that are powers of 2 are supported in the library.
 
+.. code-block:: yaml
+
+    format:
+      delimiters: [string of delimiter characters]
+      comment: [prefix of comment strings]
+
+Format section is optional. By default, delimiters are any blank character and comment
+is "#", following the Python style.
+
 .. code-block:: yaml
 
     graph:
-        file_name: [file name]
-        as_undirected: [symmetrize the graph or not]
-        delimiters: [string of delimiter characters]
-        comment: [prefix of comment strings]
+      file_name: [file name]
+      as_undirected: [symmetrize the graph or not]
 
-For standard datasets, you can specify its file name by ``<[dataset].[split]>``.
+For standard datasets, we can specify its file name by ``<[dataset].[split]>``.
 This would make the configuration file independent of the path.
 
 .. code-block:: yaml
 
     build:
-        optimizer:
-            type: [type]
-            lr: [learning rate]
-            weight_decay: [weight decay]
-            # and other optimizer-specific configuration
-        num_partition: [number of partitions]
-        num_negative: [number of negative samples]
-        batch_size: [batch size]
-        episode_size: [episode size]
+      optimizer:
+        type: [type]
+        lr: [learning rate]
+        weight_decay: [weight decay]
+        schedule: [learning rate schedule]
+        # and other optimizer-specific configuration
+      num_partition: [number of partitions]
+      num_negative: [number of negative samples]
+      batch_size: [batch size]
+      episode_size: [episode size]
 
 The number of partitions determines how to deal with multi-GPU or large graph cases.
 The more partitions, the less GPU memory consumption and speed. The episode size
@@ -65,19 +77,21 @@ See section 3.2 in `GraphVite paper <GraphVite_>`_  for a detailed illustration.
 .. code-block:: yaml
 
     train:
-        model: [model]
-        num_epoch: [number of epochs]
-        negative_weight: [weight for negative sample]
-        log_frequency: 1000
-        # and other application-specific configuration
+      model: [model]
+      num_epoch: [number of epochs]
+      negative_weight: [weight for negative sample]
+      log_frequency: 1000
+      # and other application-specific configuration
 
 .. code-block:: yaml
 
     evaluate:
-        task: [task]
+      - task: [task]
         # and other task-specific configuration
+      - task: [task]
+        ...
 
-Evaluation is optional.
+Evaluation is optional. There may be multiple evaluation tasks.
 
 .. code-block:: yaml
 
@@ -86,20 +100,21 @@ Evaluation is optional.
 
 Saving embeddings is optional.
 
-For more detailed settings, we recommend you to read the baseline configurations
+For more detailed settings, we recommend to read the baseline configurations
 for concrete examples. They can be found under ``config/`` in the Python package,
 or in the `GitHub repository <Repo_>`_.
 
 Global configuration
 --------------------
 
-You can overwrite the global settings of GraphVite in ``~/.graphvite/config.yaml``.
+We can overwrite the global settings of GraphVite in ``~/.graphvite/config.yaml``.
 
 .. code-block:: yaml
 
-    dataset_path: [path to store datasets]
+    backend: [graphvite or torch]
+    dataset_path: [path to store downloaded datasets]
     float_type: [default float type]
     index_type: [default index type]
 
-By default, the datasets are stored in ``~/.graphvite/dataset``.
-The data types are ``float32`` and ``uint32``.
\ No newline at end of file
+By default, the evaluation backend is ``graphvite``. The datasets are stored in
+``~/.graphvite/dataset``. The data types are ``float32`` and ``uint32`` respectively.
\ No newline at end of file
diff --git a/doc/source/user/format.rst b/doc/source/user/format.rst
new file mode 100644
index 0000000..26e210e
--- /dev/null
+++ b/doc/source/user/format.rst
@@ -0,0 +1,60 @@
+Data Format
+===========
+
+GraphVite is designed to support a wide range of formats for graphs. Generally, it
+does not enforce any type restriction on input elements. We can either use integers
+or strings as our input. Each line in the file is parsed as
+
+.. code-block::
+
+    [token] [delimiter] [token] [delimiter]... [comment]...
+
+By default, GraphVite treats any blank character as delimiter, and string after ``#``
+as comment. You can change these settings in the
+:ref:`format section <experiment configuration>` of configuration files, or using
+``app.set_format()`` in Python code.
+
+Node Embedding
+--------------
+
+The input graph for node embedding follows the edge list format. Each line should be
+
+.. code-block::
+
+    [head] [tail]
+
+For link prediction task, the evaluation file consists of edges and labels.
+
+.. code-block::
+
+    [head] [tail] [label]
+
+where label ``1`` is positive and ``0`` is negative. The filter file takes the same
+format as the input graph.
+
+For node classification task, each line is a node and a label. If a node has more
+than one label, it should take multiple lines.
+
+.. code-block::
+
+    [node] [label]
+
+Knowledge Graph Embedding
+-------------------------
+
+Each line in a knowledge graph is a triplet.
+
+.. code-block::
+
+    [head] [relation] [tail]
+
+All the files in knowledge graph evaluation tasks take the same triplet format.
+
+Graph & High-dimensional Data Visualization
+-------------------------------------------
+
+For graph visualization, the input format is same as the graph in node embedding.
+
+For high-dimensional data visualization, the input format can either be a 2D numpy
+array or a text matrix. Each row in the matrix is parsed as a point in the
+high-dimensional space.
\ No newline at end of file
diff --git a/doc/source/user/python.rst b/doc/source/user/python.rst
index d9ce291..7c3cb4b 100644
--- a/doc/source/user/python.rst
+++ b/doc/source/user/python.rst
@@ -2,7 +2,7 @@ Python Interface
 ================
 
 GraphVite provides Python interface for convenient integration with other software.
-To use GraphVite in Python, import these two modules in your script.
+To use GraphVite in Python, import these two modules in our script.
 
     >>> import graphvite as gv
     >>> import graphvite.application as gap
@@ -14,13 +14,13 @@ wrappers of applications, along with their evaluation routines.
 Applications
 ------------
 
-You can invoke a node embedding application with the following lines.
+We can invoke a node embedding application with the following lines.
 
     >>> app = gap.GraphApplication(dim=128)
     >>> app.load(file_name=gv.dataset.blogcatalog.train)
     >>> app.build()
     >>> app.train()
-    >>> app.evaluate("node_classification", file_name=gv.dataset.blogcatalog.label)
+    >>> app.evaluate("node classification", file_name=gv.dataset.blogcatalog.label)
 
 where the arguments of each member function are identical to those in the
 :doc:`configuration files <configuration>`.
@@ -31,8 +31,8 @@ where the arguments of each member function are identical to those in the
 Basic classes
 -------------
 
-The basic classes are very helpful if you need fine-grained manipulation of the
-pipeline. For example, you may train an ensemble of node embedding models on the
+The basic classes are very helpful if we need fine-grained manipulation of the
+pipeline. For example, we may train an ensemble of node embedding models on the
 same graph. First, create a graph and two node embedding solvers.
 
     >>> graph = gv.graph.Graph()
@@ -45,7 +45,7 @@ Then, build the solvers on that graph. This step determines all memory allocatio
     >>> for solver in solvers:
     >>>     solver.build(graph)
 
-Now you can train the solver. The training stage of solvers can be fully paralleled
+Now we can train the solver. The training stage of solvers can be fully paralleled
 with multiple threads, since GraphVite never holds Python GIL inside basic classes.
 
     >>> from multiprocessing.pool import ThreadPool
@@ -72,7 +72,7 @@ prints the vertex embedding of node "1024".
 Logging settings
 ----------------
 
-GraphVite outputs a bunch of messages during stages like training. You can set the
+GraphVite outputs a bunch of messages during stages like training. We can set the
 logging level to dismiss unimportant logs.
 
 The following lines suppress most logs except hyperparameters and evaluation results.
diff --git a/include/base/alias_table.cuh b/include/base/alias_table.cuh
index 823d1d0..7299d60 100644
--- a/include/base/alias_table.cuh
+++ b/include/base/alias_table.cuh
@@ -51,6 +51,8 @@ public:
     typedef _Float Float;
     typedef _Index Index;
 
+    static const int kThreadPerBlock = 512;
+
     int device_id;
     Index count;
     cudaStream_t stream;
@@ -72,6 +74,12 @@ public:
 
     AliasTable &operator=(const AliasTable &) = delete;
 
+    /** Reallocate the memory space */
+    void reallocate(Index capacity) {
+        prob_table.reallocate(capacity);
+        alias_table.reallocate(capacity);
+    }
+
     /** Initialize the table with a distribution */
     void build(const std::vector<Float> &_prob_table) {
         count = _prob_table.size();
@@ -133,8 +141,7 @@ public:
 
     /** Free GPU memory */
     void clear() {
-        prob_table.resize(0);
-        alias_table.resize(0);
+        reallocate(0);
     }
 
     /** Generate a sample on CPU / GPU */
@@ -146,8 +153,8 @@ public:
 
     /** Generate a batch of samples on GPU */
     void device_sample(const Memory<double, int> &rand, Memory<Index, int> *result) {
-        int grid_dim = (result->count + 512 - 1) / 512;
-        gpu::Sample<Float, Index><<<grid_dim, 512, 0, stream>>>(*this, rand, *result);
+        int block_per_grid = (result->count + kThreadPerBlock - 1) / kThreadPerBlock;
+        gpu::Sample<Float, Index><<<block_per_grid, kThreadPerBlock, 0, stream>>>(*this, rand, *result);
     }
 
     /**
diff --git a/include/base/memory.h b/include/base/memory.h
index 7e02a38..3c7524e 100644
--- a/include/base/memory.h
+++ b/include/base/memory.h
@@ -37,7 +37,7 @@ class Memory {
     typedef _Index Index;
 
     int device_id;
-    Index count = 0;
+    Index count = 0, capacity = 0;
     cudaStream_t stream;
     int *refer_count = nullptr;
     Data *host_ptr = nullptr, *device_ptr = nullptr;
@@ -55,20 +55,21 @@ class Memory {
 
     /** Shallow copy constructor */
     Memory(const Memory &m) :
-            device_id(m.device_id), count(m.count), stream(m.stream), refer_count(m.refer_count),
+            device_id(m.device_id), count(m.count), capacity(m.capacity), stream(m.stream), refer_count(m.refer_count),
             host_ptr(m.host_ptr), device_ptr(m.device_ptr) {
-        if (count)
+        if (capacity)
             (*refer_count)++;
     }
 
     Memory &operator=(const Memory &) = delete;
 
-    ~Memory() { resize(0); }
+    ~Memory() { reallocate(0); }
 
     /** Swap two memory spaces */
     void swap(Memory &m) {
         std::swap(device_id, m.device_id);
         std::swap(count, m.count);
+        std::swap(capacity, m.capacity);
         std::swap(stream, m.stream);
         std::swap(refer_count, m.refer_count);
         std::swap(host_ptr, m.host_ptr);
@@ -91,11 +92,21 @@ class Memory {
 #endif
     }
 
-    /** Resize the memory space */
-    void resize(Index _count) {
-        if (count == _count)
-            return;
-        if (count && !--(*refer_count)) {
+    /** Copy data from another memory */
+    void copy(const Memory &m) {
+        resize(m.count);
+        memcpy(host_ptr, m.host_ptr, count * sizeof(Data));
+    }
+
+    /** Copy data from a pointer */
+    void copy(void *ptr, Index _count) {
+        resize(_count);
+        memcpy(host_ptr, ptr, count * sizeof(Data));
+    }
+
+    /** Reallocate the memory space */
+    void reallocate(Index _capacity) {
+        if (capacity && !--(*refer_count)) {
             delete refer_count;
 #ifdef PINNED_MEMORY
             CUDA_CHECK(cudaFreeHost(host_ptr));
@@ -107,21 +118,28 @@ class Memory {
                 CUDA_CHECK(cudaFree(device_ptr));
             }
         }
-        count = _count;
-        if (count) {
+        capacity = _capacity;
+        if (capacity) {
             refer_count = new int(1);
 #ifdef PINNED_MEMORY
-            CUDA_CHECK(cudaMallocHost(&host_ptr, count * sizeof(Data)));
+            CUDA_CHECK(cudaMallocHost(&host_ptr, capacity * sizeof(Data)));
 #else
-            host_ptr = new Data[count];
+            host_ptr = new Data[capacity];
 #endif
             if (device_id != -1) {
                 CUDA_CHECK(cudaSetDevice(device_id));
-                CUDA_CHECK(cudaMalloc(&device_ptr, count * sizeof(Data)));
+                CUDA_CHECK(cudaMalloc(&device_ptr, capacity * sizeof(Data)));
             }
         }
     }
 
+    /** Resize the memory space. Reallocate only if the capacity is not enough. */
+    void resize(Index _count) {
+        if (_count > capacity || (capacity && *refer_count > 1))
+            reallocate(_count);
+        count = _count;
+    }
+
     /** Copy the memory space to GPU */
     void to_device(Index copy_count = 0) {
         if (count && device_id != -1) {
@@ -164,9 +182,10 @@ class Memory {
         }
     }
 
-    /** Fill the memory space with data. Automatically resize the memory swhen necessary. */
-    void fill(const Data &data, Index _count) {
-        resize(_count);
+    /** Fill the memory space with data. Automatically resize the memory when necessary. */
+    void fill(const Data &data, Index _count = 0) {
+        if (_count)
+            resize(_count);
         for (Index i = 0; i < count; i++)
             host_ptr[i] = data;
     }
@@ -222,11 +241,11 @@ class Memory {
     }
 
     /**
-     * @param count number of data
+     * @param capacity number of data
      * @return GPU memory cost
      */
-    static size_t gpu_memory_demand(int count) {
-        return count * sizeof(Data);
+    static size_t gpu_memory_demand(int capacity) {
+        return capacity * sizeof(Data);
     }
 };
 
diff --git a/include/base/vector.h b/include/base/vector.h
index c87dc49..dcb47ea 100644
--- a/include/base/vector.h
+++ b/include/base/vector.h
@@ -30,7 +30,7 @@ namespace graphvite {
  */
 template<size_t _dim, class _Float = float>
 class Vector {
-    // static_assert(std::is_floating_point<_Float>::value, "Vector can only be instantiated with floating point types");
+     static_assert(std::is_floating_point<_Float>::value, "Vector can only be instantiated with floating point types");
     // static_assert(_dim % gpu::kWarpSize == 0, "`dim` should be divided by 32");
 public:
     static const size_t dim = _dim;
@@ -40,8 +40,10 @@ class Vector {
 
     /** Default constructor */
     Vector() = default;
+
     /** Construct a vector of repeat scalar */
     Vector(Float f) {
+#pragma unroll
         for (Index i = 0; i < dim; i++)
             data[i] = f;
     }
@@ -54,6 +56,18 @@ class Vector {
         return data[index];
     }
 
+    __host__ __device__ Vector &operator=(const Vector &v) {
+#if __CUDA_ARCH__
+        using namespace gpu;
+        const int lane_id = threadIdx.x % kWarpSize;
+        for (Index i = lane_id; i < dim; i += kWarpSize)
+#else
+        for (Index i = 0; i < dim; i++)
+#endif
+            data[i] = v[i];
+        return *this;
+    }
+
     Vector &operator =(Float f) {
 #pragma unroll
         for (Index i = 0; i < dim; i++)
diff --git a/include/bind.h b/include/bind.h
index a1f92cf..23b1029 100644
--- a/include/bind.h
+++ b/include/bind.h
@@ -362,7 +362,7 @@ class pyKNNGraph : public py::class_<graphvite::KNNGraph<Index>, graphvite::Grap
                 comment (str, optional): prefix of comment strings
             )");
 
-        def("load", &KNNGraph::load, py::no_gil(),
+        def("load", &KNNGraph::load_numpy, py::no_gil(),
             py::arg("vectors"), py::arg("num_neighbor") = 200, py::arg("perplexity") = 30,
             py::arg("vector_normalization") = true);
 
@@ -404,16 +404,17 @@ class pyGraphSolver : public py::class_<graphvite::GraphSolver<dim, Float, Index
         // data members
         def_readonly("num_partition", &GraphSolver::num_partition);
         def_readonly("num_negative", &GraphSolver::num_negative);
-        def_readonly("augmentation_step", &GraphSolver::augmentation_step);
-        def_readonly("random_walk_length", &GraphSolver::random_walk_length);
-        def_readonly("random_walk_batch_size", &GraphSolver::random_walk_batch_size);
-        def_readonly("shuffle_base", &GraphSolver::shuffle_base);
         def_readonly("optimizer", &GraphSolver::optimizer);
         def_readonly("negative_sample_exponent", &GraphSolver::negative_sample_exponent);
         def_readonly("model", &GraphSolver::model);
         def_readonly("num_epoch", &GraphSolver::num_epoch);
         def_readonly("episode_size", &GraphSolver::episode_size);
         def_readonly("batch_size", &GraphSolver::batch_size);
+        def_readonly("augmentation_step", &GraphSolver::augmentation_step);
+        def_readonly("random_walk_length", &GraphSolver::random_walk_length);
+        def_readonly("random_walk_batch_size", &GraphSolver::random_walk_batch_size);
+        def_readonly("shuffle_base", &GraphSolver::shuffle_base);
+        def_readonly("positive_reuse", &GraphSolver::positive_reuse);
         def_readonly("log_frequency", &GraphSolver::log_frequency);
         def_readonly("num_worker", &GraphSolver::num_worker);
         def_readonly("num_sampler", &GraphSolver::num_sampler);
@@ -477,6 +478,16 @@ class pyGraphSolver : public py::class_<graphvite::GraphSolver<dim, Float, Index
                 log_frequency (int, optional): log every log_frequency batches
             )");
 
+        def("predict", &GraphSolver::predict_numpy, py::no_gil(),
+            py::arg("samples"),
+            "predict(samples)"
+            R"(
+            Predict logits for samples.
+
+            Parameters:
+                samples (ndarray): triplets with shape (?, 2), each triplet is ordered as (v, c)
+            )");
+
         def("clear", &GraphSolver::clear, py::no_gil(),
             "clear()"
             R"(
@@ -528,6 +539,10 @@ class pyKnowledgeGraphSolver : public py::class_<graphvite::KnowledgeGraphSolver
         def_readonly("num_epoch", &KnowledgeGraphSolver::num_epoch);
         def_readonly("episode_size", &KnowledgeGraphSolver::episode_size);
         def_readonly("batch_size", &KnowledgeGraphSolver::batch_size);
+        def_readonly("margin", &KnowledgeGraphSolver::margin);
+        def_readonly("l3_regularization", &KnowledgeGraphSolver::l3_regularization);
+        def_readonly("adversarial_temperature", &KnowledgeGraphSolver::adversarial_temperature);
+        def_readonly("positive_reuse", &KnowledgeGraphSolver::positive_reuse);
         def_readonly("log_frequency", &KnowledgeGraphSolver::log_frequency);
         def_readonly("num_worker", &KnowledgeGraphSolver::num_worker);
         def_readonly("num_sampler", &KnowledgeGraphSolver::num_sampler);
@@ -579,11 +594,21 @@ class pyKnowledgeGraphSolver : public py::class_<graphvite::KnowledgeGraphSolver
                 l3_regularization (float, optional): L3 regularization (for DistMult, ComplEx & SimplE)
                 sample_batch_size (int, optional): batch size of samples in samplers
                 positive_reuse (int, optional): times of reusing positive samples
-                adversarial_temperature (float, optional): temperature of adversarial negative sampling,
+                adversarial_temperature (float, optional): temperature of self-adversarial negative sampling,
                     disabled when set to non-positive value
                 log_frequency (int, optional): log every log_frequency batches
             )");
 
+        def("predict", &KnowledgeGraphSolver::predict_numpy, py::no_gil(),
+            py::arg("samples"),
+            "predict(samples)"
+            R"(
+            Predict logits for samples.
+
+            Parameters:
+                samples (ndarray): triplets with shape (?, 3), each triplet is ordered as (h, t, r)
+            )");
+
         def("clear", &KnowledgeGraphSolver::clear, py::no_gil(),
             "clear()"
             R"(
@@ -634,6 +659,7 @@ class pyVisualizationSolver : public py::class_<graphvite::VisualizationSolver<d
         def_readonly("num_epoch", &VisualizationSolver::num_epoch);
         def_readonly("episode_size", &VisualizationSolver::episode_size);
         def_readonly("batch_size", &VisualizationSolver::batch_size);
+        def_readonly("positive_reuse", &VisualizationSolver::positive_reuse);
         def_readonly("log_frequency", &VisualizationSolver::log_frequency);
         def_readonly("num_worker", &VisualizationSolver::num_worker);
         def_readonly("num_sampler", &VisualizationSolver::num_sampler);
@@ -954,4 +980,4 @@ std::function<py::object(const std::string &, py::args, py::kwargs)> optimizer_h
     return [module](const std::string &type, py::args args, py::kwargs kwargs) {
         return module.attr(py::str(type))(*args, **kwargs);
     };
-}
+}
\ No newline at end of file
diff --git a/include/core/graph.h b/include/core/graph.h
index def1dea..26069e1 100644
--- a/include/core/graph.h
+++ b/include/core/graph.h
@@ -89,12 +89,13 @@ class GraphMixin {
             return;
 
         size_t offset = 0;
+        flat_offsets.resize(num_vertex);
         for (Index u = 0; u < num_vertex; u++) {
             for (auto &&vertex_edge : vertex_edges[u]) {
                 edges.push_back(std::tuple_cat(std::tie(u), vertex_edge));
                 edge_weights.push_back(std::get<1>(vertex_edge));
             }
-            flat_offsets.push_back(offset);
+            flat_offsets[u] = offset;
             offset += vertex_edges[u].size();
         }
     }
diff --git a/include/core/optimizer.h b/include/core/optimizer.h
index af072f9..b43f1b7 100644
--- a/include/core/optimizer.h
+++ b/include/core/optimizer.h
@@ -96,7 +96,7 @@ class LRSchedule {
  * - add a value in OptimizerType
  * - implement an update function in Optimizer
  * - implement a helper class for that optimizer
- * - instantiate kernels with the optimizer in Worker::kernel_dispatch()
+ * - instantiate kernels with the optimizer in Worker::train_dispatch() and Worker::predict_dispatch()
  * - add python binding of the helper class in bind.h & graphvite.cu
  */
 class Optimizer {
@@ -219,12 +219,12 @@ class Optimizer {
 /**
  * @brief Compile-time binding of 0-moment optimizers
  * @tparam Float floating type of parameters
- * @tparam type type of optimizer
+ * @tparam optimizer_type type of optimizer
  * @return the update function of the optimizer
  */
-template<class Float, OptimizerType type>
-__device__ decltype(&Optimizer::sgd_update<Float>) get_update_function() {
-    switch (type) {
+template<class Float, OptimizerType optimizer_type>
+__host__ __device__ decltype(&Optimizer::sgd_update<Float>) get_update_function() {
+    switch (optimizer_type) {
         case kSGD:
             return &Optimizer::sgd_update<Float>;
         default:
@@ -235,12 +235,12 @@ __device__ decltype(&Optimizer::sgd_update<Float>) get_update_function() {
 /**
  * @brief Compile-time binding of 1-moment optimizers
  * @tparam Float floating type of parameters
- * @tparam type type of optimizer
+ * @tparam optimizer_type type of optimizer
  * @return the update function of the optimizer
  */
-template<class Float, OptimizerType type>
-__device__ decltype(&Optimizer::momentum_update<Float>) get_update_function_1_moment() {
-    switch (type) {
+template<class Float, OptimizerType optimizer_type>
+__host__ __device__ decltype(&Optimizer::momentum_update<Float>) get_update_function_1_moment() {
+    switch (optimizer_type) {
         case kMomentum:
             return &Optimizer::momentum_update<Float>;
         case kAdaGrad:
@@ -255,12 +255,12 @@ __device__ decltype(&Optimizer::momentum_update<Float>) get_update_function_1_mo
 /**
  * @brief Compile-time binding of 2-moment optimizers
  * @tparam Float floating type of parameters
- * @tparam type type of optimizer
+ * @tparam optimizer_type type of optimizer
  * @return the update function of the optimizer
  */
-template<class Float, OptimizerType type>
-__device__ decltype(&Optimizer::adam_update<Float>) get_update_function_2_moment() {
-    switch (type) {
+template<class Float, OptimizerType optimizer_type>
+__host__ __device__ decltype(&Optimizer::adam_update<Float>) get_update_function_2_moment() {
+    switch (optimizer_type) {
         case kAdam:
             return &Optimizer::adam_update<Float>;
         default:
diff --git a/include/core/solver.h b/include/core/solver.h
index 5936ec0..7746513 100644
--- a/include/core/solver.h
+++ b/include/core/solver.h
@@ -31,6 +31,7 @@
 #include <curand.h>
 #include <cuda_runtime.h>
 #include <glog/logging.h>
+#include <pybind11/numpy.h>
 
 #include "base/memory.h"
 #include "base/vector.h"
@@ -42,6 +43,8 @@
 #include "util/debug.h"
 #include "util/time.h"
 
+namespace py = pybind11;
+
 namespace graphvite {
 
 std::mt19937 seed;
@@ -70,11 +73,17 @@ const Protocol kSharedWithPredecessor = 0x10;
  * @tparam _Sampler type of edge sampler
  * @tparam _Worker type of training worker
  *
- * @note To add a new solver, you need to
+ * The solver class is a high-level abstract of all procedures on a family of graph embeddings.
+ * Most interface of the solver class is exposed to Python through pybind11.
+ *
+ * A solver works like a master thread over a bunch of samplers and workers.
+ * It cooperates multiple samplers (CPU) and workers (GPU) for training and prediction over graph embeddings.
+ *
+ * @note To add a new solver class, you need to
  * - derive a template solver class from SolverMixin
- * - implement all virtual functions for that class
- * - add python binding of instantiations of that class in extension.h & extension.cu
- */
+ * - implement all virtual functions for that class in instance/*.cuh
+ * - add python binding of instantiations of that class in bind.h & graphvite.cu
+*/
 template<size_t _dim, class _Float, class _Index, template<class> class _Graph, template<class> class _Sampler,
         template<class> class _Worker>
 class SolverMixin {
@@ -91,6 +100,8 @@ class SolverMixin {
     typedef typename Sampler::EdgeSample EdgeSample;
     typedef std::function<void(Sampler *, int, int)> SampleFunction;
 
+    static const int kSampleSize = Sampler::kSampleSize;
+
     Graph *graph = nullptr;
     Index num_vertex;
     size_t num_edge;
@@ -107,9 +118,19 @@ class SolverMixin {
     Protocol sampler_protocol;
     bool tied_weights;
     std::vector<std::vector<Index>> head_partitions, tail_partitions;
+    Index head_partition_size, tail_partition_size;
     std::vector<std::pair<int, Index>> head_locations, tail_locations;
     AliasTable<Float, size_t> edge_table;
     std::vector<std::vector<std::vector<std::vector<EdgeSample>>>> sample_pools;
+    // <<<<<<<<<<<<<<< predict <<<<<<<<<<<<<<<
+    const std::vector<EdgeSample> *samples;
+    const py::array_t<Index> *array;
+    std::vector<Float> results;
+    std::vector<std::vector<std::vector<EdgeSample>>> predict_pool;
+    std::vector<std::vector<std::vector<size_t>>> sample_indexes;
+    std::vector<std::vector<std::vector<size_t>>> pool_offsets;
+    // >>>>>>>>>>>>>>> predict >>>>>>>>>>>>>>>
+    bool is_train;
     int pool_id = 0;
     std::vector<Sampler *> samplers;
     std::vector<Worker *> workers;
@@ -118,8 +139,8 @@ class SolverMixin {
     Optimizer optimizer;
     int num_worker, num_sampler, num_thread;
     size_t gpu_memory_limit, gpu_memory_cost;
-    volatile std::atomic<int> batch_id;
-    int num_batch;
+    volatile std::atomic<int> batch_id, predict_batch_id;
+    int num_batch, num_predict_batch;
 
 #define USING_SOLVER_MIXIN(type) \
     using type::dim; \
@@ -357,16 +378,6 @@ class SolverMixin {
                 protocols, shapes, sampler_protocol, num_moment, num_partition, num_negative, batch_size);
         CHECK(gpu_memory_cost < gpu_memory_limit)
                 << "Can't satisfy the specified GPU memory limit";
-        if (episode_size == kAuto) {
-            if (all & kGlobal)
-                episode_size = float(num_vertex * kSamplePerVertexWithGlobal) / num_partition / batch_size;
-            else
-                episode_size = float(num_vertex * kSamplePerVertex) / num_partition / batch_size;
-            episode_size = std::max(episode_size, 1);
-            if (num_partition == 1)
-                // for single partition, we don't need to use very small episode size
-                episode_size = std::max(episode_size, kMinEpisodeSample / batch_size);
-        }
 
         // use naive data parallel if there is no partition
         naive_parallel = !(all & (kHeadPartition | kTailPartition));
@@ -377,6 +388,14 @@ class SolverMixin {
         if (!naive_parallel) {
             head_partitions = partition(graph->vertex_weights, num_partition);
             tail_partitions = partition(graph->vertex_weights, num_partition);
+            head_partition_size = 0;
+            tail_partition_size = 0;
+            for (int i = 0; i < num_partition; i++) {
+                if (head_partition_size < head_partitions[i].size())
+                    head_partition_size = head_partitions[i].size();
+                if (tail_partition_size < tail_partitions[i].size())
+                    tail_partition_size = tail_partitions[i].size();
+            }
             head_locations.resize(num_vertex);
             tail_locations.resize(num_vertex);
             for (int i = 0; i < num_partition; i++)
@@ -391,23 +410,59 @@ class SolverMixin {
                 }
         }
 
-        // allocating sample pool is slow
+        for (auto &&worker : workers)
+            worker->build();
+
+        // leave allocation of sample pools last, since it is the most elastic part
         sample_pools.resize(2);
         for (auto &&sample_pool : sample_pools) {
             sample_pool.resize(num_partition);
-            for (auto &&partition_pool : sample_pool) {
+            for (auto &&partition_pool : sample_pool)
                 if (naive_parallel)
                     partition_pool.resize(1);
                 else
                     partition_pool.resize(num_partition);
-                for (auto &&pool : partition_pool)
-                    pool.resize(episode_size * batch_size);
+        }
+        int expected_size = episode_size;
+        if (episode_size == kAuto) {
+            if (all & kGlobal)
+                expected_size = float(num_vertex * kSamplePerVertexWithGlobal) / num_partition / batch_size;
+            else
+                expected_size = float(num_vertex * kSamplePerVertex) / num_partition / batch_size;
+            expected_size = std::max(expected_size, 1);
+            if (num_partition == 1)
+                // for single partition, we don't need to use very small episode size
+                expected_size = std::max(expected_size, kMinEpisodeSample / batch_size);
+        }
+        while (expected_size > 0) {
+            try {
+                for (auto &&sample_pool : sample_pools) {
+                    for (auto &&partition_pool : sample_pool)
+                        for (auto &&pool : partition_pool)
+                            pool.resize(expected_size * batch_size);
+                }
+            } catch (const std::bad_alloc &) {
+                expected_size /= 2;
+                // free memory
+                for (auto &&sample_pool : sample_pools) {
+                    for (auto &&partition_pool : sample_pool)
+                        for (auto &&pool : partition_pool)
+                            std::vector<EdgeSample>().swap(pool);
+                }
+                continue;
             }
+            break;
         }
+        if (expected_size == 0)
+            LOG(FATAL) << "Out of memory. Try to reduce the size of your graph or the dimension of your embeddings.";
+        if (episode_size != kAuto)
+            LOG_IF(WARNING, expected_size < episode_size)
+                << "Fail to allocate memory for episode size of " << episode_size
+                << ". Use the maximal possible size instead.";
+        episode_size = expected_size;
+
         for (auto &&sampler : samplers)
             sampler->build();
-        for (auto &&worker : workers)
-            worker->build();
     }
 
     virtual inline std::string name() const {
@@ -520,7 +575,7 @@ class SolverMixin {
     }
 
     /**
-     * @brief Train graph embeddings
+     * @brief Train embeddings
      * @param _model model
      * @param _num_epoch number of epochs, i.e. #positive edges / |E|
      * @param _resume resume training from learned embeddings or not
@@ -554,6 +609,7 @@ class SolverMixin {
             batch_id = 0;
         }
         num_batch = batch_id + num_epoch * num_edge / batch_size;
+        is_train = true;
 
         std::vector<std::thread> sample_threads(num_sampler);
         std::vector<std::thread> worker_threads(num_worker);
@@ -597,12 +653,174 @@ class SolverMixin {
             thread.join();
     }
 
+    /**
+     * @brief Predict logits for samples
+     * @param _samples edge samples
+     */
+    std::vector<Float> predict(const std::vector<EdgeSample> &_samples) {
+        samples = &_samples;
+        is_train = false;
+
+        pool_offsets.resize(num_sampler + 1);
+        for (auto &&sampler_offsets : pool_offsets) {
+            sampler_offsets.resize(num_partition);
+            for (auto &&partition_offsets : sampler_offsets)
+                partition_offsets.resize(num_partition);
+        }
+        predict_pool.resize(num_partition);
+        for (auto &&partition_pool : predict_pool)
+            partition_pool.resize(num_partition);
+        sample_indexes.resize(num_partition);
+        for (auto &&partition_indexes : sample_indexes)
+            partition_indexes.resize(num_partition);
+
+        std::vector<std::thread> sample_threads(num_sampler + num_worker);
+        size_t num_sample = samples->size();
+        size_t work_load = (num_sample + num_sampler - 1) / num_sampler;
+        for (int i = 0; i < num_sampler + num_worker; i++)
+            sample_threads[i] = std::thread(&Sampler::count, samplers[0], work_load * i,
+                                             std::min(work_load * (i + 1), num_sample), i);
+        for (auto &&thread : sample_threads)
+            thread.join();
+
+        for (int i = 0; i < num_sampler; i++)
+            for (int j = 0; j < num_partition; j++)
+                for (int k = 0; k < num_partition; k++)
+                    pool_offsets[i + 1][j][k] += pool_offsets[i][j][k];
+        predict_batch_id = 0;
+        num_predict_batch = 0;
+        size_t all_pool = 0;
+        for (int i = 0; i < num_partition; i++)
+            for (int j = 0; j < num_partition; j++) {
+                size_t this_pool_size = pool_offsets[num_sampler][i][j];
+                all_pool += this_pool_size;
+                predict_pool[i][j].resize(this_pool_size);
+                sample_indexes[i][j].resize(this_pool_size);
+                num_predict_batch += (this_pool_size + batch_size - 1) / batch_size;
+            }
+//        LOG(INFO) << "all pool size = " << all_pool;
+
+//        LOG(INFO) << "start distribute";
+        for (int i = 0; i < num_sampler + num_worker; i++)
+            sample_threads[i] = std::thread(&Sampler::distribute, samplers[0], work_load * i,
+                                             std::min(work_load * (i + 1), num_sample), i);
+        for (auto &&thread : sample_threads)
+            thread.join();
+//        LOG(INFO) << "end distribute";
+
+//        LOG(INFO) << "start predict";
+        results.resize(num_sample);
+        std::vector<std::thread> worker_threads(num_worker);
+        auto schedule = get_schedule();
+        for (auto &&assignment : schedule) {
+            for (int i = 0; i < assignment.size(); i++)
+                worker_threads[i] = std::thread(&Worker::predict, workers[i], assignment[i].first, assignment[i].second);
+            for (int i = 0; i < assignment.size(); i++)
+                worker_threads[i].join();
+        }
+//        LOG(INFO) << "end predict";
+
+        decltype(predict_pool)().swap(predict_pool);
+        decltype(sample_indexes)().swap(sample_indexes);
+        decltype(pool_offsets)().swap(pool_offsets);
+
+        return results;
+    }
+
+    /**
+     * @brief Predict logits for samples
+     * @param _samples ndarray of edge samples, with shape (?, kSampleSize)
+     */
+    py::array_t<Float> predict_numpy(const py::array_t<Index> &_array) {
+        if (_array.ndim() != 2 || _array.shape(1) != kSampleSize) {
+            std::stringstream ss;
+            ss << _array.shape(0);
+            for (int i = 1; i < _array.ndim(); i++)
+                ss << ", " << _array.shape(i);
+            LOG(FATAL) << "Expect an array with shape (?, " << kSampleSize
+                       << "), but shape (" << ss.str() << ") is found";
+        }
+//        LOG(INFO) << "begin predict numpy";
+        array = &_array;
+        is_train = false;
+
+        pool_offsets.resize(num_sampler + 1);
+        for (auto &&sampler_offsets : pool_offsets) {
+            sampler_offsets.resize(num_partition);
+            for (auto &&partition_offsets : sampler_offsets)
+                partition_offsets.resize(num_partition);
+        }
+        predict_pool.resize(num_partition);
+        for (auto &&partition_pool : predict_pool)
+            partition_pool.resize(num_partition);
+        sample_indexes.resize(num_partition);
+        for (auto &&partition_indexes : sample_indexes)
+            partition_indexes.resize(num_partition);
+
+        std::vector<std::thread> sample_threads(num_sampler + num_worker);
+        size_t num_sample = array->shape(0);
+        size_t work_load = (num_sample + num_sampler - 1) / num_sampler;
+//        LOG(INFO) << "begin count";
+        for (int i = 0; i < num_sampler + num_worker; i++)
+            sample_threads[i] = std::thread(&Sampler::count_numpy, samplers[0], work_load * i,
+                                            std::min(work_load * (i + 1), num_sample), i);
+        for (auto &&thread : sample_threads)
+            thread.join();
+//        LOG(INFO) << "end count";
+
+        for (int i = 0; i < num_sampler; i++)
+            for (int j = 0; j < num_partition; j++)
+                for (int k = 0; k < num_partition; k++)
+                    pool_offsets[i + 1][j][k] += pool_offsets[i][j][k];
+        predict_batch_id = 0;
+        num_predict_batch = 0;
+        size_t all_pool = 0;
+        for (int i = 0; i < num_partition; i++)
+            for (int j = 0; j < num_partition; j++) {
+                size_t this_pool_size = pool_offsets[num_sampler][i][j];
+                all_pool += this_pool_size;
+                predict_pool[i][j].resize(this_pool_size);
+                sample_indexes[i][j].resize(this_pool_size);
+                num_predict_batch += (this_pool_size + batch_size - 1) / batch_size;
+            }
+
+//        LOG(INFO) << "begin distribute";
+        for (int i = 0; i < num_sampler + num_worker; i++)
+            sample_threads[i] = std::thread(&Sampler::distribute_numpy, samplers[0], work_load * i,
+                                            std::min(work_load * (i + 1), num_sample), i);
+        for (auto &&thread : sample_threads)
+            thread.join();
+//        LOG(INFO) << "end distribute";
+
+        results.resize(num_sample);
+        std::vector<std::thread> worker_threads(num_worker);
+        auto schedule = get_schedule();
+//        LOG(INFO) << "begin predict kernel";
+        for (auto &&assignment : schedule) {
+            for (int i = 0; i < assignment.size(); i++)
+                worker_threads[i] = std::thread(&Worker::predict, workers[i], assignment[i].first, assignment[i].second);
+            for (int i = 0; i < assignment.size(); i++)
+                worker_threads[i].join();
+        }
+//        LOG(INFO) << "end predict kernel";
+
+        decltype(predict_pool)().swap(predict_pool);
+        decltype(sample_indexes)().swap(sample_indexes);
+        decltype(pool_offsets)().swap(pool_offsets);
+//        LOG(INFO) << "end predict numpy";
+
+        py::array_t<Float> _results(num_sample);
+        memcpy(_results.mutable_data(), results.data(), num_sample * sizeof(Float));
+        return _results;
+    }
+
     /** Free CPU and GPU memory, except the embeddings on CPU */
-    void clear() {
+    virtual void clear() {
         decltype(moments)().swap(moments);
         decltype(head_partitions)().swap(head_partitions);
         decltype(tail_partitions)().swap(tail_partitions);
         decltype(sample_pools)().swap(sample_pools);
+        edge_table.clear();
         for (auto &&sampler : samplers)
             sampler->clear();
         for (auto &&worker : workers)
@@ -621,7 +839,7 @@ class SolverMixin {
      */
     static size_t gpu_memory_demand(const std::vector<Protocol> &protocols, const std::vector<Index> &shapes,
                                     Protocol sampler_protocol = kTailPartition, int num_moment = 0,
-                                    int num_partition = 4, int num_negative = 1, int batch_size = 100000) {
+                                    int num_partition = 4, int num_negative = 1, int batch_size = 10000) {
         auto partition_shapes = shapes;
         int num_embedding = protocols.size();
         Index num_vertex;
@@ -654,6 +872,10 @@ class SolverMixin {
         return demand;
     }
 
+    static size_t cpu_memory_demand() {
+        size_t demand = 0;
+    }
+
 private:
     /**
      * @brief Generate partition for nodes s.t. each partition has similar sum of weights.
@@ -683,9 +905,12 @@ class SolverMixin {
  * @tparam _Solver type of graph embedding solver
  * @tparam _Attributes types of additional edge attributes
  *
- * @note To add a new sampler, you need to
+ * The sampler class is an abstract interface of CPU routines on a family of graph embeddings.
+ * Multiple samplers generate and prepare edge samples in parallel, under the schedule of the solver.
+ *
+ * @note To add a new sampler class, you need to
  * - derive a template sampler class from SamplerMixin
- * - implement all virtual functions for that class
+ * - implement all virtual functions for that class in instance/*.cuh
  * - bind that class to a solver as a template parameter
  */
 template<class _Solver, class ..._Attributes>
@@ -699,12 +924,15 @@ class SamplerMixin {
     typedef std::tuple<Index, Index, _Attributes...> EdgeSample;
     typedef typename Solver::Graph::Edge Edge;
 
+    static const int kSampleSize = sizeof(EdgeSample) / sizeof(Index);
+    static_assert(sizeof(EdgeSample) % sizeof(Index) == 0, "sizeof(EdgeSample) must be a multiplier of sizeof(Index)");
+
     Solver *solver;
     int device_id;
     cudaStream_t stream;
     Memory<double, int> random;
     curandGenerator_t generator;
-    int pool_size;
+    int num_partition, pool_size;
 
 #define USING_SAMPLER_MIXIN(type) \
     using typename type::Solver; \
@@ -717,6 +945,7 @@ class SamplerMixin {
     using type::stream; \
     using type::random; \
     using type::generator; \
+    using type::num_partition; \
     using type::pool_size
 
     /**
@@ -744,14 +973,15 @@ class SamplerMixin {
     void build() {
         CUDA_CHECK(cudaSetDevice(device_id));
 
-        pool_size = solver->sample_pools[0][0][0].size();
+        num_partition = solver->num_partition;
+        pool_size = solver->episode_size * solver->batch_size;
         random.resize(kRandBatchSize);
         CURAND_CHECK(curandGenerateUniformDouble(generator, random.device_ptr, kRandBatchSize));
     }
 
     /** Free GPU memory */
     void clear() {
-        random.resize(0);
+        random.reallocate(0);
     }
 
     /** Sample edges for naive parallel. This function can be parallelized. */
@@ -766,7 +996,7 @@ class SamplerMixin {
         std::vector<Index> heads(solver->sample_batch_size);
         std::vector<Index> tails(solver->sample_batch_size);
         std::vector<Attributes> attributes(solver->sample_batch_size);
-        while (partition_id < solver->num_partition) {
+        while (partition_id < num_partition) {
             for (int i = 0; i < solver->sample_batch_size; i++) {
                 if (rand_id > kRandBatchSize - 2) {
                     random.to_host();
@@ -783,7 +1013,7 @@ class SamplerMixin {
                 auto &pool = sample_pool[partition_id][0];
                 pool[offset] = std::tuple_cat(std::tie(heads[i], tails[i]), attributes[i]);
                 if (++offset == end) {
-                    if (++partition_id == solver->num_partition)
+                    if (++partition_id == num_partition)
                         return;
                     offset = start;
                 }
@@ -799,14 +1029,14 @@ class SamplerMixin {
         CURAND_CHECK(curandGenerateUniformDouble(generator, random.device_ptr, kRandBatchSize));
 
         auto &sample_pool = solver->sample_pools[solver->pool_id ^ 1];
-        std::vector<std::vector<int>> offsets(solver->num_partition);
+        std::vector<std::vector<int>> offsets(num_partition);
         for (auto &&partition_offsets : offsets)
-            partition_offsets.resize(solver->num_partition, start);
+            partition_offsets.resize(num_partition, start);
         int num_complete = 0, rand_id = 0;
         std::vector<std::pair<int, Index>> heads(solver->sample_batch_size);
         std::vector<std::pair<int, Index>> tails(solver->sample_batch_size);
         std::vector<Attributes> attributes(solver->sample_batch_size);
-        while (num_complete < solver->num_partition * solver->num_partition) {
+        while (num_complete < num_partition * num_partition) {
             for (int i = 0; i < solver->sample_batch_size; i++) {
                 if (rand_id > kRandBatchSize - 2) {
                     random.to_host();
@@ -837,6 +1067,89 @@ class SamplerMixin {
         }
     }
 
+    /** Count edges for each sample block. This function can be parallelized. */
+    void count(size_t start, size_t end, int id) {
+        auto &offsets = solver->pool_offsets[id + 1];
+        for (size_t i = start; i < end; i++) {
+            Index head_global_id = std::get<0>((*solver->samples)[i]);
+            Index tail_global_id = std::get<1>((*solver->samples)[i]);
+            int head_partition_id = solver->head_locations[head_global_id].first;
+            int tail_partition_id = solver->tail_locations[tail_global_id].first;
+            offsets[head_partition_id][tail_partition_id]++;
+        }
+    }
+
+    /** Count edges for each sample block. This function can be parallelized. */
+    void count_numpy(size_t start, size_t end, int id) {
+        auto &offsets = solver->pool_offsets[id + 1];
+        auto array = solver->array->unchecked();
+        for (size_t i = start; i < end; i++) {
+            Index head_global_id = array(i, 0);
+            Index tail_global_id = array(i, 1);
+            int head_partition_id = solver->head_locations[head_global_id].first;
+            int tail_partition_id = solver->tail_locations[tail_global_id].first;
+            offsets[head_partition_id][tail_partition_id]++;
+        }
+    }
+
+    /** Distribute edges to the sample pool. This function can be parallelized. */
+    void distribute(size_t start, size_t end, int id) {
+        auto &offsets = solver->pool_offsets[id];
+
+        for (size_t i = start; i < end; i++) {
+            Index head_global_id = std::get<0>((*solver->samples)[i]);
+            Index tail_global_id = std::get<1>((*solver->samples)[i]);
+            std::pair<int, Index> head = solver->head_locations[head_global_id];
+            std::pair<int, Index> tail = solver->tail_locations[tail_global_id];
+            int head_partition_id = head.first;
+            int tail_partition_id = tail.first;
+            Index head_local_id = head.second;
+            Index tail_local_id = tail.second;
+
+            auto &pool = solver->predict_pool[head_partition_id][tail_partition_id];
+            auto &indexes = solver->sample_indexes[head_partition_id][tail_partition_id];
+            size_t &offset = offsets[head_partition_id][tail_partition_id];
+            EdgeSample sample = (*solver->samples)[i];
+            std::get<0>(sample) = head_local_id;
+            std::get<1>(sample) = tail_local_id;
+
+            pool[offset] = sample;
+            indexes[offset] = i;
+            offset++;
+        }
+    }
+
+    /** Distribute edges to the sample pool. This function can be parallelized. */
+    void distribute_numpy(size_t start, size_t end, int id) {
+        auto &offsets = solver->pool_offsets[id];
+        auto array = solver->array->unchecked();
+
+        for (size_t i = start; i < end; i++) {
+            Index head_global_id = array(i, 0);
+            Index tail_global_id = array(i, 1);
+            std::pair<int, Index> head = solver->head_locations[head_global_id];
+            std::pair<int, Index> tail = solver->tail_locations[tail_global_id];
+            int head_partition_id = head.first;
+            int tail_partition_id = tail.first;
+            Index head_local_id = head.second;
+            Index tail_local_id = tail.second;
+
+            auto &pool = solver->predict_pool[head_partition_id][tail_partition_id];
+            auto &indexes = solver->sample_indexes[head_partition_id][tail_partition_id];
+            size_t &offset = offsets[head_partition_id][tail_partition_id];
+            EdgeSample sample;
+            Index *_sample = reinterpret_cast<Index *>(&sample);
+            for (int j = 0; j < kSampleSize; j++)
+                _sample[j] = array(i, kSampleSize - j - 1);
+            std::get<0>(sample) = head_local_id;
+            std::get<1>(sample) = tail_local_id;
+
+            pool[offset] = sample;
+            indexes[offset] = i;
+            offset++;
+        }
+    }
+
     /** @return GPU memory cost */
     static size_t gpu_memory_demand() {
         size_t demand = 0;
@@ -846,13 +1159,26 @@ class SamplerMixin {
 };
 
 /**
- * General interface for training workers
+ * General interface for workers
  * @tparam _Solver type of graph embedding solver
  *
- * @note To add a new worker, you need to
+ * The worker class is an abstract interface of GPU routines on a family of graph embeddings.
+ * Multiple workers compute and update embeddings in parallel, under the schedule of the solver.
+ *
+ * The computation routine and the model are implemented separately to facilitate development and maintenance.
+ * They are binded at compile time to ensure minimal run-time cost.
+ *
+ * Generally, the routine and the model should contain model-agnostic and model-specific computation respectively.
+ *
+ * @note To add a new worker class, you need to
  * - derive a template worker class from WorkerMixin
- * - implement all virtual functions for that class
+ * - implement all virtual functions for that class in instance/*.cuh
  * - bind that class to a solver as a template parameter
+ *
+ * @note To add a new routine or model, you may need to
+ * - implement a GPU kernel for the routine in instance/gpu/*.cuh
+ * - implement a template model class in instance/model/*.cuh
+ * - bind the GPU kernel and the model class in train_dispatch() or predict_dispatch()
  */
 template<class _Solver>
 class WorkerMixin {
@@ -863,8 +1189,7 @@ class WorkerMixin {
     typedef typename Solver::Vector Vector;
     typedef typename Solver::EdgeSample EdgeSample;
 
-    static const int sample_size = sizeof(EdgeSample) / sizeof(Index);
-    static_assert(sizeof(EdgeSample) % sizeof(Index) == 0, "sizeof(EdgeSample) must be a multiplier of sizeof(Index)");
+    static const int kSampleSize = Solver::kSampleSize;
 
     Solver *solver;
     int device_id;
@@ -879,9 +1204,7 @@ class WorkerMixin {
     Protocol sampler_protocol;
     AliasTable<Float, Index> negative_sampler;
     Memory<Index, int> batch, negative_batch;
-#ifdef USE_LOSS
-    Memory<Float, int> loss;
-#endif
+    Memory<Float, int> logits, loss;
     Memory<double, int> random;
     curandGenerator_t generator;
     int num_moment;
@@ -904,6 +1227,8 @@ class WorkerMixin {
     using type::negative_sampler; \
     using type::batch; \
     using type::negative_batch; \
+    using type::logits; \
+    using type::loss; \
     using type::batch_size; \
     using type::embeddings; \
     using type::moments; \
@@ -918,11 +1243,7 @@ class WorkerMixin {
      */
     WorkerMixin(Solver *_solver, int _device_id) :
             solver(_solver), device_id(_device_id), negative_sampler(device_id),
-            batch(device_id), negative_batch(device_id),
-#ifdef USE_LOSS
-            loss(device_id),
-#endif
-            random(device_id) {
+            batch(device_id), negative_batch(device_id), logits(device_id), loss(device_id), random(device_id) {
         CUDA_CHECK(cudaSetDevice(device_id));
 
         CUDA_CHECK(cudaStreamCreate(&work_stream));
@@ -931,9 +1252,8 @@ class WorkerMixin {
         // work stream
         batch.stream = work_stream;
         negative_batch.stream = work_stream;
-#ifdef USE_LOSS
+        logits.stream = work_stream;
         loss.stream = work_stream;
-#endif
         // sample stream
         negative_sampler.stream = sample_stream;
         random.stream = sample_stream;
@@ -947,8 +1267,11 @@ class WorkerMixin {
 
     WorkerMixin &operator=(const WorkerMixin &) = delete;
 
-    /** Should call the corresponding GPU kernel */
-    virtual bool kernel_dispatch() = 0;
+    /** Should call the corresponding training kernel */
+    virtual bool train_dispatch() = 0;
+
+    /** Should call the corresponding prediction kernel */
+    virtual bool predict_dispatch() = 0;
 
     /** Build the alias table for negative sampling */
     virtual void build_negative_sampler() {
@@ -981,18 +1304,37 @@ class WorkerMixin {
         gradients.resize(num_embedding);
         moments.resize(num_embedding);
         for (int i = 0; i < num_embedding; i++) {
+            Protocol protocol = protocols[i];
+            Index size = solver->embeddings[i]->size();
+            if (protocol & kHeadPartition)
+                size = solver->head_partition_size;
+            if (protocol & kTailPartition)
+                size = solver->tail_partition_size;
+            if (protocol & kSharedWithPredecessor && solver->num_partition == 1)
+                size = 0;
             embeddings[i] = std::make_shared<Memory<Vector, Index>>(device_id, 0, work_stream);
-            gradients[i] = std::make_shared<Memory<Vector, Index>>(device_id, 0, work_stream);
+            embeddings[i]->reallocate(size);
+            gradients[i] = std::make_shared<Memory<Vector, Index>>(-1, 0, work_stream);
+            if (!(protocol & kInPlace))
+                gradients[i]->reallocate(size);
             moments[i] = std::make_shared<std::vector<Memory<Vector, Index>>>();
-            for (int j = 0; j < num_moment; j++)
+            for (int j = 0; j < num_moment; j++) {
                 moments[i]->push_back(Memory<Vector, Index>(device_id, 0, work_stream));
+                (*moments[i])[j].reallocate(size);
+            }
         }
+        Index size = 0;
+        if (sampler_protocol & kHeadPartition)
+            size += solver->head_partition_size;
+        if (sampler_protocol & kTailPartition)
+            size += solver->tail_partition_size;
+        if (sampler_protocol & kGlobal)
+            size += solver->num_vertex;
+        negative_sampler.reallocate(size);
 
-        batch.resize(batch_size * sample_size);
+        batch.resize(batch_size * kSampleSize);
         negative_batch.resize(batch_size * num_negative);
-#ifdef USE_LOSS
         loss.resize(batch_size);
-#endif
         random.resize(batch_size * num_negative * 2);
 
         head_partition_id = -1;
@@ -1004,21 +1346,23 @@ class WorkerMixin {
         embeddings.clear();
         gradients.clear();
         moments.clear();
-        batch.resize(0);
-        negative_batch.resize(0);
-#ifdef USE_LOSS
-        loss.resize(0);
-#endif
-        random.resize(0);
+        batch.reallocate(0);
+        negative_batch.reallocate(0);
+        loss.reallocate(0);
+        random.reallocate(0);
         negative_sampler.clear();
     }
 
-    /** Load an embedding matrix to GPU, along with its gradients and moment statistics */
-    void load_partition_one(int id) {
+    /**
+     * @brief Load an embedding matrix and its moment matrices to GPU cache
+     * @param id id of embedding matrix
+     *
+     * The actual operation depends on the protocol. It is safe to call this function multiple times.
+     */
+    void load_embedding(int id) {
         Protocol protocol = protocols[id];
         if ((protocol & kSharedWithPredecessor) && ((protocol & kGlobal) || head_partition_id == tail_partition_id)) {
             embeddings[id] = embeddings[id - 1];
-            gradients[id] = gradients[id - 1];
             moments[id] = moments[id - 1];
             return;
         }
@@ -1043,20 +1387,24 @@ class WorkerMixin {
 
         embedding.gather(global_embedding, *mapping);
         embedding.to_device_async();
+
         // only load partitioned moments, or global moments if uninitialized
-        if (!(protocol & kGlobal) || (num_moment && moment[0].count == 0))
-            for (int i = 0; i < num_moment; i++) {
-                moment[i].gather(global_moment[i], *mapping);
-                moment[i].to_device_async();
+        if (solver->is_train)
+            if (!(protocol & kGlobal) || (num_moment && moment[0].count == 0)) {
+                for (int i = 0; i < num_moment; i++) {
+                    moment[i].gather(global_moment[i], *mapping);
+                    moment[i].to_device_async();
+                }
             }
-        if (!(protocol & kInPlace)) {
-            gradient.fill(0, embedding.count);
-            gradient.to_device_async();
-        }
     }
 
-    /** Write back an embedding matrix from GPU, along with its gradients and moment statistics */
-    void write_back_one(int id) {
+    /**
+     * @brief Write back an embedding matrix and its moment matrices from GPU cache
+     * @param id id of embedding matrix
+     *
+     * The actual operation depends on the protocol. It is safe to call this function multiple times.
+     */
+    void write_embedding(int id) {
         Protocol protocol = protocols[id];
         if (id > 0 && embeddings[id] == embeddings[id - 1])
             return;
@@ -1077,13 +1425,15 @@ class WorkerMixin {
             embedding.scatter(global_embedding, *mapping);
         }
         else {
-            gradient.to_host();
-            for (Index i = 0; i < gradient.count; i++)
-                gradient[i] /= solver->num_worker;
+            gradient.copy(embedding);
+            embedding.to_host();
+            for (Index i = 0; i < embedding.count; i++)
+                gradient[i] -= embedding[i];
             gradient.scatter_sub(global_embedding, *mapping);
         }
+
         // only write back partitioned moments
-        if (!(protocol & kGlobal))
+        if (solver->is_train && !(protocol & kGlobal))
             for (int i = 0; i < num_moment; i++) {
                 moment[i].to_host();
                 moment[i].scatter(global_moment[i], *mapping);
@@ -1091,7 +1441,7 @@ class WorkerMixin {
     }
 
     /**
-     * @brief Load a partition of the sample pool. Update the cache automatically
+     * @brief Load a partition of the sample pool. Update the cache automatically.
      * @param _head_partition_id id of head partition
      * @param _tail_partition_id id of tail partition
      */
@@ -1106,7 +1456,6 @@ class WorkerMixin {
                     // check swap hit
                     if (head_partition_id == _tail_partition_id && tail_partition_id == _head_partition_id) {
                         embeddings[i].swap(embeddings[i - 1]);
-                        gradients[i].swap(gradients[i - 1]);
                         moments[i].swap(moments[i - 1]);
                         hit[i] = true;
                         hit[i - 1] = true;
@@ -1119,9 +1468,11 @@ class WorkerMixin {
                     hit[i] = hit[i] || ((protocol & kTailPartition) && tail_partition_id == _tail_partition_id);
                 }
             }
-            for (int i = 0; i < num_embedding; i++)
-                if (!hit[i])
-                    write_back_one(i);
+            // we don't need to write back during prediction
+            if (solver->is_train)
+                for (int i = 0; i < num_embedding; i++)
+                    if (!hit[i])
+                        write_embedding(i);
         }
         bool sampler_hit = (sampler_protocol & kGlobal) && !cold_cache;
         sampler_hit = sampler_hit || ((sampler_protocol & kHeadPartition) && (sampler_protocol & kTailPartition)
@@ -1132,7 +1483,7 @@ class WorkerMixin {
         sampler_hit = sampler_hit || ((sampler_protocol & (kHeadPartition | kTailPartition)) == kTailPartition
                                       && tail_partition_id == _tail_partition_id);
 
-        // load cache
+        // load partition mappings
         if (head_partition_id != _head_partition_id) {
             head_partition_id = _head_partition_id;
             if (!solver->naive_parallel) {
@@ -1153,20 +1504,20 @@ class WorkerMixin {
         }
         for (int i = 0; i < num_embedding; i++)
             if (!hit[i])
-                load_partition_one(i);
+                load_embedding(i);
     }
 
-    /** Write back all cache */
+    /** Write back all embeddings and their moment matrices from GPU cache */
     void write_back() {
         bool cold_cache = head_partition_id == -1 || tail_partition_id == -1;
         if (cold_cache)
             return;
         for (int i = 0; i < num_embedding; i++)
-            write_back_one(i);
+            write_embedding(i);
     }
 
     /**
-     * @brief Train embeddings with samples in the partition
+     * @brief Train embeddings with samples in the sample block
      * @param _head_partition_id id of head partition
      * @param _tail_partition_id id of tail partition
      */
@@ -1178,14 +1529,14 @@ class WorkerMixin {
         log_frequency = solver->log_frequency;
         for (int i = 0; i < solver->positive_reuse; i++)
             for (int j = 0; j < solver->episode_size; j++) {
-                memcpy(batch.host_ptr, &samples[j * batch_size], batch_size * sample_size * sizeof(Index));
+                batch.copy(&samples[j * batch_size], batch_size * kSampleSize);
                 train_batch(solver->batch_id++);
             }
     }
 
     /** Train a single batch */
     virtual void train_batch(int batch_id) {
-        Timer batch_timer("Batch", log_frequency);
+        Timer batch_timer("Train Batch", log_frequency);
         if (batch_id % log_frequency == 0)
             LOG(INFO) << "Batch id: " << batch_id << " / " << solver->num_batch;
         batch.to_device_async();
@@ -1200,7 +1551,6 @@ class WorkerMixin {
             }
             CUDA_CHECK(cudaStreamSynchronize(sample_stream));
         }
-#ifdef USE_LOSS
         // Loss (last batch)
         if (batch_id % log_frequency == 0){
             Timer timer("Loss", log_frequency);
@@ -1210,13 +1560,51 @@ class WorkerMixin {
                 batch_loss += loss[i];
             LOG(INFO) << "loss = " << batch_loss / batch_size;
         }
-#endif
         // Train
         {
-            Timer timer("Train", log_frequency);
+            Timer timer("Train Kernel", log_frequency);
             optimizer.apply_schedule(batch_id, solver->num_batch);
-            CHECK(kernel_dispatch())
-                    << "Can't find a kernel implementation of `" << solver->model << " with " << optimizer.type;
+            CHECK(train_dispatch())
+                    << "Can't find a training kernel for `" << solver->model << "` with " << optimizer.type;
+        }
+    }
+
+    /**
+     * @brief Predict on samples in the sample block
+     * @param _head_partition_id id of head partition
+     * @param _tail_partition_id id of tail partition
+     */
+    virtual void predict(int _head_partition_id, int _tail_partition_id) {
+        CUDA_CHECK(cudaSetDevice(device_id));
+        load_partition(_head_partition_id, _tail_partition_id);
+
+        auto &samples = solver->predict_pool[head_partition_id][tail_partition_id];
+        auto &indexes = solver->sample_indexes[head_partition_id][tail_partition_id];
+        log_frequency = solver->log_frequency;
+        size_t num_sample = samples.size();
+        int num_batch = (num_sample + batch_size - 1) / batch_size;
+        for (size_t i = 0; i < num_batch; i++) {
+            int actual_size = std::min(size_t(batch_size), num_sample - i * batch_size);
+            batch.copy(&samples[i * batch_size], actual_size * kSampleSize);
+            logits.resize(actual_size);
+            predict_batch(solver->predict_batch_id++);
+            for (int j = 0; j < actual_size; j++) {
+                size_t index = indexes[i * batch_size + j];
+                solver->results[index] = logits[j];
+            }
+        }
+        logits.reallocate(0);
+    }
+
+    /** Predict a single batch */
+    virtual void predict_batch(int batch_id) {
+        Timer batch_timer("Predict Batch", log_frequency);
+        batch.to_device_async();
+        // Predict
+        {
+            Timer timer("Predict Kernel", log_frequency);
+            CHECK(predict_dispatch()) << "Can't find a prediction kernel for `" << solver->model << "`";
+            logits.to_host();
         }
     }
 
@@ -1237,14 +1625,10 @@ class WorkerMixin {
         for (int i = 0; i < num_embedding; i++) {
             Protocol protocol = protocols[i];
             demand += Memory<Vector, Index>::gpu_memory_demand(shapes[i]) * (num_moment + 1);
-            if (!(protocol & kInPlace))
-                demand += Memory<Vector, Index>::gpu_memory_demand(shapes[i]);
         }
-        demand += decltype(batch)::gpu_memory_demand(batch_size * sample_size);
+        demand += decltype(batch)::gpu_memory_demand(batch_size * kSampleSize);
         demand += decltype(negative_batch)::gpu_memory_demand(batch_size * num_negative);
-#ifdef USE_LOSS
         demand += decltype(loss)::gpu_memory_demand(batch_size);
-#endif
         demand += decltype(random)::gpu_memory_demand(batch_size * num_negative * 2);
         demand += decltype(negative_sampler)::gpu_memory_demand(sampler_size);
         return demand;
diff --git a/include/gpu/knowledge_graph.cuh b/include/gpu/knowledge_graph.cuh
deleted file mode 100644
index 3d28b2d..0000000
--- a/include/gpu/knowledge_graph.cuh
+++ /dev/null
@@ -1,2266 +0,0 @@
-/**
- * Copyright 2019 MilaGraph. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * @author Zhaocheng Zhu
- */
-
-#pragma once
-
-#include "base/memory.h"
-#include "core/optimizer.h"
-#include "util/gpu.cuh"
-
-namespace graphvite {
-namespace gpu {
-
-namespace transe {
-
-/**
- * @brief Train TransE with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                      Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function<Float, type>();
-
-    __shared__ graphvite::Vector<dim, bool> buffer[kThreadPerBlock / kWarpSize];
-
-    graphvite::Vector<dim, bool> &sign = buffer[threadIdx.x / kWarpSize];
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += abs(head[i] + relation[i] - tail[i]);
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float y = head[i] + relation[i] - tail[i];
-                sign[i] = y > 0;
-                x += abs(y);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float s = sign[i] ? 1 : -1;
-                head[i] -= (optimizer.*update)(head[i], -gradient * s, weight);
-                tail[i] -= (optimizer.*update)(tail[i], gradient * s, weight);
-                Float relation_update = (optimizer.*update)(relation[i], -gradient * s, weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train TransE with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_1_moment<Float, type>();
-
-    __shared__ graphvite::Vector<dim, bool> buffer[kThreadPerBlock / kWarpSize];
-
-    graphvite::Vector<dim, bool> &sign = buffer[threadIdx.x / kWarpSize];
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += abs(head[i] + relation[i] - tail[i]);
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float y = head[i] + relation[i] - tail[i];
-                sign[i] = y > 0;
-                x += abs(y);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float s = sign[i] ? 1 : -1;
-                head[i] -= (optimizer.*update)(head[i], -gradient * s, head_moment1[i], weight);
-                tail[i] -= (optimizer.*update)(tail[i], gradient * s, tail_moment1[i], weight);
-                Float relation_update = (optimizer.*update)(relation[i], -gradient * s,
-                                                            relation_moment1[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train TransE with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s, Memory<Vector, Index> head_moment2s,
-                               Memory<Vector, Index> tail_moment2s, Memory<Vector, Index> relation_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_2_moment<Float, type>();
-
-    __shared__ graphvite::Vector<dim, bool> buffer[kThreadPerBlock / kWarpSize];
-
-    graphvite::Vector<dim, bool> &sign = buffer[threadIdx.x / kWarpSize];
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_moment2 = relation_moment2s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += abs(head[i] + relation[i] - tail[i]);
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &head_moment2 = head_moment2s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            Vector &tail_moment2 = tail_moment2s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float y = head[i] + relation[i] - tail[i];
-                sign[i] = y > 0;
-                x += abs(y);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float s = sign[i] ? 1 : -1;
-                head[i] -= (optimizer.*update)(head[i], -gradient * s, head_moment1[i], head_moment2[i], weight);
-                tail[i] -= (optimizer.*update)(tail[i], gradient * s, tail_moment1[i], tail_moment2[i], weight);
-                Float relation_update = (optimizer.*update)(relation[i], -gradient * s,
-                                                            relation_moment1[i], relation_moment2[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-} // namespace transe
-
-namespace distmult {
-
-/**
- * @brief Train DistMult with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                      Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += head[i] * relation[i] * tail[i];
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += head[i] * relation[i] * tail[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[i];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h, weight);
-                tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t, weight);
-                Float relation_update = (optimizer.*update)
-                        (r, gradient * h * t + l3_regularization * abs(r) * r, weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train DistMult with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_1_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += head[i] * relation[i] * tail[i];
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += head[i] * relation[i] * tail[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[i];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
-                                               head_moment1[i], weight);
-                tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
-                                               tail_moment1[i], weight);
-                Float relation_update = (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
-                                                            relation_moment1[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train DistMult with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s, Memory<Vector, Index> head_moment2s,
-                               Memory<Vector, Index> tail_moment2s, Memory<Vector, Index> relation_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_2_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_moment2 = relation_moment2s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize)
-                    x += head[i] * relation[i] * tail[i];
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &head_moment2 = head_moment2s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            Vector &tail_moment2 = tail_moment2s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += head[i] * relation[i] * tail[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[i];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
-                                               head_moment1[i], head_moment2[i], weight);
-                tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
-                                               tail_moment1[i], tail_moment2[i], weight);
-                Float relation_update = (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
-                                                            relation_moment1[i], relation_moment2[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-} // namespace distmult
-
-namespace complex {
-
-/**
- * @brief Train ComplEx with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                      Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_re = relation[i * 2];
-                    Float relation_im = relation[i * 2 + 1];
-                    Float product_re = head_re * relation_re - head_im * relation_im;
-                    Float product_im = head_re * relation_im + head_im * relation_re;
-                    x += product_re * tail_re + product_im * tail_im;
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                Float product_re = head_re * relation_re - head_im * relation_im;
-                Float product_im = head_re * relation_im + head_im * relation_re;
-                x += product_re * tail_re + product_im * tail_im;
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                // head
-                Float head_re_grad = gradient * (relation_re * tail_re + relation_im * tail_im);
-                Float head_im_grad = gradient * (-relation_im * tail_re + relation_re * tail_im);
-                head[i * 2] -= (optimizer.*update)
-                        (head_re, head_re_grad + l3_regularization * abs(head_re) * head_re, weight);
-                head[i * 2 + 1] -= (optimizer.*update)
-                        (head_im, head_im_grad + l3_regularization * abs(head_im) * head_im, weight);
-                // tail
-                Float tail_re_grad = gradient * (head_re * relation_re - head_im * relation_im);
-                Float tail_im_grad = gradient * (head_re * relation_im + head_im * relation_re);
-                tail[i * 2] -= (optimizer.*update)
-                        (tail_re, tail_re_grad + l3_regularization * abs(tail_re) * tail_re, weight);
-                tail[i * 2 + 1] -= (optimizer.*update)
-                        (tail_im, tail_im_grad + l3_regularization * abs(tail_im) * tail_im, weight);
-                // relation
-                Float relation_re_grad = gradient * (head_re * tail_re + head_im * tail_im);
-                Float relation_im_grad = gradient * (-head_im * tail_re + head_re * tail_im);
-                Float relation_re_update = (optimizer.*update)
-                        (relation_re, relation_re_grad + l3_regularization * abs(relation_re) * relation_re, weight);
-                Float relation_im_update = (optimizer.*update)
-                        (relation_im, relation_im_grad + l3_regularization * abs(relation_im) * relation_im, weight);
-                relation[i * 2] -= relation_re_update;
-                relation[i * 2 + 1] -= relation_im_update;
-                relation_gradient[i * 2] += relation_re_update;
-                relation_gradient[i * 2 + 1] += relation_im_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train ComplEx with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_1_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_re = relation[i * 2];
-                    Float relation_im = relation[i * 2 + 1];
-                    Float product_re = head_re * relation_re - head_im * relation_im;
-                    Float product_im = head_re * relation_im + head_im * relation_re;
-                    x += product_re * tail_re + product_im * tail_im;
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                Float product_re = head_re * relation_re - head_im * relation_im;
-                Float product_im = head_re * relation_im + head_im * relation_re;
-                x += product_re * tail_re + product_im * tail_im;
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                // head
-                Float head_re_grad = gradient * (relation_re * tail_re + relation_im * tail_im);
-                Float head_im_grad = gradient * (-relation_im * tail_re + relation_re * tail_im);
-                head[i * 2] -= (optimizer.*update)(head_re, head_re_grad + l3_regularization * abs(head_re) * head_re,
-                                                   head_moment1[i * 2], weight);
-                head[i * 2 + 1] -= (optimizer.*update)
-                        (head_im, head_im_grad + l3_regularization * abs(head_im) * head_im,
-                         head_moment1[i * 2 + 1], weight);
-                // tail
-                Float tail_re_grad = gradient * (head_re * relation_re - head_im * relation_im);
-                Float tail_im_grad = gradient * (head_re * relation_im + head_im * relation_re);
-                tail[i * 2] -= (optimizer.*update)(tail_re, tail_re_grad + l3_regularization * abs(tail_re) * tail_re,
-                                                   tail_moment1[i * 2], weight);
-                tail[i * 2 + 1] -= (optimizer.*update)
-                        (tail_im, tail_im_grad + l3_regularization * abs(tail_im) * tail_im,
-                         tail_moment1[i * 2 + 1], weight);
-                // relation
-                Float relation_re_grad = gradient * (head_re * tail_re + head_im * tail_im);
-                Float relation_im_grad = gradient * (-head_im * tail_re + head_re * tail_im);
-                Float relation_re_update = (optimizer.*update)
-                        (relation_re, relation_re_grad + l3_regularization * abs(relation_re) * relation_re,
-                         relation_moment1[i], weight);
-                Float relation_im_update = (optimizer.*update)
-                        (relation_im, relation_im_grad + l3_regularization * abs(relation_im) * relation_im,
-                         relation_moment1[i], weight);
-                relation[i * 2] -= relation_re_update;
-                relation[i * 2 + 1] -= relation_im_update;
-                relation_gradient[i * 2] += relation_re_update;
-                relation_gradient[i * 2 + 1] += relation_im_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train ComplEx with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s, Memory<Vector, Index> head_moment2s,
-                               Memory<Vector, Index> tail_moment2s, Memory<Vector, Index> relation_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_2_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_moment2 = relation_moment2s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_re = relation[i * 2];
-                    Float relation_im = relation[i * 2 + 1];
-                    Float product_re = head_re * relation_re - head_im * relation_im;
-                    Float product_im = head_re * relation_im + head_im * relation_re;
-                    x += product_re * tail_re + product_im * tail_im;
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &head_moment2 = head_moment2s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            Vector &tail_moment2 = tail_moment2s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                Float product_re = head_re * relation_re - head_im * relation_im;
-                Float product_im = head_re * relation_im + head_im * relation_re;
-                x += product_re * tail_re + product_im * tail_im;
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_re = relation[i * 2];
-                Float relation_im = relation[i * 2 + 1];
-                // head
-                Float head_re_grad = gradient * (relation_re * tail_re + relation_im * tail_im);
-                Float head_im_grad = gradient * (-relation_im * tail_re + relation_re * tail_im);
-                head[i * 2] -= (optimizer.*update)(head_re, head_re_grad + l3_regularization * abs(head_re) * head_re,
-                                                   head_moment1[i * 2], head_moment2[i * 2], weight);
-                head[i * 2 + 1] -= (optimizer.*update)
-                        (head_im, head_im_grad + l3_regularization * abs(head_im) * head_im,
-                         head_moment1[i * 2 + 1], head_moment2[i * 2 + 1], weight);
-                // tail
-                Float tail_re_grad = gradient * (head_re * relation_re - head_im * relation_im);
-                Float tail_im_grad = gradient * (head_re * relation_im + head_im * relation_re);
-                tail[i * 2] -= (optimizer.*update)(tail_re, tail_re_grad + l3_regularization * abs(tail_re) * tail_re,
-                                                   tail_moment1[i * 2], tail_moment2[i * 2], weight);
-                tail[i * 2 + 1] -= (optimizer.*update)
-                        (tail_im, tail_im_grad + l3_regularization * abs(tail_im) * tail_im,
-                         tail_moment1[i * 2 + 1], tail_moment2[i * 2 + 1], weight);
-                // relation
-                Float relation_re_grad = gradient * (head_re * tail_re + head_im * tail_im);
-                Float relation_im_grad = gradient * (-head_im * tail_re + head_re * tail_im);
-                Float relation_re_update = (optimizer.*update)
-                        (relation_re, relation_re_grad + l3_regularization * abs(relation_re) * relation_re,
-                         relation_moment1[i], relation_moment2[i], weight);
-                Float relation_im_update = (optimizer.*update)
-                        (relation_im, relation_im_grad + l3_regularization * abs(relation_im) * relation_im,
-                         relation_moment1[i], relation_moment2[i], weight);
-                relation[i * 2] -= relation_re_update;
-                relation[i * 2 + 1] -= relation_im_update;
-                relation_gradient[i * 2] += relation_re_update;
-                relation_gradient[i * 2 + 1] += relation_im_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-} // namespace complex
-
-namespace simple {
-
-/**
- * @brief Train SimplE with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                      Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Index j = i ^ 1;
-                    x += head[i] * relation[i] * tail[j];
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                x += head[i] * relation[i] * tail[j];
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[j];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h, weight);
-                tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t, weight);
-                Float relation_update = (optimizer.*update)
-                        (r, gradient * h * t + l3_regularization * abs(r) * r, weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train SimplE with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_1_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Index j = i ^ 1;
-                    x += head[i] * relation[i] * tail[j];
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                x += head[i] * relation[i] * tail[j];
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[j];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
-                                               head_moment1[i], weight);
-                tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
-                                               tail_moment1[j], weight);
-                Float relation_update = (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
-                                                            relation_moment1[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train SimplE with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s, Memory<Vector, Index> head_moment2s,
-                               Memory<Vector, Index> tail_moment2s, Memory<Vector, Index> relation_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float l3_regularization, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-    l3_regularization *= 3;
-
-    auto update = get_update_function_2_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_moment2 = relation_moment2s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Index j = i ^ 1;
-                    x += head[i] * relation[i] * tail[j];
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &head_moment2 = head_moment2s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            Vector &tail_moment2 = tail_moment2s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                x += head[i] * relation[i] * tail[j];
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Index j = i ^ 1;
-                Float h = head[i];
-                Float r = relation[i];
-                Float t = tail[j];
-                head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
-                                               head_moment1[i], head_moment2[i], weight);
-                tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
-                                               tail_moment1[j], tail_moment2[j], weight);
-                Float relation_update = (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
-                                                            relation_moment1[i], relation_moment2[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-} // namespace simple
-
-namespace rotate {
-
-/**
- * @brief Train RotatE with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                      Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_phase = relation[i];
-                    Float relation_re = cos(relation_phase);
-                    Float relation_im = sin(relation_phase);
-                    Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                    Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                    x += sqrt(distance_re * distance_re + distance_im * distance_im);
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                x += sqrt(distance_re * distance_re + distance_im * distance_im);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                Float grad_this_dim = gradient /
-                                      (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
-                // head
-                Float head_re_grad = -grad_this_dim * (distance_re * relation_re + distance_im * relation_im);
-                Float head_im_grad = -grad_this_dim * (-distance_re * relation_im + distance_im * relation_re);
-                head[i * 2] -= (optimizer.*update)(head_re, head_re_grad, weight);
-                head[i * 2 + 1] -= (optimizer.*update)(head_im, head_im_grad, weight);
-                // tail
-                tail[i * 2] -= (optimizer.*update)(tail_re, grad_this_dim * distance_re, weight);
-                tail[i * 2 + 1] -= (optimizer.*update)(tail_im, grad_this_dim * distance_im, weight);
-                // relation
-                Float relation_grad = -grad_this_dim *
-                                      (distance_re * (head_re * -relation_im + head_im * -relation_re) +
-                                       distance_im * (head_re * relation_re + head_im * -relation_im));
-                Float relation_update = (optimizer.*update)(relation_phase, relation_grad, weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train RotatE with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_1_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_phase = relation[i];
-                    Float relation_re = cos(relation_phase);
-                    Float relation_im = sin(relation_phase);
-                    Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                    Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                    x += sqrt(distance_re * distance_re + distance_im * distance_im);
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                x += sqrt(distance_re * distance_re + distance_im * distance_im);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                Float grad_this_dim = gradient /
-                                      (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
-                // head
-                Float head_re_grad = -grad_this_dim * (distance_re * relation_re + distance_im * relation_im);
-                Float head_im_grad = -grad_this_dim * (-distance_re * relation_im + distance_im * relation_re);
-                head[i * 2] -= (optimizer.*update)(head_re, head_re_grad,
-                                                   head_moment1[i * 2], weight);
-                head[i * 2 + 1] -= (optimizer.*update)(head_im, head_im_grad,
-                                                       head_moment1[i * 2 + 1], weight);
-                // tail
-                tail[i * 2] -= (optimizer.*update)(tail_re, grad_this_dim * distance_re,
-                                                   tail_moment1[i * 2], weight);
-                tail[i * 2 + 1] -= (optimizer.*update)(tail_im, grad_this_dim * distance_im,
-                                                       tail_moment1[i * 2 + 1], weight);
-                // relation
-                Float relation_grad = -grad_this_dim *
-                                      (distance_re * (head_re * -relation_im + head_im * -relation_re) +
-                                       distance_im * (head_re * relation_re + head_im * -relation_im));
-                Float relation_update = (optimizer.*update)(relation_phase, relation_grad,
-                                                            relation_moment1[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-
-/**
- * @brief Train RotatE with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - relation: store gradients
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
- * @tparam Index integral type of indexes
- * @tparam type type of optimizer
- */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> relation_gradients,
-                               Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Vector, Index> relation_moment1s, Memory<Vector, Index> head_moment2s,
-                               Memory<Vector, Index> tail_moment2s, Memory<Vector, Index> relation_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
-                               Optimizer optimizer, float margin, float adversarial_temperature
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim / 2;
-    typedef typename Vector::Float Float;
-
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int num_head = head_embeddings.count;
-    int batch_size = batch.count / 3;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_2_moment<Float, type>();
-
-    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
-        // elements in std::tuple are stored in reverse order
-        // each positive sample is {relation, tail, head}
-        Index relation_id = batch[sample_id * 3];
-        Vector &relation = relation_embeddings[relation_id];
-        Vector &relation_moment1 = relation_moment1s[relation_id];
-        Vector &relation_moment2 = relation_moment2s[relation_id];
-        Vector &relation_gradient = relation_gradients[relation_id];
-
-        // compute normalizer
-        Float x0, normalizer = 0;
-        if (adversarial_temperature > kEpsilon)
-            for (int s = 0; s < num_negative; s++) {
-                Index head_id = batch[sample_id * 3 + 2];
-                Index tail_id = batch[sample_id * 3 + 1];
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                Vector &head = head_embeddings[head_id];
-                Vector &tail = tail_embeddings[tail_id];
-                // Forward
-                Float x = 0;
-                for (int i = lane_id; i < dim; i += kWarpSize) {
-                    Float head_re = head[i * 2];
-                    Float head_im = head[i * 2 + 1];
-                    Float tail_re = tail[i * 2];
-                    Float tail_im = tail[i * 2 + 1];
-                    Float relation_phase = relation[i];
-                    Float relation_re = cos(relation_phase);
-                    Float relation_im = sin(relation_phase);
-                    Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                    Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                    x += sqrt(distance_re * distance_re + distance_im * distance_im);
-                }
-                x = WarpBroadcast(WarpReduce(x), 0);
-                x = margin - x;
-                if (s == 0)
-                    x0 = x;
-                normalizer += exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip));
-            }
-
-#ifdef USE_LOSS
-        Float sample_loss = 0;
-#endif
-        for (int s = 0; s <= num_negative; s++) {
-            Index head_id = batch[sample_id * 3 + 2];
-            Index tail_id = batch[sample_id * 3 + 1];
-            int label = 1;
-            if (s < num_negative) {
-                Index negative_id = negative_batch[sample_id * num_negative + s];
-                if (negative_id < num_head)
-                    head_id = negative_id;
-                else
-                    tail_id = negative_id - num_head;
-                label = 0;
-            }
-            Vector &head = head_embeddings[head_id];
-            Vector &head_moment1 = head_moment1s[head_id];
-            Vector &head_moment2 = head_moment2s[head_id];
-            Vector &tail = tail_embeddings[tail_id];
-            Vector &tail_moment1 = tail_moment1s[tail_id];
-            Vector &tail_moment2 = tail_moment2s[tail_id];
-            // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                x += sqrt(distance_re * distance_re + distance_im * distance_im);
-            }
-            x = WarpBroadcast(WarpReduce(x), 0);
-            x = margin - x;
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
-            // Backward
-            Float gradient, weight;
-            if (label) {
-                gradient = prob - 1;
-                weight = 1;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(prob + kEpsilon);
-#endif
-            } else {
-                gradient = prob;
-                if (adversarial_temperature > kEpsilon)
-                    weight = exp(min(max((x - x0) / adversarial_temperature, -kLogitClip), kLogitClip)) / normalizer;
-                else
-                    weight = 1.0 / num_negative;
-#ifdef USE_LOSS
-                sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float relation_phase = relation[i];
-                Float relation_re = cos(relation_phase);
-                Float relation_im = sin(relation_phase);
-                Float head_re = head[i * 2];
-                Float head_im = head[i * 2 + 1];
-                Float tail_re = tail[i * 2];
-                Float tail_im = tail[i * 2 + 1];
-                Float distance_re = head_re * relation_re - head_im * relation_im - tail_re;
-                Float distance_im = head_re * relation_im + head_im * relation_re - tail_im;
-                Float grad_this_dim = gradient /
-                                      (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
-                // head
-                Float head_re_grad = -grad_this_dim * (distance_re * relation_re + distance_im * relation_im);
-                Float head_im_grad = -grad_this_dim * (-distance_re * relation_im + distance_im * relation_re);
-                head[i * 2] -= (optimizer.*update)(head_re, head_re_grad,
-                                                   head_moment1[i * 2], head_moment2[i * 2], weight);
-                head[i * 2 + 1] -= (optimizer.*update)(head_im, head_im_grad,
-                                                       head_moment1[i * 2 + 1], head_moment2[i * 2 + 1], weight);
-                // tail
-                tail[i * 2] -= (optimizer.*update)(tail_re, grad_this_dim * distance_re,
-                                                   tail_moment1[i * 2], tail_moment2[i * 2], weight);
-                tail[i * 2 + 1] -= (optimizer.*update)(tail_im, grad_this_dim * distance_im,
-                                                       tail_moment1[i * 2 + 1], tail_moment2[i * 2 + 1], weight);
-                // relation
-                Float relation_grad = -grad_this_dim *
-                                      (distance_re * (head_re * -relation_im + head_im * -relation_re) +
-                                       distance_im * (head_re * relation_re + head_im * -relation_im));
-                Float relation_update = (optimizer.*update)(relation_phase, relation_grad,
-                                                            relation_moment1[i], relation_moment2[i], weight);
-                relation[i] -= relation_update;
-                relation_gradient[i] += relation_update;
-            }
-        }
-#ifdef USE_LOSS
-        if (lane_id == 0)
-            loss[sample_id] = sample_loss / 2;
-#endif
-    }
-}
-} // namespace rotate
-
-}
-}
\ No newline at end of file
diff --git a/include/gpu/graph.cuh b/include/instance/gpu/graph.cuh
similarity index 55%
rename from include/gpu/graph.cuh
rename to include/instance/gpu/graph.cuh
index 4aff7be..f9cbb99 100644
--- a/include/gpu/graph.cuh
+++ b/include/instance/gpu/graph.cuh
@@ -24,38 +24,29 @@
 
 namespace graphvite {
 namespace gpu {
-
-namespace line {
+namespace graph {
 
 /**
- * @brief Train LINE with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - vertex: in place
- * - context: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train node embedding with 0-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train(Memory <Vector, Index> vertex_embeddings, Memory <Vector, Index> context_embeddings,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                      float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+__global__ void train(Memory<Vector, Index> vertex_embeddings, Memory<Vector, Index> context_embeddings,
+                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                      Memory<typename Vector::Float, int> loss,
+                      Optimizer optimizer, float negative_weight) {
     static const size_t dim = Vector::dim;
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &vertex_buffer = buffer[threadIdx.x / kWarpSize];
@@ -65,11 +56,9 @@ __global__ void train(Memory <Vector, Index> vertex_embeddings, Memory <Vector,
         // each positive sample is {tail, head}
         Index head_id = batch[sample_id * 2 + 1];
         Vector &vertex = vertex_embeddings[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex_buffer[i] = vertex[i];
-#ifdef USE_LOSS
+        vertex_buffer = vertex;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -82,72 +71,51 @@ __global__ void train(Memory <Vector, Index> vertex_embeddings, Memory <Vector,
             }
             Vector &context = context_embeddings[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += vertex_buffer[i] * context[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
+            Float logit;
+            model.forward(vertex_buffer, context, logit);
+            Float prob = sigmoid(logit);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = prob - 1;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = prob;
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float v = vertex_buffer[i];
-                Float c = context[i];
-                vertex_buffer[i] -= (optimizer.*update)(v, gradient * c, weight);
-                context[i] -= (optimizer.*update)(c, gradient * v, weight);
             }
+            model.backward<optimizer_type>(vertex_buffer, context, gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex[i] = vertex_buffer[i];
+        vertex = vertex_buffer;
     }
 }
 
 /**
- * @brief Train LINE with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - vertex: in place
- * - context: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train node embedding with 1-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
 __global__ void train_1_moment(Memory <Vector, Index> vertex_embeddings, Memory <Vector, Index> context_embeddings,
                                Memory<Vector, Index> vertex_moment1s, Memory<Vector, Index> context_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                               float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
+                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                               Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float negative_weight) {
     static const size_t dim = Vector::dim;
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_1_moment<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &vertex_buffer = buffer[threadIdx.x / kWarpSize];
@@ -158,11 +126,9 @@ __global__ void train_1_moment(Memory <Vector, Index> vertex_embeddings, Memory
         Index head_id = batch[sample_id * 2 + 1];
         Vector &vertex = vertex_embeddings[head_id];
         Vector &vertex_moment1 = vertex_moment1s[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex_buffer[i] = vertex[i];
-#ifdef USE_LOSS
+        vertex_buffer = vertex;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -176,73 +142,53 @@ __global__ void train_1_moment(Memory <Vector, Index> vertex_embeddings, Memory
             Vector &context = context_embeddings[tail_id];
             Vector &context_moment1 = context_moment1s[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += vertex_buffer[i] * context[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
+            Float logit;
+            model.forward(vertex_buffer, context, logit);
+            Float prob = sigmoid(logit);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = prob - 1;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = prob;
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float v = vertex_buffer[i];
-                Float c = context[i];
-                vertex_buffer[i] -= (optimizer.*update)(v, gradient * c, vertex_moment1[i], weight);
-                context[i] -= (optimizer.*update)(c, gradient * v, context_moment1[i], weight);
             }
+            model.backward<optimizer_type>(vertex_buffer, context, vertex_moment1, context_moment1,
+                                           gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex[i] = vertex_buffer[i];
+        vertex = vertex_buffer;
     }
 }
 
 /**
- * @brief Train LINE with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - vertex: in place
- * - context: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train node embedding with 2-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
-__global__ void train_2_moment(Memory <Vector, Index> vertex_embeddings, Memory <Vector, Index> context_embeddings,
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+__global__ void train_2_moment(Memory<Vector, Index> vertex_embeddings, Memory <Vector, Index> context_embeddings,
                                Memory<Vector, Index> vertex_moment1s, Memory<Vector, Index> context_moment1s,
                                Memory<Vector, Index> vertex_moment2s, Memory<Vector, Index> context_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                               float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
+                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                               Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float negative_weight) {
     static const size_t dim = Vector::dim;
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_2_moment<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &vertex_buffer = buffer[threadIdx.x / kWarpSize];
@@ -254,11 +200,9 @@ __global__ void train_2_moment(Memory <Vector, Index> vertex_embeddings, Memory
         Vector &vertex = vertex_embeddings[head_id];
         Vector &vertex_moment1 = vertex_moment1s[head_id];
         Vector &vertex_moment2 = vertex_moment2s[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex_buffer[i] = vertex[i];
-#ifdef USE_LOSS
+        vertex_buffer = vertex;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -273,45 +217,67 @@ __global__ void train_2_moment(Memory <Vector, Index> vertex_embeddings, Memory
             Vector &context_moment1 = context_moment1s[tail_id];
             Vector &context_moment2 = context_moment2s[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += vertex_buffer[i] * context[i];
-            x = WarpBroadcast(WarpReduce(x), 0);
-            Float prob = x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
+            Float logit;
+            model.forward(vertex_buffer, context, logit);
+            Float prob = sigmoid(logit);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = prob - 1;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = prob;
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float v = vertex_buffer[i];
-                Float c = context[i];
-                vertex_buffer[i] -= (optimizer.*update)(v, gradient * c, vertex_moment1[i], vertex_moment2[i], weight);
-                context[i] -= (optimizer.*update)(c, gradient * v, context_moment1[i], context_moment2[i], weight);
             }
+            model.backward<optimizer_type>(vertex_buffer, context, vertex_moment1, context_moment1,
+                                           vertex_moment2, context_moment2, gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            vertex[i] = vertex_buffer[i];
+        vertex = vertex_buffer;
     }
 }
-} // namespace line
 
-namespace deepwalk = line;
-namespace node2vec = line;
+/**
+ * @brief Predict logits for batch samples
+ * @tparam Vector vector type of embeddings
+ * @tparam Index integral type of indexes
+ * @tparam Model embedding model
+ */
+template<class Vector, class Index, template<class> class Model>
+__global__ void predict(Memory<Vector, Index> vertex_embeddings, Memory<Vector, Index> context_embeddings,
+                        Memory<Index, int> batch, Memory<typename Vector::Float, int> logits) {
+    static const size_t dim = Vector::dim;
+    typedef typename Vector::Float Float;
+
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    Model<Vector> model;
+
+    __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
+    Vector &vertex_buffer = buffer[threadIdx.x / kWarpSize];
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        // elements in std::tuple are stored in reverse order
+        // each positive sample is {tail, head}
+        Index head_id = batch[sample_id * 2 + 1];
+        Index tail_id = batch[sample_id * 2];
+        Vector &vertex = vertex_embeddings[head_id];
+        Vector &context = context_embeddings[tail_id];
+
+        Float logit;
+        model.forward(vertex, context, logit);
+
+        if (lane_id == 0)
+            logits[sample_id] = logit;
+    }
+}
 
+} // namespace graph
 } // namespace gpu
 } // namespace graphvite
\ No newline at end of file
diff --git a/include/instance/gpu/knowledge_graph.cuh b/include/instance/gpu/knowledge_graph.cuh
new file mode 100644
index 0000000..428c17e
--- /dev/null
+++ b/include/instance/gpu/knowledge_graph.cuh
@@ -0,0 +1,355 @@
+/**
+ * Copyright 2019 MilaGraph. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author Zhaocheng Zhu
+ */
+
+#pragma once
+
+#include "base/memory.h"
+#include "core/optimizer.h"
+#include "util/gpu.cuh"
+#include "util/math.h"
+
+namespace graphvite {
+namespace gpu {
+namespace knowledge_graph {
+
+/**
+ * @brief Train knowledge graph embedding with 0-moment optimizers
+ * @tparam Vector vector type of embeddings
+ * @tparam Index integral type of indexes
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
+ */
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+__global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
+                      Memory<Vector, Index> relation_embeddings, Memory<Index, int> batch,
+                      Memory<Index, int> negative_batch, Memory<typename Vector::Float, int> loss,
+                      Optimizer optimizer, float margin_or_l3, float adversarial_temperature) {
+    typedef typename Vector::Float Float;
+
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int num_head = head_embeddings.count;
+    const int batch_size = batch.count / 3;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        // elements in std::tuple are stored in reverse order
+        // each positive sample is {relation, tail, head}
+        Index relation_id = batch[sample_id * 3];
+        Vector &relation = relation_embeddings[relation_id];
+
+        // compute normalizer
+        Float bias, normalizer = 0;
+        if (adversarial_temperature > kEpsilon)
+            for (int s = 0; s < num_negative; s++) {
+                Index head_id = batch[sample_id * 3 + 2];
+                Index tail_id = batch[sample_id * 3 + 1];
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                Vector &head = head_embeddings[head_id];
+                Vector &tail = tail_embeddings[tail_id];
+                // Forward
+                Float logit;
+                model.forward(head, tail, relation, logit, margin_or_l3);
+                if (s == 0)
+                    bias = logit;
+                normalizer += safe_exp((logit - bias) / adversarial_temperature);
+            }
+
+        Float sample_loss = 0;
+        for (int s = 0; s <= num_negative; s++) {
+            Index head_id = batch[sample_id * 3 + 2];
+            Index tail_id = batch[sample_id * 3 + 1];
+            int label = 1;
+            if (s < num_negative) {
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                label = 0;
+            }
+            Vector &head = head_embeddings[head_id];
+            Vector &tail = tail_embeddings[tail_id];
+            // Forward
+            Float logit;
+            model.forward(head, tail, relation, logit, margin_or_l3);
+            Float prob = sigmoid(logit);
+            // Backward
+            Float gradient, weight;
+            if (label) {
+                gradient = prob - 1;
+                weight = 1;
+                sample_loss += weight * -log(prob + kEpsilon);
+            } else {
+                gradient = prob;
+                if (adversarial_temperature > kEpsilon)
+                    weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                else
+                    weight = 1.0 / num_negative;
+                sample_loss += weight * -log(1 - prob + kEpsilon);
+            }
+            model.backward<optimizer_type>(head, tail, relation, margin_or_l3, gradient, optimizer, weight);
+        }
+
+        if (lane_id == 0)
+            loss[sample_id] = sample_loss / 2;
+    }
+}
+
+/**
+ * @brief Train knowledge graph embedding with 1-moment optimizers
+ * @tparam Vector vector type of embeddings
+ * @tparam Index integral type of indexes
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
+ */
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+__global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
+                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> head_moment1s,
+                               Memory<Vector, Index> tail_moment1s, Memory<Vector, Index> relation_moment1s,
+                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                               Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float margin_or_l3, float adversarial_temperature) {
+    typedef typename Vector::Float Float;
+
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int num_head = head_embeddings.count;
+    const int batch_size = batch.count / 3;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        // elements in std::tuple are stored in reverse order
+        // each positive sample is {relation, tail, head}
+        Index relation_id = batch[sample_id * 3];
+        Vector &relation = relation_embeddings[relation_id];
+        Vector &relation_moment1 = relation_moment1s[relation_id];
+
+        // compute normalizer
+        Float bias, normalizer = 0;
+        if (adversarial_temperature > kEpsilon)
+            for (int s = 0; s < num_negative; s++) {
+                Index head_id = batch[sample_id * 3 + 2];
+                Index tail_id = batch[sample_id * 3 + 1];
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                Vector &head = head_embeddings[head_id];
+                Vector &tail = tail_embeddings[tail_id];
+                // Forward
+                Float logit;
+                model.forward(head, tail, relation, logit, margin_or_l3);
+                if (s == 0)
+                    bias = logit;
+                normalizer += safe_exp((logit - bias) / adversarial_temperature);
+            }
+
+        Float sample_loss = 0;
+        for (int s = 0; s <= num_negative; s++) {
+            Index head_id = batch[sample_id * 3 + 2];
+            Index tail_id = batch[sample_id * 3 + 1];
+            int label = 1;
+            if (s < num_negative) {
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                label = 0;
+            }
+            Vector &head = head_embeddings[head_id];
+            Vector &head_moment1 = head_moment1s[head_id];
+            Vector &tail = tail_embeddings[tail_id];
+            Vector &tail_moment1 = tail_moment1s[tail_id];
+            // Forward
+            Float logit;
+            model.forward(head, tail, relation, logit, margin_or_l3);
+            Float prob = sigmoid(logit);
+            // Backward
+            Float gradient, weight;
+            if (label) {
+                gradient = prob - 1;
+                weight = 1;
+                sample_loss += weight * -log(prob + kEpsilon);
+            } else {
+                gradient = prob;
+                if (adversarial_temperature > kEpsilon)
+                    weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                else
+                    weight = 1.0 / num_negative;
+                sample_loss += weight * -log(1 - prob + kEpsilon);
+            }
+            model.backward<optimizer_type>(head, tail, relation, head_moment1, tail_moment1, relation_moment1,
+                                           margin_or_l3, gradient, optimizer, weight);
+        }
+
+        if (lane_id == 0)
+            loss[sample_id] = sample_loss / 2;
+    }
+}
+
+/**
+ * @brief Train knowledge graph embedding with 2-moment optimizers
+ * @tparam Vector vector type of embeddings
+ * @tparam Index integral type of indexes
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
+ */
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
+__global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
+                               Memory<Vector, Index> relation_embeddings, Memory<Vector, Index> head_moment1s,
+                               Memory<Vector, Index> tail_moment1s, Memory<Vector, Index> relation_moment1s,
+                               Memory<Vector, Index> head_moment2s, Memory<Vector, Index> tail_moment2s,
+                               Memory<Vector, Index> relation_moment2s, Memory<Index, int> batch,
+                               Memory<Index, int> negative_batch, Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float margin_or_l3, float adversarial_temperature) {
+    typedef typename Vector::Float Float;
+
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int num_head = head_embeddings.count;
+    const int batch_size = batch.count / 3;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        // elements in std::tuple are stored in reverse order
+        // each positive sample is {relation, tail, head}
+        Index relation_id = batch[sample_id * 3];
+        Vector &relation = relation_embeddings[relation_id];
+        Vector &relation_moment1 = relation_moment1s[relation_id];
+        Vector &relation_moment2 = relation_moment2s[relation_id];
+
+        // compute normalizer
+        Float bias, normalizer = 0;
+        if (adversarial_temperature > kEpsilon)
+            for (int s = 0; s < num_negative; s++) {
+                Index head_id = batch[sample_id * 3 + 2];
+                Index tail_id = batch[sample_id * 3 + 1];
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                Vector &head = head_embeddings[head_id];
+                Vector &tail = tail_embeddings[tail_id];
+                // Forward
+                Float logit;
+                model.forward(head, tail, relation, logit, margin_or_l3);
+                if (s == 0)
+                    bias = logit;
+                normalizer += safe_exp((logit - bias) / adversarial_temperature);
+            }
+
+        Float sample_loss = 0;
+        for (int s = 0; s <= num_negative; s++) {
+            Index head_id = batch[sample_id * 3 + 2];
+            Index tail_id = batch[sample_id * 3 + 1];
+            int label = 1;
+            if (s < num_negative) {
+                Index negative_id = negative_batch[sample_id * num_negative + s];
+                if (negative_id < num_head)
+                    head_id = negative_id;
+                else
+                    tail_id = negative_id - num_head;
+                label = 0;
+            }
+            Vector &head = head_embeddings[head_id];
+            Vector &head_moment1 = head_moment1s[head_id];
+            Vector &head_moment2 = head_moment2s[head_id];
+            Vector &tail = tail_embeddings[tail_id];
+            Vector &tail_moment1 = tail_moment1s[tail_id];
+            Vector &tail_moment2 = tail_moment2s[tail_id];
+            // Forward
+            Float logit;
+            model.forward(head, tail, relation, logit, margin_or_l3);
+            Float prob = sigmoid(logit);
+            // Backward
+            Float gradient, weight;
+            if (label) {
+                gradient = prob - 1;
+                weight = 1;
+                sample_loss += weight * -log(prob + kEpsilon);
+            } else {
+                gradient = prob;
+                if (adversarial_temperature > kEpsilon)
+                    weight = safe_exp((logit - bias) / adversarial_temperature) / normalizer;
+                else
+                    weight = 1.0 / num_negative;
+                sample_loss += weight * -log(1 - prob + kEpsilon);
+            }
+            model.backward<optimizer_type>(head, tail, relation, head_moment1, tail_moment1, relation_moment1,
+                                           head_moment2, tail_moment2, relation_moment2,
+                                           margin_or_l3, gradient, optimizer, weight);
+        }
+
+        if (lane_id == 0)
+            loss[sample_id] = sample_loss / 2;
+    }
+}
+
+/**
+ * @brief Predict logits for batch samples
+ * @tparam Vector vector type of embeddings
+ * @tparam Index integral type of indexes
+ * @tparam Model embedding model
+ */
+template<class Vector, class Index, template<class> class Model>
+__global__ void predict(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
+                        Memory<Vector, Index> relation_embeddings, Memory<Index, int> batch,
+                        Memory<typename Vector::Float, int> logits, float margin_or_l3) {
+    typedef typename Vector::Float Float;
+
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 3;
+    Model<Vector> model;
+
+    for (int sample_id = thread_id / kWarpSize; sample_id < batch_size; sample_id += num_thread / kWarpSize) {
+        // elements in std::tuple are stored in reverse order
+        // each positive sample is {relation, tail, head}
+        Index head_id = batch[sample_id * 3 + 2];
+        Index tail_id = batch[sample_id * 3 + 1];
+        Index relation_id = batch[sample_id * 3];
+        Vector &head = head_embeddings[head_id];
+        Vector &tail = tail_embeddings[tail_id];
+        Vector &relation = relation_embeddings[relation_id];
+
+        Float logit;
+        model.forward(head, tail, relation, logit, margin_or_l3);
+
+        if (lane_id == 0)
+            logits[sample_id] = logit;
+    }
+}
+
+} // namespace knowledge graph
+} // namespace gpu
+} // namespace graphvite
\ No newline at end of file
diff --git a/include/gpu/visualization.cuh b/include/instance/gpu/visualization.cuh
similarity index 57%
rename from include/gpu/visualization.cuh
rename to include/instance/gpu/visualization.cuh
index 487e9f9..67cbc90 100644
--- a/include/gpu/visualization.cuh
+++ b/include/instance/gpu/visualization.cuh
@@ -24,40 +24,30 @@
 
 namespace graphvite {
 namespace gpu {
-
-namespace largevis {
+namespace visualization {
 
 const float kSmoothTerm = 0.1;
 
 /**
- * @brief Train LargeVis with 0-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train visualization with 0-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
 __global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
-                      Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                      float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
+                      Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                      Memory<typename Vector::Float, int> loss,
+                      Optimizer optimizer, float negative_weight) {
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &head_buffer = buffer[threadIdx.x / kWarpSize];
@@ -67,11 +57,9 @@ __global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Inde
         // each positive sample is {tail, head}
         Index head_id = batch[sample_id * 2 + 1];
         Vector &head = head_embeddings[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head_buffer[i] = head[i];
-#ifdef USE_LOSS
+        head_buffer = head;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -84,72 +72,50 @@ __global__ void train(Memory<Vector, Index> head_embeddings, Memory<Vector, Inde
             }
             Vector &tail = tail_embeddings[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += (head_buffer[i] - tail[i]) * (head_buffer[i] - tail[i]);
-            x = WarpBroadcast(WarpReduce(x), 0);
+            Float x;
+            model.forward(head, tail, x);
             Float prob = 1 / (1 + x);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = 2 * prob;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = -2 * prob / (x + kSmoothTerm);
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head_buffer[i];
-                Float t = tail[i];
-                head_buffer[i] -= (optimizer.*update)(h, gradient * (h - t), weight);
-                tail[i] -= (optimizer.*update)(t, gradient * (t - h), weight);
             }
+            model.backward<optimizer_type>(head, tail, gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head[i] = head_buffer[i];
+        head = head_buffer;
     }
 }
 
 /**
- * @brief Train LargeVis with 1-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train visualization with 1-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
 __global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
                                Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                               float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
+                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                               Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float negative_weight) {
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_1_moment<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &head_buffer = buffer[threadIdx.x / kWarpSize];
@@ -160,11 +126,9 @@ __global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
         Index head_id = batch[sample_id * 2 + 1];
         Vector &head = head_embeddings[head_id];
         Vector &head_moment1 = head_moment1s[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head_buffer[i] = head[i];
-#ifdef USE_LOSS
+        head_buffer = head;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -178,73 +142,51 @@ __global__ void train_1_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
             Vector &tail = tail_embeddings[tail_id];
             Vector &tail_moment1 = tail_moment1s[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += (head_buffer[i] - tail[i]) * (head_buffer[i] - tail[i]);
-            x = WarpBroadcast(WarpReduce(x), 0);
+            Float x;
+            model.forward(head, tail, x);
             Float prob = 1 / (1 + x);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = 2 * prob;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = -2 * prob / (x + kSmoothTerm);
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head_buffer[i];
-                Float t = tail[i];
-                head_buffer[i] -= (optimizer.*update)(h, gradient * (h - t), head_moment1[i], weight);
-                tail[i] -= (optimizer.*update)(t, gradient * (t - h), tail_moment1[i], weight);
             }
+            model.backward<optimizer_type>(head, tail, head_moment1, tail_moment1, gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head[i] = head_buffer[i];
+        head = head_buffer;
     }
 }
 
 /**
- * @brief Train LargeVis with 2-moment optimizers
- *
- * Update protocols of embeddings
- * - head: in place
- * - tail: in place
- *
- * @tparam Vector type of embedding vectors
+ * @brief Train visualization with 2-moment optimizers
+ * @tparam Vector vector type of embeddings
  * @tparam Index integral type of indexes
- * @tparam type type of optimizer
+ * @tparam Model embedding model
+ * @tparam optimizer_type type of optimizer
  */
-template<class Vector, class Index, OptimizerType type>
+template<class Vector, class Index, template<class> class Model, OptimizerType optimizer_type>
 __global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vector, Index> tail_embeddings,
                                Memory<Vector, Index> head_moment1s, Memory<Vector, Index> tail_moment1s,
                                Memory<Vector, Index> head_moment2s, Memory<Vector, Index> tail_moment2s,
-                               Memory<Index, int> batch, Memory<Index, int> negative_batch, Optimizer optimizer,
-                               float negative_weight
-#ifdef USE_LOSS
-        , Memory<typename Vector::Float, int> loss
-#endif
-) {
-    static const size_t dim = Vector::dim;
+                               Memory<Index, int> batch, Memory<Index, int> negative_batch,
+                               Memory<typename Vector::Float, int> loss,
+                               Optimizer optimizer, float negative_weight) {
     typedef typename Vector::Float Float;
 
-    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int lane_id = thread_id % kWarpSize;
-    int num_thread = gridDim.x * blockDim.x;
-    int batch_size = batch.count / 2;
-    int num_negative = negative_batch.count / batch_size;
-
-    auto update = get_update_function_2_moment<Float, type>();
+    const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int lane_id = thread_id % kWarpSize;
+    const int num_thread = gridDim.x * blockDim.x;
+    const int batch_size = batch.count / 2;
+    const int num_negative = negative_batch.count / batch_size;
+    Model<Vector> model;
 
     __shared__ Vector buffer[kThreadPerBlock / kWarpSize];
     Vector &head_buffer = buffer[threadIdx.x / kWarpSize];
@@ -256,11 +198,9 @@ __global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
         Vector &head = head_embeddings[head_id];
         Vector &head_moment1 = head_moment1s[head_id];
         Vector &head_moment2 = head_moment2s[head_id];
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head_buffer[i] = head[i];
-#ifdef USE_LOSS
+        head_buffer = head;
         Float sample_loss = 0;
-#endif
+
         for (int s = 0; s <= num_negative; s++) {
             Index tail_id;
             int label;
@@ -275,42 +215,30 @@ __global__ void train_2_moment(Memory<Vector, Index> head_embeddings, Memory<Vec
             Vector &tail_moment1 = tail_moment1s[tail_id];
             Vector &tail_moment2 = tail_moment2s[tail_id];
             // Forward
-            Float x = 0;
-            for (int i = lane_id; i < dim; i += kWarpSize)
-                x += (head_buffer[i] - tail[i]) * (head_buffer[i] - tail[i]);
-            x = WarpBroadcast(WarpReduce(x), 0);
+            Float x;
+            model.forward(head, tail, x);
             Float prob = 1 / (1 + x);
             // Backward
             Float gradient, weight;
             if (label) {
                 gradient = 2 * prob;
                 weight = 1;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(prob + kEpsilon);
-#endif
             } else {
                 gradient = -2 * prob / (x + kSmoothTerm);
                 weight = negative_weight;
-#ifdef USE_LOSS
                 sample_loss += weight * -log(1 - prob + kEpsilon);
-#endif
-            }
-            for (int i = lane_id; i < dim; i += kWarpSize) {
-                Float h = head_buffer[i];
-                Float t = tail[i];
-                head_buffer[i] -= (optimizer.*update)(h, gradient * (h - t), head_moment1[i], head_moment2[i], weight);
-                tail[i] -= (optimizer.*update)(t, gradient * (t - h), tail_moment1[i], tail_moment2[i], weight);
             }
+            model.backward<optimizer_type>(head, tail, head_moment1, tail_moment1, head_moment2, tail_moment2,
+                                           gradient, optimizer, weight);
         }
-#ifdef USE_LOSS
+
         if (lane_id == 0)
             loss[sample_id] = sample_loss / (1 + num_negative * negative_weight);
-#endif
-        for (int i = lane_id; i < dim; i += kWarpSize)
-            head[i] = head_buffer[i];
+        head = head_buffer;
     }
 }
-} // namespace largevis
 
-}
-}
\ No newline at end of file
+} // namespace visualization
+} // namespace gpu
+} // namespace graphvite
\ No newline at end of file
diff --git a/include/instance/graph.cuh b/include/instance/graph.cuh
index d935299..5d921a4 100644
--- a/include/instance/graph.cuh
+++ b/include/instance/graph.cuh
@@ -23,6 +23,7 @@
 
 #include "core/graph.h"
 #include "core/solver.h"
+#include "model/graph.h"
 #include "gpu/graph.cuh"
 
 /**
@@ -280,9 +281,9 @@ public:
         CURAND_CHECK(curandGenerateUniformDouble(generator, random.device_ptr, kRandBatchSize));
 
         auto &sample_pool = solver->sample_pools[solver->pool_id ^ 1];
-        std::vector<std::vector<int>> offsets(solver->num_partition);
+        std::vector<std::vector<int>> offsets(num_partition);
         for (auto &&partition_offsets : offsets)
-            partition_offsets.resize(solver->num_partition, start);
+            partition_offsets.resize(num_partition, start);
         std::vector<std::vector<std::pair<int, Index>>> head_chains(solver->random_walk_batch_size);
         std::vector<std::vector<std::pair<int, Index>>> tail_chains(solver->random_walk_batch_size);
         for (auto &&head_chain : head_chains)
@@ -291,7 +292,7 @@ public:
             tail_chain.resize(solver->random_walk_length + 1);
         std::vector<int> sample_lengths(solver->random_walk_batch_size);
         int num_complete = 0, rand_id = 0;
-        while (num_complete < solver->num_partition * solver->num_partition) {
+        while (num_complete < num_partition * num_partition) {
             for (int i = 0; i < solver->random_walk_batch_size; i++) {
                 if (rand_id > kRandBatchSize - solver->random_walk_length * 2) {
                     random.to_host();
@@ -358,9 +359,9 @@ public:
         CURAND_CHECK(curandGenerateUniformDouble(generator, random.device_ptr, kRandBatchSize));
 
         auto &sample_pool = solver->sample_pools[solver->pool_id ^ 1];
-        std::vector<std::vector<int>> offsets(solver->num_partition);
+        std::vector<std::vector<int>> offsets(num_partition);
         for (auto &&partition_offsets : offsets)
-            partition_offsets.resize(solver->num_partition, start);
+            partition_offsets.resize(num_partition, start);
         std::vector<std::vector<std::pair<int, Index>>> head_chains(solver->random_walk_batch_size);
         std::vector<std::vector<std::pair<int, Index>>> tail_chains(solver->random_walk_batch_size);
         for (auto &&head_chain : head_chains)
@@ -369,7 +370,7 @@ public:
             tail_chain.resize(solver->random_walk_length + 1);
         std::vector<int> sample_lengths(solver->random_walk_batch_size);
         int num_complete = 0, rand_id = 0;
-        while (num_complete < solver->num_partition * solver->num_partition) {
+        while (num_complete < num_partition * num_partition) {
             for (int i = 0; i < solver->random_walk_batch_size; i++) {
                 if (rand_id > kRandBatchSize - solver->random_walk_length * 2) {
                     random.to_host();
@@ -434,99 +435,90 @@ public:
     typedef GraphSolver<Solver::dim, Float, Index> GraphSolver;
 
     /**
-     * Call the corresponding GPU kernel
+     * Call the corresponding GPU kernel for training
      * (DeepWalk, LINE, node2vec) * (SGD, Momentum, AdaGrad, RMSprop, Adam)
      */
-    bool kernel_dispatch() override {
+    bool train_dispatch() override {
         using namespace gpu;
         GraphSolver *solver = reinterpret_cast<GraphSolver *>(this->solver);
 
         switch (num_moment) {
             case 0: {
-                decltype(&line::train<Vector, Index, kSGD>) train = nullptr;
+                decltype(&graph::train<Vector, Index, DeepWalk, kSGD>) train = nullptr;
                 if (solver->model == "DeepWalk") {
                     if (optimizer.type == "SGD")
-                        train = &deepwalk::train<Vector, Index, kSGD>;
+                        train = &graph::train<Vector, Index, DeepWalk, kSGD>;
                 }
                 if (solver->model == "LINE") {
                     if (optimizer.type == "SGD")
-                        train = &line::train<Vector, Index, kSGD>;
+                        train = &graph::train<Vector, Index, LINE, kSGD>;
                 }
                 if (solver->model == "node2vec") {
                     if (optimizer.type == "SGD")
-                        train = &node2vec::train<Vector, Index, kSGD>;
+                        train = &graph::train<Vector, Index, Node2Vec, kSGD>;
                 }
                 if (train) {
-                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1],
-                                    batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>(
+                            *embeddings[0], *embeddings[1],
+                                    batch, negative_batch, loss, optimizer, solver->negative_weight
                     );
                     return true;
                 }
             }
             case 1: {
-                decltype(&line::train_1_moment<Vector, Index, kMomentum>) train = nullptr;
+                decltype(&graph::train_1_moment<Vector, Index, DeepWalk, kMomentum>) train = nullptr;
                 if (solver->model == "DeepWalk") {
                     if (optimizer.type == "Momentum")
-                        train = &deepwalk::train_1_moment<Vector, Index, kMomentum>;
+                        train = &graph::train_1_moment<Vector, Index, DeepWalk, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &deepwalk::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &graph::train_1_moment<Vector, Index, DeepWalk, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &deepwalk::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &graph::train_1_moment<Vector, Index, DeepWalk, kRMSprop>;
                 }
                 if (solver->model == "LINE") {
                     if (optimizer.type == "Momentum")
-                        train = &line::train_1_moment<Vector, Index, kMomentum>;
+                        train = &graph::train_1_moment<Vector, Index, LINE, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &line::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &graph::train_1_moment<Vector, Index, LINE, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &line::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &graph::train_1_moment<Vector, Index, LINE, kRMSprop>;
                 }
                 if (solver->model == "node2vec") {
                     if (optimizer.type == "Momentum")
-                        train = &node2vec::train_1_moment<Vector, Index, kMomentum>;
+                        train = &graph::train_1_moment<Vector, Index, Node2Vec, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &node2vec::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &graph::train_1_moment<Vector, Index, Node2Vec, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &node2vec::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &graph::train_1_moment<Vector, Index, Node2Vec, kRMSprop>;
                 }
                 if (train) {
-                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1],
+                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>(
+                            *embeddings[0], *embeddings[1],
                                     (*moments[0])[0], (*moments[1])[0],
-                                    batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                                    batch, negative_batch, loss, optimizer, solver->negative_weight
                     );
                     return true;
                 }
             }
             case 2: {
-                decltype(&line::train_2_moment<Vector, Index, kAdam>) train = nullptr;
+                decltype(&graph::train_2_moment<Vector, Index, DeepWalk, kAdam>) train = nullptr;
                 if (solver->model == "DeepWalk") {
                     if (optimizer.type == "Adam")
-                        train = &deepwalk::train_2_moment<Vector, Index, kAdam>;
+                        train = &graph::train_2_moment<Vector, Index, DeepWalk, kAdam>;
                 }
                 if (solver->model == "LINE") {
                     if (optimizer.type == "Adam")
-                        train = &line::train_2_moment<Vector, Index, kAdam>;
+                        train = &graph::train_2_moment<Vector, Index, LINE, kAdam>;
                 }
                 if (solver->model == "node2vec") {
                     if (optimizer.type == "Adam")
-                        train = &node2vec::train_2_moment<Vector, Index, kAdam>;
+                        train = &graph::train_2_moment<Vector, Index, Node2Vec, kAdam>;
                 }
                 if (train) {
-                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1],
+                    train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>(
+                            *embeddings[0], *embeddings[1],
                                     (*moments[0])[0], (*moments[1])[0], (*moments[0])[1], (*moments[1])[1],
-                                    batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                                    batch, negative_batch, loss, optimizer, solver->negative_weight
                     );
                     return true;
                 }
@@ -534,6 +526,29 @@ public:
         }
         return false;
     }
+
+    /**
+     * Call the corresponding GPU kernel for prediction
+     * (DeepWalk, LINE, node2vec)
+     */
+    bool predict_dispatch() override {
+        using namespace gpu;
+        GraphSolver *solver = reinterpret_cast<GraphSolver *>(this->solver);
+
+        decltype(&graph::predict<Vector, Index, DeepWalk>) predict = nullptr;
+        if (solver->model == "DeepWalk")
+            predict = &graph::predict<Vector, Index, DeepWalk>;
+        if (solver->model == "LINE")
+            predict = &graph::predict<Vector, Index, LINE>;
+        if (solver->model == "node2vec")
+            predict = &graph::predict<Vector, Index, Node2Vec>;
+        if (predict) {
+            predict<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
+                    (*embeddings[0], *embeddings[1], batch, logits);
+            return true;
+        }
+        return false;
+    }
 };
 
 /**
@@ -760,6 +775,13 @@ public:
         }
         fclose(fout);
     }
+
+    /** Free CPU and GPU memory, except the embeddings on CPU */
+    void clear() override {
+        Base::clear();
+        decltype(vertex_edge_tables)().swap(vertex_edge_tables);
+        decltype(edge_edge_tables)().swap(edge_edge_tables);
+    }
 };
 
 } // namespace graphvite
\ No newline at end of file
diff --git a/include/instance/knowledge_graph.cuh b/include/instance/knowledge_graph.cuh
index c94f325..6d1d0c7 100644
--- a/include/instance/knowledge_graph.cuh
+++ b/include/instance/knowledge_graph.cuh
@@ -22,6 +22,7 @@
 
 #include "core/graph.h"
 #include "core/solver.h"
+#include "model/knowledge_graph.h"
 #include "gpu/knowledge_graph.cuh"
 
 /**
@@ -80,6 +81,7 @@ public:
     /** Clear the graph and free CPU memory */
     void clear() override {
         Base::clear();
+        num_relation = 0;
         decltype(entity2id)().swap(entity2id);
         decltype(relation2id)().swap(relation2id);
         decltype(id2entity)().swap(id2entity);
@@ -293,10 +295,10 @@ public:
     }
 
     /**
-     * Call the corresponding GPU kernel
+     * Call the corresponding GPU kernel for training
      * (TransE, DistMult, ComplEx, SimplE, RotatE) * (SGD, Momentum, AdaGrad, RMSprop, Adam)
      */
-    bool kernel_dispatch() override {
+    bool train_dispatch() override {
         using namespace gpu;
         KnowledgeGraphSolver *solver = reinterpret_cast<KnowledgeGraphSolver *>(this->solver);
 
@@ -307,126 +309,120 @@ public:
             margin_or_l3 = solver->l3_regularization;
         switch (num_moment) {
             case 0: {
-                decltype(&transe::train<Vector, Index, kSGD>) train = nullptr;
+                decltype(&knowledge_graph::train<Vector, Index, TransE, kSGD>) train = nullptr;
                 if (solver->model == "TransE") {
                     if (optimizer.type == "SGD")
-                        train = &transe::train<Vector, Index, kSGD>;
+                        train = &knowledge_graph::train<Vector, Index, TransE, kSGD>;
                 }
                 if (solver->model == "DistMult") {
                     if (optimizer.type == "SGD")
-                        train = &distmult::train<Vector, Index, kSGD>;
+                        train = &knowledge_graph::train<Vector, Index, DistMult, kSGD>;
                 }
                 if (solver->model == "ComplEx") {
                     if (optimizer.type == "SGD")
-                        train = &complex::train<Vector, Index, kSGD>;
+                        train = &knowledge_graph::train<Vector, Index, ComplEx, kSGD>;
                 }
                 if (solver->model == "SimplE") {
                     if (optimizer.type == "SGD")
-                        train = &simple::train<Vector, Index, kSGD>;
+                        train = &knowledge_graph::train<Vector, Index, SimplE, kSGD>;
                 }
                 if (solver->model == "RotatE") {
                     if (optimizer.type == "SGD")
-                        train = &rotate::train<Vector, Index, kSGD>;
+                        train = &knowledge_graph::train<Vector, Index, RotatE, kSGD>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1], *embeddings[2], *gradients[2],
-                                    batch, negative_batch, optimizer, margin_or_l3, solver->adversarial_temperature
-#ifdef USE_LOSS
-                            , this->loss
-#endif
-                    );
+                            (*embeddings[0], *embeddings[1], *embeddings[2],
+                                    batch, negative_batch, loss,
+                                    optimizer, margin_or_l3, solver->adversarial_temperature
+                            );
                     return true;
                 }
                 break;
             }
             case 1: {
-                decltype(&transe::train_1_moment<Vector, Index, kMomentum>) train = nullptr;
+                decltype(&knowledge_graph::train_1_moment<Vector, Index, TransE, kMomentum>) train = nullptr;
                 if (solver->model == "TransE") {
                     if (optimizer.type == "Momentum")
-                        train = &transe::train_1_moment<Vector, Index, kMomentum>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, TransE, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &transe::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, TransE, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &transe::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, TransE, kRMSprop>;
                 }
                 if (solver->model == "DistMult") {
                     if (optimizer.type == "Momentum")
-                        train = &distmult::train_1_moment<Vector, Index, kMomentum>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, DistMult, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &distmult::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, DistMult, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &distmult::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, DistMult, kRMSprop>;
                 }
                 if (solver->model == "ComplEx") {
                     if (optimizer.type == "Momentum")
-                        train = &complex::train_1_moment<Vector, Index, kMomentum>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, ComplEx, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &complex::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, ComplEx, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &complex::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, ComplEx, kRMSprop>;
                 }
                 if (solver->model == "SimplE") {
                     if (optimizer.type == "Momentum")
-                        train = &simple::train_1_moment<Vector, Index, kMomentum>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, SimplE, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &simple::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, SimplE, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &simple::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, SimplE, kRMSprop>;
                 }
                 if (solver->model == "RotatE") {
                     if (optimizer.type == "Momentum")
-                        train = &rotate::train_1_moment<Vector, Index, kMomentum>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, RotatE, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &rotate::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, RotatE, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &rotate::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &knowledge_graph::train_1_moment<Vector, Index, RotatE, kRMSprop>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1], *embeddings[2], *gradients[2],
+                            (*embeddings[0], *embeddings[1], *embeddings[2],
                                     (*moments[0])[0], (*moments[1])[0], (*moments[2])[0],
-                                    batch, negative_batch, optimizer, margin_or_l3, solver->adversarial_temperature
-#ifdef USE_LOSS
-                            , this->loss
-#endif
-                    );
+                                    batch, negative_batch, loss,
+                                    optimizer, margin_or_l3, solver->adversarial_temperature
+                            );
                     return true;
                 }
                 break;
             }
             case 2: {
-                decltype(&transe::train_2_moment<Vector, Index, kAdam>) train = nullptr;
+                decltype(&knowledge_graph::train_2_moment<Vector, Index, TransE, kAdam>) train = nullptr;
                 if (solver->model == "TransE") {
                     if (optimizer.type == "Adam")
-                        train = &transe::train_2_moment<Vector, Index, kAdam>;
+                        train = &knowledge_graph::train_2_moment<Vector, Index, TransE, kAdam>;
                 }
                 if (solver->model == "DistMult") {
                     if (optimizer.type == "Adam")
-                        train = &distmult::train_2_moment<Vector, Index, kAdam>;
+                        train = &knowledge_graph::train_2_moment<Vector, Index, DistMult, kAdam>;
                 }
                 if (solver->model == "ComplEx") {
                     if (optimizer.type == "Adam")
-                        train = &complex::train_2_moment<Vector, Index, kAdam>;
+                        train = &knowledge_graph::train_2_moment<Vector, Index, ComplEx, kAdam>;
                 }
                 if (solver->model == "SimplE") {
                     if (optimizer.type == "Adam")
-                        train = &simple::train_2_moment<Vector, Index, kAdam>;
+                        train = &knowledge_graph::train_2_moment<Vector, Index, SimplE, kAdam>;
                 }
                 if (solver->model == "RotatE") {
                     if (optimizer.type == "Adam")
-                        train = &rotate::train_2_moment<Vector, Index, kAdam>;
+                        train = &knowledge_graph::train_2_moment<Vector, Index, RotatE, kAdam>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1], *embeddings[2], *gradients[2],
+                            (*embeddings[0], *embeddings[1], *embeddings[2],
                                     (*moments[0])[0], (*moments[1])[0], (*moments[2])[0],
                                     (*moments[0])[1], (*moments[1])[1], (*moments[2])[1],
-                                    batch, negative_batch, optimizer, margin_or_l3, solver->adversarial_temperature
-#ifdef USE_LOSS
-                            , this->loss
-#endif
-                    );
+                                    batch, negative_batch, loss,
+                                    optimizer, margin_or_l3, solver->adversarial_temperature
+                            );
                     return true;
                 }
                 break;
@@ -434,6 +430,33 @@ public:
         }
         return false;
     }
+
+    /**
+     * Call the corresponding GPU kernel for prediction
+     * (TransE, DistMult, ComplEx, SimplE, RotatE)
+     */
+    bool predict_dispatch() override {
+        using namespace gpu;
+        KnowledgeGraphSolver *solver = reinterpret_cast<KnowledgeGraphSolver *>(this->solver);
+
+        decltype(&knowledge_graph::predict<Vector, Index, TransE>) predict = nullptr;
+        if (solver->model == "TransE")
+            predict = &knowledge_graph::predict<Vector, Index, TransE>;
+        if (solver->model == "DistMult")
+            predict = &knowledge_graph::predict<Vector, Index, DistMult>;
+        if (solver->model == "ComplEx")
+            predict = &knowledge_graph::predict<Vector, Index, ComplEx>;
+        if (solver->model == "SimplE")
+            predict = &knowledge_graph::predict<Vector, Index, SimplE>;
+        if (solver->model == "RotatE")
+            predict = &knowledge_graph::predict<Vector, Index, RotatE>;
+        if (predict) {
+            predict<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
+                    (*embeddings[0], *embeddings[1], *embeddings[2], batch, logits, solver->margin);
+            return true;
+        }
+        return false;
+    }
 };
 
 /**
@@ -488,7 +511,7 @@ public:
 
     /** Return the default optimizer type and its hyperparameters */
     inline Optimizer get_default_optimizer() const override {
-        return Adam(1e-4, 0);
+        return Adam(5e-5, 0);
     }
 
     /** Build alias reference for embeddings */
@@ -568,11 +591,11 @@ public:
      * @param _l3_regularization l3 regularization (for DistMult, ComplEx & SimplE)
      * @param _sample_batch_size batch size of samples in samplers
      * @param _positive_reuse times of reusing positive samples
-     * @param _adversarial_temperature temperature of adversarial negative sampling,
+     * @param _adversarial_temperature temperature of self-adversarial negative sampling,
      *     disabled when set to non-positive value
      * @param _log_frequency log every log_frequency batches
      */
-    void train(const std::string &_model = "RotatE", int _num_epoch = 2000, bool _resume = false, float _margin = 24,
+    void train(const std::string &_model = "RotatE", int _num_epoch = 2000, bool _resume = false, float _margin = 12,
                float _l3_regularization = 2e-3, int _sample_batch_size = 2000, int _positive_reuse = 1,
                float _adversarial_temperature = 2, int _log_frequency = 100) {
         margin = _margin;
diff --git a/include/instance/model/graph.h b/include/instance/model/graph.h
new file mode 100644
index 0000000..1fe08af
--- /dev/null
+++ b/include/instance/model/graph.h
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2019 MilaGraph. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author Zhaocheng Zhu, Shizhen Xu
+ */
+
+#pragma once
+
+#include "core/optimizer.h"
+#include "util/gpu.cuh"
+
+namespace graphvite {
+
+/**
+ * @brief LINE model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: dot(vertex, context)
+ * Backward: gradient of forward function
+ */
+template<class _Vector>
+class LINE {
+public:
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__ static void forward(const Vector &vertex, const Vector &context, Float &output) {
+        output = 0;
+        FOR(i, dim)
+            output += vertex[i] * context[i];
+        output = SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &vertex, Vector &context,
+                         Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function < Float, optimizer_type>();
+        FOR(i, dim) {
+            Float v = vertex[i];
+            Float c = context[i];
+            vertex[i] -= (optimizer.*update)(v, gradient * c, weight);
+            context[i] -= (optimizer.*update)(c, gradient * v, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &vertex, Vector &context, Vector &vertex_moment1, Vector &context_moment1,
+                         Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment < Float, optimizer_type>();
+        FOR(i, dim) {
+            Float v = vertex[i];
+            Float c = context[i];
+            vertex[i] -= (optimizer.*update)(v, gradient * c, vertex_moment1[i], weight);
+            context[i] -= (optimizer.*update)(c, gradient * v, context_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &vertex, Vector &context, Vector &vertex_moment1, Vector &context_moment1,
+                         Vector &vertex_moment2, Vector &context_moment2,
+                         Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment < Float, optimizer_type>();
+        FOR(i, dim) {
+            Float v = vertex[i];
+            Float c = context[i];
+            vertex[i] -= (optimizer.*update)(v, gradient * c, vertex_moment1[i], vertex_moment2[i], weight);
+            context[i] -= (optimizer.*update)(c, gradient * v, context_moment1[i], context_moment2[i], weight);
+        }
+    }
+};
+
+/**
+ * @brief DeepWalk model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: dot(vertex, context)
+ * Backward: gradient of forward function
+ */
+template<class _Vector>
+class DeepWalk : public LINE<_Vector> {};
+
+/**
+ * @brief node2vec model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: dot(vertex, context)
+ * Backward: gradient of forward function
+ */
+template<class _Vector>
+class Node2Vec : public LINE<_Vector> {};
+
+}
\ No newline at end of file
diff --git a/include/instance/model/knowledge_graph.h b/include/instance/model/knowledge_graph.h
new file mode 100644
index 0000000..c635b01
--- /dev/null
+++ b/include/instance/model/knowledge_graph.h
@@ -0,0 +1,547 @@
+/**
+ * Copyright 2019 MilaGraph. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author Zhaocheng Zhu
+ */
+
+#pragma once
+
+#include "core/optimizer.h"
+#include "util/gpu.cuh"
+#include "util/math.h"
+
+namespace graphvite {
+
+/**
+ * @brief TransE model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: margin - L1_norm(head + relation - tail)
+ * Backward: gradient of forward function
+ */
+template<class _Vector>
+class TransE {
+public:
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, const Vector &relation, Float &output, float margin) {
+        output = 0;
+        FOR(i, dim)
+            output += abs(head[i] + relation[i] - tail[i]);
+        output = margin - SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            Float s = h + r - t > 0 ? 1 : -1;
+            head[i] -= (optimizer.*update)(h, -gradient * s, weight);
+            tail[i] -= (optimizer.*update)(t, gradient * s, weight);
+            relation[i] -= (optimizer.*update)(r, -gradient * s, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            Float s = h + r - t > 0 ? 1 : -1;
+            head[i] -= (optimizer.*update)(h, -gradient * s, head_moment1[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * s, tail_moment1[i], weight);
+            relation[i] -= (optimizer.*update)(r, -gradient * s, relation_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         Vector &head_moment2, Vector &tail_moment2, Vector &relation_moment2,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            Float s = h + r - t > 0 ? 1 : -1;
+            head[i] -= (optimizer.*update)(h, -gradient * s, head_moment1[i], head_moment2[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * s, tail_moment1[i], tail_moment2[i], weight);
+            relation[i] -= (optimizer.*update)(r, -gradient * s, relation_moment1[i], relation_moment2[i], weight);
+        }
+    }
+};
+
+/**
+ * @brief DistMult model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: sum(head * relation * tail)
+ * Backward: gradient of forward function, with l3 regularization on each parameter
+ */
+template<class _Vector>
+class DistMult {
+public:
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, const Vector &relation, Float &output,
+                        float l3_regularization) {
+        output = 0;
+        FOR(i, dim)
+            output += head[i] * relation[i] * tail[i];
+        output = SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h, weight);
+            tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t, weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
+                                           head_moment1[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
+                                           tail_moment1[i], weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
+                                               relation_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         Vector &head_moment2, Vector &tail_moment2, Vector &relation_moment2,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
+                                           head_moment1[i], head_moment2[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
+                                           tail_moment1[i], tail_moment2[i], weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
+                                               relation_moment1[i], relation_moment2[i], weight);
+        }
+    }
+};
+
+/**
+ * @brief ComplEx model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: real(sum(head * relation * conjugate(tail)))
+ * Backward: gradient of forward function, with l3 regularization on each parameter
+ */
+template<class _Vector>
+class ComplEx {
+public:
+    static_assert(_Vector::dim % 2 == 0, "Model `ComplEx` can only be instantiated with even-dimensional vectors");
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, const Vector &relation, Float &output,
+                        float l3_regularization) {
+        output = 0;
+        FOR(i, dim / 2) {
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float r_re = relation[i * 2];
+            Float r_im = relation[i * 2 + 1];
+            Float product_re = h_re * r_re - h_im * r_im;
+            Float product_im = h_re * r_im + h_im * r_re;
+            output += product_re * t_re + product_im * t_im;
+        }
+        output = SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim / 2) {
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float r_re = relation[i * 2];
+            Float r_im = relation[i * 2 + 1];
+            // head
+            Float h_re_grad = gradient * (r_re * t_re + r_im * t_im);
+            Float h_im_grad = gradient * (-r_im * t_re + r_re * t_im);
+            head[i * 2] -= (optimizer.*update)(h_re, h_re_grad + l3_regularization * abs(h_re) * h_re, weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, h_im_grad + l3_regularization * abs(h_im) * h_im, weight);
+            // tail
+            Float t_re_grad = gradient * (h_re * r_re - h_im * r_im);
+            Float t_im_grad = gradient * (h_re * r_im + h_im * r_re);
+            tail[i * 2] -= (optimizer.*update)(t_re, t_re_grad + l3_regularization * abs(t_re) * t_re, weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, t_im_grad + l3_regularization * abs(t_im) * t_im, weight);
+            // relation
+            Float r_re_grad = gradient * (h_re * t_re + h_im * t_im);
+            Float r_im_grad = gradient * (-h_im * t_re + h_re * t_im);
+            relation[i * 2] -= (optimizer.*update)(r_re, r_re_grad + l3_regularization * abs(r_re) * r_re, weight);
+            relation[i * 2 + 1] -= (optimizer.*update)(r_im, r_im_grad + l3_regularization * abs(r_im) * r_im, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim / 2) {
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float r_re = relation[i * 2];
+            Float r_im = relation[i * 2 + 1];
+            // head
+            Float h_re_grad = gradient * (r_re * t_re + r_im * t_im);
+            Float h_im_grad = gradient * (-r_im * t_re + r_re * t_im);
+            head[i * 2] -= (optimizer.*update)(h_re, h_re_grad + l3_regularization * abs(h_re) * h_re,
+                                               head_moment1[i * 2], weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, h_im_grad + l3_regularization * abs(h_im) * h_im,
+                                                   head_moment1[i * 2 + 1], weight);
+            // tail
+            Float t_re_grad = gradient * (h_re * r_re - h_im * r_im);
+            Float t_im_grad = gradient * (h_re * r_im + h_im * r_re);
+            tail[i * 2] -= (optimizer.*update)(t_re, t_re_grad + l3_regularization * abs(t_re) * t_re,
+                                               tail_moment1[i * 2], weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, t_im_grad + l3_regularization * abs(t_im) * t_im,
+                                                   tail_moment1[i * 2 + 1], weight);
+            // relation
+            Float r_re_grad = gradient * (h_re * t_re + h_im * t_im);
+            Float r_im_grad = gradient * (-h_im * t_re + h_re * t_im);
+            relation[i * 2] -= (optimizer.*update)(r_re, r_re_grad + l3_regularization * abs(r_re) * r_re,
+                                                   relation_moment1[i], weight);
+            relation[i * 2 + 1] -= (optimizer.*update)(r_im, r_im_grad + l3_regularization * abs(r_im) * r_im,
+                                                       relation_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         Vector &head_moment2, Vector &tail_moment2, Vector &relation_moment2,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim / 2) {
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float r_re = relation[i * 2];
+            Float r_im = relation[i * 2 + 1];
+            // head
+            Float h_re_grad = gradient * (r_re * t_re + r_im * t_im);
+            Float h_im_grad = gradient * (-r_im * t_re + r_re * t_im);
+            head[i * 2] -= (optimizer.*update)(h_re, h_re_grad + l3_regularization * abs(h_re) * h_re,
+                                               head_moment1[i * 2], head_moment2[i * 2], weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, h_im_grad + l3_regularization * abs(h_im) * h_im,
+                                                   head_moment1[i * 2 + 1], head_moment2[i * 2 + 1], weight);
+            // tail
+            Float t_re_grad = gradient * (h_re * r_re - h_im * r_im);
+            Float t_im_grad = gradient * (h_re * r_im + h_im * r_re);
+            tail[i * 2] -= (optimizer.*update)(t_re, t_re_grad + l3_regularization * abs(t_re) * t_re,
+                                               tail_moment1[i * 2], tail_moment2[i * 2], weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, t_im_grad + l3_regularization * abs(t_im) * t_im,
+                                                   tail_moment1[i * 2 + 1], tail_moment2[i * 2 + 1], weight);
+            // relation
+            Float r_re_grad = gradient * (h_re * t_re + h_im * t_im);
+            Float r_im_grad = gradient * (-h_im * t_re + h_re * t_im);
+            relation[i * 2] -= (optimizer.*update)(r_re, r_re_grad + l3_regularization * abs(r_re) * r_re,
+                                                   relation_moment1[i], relation_moment2[i], weight);
+            relation[i * 2 + 1] -= (optimizer.*update)(r_im, r_im_grad + l3_regularization * abs(r_im) * r_im,
+                                                       relation_moment1[i], relation_moment2[i], weight);
+        }
+    }
+};
+
+/**
+ * @brief SimplE model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: sum(head * relation * flip(tail))
+ * Backward: gradient of forward function, with l3 regularization on each parameter
+ */
+template<class _Vector>
+class SimplE {
+public:
+    static_assert(_Vector::dim % 2 == 0, "Model `SimplE` can only be instantiated with even-dimensional vectors");
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, const Vector &relation, Float &output,
+                        float l3_regularization) {
+        output = 0;
+        FOR(i, dim) {
+            int j = i ^ 1;
+            output += head[i] * relation[i] * tail[j];
+        }
+        output = SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            int j = i ^ 1;
+            Float h = head[i];
+            Float t = tail[j];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h, weight);
+            tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t, weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            int j = i ^ 1;
+            Float h = head[i];
+            Float t = tail[j];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
+                                           head_moment1[i], weight);
+            tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
+                                           tail_moment1[j], weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
+                                               relation_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         Vector &head_moment2, Vector &tail_moment2, Vector &relation_moment2,
+                         float l3_regularization, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        l3_regularization *= 3;
+        FOR(i, dim) {
+            int j = i ^ 1;
+            Float h = head[i];
+            Float t = tail[j];
+            Float r = relation[i];
+            head[i] -= (optimizer.*update)(h, gradient * r * t + l3_regularization * abs(h) * h,
+                                           head_moment1[i], head_moment2[i], weight);
+            tail[j] -= (optimizer.*update)(t, gradient * h * r + l3_regularization * abs(t) * t,
+                                           tail_moment1[j], tail_moment2[j], weight);
+            relation[i] -= (optimizer.*update)(r, gradient * h * t + l3_regularization * abs(r) * r,
+                                               relation_moment1[i], relation_moment2[i], weight);
+        }
+    }
+};
+
+/**
+ * @brief RotatE model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: margin - L1_norm(head * relation - tail), with constraint L1_norm(relation[*]) = 1
+ * Backward: gradient of forward function
+ *
+ * In practice, the relation is reparameterized as a phase vector to remove the constraint.
+ */
+template<class _Vector>
+class RotatE {
+public:
+    static_assert(_Vector::dim % 2 == 0, "Model `RotatE` can only be instantiated with even-dimensional vectors");
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, const Vector &relation, Float &output, float margin) {
+        output = 0;
+        FOR(i, dim / 2) {
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float phase = relation[i];
+            Float r_re = cos(phase);
+            Float r_im = sin(phase);
+            Float distance_re = h_re * r_re - h_im * r_im - t_re;
+            Float distance_im = h_re * r_im + h_im * r_re - t_im;
+            output += sqrt(distance_re * distance_re + distance_im * distance_im);
+        }
+        output = margin - SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        FOR(i, dim / 2) {
+            Float phase = relation[i];
+            Float r_re = cos(phase);
+            Float r_im = sin(phase);
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float distance_re = h_re * r_re - h_im * r_im - t_re;
+            Float distance_im = h_re * r_im + h_im * r_re - t_im;
+            Float grad = gradient / (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
+            // head
+            Float head_re_grad = -grad * (distance_re * r_re + distance_im * r_im);
+            Float head_im_grad = -grad * (-distance_re * r_im + distance_im * r_re);
+            head[i * 2] -= (optimizer.*update)(h_re, head_re_grad, weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, head_im_grad, weight);
+            // tail
+            tail[i * 2] -= (optimizer.*update)(t_re, grad * distance_re, weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, grad * distance_im, weight);
+            // relation
+            Float relation_grad =
+                    -grad * (distance_re * (h_re * -r_im + h_im * -r_re) + distance_im * (h_re * r_re + h_im * -r_im));
+            relation[i] -= (optimizer.*update)(phase, relation_grad, weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        FOR(i, dim / 2) {
+            Float phase = relation[i];
+            Float r_re = cos(phase);
+            Float r_im = sin(phase);
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float distance_re = h_re * r_re - h_im * r_im - t_re;
+            Float distance_im = h_re * r_im + h_im * r_re - t_im;
+            Float grad = gradient / (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
+            // head
+            Float head_re_grad = -grad * (distance_re * r_re + distance_im * r_im);
+            Float head_im_grad = -grad * (-distance_re * r_im + distance_im * r_re);
+            head[i * 2] -= (optimizer.*update)(h_re, head_re_grad, head_moment1[i * 2], weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, head_im_grad, head_moment1[i * 2 + 1], weight);
+            // tail
+            tail[i * 2] -= (optimizer.*update)(t_re, grad * distance_re, tail_moment1[i * 2], weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, grad * distance_im, tail_moment1[i * 2 + 1], weight);
+            // relation
+            Float relation_grad =
+                    -grad * (distance_re * (h_re * -r_im + h_im * -r_re) + distance_im * (h_re * r_re + h_im * -r_im));
+            relation[i] -= (optimizer.*update)(phase, relation_grad, relation_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &relation,
+                         Vector &head_moment1, Vector &tail_moment1, Vector &relation_moment1,
+                         Vector &head_moment2, Vector &tail_moment2, Vector &relation_moment2,
+                         float margin, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        FOR(i, dim / 2) {
+            Float phase = relation[i];
+            Float r_re = cos(phase);
+            Float r_im = sin(phase);
+            Float h_re = head[i * 2];
+            Float h_im = head[i * 2 + 1];
+            Float t_re = tail[i * 2];
+            Float t_im = tail[i * 2 + 1];
+            Float distance_re = h_re * r_re - h_im * r_im - t_re;
+            Float distance_im = h_re * r_im + h_im * r_re - t_im;
+            Float grad = gradient / (sqrt(distance_re * distance_re + distance_im * distance_im) + kEpsilon);
+            // head
+            Float head_re_grad = -grad * (distance_re * r_re + distance_im * r_im);
+            Float head_im_grad = -grad * (-distance_re * r_im + distance_im * r_re);
+            head[i * 2] -= (optimizer.*update)(h_re, head_re_grad,
+                                               head_moment1[i * 2], head_moment2[i * 2], weight);
+            head[i * 2 + 1] -= (optimizer.*update)(h_im, head_im_grad,
+                                                   head_moment1[i * 2 + 1], head_moment2[i * 2 + 1], weight);
+            // tail
+            tail[i * 2] -= (optimizer.*update)(t_re, grad * distance_re,
+                                               tail_moment1[i * 2], tail_moment2[i * 2], weight);
+            tail[i * 2 + 1] -= (optimizer.*update)(t_im, grad * distance_im,
+                                                   tail_moment1[i * 2 + 1], tail_moment2[i * 2 + 1], weight);
+            // relation
+            Float relation_grad =
+                    -grad * (distance_re * (h_re * -r_im + h_im * -r_re) + distance_im * (h_re * r_re + h_im * -r_im));
+            relation[i] -= (optimizer.*update)(phase, relation_grad, relation_moment1[i], relation_moment2[i], weight);
+        }
+    }
+};
+
+}
\ No newline at end of file
diff --git a/include/instance/model/visualization.h b/include/instance/model/visualization.h
new file mode 100644
index 0000000..7352b91
--- /dev/null
+++ b/include/instance/model/visualization.h
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2019 MilaGraph. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author Zhaocheng Zhu
+ */
+
+#pragma once
+
+#include "core/optimizer.h"
+#include "util/gpu.cuh"
+
+namespace graphvite {
+
+/**
+ * @brief LargeVis model
+ * @tparam _Vector vector type of embeddings
+ *
+ * Forward: L2_norm(head - tail) ^ 2
+ * Backward: gradient of forward function
+ */
+template<class _Vector>
+class LargeVis {
+public:
+    static const size_t dim = _Vector::dim;
+    typedef _Vector Vector;
+    typedef typename _Vector::Float Float;
+
+    __host__ __device__
+    static void forward(const Vector &head, const Vector &tail, Float &output) {
+        output = 0;
+        FOR(i, dim)
+            output += (head[i] - tail[i]) * (head[i] - tail[i]);
+        output = SUM(output);
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            head[i] -= (optimizer.*update)(h, gradient * (h - t), weight);
+            tail[i] -= (optimizer.*update)(t, gradient * (t - h), weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &head_moment1, Vector &tail_moment1,
+                         Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_1_moment<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            head[i] -= (optimizer.*update)(h, gradient * (h - t), head_moment1[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * (t - h), tail_moment1[i], weight);
+        }
+    }
+
+    template<OptimizerType optimizer_type>
+    __host__ __device__
+    static void backward(Vector &head, Vector &tail, Vector &head_moment1, Vector &tail_moment1,
+                         Vector &head_moment2, Vector &tail_moment2,
+                         Float gradient, const Optimizer &optimizer, Float weight = 1) {
+        auto update = get_update_function_2_moment<Float, optimizer_type>();
+        FOR(i, dim) {
+            Float h = head[i];
+            Float t = tail[i];
+            head[i] -= (optimizer.*update)(h, gradient * (h - t), head_moment1[i], head_moment2[i], weight);
+            tail[i] -= (optimizer.*update)(t, gradient * (t - h), tail_moment1[i], tail_moment2[i], weight);
+        }
+    }
+};
+
+}
\ No newline at end of file
diff --git a/include/instance/visualization.cuh b/include/instance/visualization.cuh
index 92da778..adb3d99 100644
--- a/include/instance/visualization.cuh
+++ b/include/instance/visualization.cuh
@@ -21,13 +21,17 @@
 #include <thread>
 #include <unordered_map>
 #include <cuda_runtime.h>
+#include <pybind11/numpy.h>
 #include "faiss/gpu/GpuIndexFlat.h"
 #include "faiss/gpu/StandardGpuResources.h"
 
 #include "graph.cuh"
 #include "core/solver.h"
+#include "model/visualization.h"
 #include "gpu/visualization.cuh"
 
+namespace py = pybind11;
+
 /**
  * @page Graph & High-dimensional Data Visualization
  *
@@ -186,23 +190,26 @@ public:
                 float weight = std::get<1>(vertex_edge);
                 norm += weight;
             }
-            float low = 0, high = num_neighbor * kLogitClip / norm, beta;
+            float low = -1, high = -1, beta = 1;
             for (int j = 0; j < 100; j++) {
                 norm = 0;
                 entropy = 0;
-                beta = (low + high) / 2;
                 for (auto &&vertex_edge : vertex_edges[i]) {
                     float weight = std::get<1>(vertex_edge);
-                    norm += exp(-beta * weight);
-                    entropy += beta * weight * exp(-beta * weight);
+                    norm += std::exp(-beta * weight);
+                    entropy += beta * weight * std::exp(-beta * weight);
                 }
                 entropy = entropy / norm + log(norm);
                 if (abs(entropy - log(perplexity)) < 1e-5)
                     break;
-                if (entropy > log(perplexity))
+                if (entropy > log(perplexity)) {
                     low = beta;
-                else
+                    beta = high < 0 ? beta * 2 : (beta + high) / 2;
+                }
+                else {
                     high = beta;
+                    beta = low < 0 ? beta / 2 : (beta + high) / 2;
+                }
             }
             for (auto &&vertex_edge : vertex_edges[i]) {
                 float &weight = std::get<1>(vertex_edge);
@@ -312,6 +319,8 @@ public:
                 vectors.push_back(f);
                 current_dim++;
             }
+            if (!current_dim)
+                continue;
             if (!dim)
                 dim = current_dim;
             CHECK(current_dim == dim)
@@ -352,6 +361,28 @@ public:
 
         LOG(WARNING) << pretty::block(info());
     }
+
+    void load_numpy(const py::array_t<float> &_array, int _num_neighbor = 200, float _perplexity = 30,
+                    bool _normalized_vector = true) {
+        CHECK(_array.ndim() == 2) << "Expect a 2d array, but a " << _array.ndim() << "d array is found";
+        clear();
+
+        num_neighbor = _num_neighbor;
+        perplexity = _perplexity;
+        CHECK(perplexity <= num_neighbor) << "`perplexity` should be no larger than `#neighbor`";
+        vector_normalization = _normalized_vector;
+
+        auto array = _array.unchecked();
+        num_vertex = array.shape(0);
+        dim = array.shape(1);
+        vectors.resize(array.size());
+        for (Index i = 0; i < num_vertex; i++)
+            for (int j = 0; j < dim; j++)
+                vectors[i * dim + j] = array(i, j);
+        build();
+
+        LOG(WARNING) << pretty::block(info());
+    }
 };
 
 template <size_t _dim, class _Float, class _Index>
@@ -381,67 +412,59 @@ class VisualizationWorker : public WorkerMixin<_Solver> {
     typedef VisualizationSolver<Solver::dim, Float, Index> VisualizationSolver;
 
     /**
-     * Call the corresponding GPU kernel
+     * Call the corresponding GPU kernel for training
      * (LargeVis) * (SGD, Momentum, AdaGrad, RMSprop, Adam)
      */
-    bool kernel_dispatch() override {
+    bool train_dispatch() override {
         using namespace gpu;
         VisualizationSolver *solver = reinterpret_cast<VisualizationSolver *>(this->solver);
 
         switch (num_moment) {
             case 0: {
-                decltype(&largevis::train<Vector, Index, kSGD>) train = nullptr;
+                decltype(&visualization::train<Vector, Index, LargeVis, kSGD>) train = nullptr;
                 if (solver->model == "LargeVis") {
                     if (optimizer.type == "SGD")
-                        train = &largevis::train<Vector, Index, kSGD>;
+                        train = &visualization::train<Vector, Index, LargeVis, kSGD>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
-                            (*embeddings[0], *embeddings[1], batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                            (*embeddings[0], *embeddings[1], batch, negative_batch, loss,
+                                    optimizer, solver->negative_weight
                     );
                     return true;
                 }
                 break;
             }
             case 1: {
-                decltype(&largevis::train_1_moment<Vector, Index, kMomentum>) train = nullptr;
+                decltype(&visualization::train_1_moment<Vector, Index, LargeVis, kMomentum>) train = nullptr;
                 if (solver->model == "LargeVis") {
                     if (optimizer.type == "Momentum")
-                        train = &largevis::train_1_moment<Vector, Index, kMomentum>;
+                        train = &visualization::train_1_moment<Vector, Index, LargeVis, kMomentum>;
                     if (optimizer.type == "AdaGrad")
-                        train = &largevis::train_1_moment<Vector, Index, kAdaGrad>;
+                        train = &visualization::train_1_moment<Vector, Index, LargeVis, kAdaGrad>;
                     if (optimizer.type == "RMSprop")
-                        train = &largevis::train_1_moment<Vector, Index, kRMSprop>;
+                        train = &visualization::train_1_moment<Vector, Index, LargeVis, kRMSprop>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
                             (*embeddings[0], *embeddings[1], (*moments[0])[0], (*moments[1])[0],
-                                    batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                                    batch, negative_batch, loss, optimizer, solver->negative_weight
                     );
                     return true;
                 }
                 break;
             }
             case 2: {
-                decltype(&largevis::train_2_moment<Vector, Index, kAdam>) train = nullptr;
+                decltype(&visualization::train_2_moment<Vector, Index, LargeVis, kAdam>) train = nullptr;
                 if (solver->model == "LargeVis") {
                     if (optimizer.type == "Adam")
-                        train = &largevis::train_2_moment<Vector, Index, kAdam>;
+                        train = &visualization::train_2_moment<Vector, Index, LargeVis, kAdam>;
                 }
                 if (train) {
                     train<<<kBlockPerGrid, kThreadPerBlock, 0, work_stream>>>
                             (*embeddings[0], *embeddings[1],
                                     (*moments[0])[0], (*moments[1])[0], (*moments[0])[1], (*moments[1])[1],
-                                    batch, negative_batch, optimizer, solver->negative_weight
-#ifdef USE_LOSS
-                            , this->loss
-#endif
+                                    batch, negative_batch, loss, optimizer, solver->negative_weight
                     );
                     return true;
                 }
@@ -450,6 +473,10 @@ class VisualizationWorker : public WorkerMixin<_Solver> {
         }
         return false;
     }
+
+    virtual bool predict_dispatch() {
+        return false;
+    }
 };
 
 /**
diff --git a/include/util/common.h b/include/util/common.h
index 36758b0..c18d552 100644
--- a/include/util/common.h
+++ b/include/util/common.h
@@ -19,13 +19,13 @@
 #pragma once
 
 #include "io.h"
+#include "math.h"
 
 namespace graphvite {
 
 #define DEPRECATED(reason) __attribute__ ((deprecated(reason)))
 
 const float kEpsilon = 1e-15;
-const float kLogitClip = 80;
 const int kAuto = 0;
 const size_t kMaxLineLength = 1 << 22;
 
diff --git a/include/util/gpu.cuh b/include/util/gpu.cuh
index 1801c92..6ed7862 100644
--- a/include/util/gpu.cuh
+++ b/include/util/gpu.cuh
@@ -19,14 +19,31 @@
 #pragma once
 
 namespace graphvite {
-namespace gpu{
+
+// helper macros for CPU-GPU agnostic code
+#if __CUDA_ARCH__
+
+#define FOR(i, stop) \
+    const int lane_id = threadIdx.x % gpu::kWarpSize; \
+    for (int i = lane_id; i < (stop); i += gpu::kWarpSize)
+#define SUM(x) gpu::WarpBroadcast(gpu::WarpReduce(x), 0)
+
+#else
+
+#define FOR(i, stop) \
+    for (int i = 0; i < stop; i++)
+#define SUM(x) (x)
+
+#endif
+
+namespace gpu {
 
 const int kBlockPerGrid = 8192;
 const int kThreadPerBlock = 512;
 const int kWarpSize = 32;
 const unsigned kFullMask = 0xFFFFFFFF;
 
-template <class T>
+template<class T>
 __device__ T WarpReduce(T value) {
 #pragma unroll
     for (int delta = 1; delta < kWarpSize; delta *= 2)
@@ -38,7 +55,7 @@ __device__ T WarpReduce(T value) {
     return value;
 }
 
-template <class T>
+template<class T>
 __device__ T WarpBroadcast(T value, int lane_id) {
 #if __CUDACC_VER_MAJOR__ >= 9
     return __shfl_sync(kFullMask, value, lane_id);
diff --git a/include/util/io.h b/include/util/io.h
index 2d491e1..85cafb0 100644
--- a/include/util/io.h
+++ b/include/util/io.h
@@ -65,17 +65,17 @@ std::string yes_no(bool x) {
     return x ? "yes" : "no";
 }
 
-std::string size_string(size_t x) {
+std::string size_string(size_t size) {
     std::stringstream ss;
     ss.precision(3);
-    if (x >= 1 << 30)
-        ss << x / float(1 << 30) << " GiB";
-    else if (x >= 1 << 20)
-        ss << x / float(1 << 20) << " MiB";
-    else if (x >= 1 << 10)
-        ss << x / float(1 << 10) << " KiB";
+    if (size >= 1 << 30)
+        ss << size / float(1 << 30) << " GiB";
+    else if (size >= 1 << 20)
+        ss << size / float(1 << 20) << " MiB";
+    else if (size >= 1 << 10)
+        ss << size / float(1 << 10) << " KiB";
     else
-        ss << x << " B";
+        ss << size << " B";
     return ss.str();
 }
 
diff --git a/include/util/math.h b/include/util/math.h
new file mode 100644
index 0000000..d59d02c
--- /dev/null
+++ b/include/util/math.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright 2019 MilaGraph. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * @author Zhaocheng Zhu
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <type_traits>
+
+namespace graphvite {
+
+#ifndef __CUDA_ARCH__
+using std::abs; // the template version of abs()
+#endif
+
+template<class Float>
+__host__ __device__ Float sigmoid(Float x) {
+    return x > 0 ? 1 / (1 + exp(-x)) : exp(x) / (exp(x) + 1);
+}
+
+template<class Float>
+__host__ __device__ Float safe_exp(Float x);
+
+template<>
+__host__ __device__ float safe_exp(float x) {
+    static const float kLogitClip = 80;
+#if __CUDA_ARCH__
+    return exp(min(max(x, -kLogitClip), kLogitClip));
+#else
+    return std::exp(std::min(std::max(x, -kLogitClip), kLogitClip));
+#endif
+}
+
+template<>
+__host__ __device__ double safe_exp(double x) {
+    static const double kLogitClip = 700;
+#if __CUDA_ARCH__
+    return exp(min(max(x, -kLogitClip), kLogitClip));
+#else
+    return std::exp(std::min(std::max(x, -kLogitClip), kLogitClip));
+#endif
+}
+
+template<class Integer>
+__host__ __device__ Integer bit_floor(Integer x) {
+    static_assert(std::is_integral<Integer>::value, "bit_floor() can only be invoked with integral types");
+#pragma unroll
+    for (int i = 1; i < sizeof(Integer) * 8; i *= 2)
+        x |= x >> i;
+    return (x + 1) >> 1;
+}
+
+template<class Integer>
+__host__ __device__ Integer bit_ceil(Integer x) {
+    static_assert(std::is_integral<Integer>::value, "bit_ceil() can only be invoked with integral types");
+    x--;
+#pragma unroll
+    for (int i = 1; i < sizeof(Integer) * 8; i *= 2)
+        x |= x >> i;
+    return x + 1;
+}
+
+} // namespace graphvie
\ No newline at end of file
diff --git a/python/graphvite/application/application.py b/python/graphvite/application/application.py
index 7770b14..ddc8ca7 100644
--- a/python/graphvite/application/application.py
+++ b/python/graphvite/application/application.py
@@ -17,18 +17,20 @@
 """Implementation of applications"""
 from __future__ import print_function, absolute_import, unicode_literals, division
 
+import os
+import re
 import pickle
 import logging
 import multiprocessing
 from collections import defaultdict
 
-from future.builtins import str, map
+from future.builtins import str, map, range
 from easydict import EasyDict
 import numpy as np
 
-from .. import dtype, auto
+from .. import lib, cfg, auto
 from .. import graph, solver
-from ..util import monitor
+from ..util import assert_in, monitor, SharedNDArray
 
 logger = logging.getLogger(__name__)
 
@@ -44,12 +46,13 @@ class ApplicationMixin(object):
         float_type (dtype, optional): type of parameters
         index_type (dtype, optional): type of graph indexes
     """
-    def __init__(self, dim, gpus=[], cpu_per_gpu=auto, float_type=dtype.float32, index_type=dtype.uint32):
+    def __init__(self, dim, gpus=[], cpu_per_gpu=auto, float_type=cfg.float_type, index_type=cfg.index_type):
         self.dim = dim
         self.gpus = gpus
         self.cpu_per_gpu = cpu_per_gpu
         self.float_type = float_type
         self.index_type = index_type
+        self.set_format()
 
     def get_graph(self, **kwargs):
         raise NotImplementedError
@@ -57,6 +60,18 @@ def get_graph(self, **kwargs):
     def get_solver(self, **kwargs):
         raise NotImplementedError
 
+    def set_format(self, delimiters=" \t\r\n", comment="#"):
+        """
+        Set the format for parsing input data.
+
+        Parameters:
+            delimiters (str, optional): string of delimiter characters
+            comment (str, optional): prefix of comment strings
+        """
+        self.delimiters = delimiters
+        self.comment = comment
+        self.pattern = re.compile("[%s]" % self.delimiters)
+
     @monitor.time
     def load(self, **kwargs):
         """load(**kwargs)
@@ -64,7 +79,10 @@ def load(self, **kwargs):
         Arguments depend on the underlying graph type.
         """
         self.graph = self.get_graph(**kwargs)
-        self.graph.load(**kwargs)
+        if "file_name" in kwargs or "vector_file" in "kwargs":
+            self.graph.load(delimiters=self.delimiters, comment=self.comment, **kwargs)
+        else:
+            self.graph.load(**kwargs)
 
     @monitor.time
     def build(self, **kwargs):
@@ -91,16 +109,21 @@ def evaluate(self, task, **kwargs):
 
         Parameters:
             task (str): name of task
-        """
-        self.solver.clear()
 
+        Returns:
+            dict: metrics and their values
+        """
         func_name = task.replace(" ", "_")
         if not hasattr(self, func_name):
             raise ValueError("Unknown task `%s`" % task)
-        logger.info("evaluate on %s" % task)
+
+        logger.info(lib.io.header(task))
         result = getattr(self, func_name)(**kwargs)
-        for metric, value in sorted(result.items()):
-            logger.warning("%s: %g" % (metric, value))
+        if isinstance(result, dict):
+            for metric, value in sorted(result.items()):
+                logger.warning("%s: %g" % (metric, value))
+
+        return result
 
     @monitor.time
     def save(self, file_name):
@@ -125,6 +148,13 @@ def save(self, file_name):
         with open(file_name, "wb") as fout:
             pickle.dump(objects, fout, protocol=pickle.HIGHEST_PROTOCOL)
 
+    def tokenize(self, str):
+        str = str.strip(self.delimiters)
+        comment_start = str.find(self.comment)
+        if comment_start != -1:
+            str = str[:comment_start]
+        return self.pattern.split(str)
+
     def name_map(self, dicts, names):
         assert len(dicts) == len(names), "The number of dictionaries and names must be equal"
 
@@ -225,9 +255,9 @@ def node_classification(self, X=None, Y=None, file_name=None, portions=(0.02,),
         Returns:
             dict: macro-F1 & micro-F1 averaged over all trials
         """
-
         import scipy.sparse as sp
-        import torch
+
+        self.solver.clear()
 
         if file_name:
             if not (X is None and Y is None):
@@ -236,7 +266,10 @@ def node_classification(self, X=None, Y=None, file_name=None, portions=(0.02,),
             Y = []
             with open(file_name, "r") as fin:
                 for line in fin:
-                    x, y = line.split()
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    x, y = tokens
                     X.append(x)
                     Y.append(y)
         if X is None or Y is None:
@@ -249,11 +282,11 @@ def node_classification(self, X=None, Y=None, file_name=None, portions=(0.02,),
         X = np.asarray(new_X)
         Y = np.asarray(new_Y)
 
-        labels = sp.coo_matrix((np.ones_like(X), (X, Y)), dtype=np.int).todense()
+        labels = sp.coo_matrix((np.ones_like(X), (X, Y)), dtype=np.int32).todense()
         indexes, _ = np.where(np.sum(labels, axis=1) > 0)
         # discard non-labeled nodes
         labels = labels[indexes]
-        vertex_embeddings = self.solver.vertex_embeddings[indexes]
+        vertex_embeddings = SharedNDArray(self.solver.vertex_embeddings[indexes])
 
         settings = []
         for portion in portions:
@@ -285,6 +318,8 @@ def link_prediction(self, H=None, T=None, Y=None, file_name=None, filter_H=None,
 
         from .network import LinkPredictor
 
+        self.solver.clear()
+
         if file_name:
             if not (H is None and T is None and Y is None):
                 raise ValueError("Evaluation data and file should not be provided at the same time")
@@ -293,7 +328,10 @@ def link_prediction(self, H=None, T=None, Y=None, file_name=None, filter_H=None,
             Y = []
             with open(file_name, "r") as fin:
                 for line in fin:
-                    h, t, y = line.split()
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    h, t, y = tokens
                     H.append(h)
                     T.append(t)
                     Y.append(y)
@@ -307,7 +345,10 @@ def link_prediction(self, H=None, T=None, Y=None, file_name=None, filter_H=None,
             filter_T = []
             with open(filter_file, "r") as fin:
                 for line in fin:
-                    h, t = line.split()
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    h, t = tokens
                     filter_H.append(h)
                     filter_T.append(t)
         elif filter_H is None:
@@ -379,6 +420,7 @@ def generate_one_vs_rest(indexes, labels):
         return torch.as_tensor(new_indexes), torch.as_tensor(new_labels)
 
     embeddings, labels, portion, normalization, times, patience, gpu = args
+    embeddings = np.asarray(embeddings)
     num_sample, num_class = labels.shape
     num_train = int(num_sample * portion)
 
@@ -423,7 +465,7 @@ def generate_one_vs_rest(indexes, labels):
         num_labels = test_labels.sum(dim=1, keepdim=True)
         sorted, _ = logits.sort(dim=1, descending=True)
         thresholds = sorted.gather(dim=1, index=num_labels-1)
-        predictions = (logits >= thresholds).long()
+        predictions = (logits >= thresholds).int()
         # compute metric
         num_TP_per_class = (predictions & test_labels).sum(dim=0).float()
         num_T_per_class = test_labels.sum(dim=0).float()
@@ -480,19 +522,19 @@ class KnowledgeGraphApplication(ApplicationMixin):
 
     Given a knowledge graph, it embeds each entity and relation into a continuous vector representation respectively.
     The learned embeddings can be used for analysis of knowledge graphs.
-    e.g. **entity clustering**, **link prediction**.
-    The likelihood of edges can be inferred by the score function over embeddings of triplets.
+    e.g. **entity prediction**, **link prediction**.
+    The likelihood of edges can be predicted by computing the score function over embeddings of triplets.
 
     Supported Models:
         - TransE (`Translating Embeddings for Modeling Multi-relational Data`_)
-        - DistMult (`Embedding Entities and Relations for Learnig and Inference in Knowledge Bases`_)
+        - DistMult (`Embedding Entities and Relations for Learning and Inference in Knowledge Bases`_)
         - ComplEx (`Complex Embeddings for Simple Link Prediction`_)
         - SimplE (`SimplE Embedding for Link Prediction in Knowledge Graphs`_)
         - RotatE (`RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space`_)
 
     .. _Translating Embeddings for Modeling Multi-relational Data:
         http://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data.pdf
-    .. _Embedding Entities and Relations for Learnig and Inference in Knowledge Bases:
+    .. _Embedding Entities and Relations for Learning and Inference in Knowledge Bases:
         https://arxiv.org/pdf/1412.6575.pdf
     .. _Complex Embeddings for Simple Link Prediction:
         http://proceedings.mlr.press/v48/trouillon16.pdf
@@ -511,7 +553,7 @@ class KnowledgeGraphApplication(ApplicationMixin):
     Note:
         The implementation of TransE, DistMult and ComplEx, SimplE are slightly different from their original papers.
         The loss function and the regularization term generally follow `this repo`_.
-        Adversarial negative sampling is also adopted in these models like RotatE.
+        Self-adversarial negative sampling is also adopted in these models like RotatE.
 
     .. _this repo: https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding
 
@@ -520,6 +562,9 @@ class KnowledgeGraphApplication(ApplicationMixin):
         :class:`KnowledgeGraphSolver <graphvite.solver.KnowledgeGraphSolver>`
     """
 
+    SAMPLE_PER_DIMENSION = 7
+    MEMORY_SCALE_FACTOR = 1.5
+
     def get_graph(self, **kwargs):
         return graph.KnowledgeGraph(self.index_type)
 
@@ -530,8 +575,145 @@ def get_solver(self, **kwargs):
             num_sampler_per_worker = self.cpu_per_gpu - 1
         return solver.KnowledgeGraphSolver(self.dim, self.float_type, self.index_type, self.gpus, num_sampler_per_worker)
 
+    def entity_prediction(self, H=None, R=None, T=None, file_name=None, save_file=None, target="tail", k=10,
+                          backend=cfg.backend):
+        """
+        Predict the distribution of missing entity or relation for triplets.
+
+        Parameters:
+            H (list of str, optional): names of head entities
+            R (list of str, optional): names of relations
+            T (list of str, optional): names of tail entities
+            file_name (str, optional): file of triplets (e.g. validation set)
+            save_file (str, optional): ``txt`` or ``pkl`` file to save predictions
+            k (int, optional): top-k recalls will be returned
+            target (str, optional): 'head' or 'tail'
+            backend (str, optional): 'graphvite' or 'torch'
+
+        Return:
+            list of list of tuple: top-k recalls for each triplet, if save file is not provided
+        """
+        def torch_predict():
+            import torch
+
+            entity_embeddings = SharedNDArray(self.solver.entity_embeddings)
+            relation_embeddings = SharedNDArray(self.solver.relation_embeddings)
+
+            num_gpu = len(self.gpus) if self.gpus else torch.cuda.device_count()
+            work_load = (num_sample + num_gpu - 1) // num_gpu
+            settings = []
+
+            for i in range(num_gpu):
+                work_H = H[work_load * i: work_load * (i+1)]
+                work_R = R[work_load * i: work_load * (i+1)]
+                work_T = T[work_load * i: work_load * (i+1)]
+                settings.append((entity_embeddings, relation_embeddings, work_H, work_R, work_T,
+                                 None, None, target, k, self.solver.model, self.solver.margin))
+
+            results = self.gpu_map(triplet_prediction, settings)
+            return sum(results, [])
+
+        def graphvite_predict():
+            num_entity = len(entity2id)
+            batch_size = self.get_batch_size(num_entity)
+            recalls = []
+
+            for i in range(0, num_sample, batch_size):
+                batch_h = H[i: i + batch_size]
+                batch_r = R[i: i + batch_size]
+                batch_t = T[i: i + batch_size]
+                batch = self.generate_one_vs_rest(batch_h, batch_r, batch_t, num_entity, target)
+
+                scores = self.solver.predict(batch)
+                scores = scores.reshape(-1, num_entity)
+                indexes = np.argpartition(scores, num_entity - k, axis=-1)
+                for index, score in zip(indexes, scores):
+                    index = index[-k:]
+                    score = score[index]
+                    order = np.argsort(score)[::-1]
+                    recall = list(zip(index[order], score[order]))
+                    recalls.append(recall)
+
+            return recalls
+
+        assert_in(["head", "tail"], target=target)
+        assert_in(["graphvite", "torch"], backend=backend)
+
+        if backend == "torch":
+            self.solver.clear()
+
+        if file_name:
+            if not (H is None and R is None and T is None):
+                raise ValueError("Evaluation data and file should not be provided at the same time")
+            H = []
+            R = []
+            T = []
+            with open(file_name, "r") as fin:
+                for line in fin:
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    if len(tokens) == 3:
+                        h, r, t = tokens
+                    elif target == "head":
+                        r, t = tokens
+                        h = None
+                    else:
+                        h, r = tokens
+                        t = None
+                    H.append(h)
+                    R.append(r)
+                    T.append(t)
+        if (H is None and T is None) or R is None:
+            raise ValueError("Either evaluation data or file should be provided")
+        if H is None:
+            target = "head"
+        if T is None:
+            target = "tail"
+
+        entity2id = self.graph.entity2id
+        relation2id = self.graph.relation2id
+        num_sample = len(R)
+        new_H = np.zeros(num_sample, dtype=np.uint32)
+        new_T = np.zeros(num_sample, dtype=np.uint32)
+        if target == "head":
+            new_R, new_T = self.name_map((relation2id, entity2id), (R, T))
+        if target == "tail":
+            new_H, new_R = self.name_map((entity2id, relation2id), (H, R))
+        assert len(new_R) == len(R), "Can't recognize some entities or relations"
+        H = np.asarray(new_H, dtype=np.uint32)
+        R = np.asarray(new_R, dtype=np.uint32)
+        T = np.asarray(new_T, dtype=np.uint32)
+
+        if backend == "graphvite":
+            recalls = graphvite_predict()
+        else:
+            recalls = torch_predict()
+
+        id2entity = self.graph.id2entity
+        new_recalls = []
+        for recall in recalls:
+            new_recall = [(id2entity[e], s) for e, s in recall]
+            new_recalls.append(new_recall)
+        recalls = new_recalls
+
+        if save_file:
+            extension = os.path.splitext(save_file)[1]
+            if extension == ".txt":
+                with open(save_file, "w") as fout:
+                    for recall in recalls:
+                        tokens = ["%s: %g" % x for x in recall]
+                        fout.write("%s\n" % "\t".join(tokens))
+            elif extension == ".pkl":
+                with open(save_file, "wb") as fout:
+                    pickle.dump(recalls, fout, protocol=pickle.HIGHEST_PROTOCOL)
+            else:
+                raise ValueError("Unknown file extension `%s`" % extension)
+        else:
+            return recalls
+
     def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None, filter_T=None, file_name=None,
-                        filter_files=None, fast_mode=None):
+                        filter_files=None, target="both", fast_mode=None, backend=cfg.backend):
         """
         Evaluate knowledge graph embeddings on link prediction task.
 
@@ -544,12 +726,68 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
             filter_R (list of str, optional): names of relations to filter out
             filter_T (list of str, optional): names of tail entities to filter out
             filter_files (str, optional): files of triplets to filter out (e.g. training / validation / test set)
+            target (str, optional): 'head', 'tail' or 'both'
             fast_mode (int, optional): if specified, only that number of samples will be evaluated
+            backend (str, optional): 'graphvite' or 'torch'
 
         Returns:
             dict: MR, MRR, HITS\@1, HITS\@3 & HITS\@10 of link prediction
         """
-        import torch
+        def torch_predict():
+            import torch
+
+            entity_embeddings = SharedNDArray(self.solver.entity_embeddings)
+            relation_embeddings = SharedNDArray(self.solver.relation_embeddings)
+
+            num_gpu = len(self.gpus) if self.gpus else torch.cuda.device_count()
+            work_load = (fast_mode + num_gpu - 1) // num_gpu
+            settings = []
+
+            for i in range(num_gpu):
+                work_H = H[work_load * i: work_load * (i+1)]
+                work_R = R[work_load * i: work_load * (i+1)]
+                work_T = T[work_load * i: work_load * (i+1)]
+                settings.append((entity_embeddings, relation_embeddings, work_H, work_R, work_T,
+                                 exclude_H, exclude_T, target, None, self.solver.model, self.solver.margin))
+
+            results = self.gpu_map(triplet_prediction, settings)
+            return np.concatenate(results)
+
+        def graphvite_predict():
+            num_entity = len(entity2id)
+            if target == "both":
+                batch_size = self.get_batch_size(num_entity * 2)
+            else:
+                batch_size = self.get_batch_size(num_entity)
+            rankings = []
+
+            for i in range(0, fast_mode, batch_size):
+                batch_h = H[i: i + batch_size]
+                batch_r = R[i: i + batch_size]
+                batch_t = T[i: i + batch_size]
+                batch = self.generate_one_vs_rest(batch_h, batch_r, batch_t, num_entity, target)
+                masks = self.generate_mask(batch_h, batch_r, batch_t, exclude_H, exclude_T, num_entity, target)
+                if target == "head":
+                    positives = batch_h
+                if target == "tail":
+                    positives = batch_t
+                if target == "both":
+                    positives = np.asarray([batch_h, batch_t]).transpose()
+                    positives = positives.ravel()
+
+                scores = self.solver.predict(batch)
+                scores = scores.reshape(-1, num_entity)
+                truths = scores[range(len(positives)), positives]
+                ranking = np.sum((scores >= truths[:, np.newaxis]) * masks, axis=1)
+                rankings.append(ranking)
+
+            return np.concatenate(rankings)
+
+        assert_in(["head", "tail", "both"], target=target)
+        assert_in(["graphvite", "torch"], backend=backend)
+
+        if backend == "torch":
+            self.solver.clear()
 
         if file_name:
             if not (H is None and R is None and T is None):
@@ -559,7 +797,10 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
             T = []
             with open(file_name, "r") as fin:
                 for line in fin:
-                    h, r, t = line.split()
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    h, r, t = tokens
                     H.append(h)
                     R.append(r)
                     T.append(t)
@@ -575,7 +816,10 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
             for filter_file in filter_files:
                 with open(filter_file, "r") as fin:
                     for line in fin:
-                        h, r, t = line.split()
+                        tokens = self.tokenize(line)
+                        if len(tokens) == 0:
+                            continue
+                        h, r, t = tokens
                         filter_H.append(h)
                         filter_R.append(r)
                         filter_T.append(t)
@@ -588,14 +832,14 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
         relation2id = self.graph.relation2id
         new_H, new_R, new_T = self.name_map((entity2id, relation2id, entity2id), (H, R, T))
         logger.info("effective triplets: %d / %d" % (len(new_H), len(H)))
-        H = np.asarray(new_H)
-        R = np.asarray(new_R)
-        T = np.asarray(new_T)
+        H = np.asarray(new_H, dtype=np.uint32)
+        R = np.asarray(new_R, dtype=np.uint32)
+        T = np.asarray(new_T, dtype=np.uint32)
         new_H, new_R, new_T = self.name_map((entity2id, relation2id, entity2id), (filter_H, filter_R, filter_T))
         logger.info("effective filter triplets: %d / %d" % (len(new_H), len(filter_H)))
-        filter_H = np.asarray(new_H)
-        filter_R = np.asarray(new_R)
-        filter_T = np.asarray(new_T)
+        filter_H = np.asarray(new_H, dtype=np.uint32)
+        filter_R = np.asarray(new_R, dtype=np.uint32)
+        filter_T = np.asarray(new_T, dtype=np.uint32)
 
         exclude_H = defaultdict(set)
         exclude_T = defaultdict(set)
@@ -609,20 +853,11 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
         H = H[indexes]
         R = R[indexes]
         T = T[indexes]
-        entity_embeddings = self.solver.entity_embeddings
-        relation_embeddings = self.solver.relation_embeddings
 
-        num_gpu = len(self.gpus) if self.gpus else torch.cuda.device_count()
-        work_load = (fast_mode + num_gpu - 1) // num_gpu
-        settings = []
-        for i in range(num_gpu):
-            work_H = H[work_load * i: work_load * (i+1)]
-            work_R = R[work_load * i: work_load * (i+1)]
-            work_T = T[work_load * i: work_load * (i+1)]
-            settings.append((entity_embeddings, relation_embeddings,
-                             work_H, work_R, work_T, exclude_H, exclude_T, self.solver.model))
-        results = self.gpu_map(link_prediction, settings)
-        rankings = np.concatenate(results)
+        if backend == "graphvite":
+            rankings = graphvite_predict()
+        elif backend == "torch":
+            rankings = torch_predict()
 
         return {
             "MR": np.mean(rankings),
@@ -632,46 +867,128 @@ def link_prediction(self, H=None, R=None, T=None, filter_H=None, filter_R=None,
             "HITS@10": np.mean(rankings <= 10)
         }
 
-
-def link_prediction(args):
+    def get_batch_size(self, sample_size):
+        import psutil
+        memory = psutil.virtual_memory()
+
+        batch_size = int(self.SAMPLE_PER_DIMENSION * self.dim * self.graph.num_vertex
+                         * self.solver.num_partition / self.solver.num_worker / sample_size)
+        # 2 triplet (Python, C++ sample pool) + 1 sample index
+        mem_per_sample = sample_size * (2 * 3 * np.uint32().itemsize + 1 * np.uint64().itemsize)
+        max_batch_size = int(memory.available / mem_per_sample / self.MEMORY_SCALE_FACTOR)
+        if max_batch_size < batch_size:
+            logger.info("Memory is not enough for optimal prediction batch size."
+                        "Use the maximal possible size instead.")
+            batch_size = max_batch_size
+        return batch_size
+
+    def generate_one_vs_rest(self, H, R, T, num_entity, target="both"):
+        one = np.ones(num_entity, dtype=np.bool)
+        all = np.arange(num_entity, dtype=np.uint32)
+        batches = []
+
+        for h, r, t in zip(H, R, T):
+            if target == "head" or target == "both":
+                batch = np.asarray([all, t * one, r * one]).transpose()
+                batches.append(batch)
+            if target == "tail" or target == "both":
+                batch = np.asarray([h * one, all, r * one]).transpose()
+                batches.append(batch)
+
+        batches = np.concatenate(batches)
+        return batches
+
+    def generate_mask(self, H, R, T, exclude_H, exclude_T, num_entity, target="both"):
+        one = np.ones(num_entity, dtype=np.bool)
+        masks = []
+
+        for h, r, t in zip(H, R, T):
+            if target == "head" or target == "both":
+                mask = one.copy()
+                mask[list(exclude_H[(t, r)])] = 0
+                mask[h] = 1
+                masks.append(mask)
+            if target == "tail" or target == "both":
+                mask = one.copy()
+                mask[list(exclude_T[(h, r)])] = 0
+                mask[t] = 1
+                masks.append(mask)
+
+        masks = np.asarray(masks)
+        return masks
+
+
+def triplet_prediction(args):
     import torch
     from .network import LinkPredictor
+    torch.set_grad_enabled(False)
 
-    entity_embeddings, relation_embeddings, H, R, T, exclude_H, exclude_T, score_function, gpu = args
+    entity_embeddings, relation_embeddings, H, R, T, \
+    exclude_H, exclude_T, target, k, score_function, margin, device = args
+    entity_embeddings = np.asarray(entity_embeddings)
+    relation_embeddings = np.asarray(relation_embeddings)
     num_entity = len(entity_embeddings)
-    model = LinkPredictor(score_function, entity_embeddings, relation_embeddings, entity_embeddings)
-    model = model.cuda(gpu)
-
-    rankings = []
-    for h, r, t in zip(H, R, T):
-        negatives = list(set(range(num_entity)) - exclude_T[(h, r)])
-        batch_size = len(negatives) + 1
+    score_function = LinkPredictor(score_function, entity_embeddings, relation_embeddings, entity_embeddings,
+                                   margin=margin)
 
-        batch_h = h * torch.ones(batch_size, dtype=torch.long)
-        batch_r = r * torch.ones(batch_size, dtype=torch.long)
-        batch_t = torch.as_tensor([t] + negatives)
-        batch_h = batch_h.cuda(gpu)
-        batch_r = batch_r.cuda(gpu)
-        batch_t = batch_t.cuda(gpu)
-
-        score = model(batch_h, batch_r, batch_t)
-        rankings.append((score >= score[0]).sum().item())
-
-        negatives = list(set(range(num_entity)) - exclude_H[(t, r)])
-        batch_size = len(negatives) + 1
-
-        batch_h = torch.as_tensor([h] + negatives)
-        batch_r = r * torch.ones(batch_size, dtype=torch.long)
-        batch_t = t * torch.ones(batch_size, dtype=torch.long)
-        batch_h = batch_h.cuda(gpu)
-        batch_r = batch_r.cuda(gpu)
-        batch_t = batch_t.cuda(gpu)
-
-        score = model(batch_h, batch_r, batch_t)
-        rankings.append((score >= score[0]).sum().item())
+    if device != "cpu":
+        try:
+            score_function = score_function.to(device)
+        except RuntimeError:
+            logger.info("Model is too large for GPU evaluation with PyTorch. Switch to CPU evaluation.")
+            device = "cpu"
+        if device == "cpu":
+            del score_function
+            torch.cuda.empty_cache()
+            score_function = LinkPredictor(score_function, entity_embeddings, relation_embeddings, entity_embeddings,
+                                           margin=margin)
+
+    one = torch.ones(num_entity, dtype=torch.long, device=device)
+    all = torch.arange(num_entity, dtype=torch.long, device=device)
+    results = [] # rankings or top-k recalls
 
-    rankings = np.asarray(rankings)
-    return rankings
+    for h, r, t in zip(H, R, T):
+        if target == "head" or target == "both":
+            batch_h = all
+            batch_r = r * one
+            batch_t = t * one
+            score = score_function(batch_h, batch_r, batch_t)
+            if k: # top-k recalls
+                score, index = torch.topk(score, k)
+                score = score.cpu().numpy()
+                index = index.cpu().numpy()
+                recall = list(zip(index, score))
+                results.append(recall)
+            else: # ranking
+                mask = torch.ones(num_entity, dtype=torch.uint8, device=device)
+                index = torch.tensor(list(exclude_H[(t, r)]), dtype=torch.long, device=device)
+                mask[index] = 0
+                mask[h] = 1
+                ranking = torch.sum((score >= score[h]) * mask).item()
+                results.append(ranking)
+
+        if target == "tail" or target == "both":
+            batch_h = h * one
+            batch_r = r * one
+            batch_t = all
+            score = score_function(batch_h, batch_r, batch_t)
+            if k: # top-k recalls
+                score, index = torch.topk(score, k)
+                score = score.cpu().numpy()
+                index = index.cpu().numpy()
+                recall = list(zip(index, score))
+                results.append(recall)
+            else: # ranking
+                mask = torch.ones(num_entity, dtype=torch.uint8, device=device)
+                index = torch.tensor(list(exclude_T[(h, r)]), dtype=torch.long, device=device)
+                mask[index] = 0
+                mask[t] = 1
+                ranking = torch.sum((score >= score[t]) * mask).item()
+                results.append(ranking)
+
+    if not k: # ranking
+        results = np.asarray(results)
+    return results
 
 
 class VisualizationApplication(ApplicationMixin):
@@ -732,6 +1049,8 @@ def visualization(self, Y=None, file_name=None, save_file=None, figure_size=10,
         from matplotlib import pyplot as plt
         plt.switch_backend("agg") # for compatibility
 
+        self.solver.clear()
+
         coordinates = self.solver.coordinates
         dim = coordinates.shape[1]
         if not (dim == 2 or dim == 3):
@@ -740,8 +1059,14 @@ def visualization(self, Y=None, file_name=None, save_file=None, figure_size=10,
         if file_name:
             if not (Y is None):
                 raise ValueError("Evaluation data and file should not be provided at the same time")
+            Y = []
             with open(file_name, "r") as fin:
-                Y = [line.strip() for line in fin]
+                for line in fin:
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    y, = tokens
+                    Y.append(y)
         elif Y is None:
             Y = ["unknown"] * self.graph.num_vertex
         Y = np.asarray(Y)
@@ -795,6 +1120,8 @@ def hierarchy(self, HY=None, file_name=None, target=None, save_file=None, figure
         from matplotlib import pyplot as plt
         plt.switch_backend("agg") # for compatibility
 
+        self.solver.clear()
+
         coordinates = self.solver.coordinates
         dim = coordinates.shape[1]
         if dim != 2:
@@ -803,8 +1130,12 @@ def hierarchy(self, HY=None, file_name=None, target=None, save_file=None, figure
         if file_name:
             if not (HY is None):
                 raise ValueError("Evaluation data and file should not be provided at the same time")
+            HY = []
             with open(file_name, "r") as fin:
-                HY = [line.split() for line in fin]
+                for line in fin:
+                    tokens = self.tokenize(line)
+                    if len(tokens) > 0:
+                        HY.append(tokens)
         elif HY is None:
             raise ValueError("No label is provided for hierarchy")
         HY = np.asarray(HY)
@@ -858,6 +1189,8 @@ def animation(self, Y=None, file_name=None, save_file=None, figure_size=5, scale
         from mpl_toolkits.mplot3d import Axes3D
         plt.switch_backend("agg") # for compatibility
 
+        self.solver.clear()
+
         coordinates = self.solver.coordinates
         dim = coordinates.shape[1]
         if dim != 3:
@@ -866,8 +1199,14 @@ def animation(self, Y=None, file_name=None, save_file=None, figure_size=5, scale
         if file_name:
             if not (Y is None):
                 raise ValueError("Evaluation data and file should not be provided at the same time")
+            Y = []
             with open(file_name, "r") as fin:
-                Y = [line.strip() for line in fin]
+                for line in fin:
+                    tokens = self.tokenize(line)
+                    if len(tokens) == 0:
+                        continue
+                    y, = tokens
+                    Y.append(y)
         elif Y is None:
             Y = ["unknown"] * self.graph.num_vertex
         Y = np.asarray(Y)
@@ -953,13 +1292,13 @@ class Application(object):
 
     Parameters:
         type (str): application type,
-            can be 'graph', 'word_graph', 'knowledge_graph' or 'visualization'
+            can be 'graph', 'word graph', 'knowledge graph' or 'visualization'
     """
 
     application = {
         "graph": GraphApplication,
-        "word_graph": WordGraphApplication,
-        "knowledge_graph": KnowledgeGraphApplication,
+        "word graph": WordGraphApplication,
+        "knowledge graph": KnowledgeGraphApplication,
         "visualization": VisualizationApplication
     }
 
diff --git a/python/graphvite/application/network.py b/python/graphvite/application/network.py
index 983dfb4..fe2dede 100644
--- a/python/graphvite/application/network.py
+++ b/python/graphvite/application/network.py
@@ -46,12 +46,13 @@ class LinkPredictor(nn.Module):
     """
     Link prediction network for graphs / knowledge graphs
     """
-    def __init__(self, score_function, *embeddings):
+    def __init__(self, score_function, *embeddings, **kwargs):
         super(LinkPredictor, self).__init__()
         if isinstance(score_function, types.FunctionType):
             self.score_function = score_function
         else:
             self.score_function = getattr(LinkPredictor, score_function)
+        self.kwargs = kwargs
         self.embeddings = nn.ModuleList()
         for embedding in embeddings:
             embedding = torch.as_tensor(embedding)
@@ -63,7 +64,7 @@ def forward(self, *indexes):
         vectors = []
         for index, embedding in zip(indexes, self.embeddings):
             vectors.append(embedding(index))
-        return self.score_function(*vectors)
+        return self.score_function(*vectors, **self.kwargs)
 
     @staticmethod
     def LINE(heads, tails):
@@ -74,13 +75,13 @@ def LINE(heads, tails):
     DeepWalk = LINE
 
     @staticmethod
-    def TransE(heads, relations, tails):
+    def TransE(heads, relations, tails, margin=12):
         x = heads + relations - tails
-        distance = x.norm(p=1, dim=1)
-        return -distance
+        score = margin - x.norm(p=1, dim=1)
+        return score
 
     @staticmethod
-    def RotatE(heads, relations, tails):
+    def RotatE(heads, relations, tails, margin=12):
         dim = heads.size(1) // 2
 
         head_re, head_im = heads.view(-1, dim, 2).permute(2, 0, 1)
@@ -91,8 +92,8 @@ def RotatE(heads, relations, tails):
         x_re = head_re * relation_re - head_im * relation_im - tail_re
         x_im = head_re * relation_im + head_im * relation_re - tail_im
         x = torch.stack([x_re, x_im], dim=0)
-        distance = x.norm(p=2, dim=0).sum(dim=1)
-        return -distance
+        score = margin - x.norm(p=2, dim=0).sum(dim=1)
+        return score
 
     @staticmethod
     def DistMult(heads, relations, tails):
diff --git a/python/graphvite/base.py b/python/graphvite/base.py
index 1ffec39..9ffbcee 100644
--- a/python/graphvite/base.py
+++ b/python/graphvite/base.py
@@ -23,7 +23,7 @@
 from easydict import EasyDict
 
 from . import lib, dtype
-from .util import recursive_default
+from .util import recursive_default, assert_in
 
 
 root = os.path.expanduser("~/.graphvite")
@@ -32,6 +32,7 @@
 
 # default config
 default = EasyDict()
+default.backend = "graphvite"
 default.dataset_path = os.path.join(root, "dataset")
 default.float_type = dtype.float32
 default.index_type = dtype.uint32
@@ -45,6 +46,8 @@ def load_global_config():
         cfg = recursive_default(cfg, default)
     else:
         cfg = default
+
+    assert_in(["graphvite", "torch"], backend=cfg.backend)
     if not os.path.exists(cfg.dataset_path):
         os.mkdir(cfg.dataset_path)
     if isinstance(cfg.float_type, str):
diff --git a/python/graphvite/cmd.py b/python/graphvite/cmd.py
index 330ad2f..1f21b08 100644
--- a/python/graphvite/cmd.py
+++ b/python/graphvite/cmd.py
@@ -19,7 +19,9 @@
 
 import os
 import re
+import glob
 import yaml
+import shutil
 import logging
 import argparse
 from easydict import EasyDict
@@ -32,13 +34,13 @@
 
 def get_config_path():
     candidate_paths = [
-        os.path.join(gv.package_path, "config"),
-        os.path.join(gv.package_path, "../../config")
+        os.path.realpath(os.path.join(gv.package_path, "config")),
+        os.path.realpath(os.path.join(gv.package_path, "../../config"))
     ]
     for config_path in candidate_paths:
         if os.path.isdir(config_path):
             return config_path
-    raise IOError("Can't find baseline configuration directory. Did you install GraphVite correctly?")
+    raise IOError("Can't find configuration directory. Did you install GraphVite correctly?")
 
 
 def get_parser():
@@ -46,11 +48,16 @@ def get_parser():
     command = parser.add_subparsers(metavar="command", dest="command")
     command.required = True
 
+    new = command.add_parser("new", help="create a new configuration file")
+    new.add_argument("application", help="name of the application (e.g. graph)", nargs="+")
+    new.add_argument("--file", help="yaml file to save")
+
     run = command.add_parser("run", help="run from configuration file")
     run.add_argument("config", help="yaml configuration file")
     run.add_argument("--no-eval", help="turn off evaluation", dest="eval", action="store_false")
     run.add_argument("--gpu", help="override the number of GPUs", type=int)
     run.add_argument("--cpu", help="override the number of CPUs per GPU", type=int)
+    run.add_argument("--epoch", help="override the number of epochs", type=int)
 
     visualize = command.add_parser("visualize", help="visualize high-dimensional vectors")
     visualize.add_argument("file", help="data file (numpy dump or txt)")
@@ -60,10 +67,12 @@ def get_parser():
     visualize.add_argument("--3d", help="3d plot", dest="dim", action="store_const", const=3, default=2)
 
     baseline = command.add_parser("baseline", help="reproduce baseline benchmarks")
-    baseline.add_argument("keywords", help="any keyword of the baseline (e.g. model, dataset)", metavar="keyword", nargs="+")
+    baseline.add_argument("keywords", help="any keyword of the baseline (e.g. model, dataset)", metavar="keyword",
+                          nargs="+")
     baseline.add_argument("--no-eval", help="turn off evaluation", dest="eval", action="store_false")
     baseline.add_argument("--gpu", help="overwrite the number of GPUs", type=int)
     baseline.add_argument("--cpu", help="overwrite the number of CPUs per GPU", type=int)
+    baseline.add_argument("--epoch", help="override the number of epochs", type=int)
 
     list = command.add_parser("list", help="list available baselines")
 
@@ -94,19 +103,57 @@ def get_dataset(x):
     return cfg
 
 
+def new_main(args):
+    config_path = get_config_path()
+    template_path = os.path.join(config_path, "template")
+    if not os.path.isdir(template_path):
+        raise IOError("Can't find template configuration directory. Did you install GraphVite correctly?")
+
+    config = "_".join(args.application) + ".yaml"
+    template = os.path.join(template_path, config)
+    if args.file:
+        config = args.file
+    if os.path.isfile(template):
+        if os.path.exists(config):
+            answer = None
+            while answer not in ["y", "n"]:
+                answer = input("File `%s` exists. Overwrite? (y/n)" % config)
+            if answer == "n":
+                return
+        shutil.copyfile(template, config)
+        print("A configuration template has been written into `%s`." % config)
+    else:
+        templates = glob.glob(os.path.join(template_path, "*.yaml"))
+        templates = sorted(templates)
+        applications = [""]
+        for template in templates:
+            application = os.path.splitext(os.path.basename(template))[0]
+            application = application.replace("_", " ")
+            applications.append(application)
+        raise ValueError("Can't find a configuration template for `%s`. Available applications are %s"
+                         % (" ".join(args.application), "\n    ".join(applications)))
+
+
 def run_main(args):
     cfg = load_config(args.config)
-    if args.gpu:
+    if args.gpu is not None:
         cfg.resource.gpus = range(args.gpu)
-    if args.cpu:
+    if args.cpu is not None:
         cfg.resource.cpu_per_gpu = args.cpu
+    if args.epoch is not None:
+        cfg.train.num_epoch = args.epoch
 
     app = gap.Application(cfg.application, **cfg.resource)
+    if "format" in cfg:
+        app.set_format(**cfg.format)
     app.load(**cfg.graph)
     app.build(**cfg.build)
     app.train(**cfg.train)
     if args.eval and "evaluate" in cfg:
-        app.evaluate(**cfg.evaluate)
+        if isinstance(cfg.evaluate, dict):
+            cfg.evaluate = [cfg.evaluate]
+        for evaluation in cfg.evaluate:
+            app.evaluate(**evaluation)
     if "save" in cfg:
         app.save(**cfg.save)
 
@@ -129,7 +176,8 @@ def load_data(file_name):
     else:
         labels = None
 
-    gv.init_logging(logging.WARNING)
+    gv.init_logging(logging.INFO)
+    # gv.init_logging(logging.WARNING)
 
     app = gap.VisualizationApplication(args.dim, [0])
     app.load(vectors=vectors, perplexity=args.perplexity)
@@ -147,7 +195,7 @@ def baseline_main(args):
             file = os.path.join(path, file)
             match = True
             for keyword in args.keywords:
-                result = re.search("[/\_.]%s[/\_.]" % keyword, file)
+                result = re.search(r"[/\\_.]%s[/\\_.]" % keyword, file)
                 if not result:
                     match = False
                     break
@@ -163,17 +211,22 @@ def baseline_main(args):
     config = configs[0]
     print("running baseline: %s" % os.path.relpath(config, config_path))
     cfg = load_config(config)
-    if args.gpu:
+    if args.gpu is not None:
         cfg.resource.gpus = range(args.gpu)
-    if args.cpu:
+    if args.cpu is not None:
         cfg.resource.cpu_per_gpu = args.cpu
+    if args.epoch is not None:
+        cfg.train.num_epoch = args.epoch
 
     app = gap.Application(cfg.application, **cfg.resource)
     app.load(**cfg.graph)
     app.build(**cfg.build)
     app.train(**cfg.train)
     if args.eval and "evaluate" in cfg:
-        app.evaluate(**cfg.evaluate)
+        if isinstance(cfg.evaluate, dict):
+            cfg.evaluate = [cfg.evaluate]
+        for evaluation in cfg.evaluate:
+            app.evaluate(**evaluation)
     if "save" in cfg:
         app.save(**cfg.save)
 
@@ -187,6 +240,8 @@ def list_main(args):
     count = 0
     for path, dirs, files in os.walk(config_path):
         path = os.path.relpath(path, config_path)
+        if path == "template":
+            continue
         depth = path.count("/")
         if path != ".":
             depth += 1
@@ -199,6 +254,7 @@ def list_main(args):
 
 
 command = {
+    "new": new_main,
     "run": run_main,
     "visualize": visualize_main,
     "baseline": baseline_main,
diff --git a/python/graphvite/dataset.py b/python/graphvite/dataset.py
index 19ee69d..bef3a07 100644
--- a/python/graphvite/dataset.py
+++ b/python/graphvite/dataset.py
@@ -28,6 +28,7 @@
 
 Knowledge Graph
 
+- :class:`Math`
 - :class:`FB15k`
 - :class:`FB15k237`
 - :class:`WN18`
@@ -40,7 +41,7 @@
 - :class:`CIFAR10`
 - :class:`ImageNet`
 """
-from __future__ import absolute_import
+from __future__ import absolute_import, division
 
 import os
 import glob
@@ -96,10 +97,10 @@ def train_preprocess(self, input_file, output_file):
 
     See also:
         Pre-defined preprocess functions
-        :func:`csv2txt`
+        :func:`csv2txt`,
         :func:`top_k_label`,
-        :func:`induced_graph`
-        :func:`link_prediction_split`
+        :func:`induced_graph`,
+        :func:`link_prediction_split`,
         :func:`image_feature_data`
     """
     def __init__(self, name, urls=None, members=None):
@@ -184,7 +185,7 @@ def get_file(self, key):
         preprocess = getattr(self, preprocess_name, None)
         if len(urls) > 1 and preprocess is None:
             raise AttributeError(
-                "There are non-trivial number of files, but function `%s` is not found" % preprocess_name)
+                "There are non-trivial number of files,but function `%s` is not found" % preprocess_name)
 
         extract_files = []
         for url, member in zip(urls, members):
@@ -286,52 +287,59 @@ def induced_graph(self, graph_file, label_file, save_file):
                         continue
                     fout.write("%s\t%s\n" % (u, v))
 
-    def link_prediction_split(self, graph_file, train_file, test_file, portion):
+    def link_prediction_split(self, graph_file, files, portions):
         """
-        Split a graph for link prediction use. The test split will contain half true and half false edges.
+        Divide a normal graph into a train split and several test splits for link prediction use.
+        Each test split contains half true and half false edges.
 
         Parameters:
             graph_file (str): graph file
-            train_file (str): train file
-            test_file (str): test file
-            portion (str): portion of test edges
+            files (list of str): file names,
+                the first file is treated as train file
+            portions (list of float): split portions
         """
-        logger.info("splitting graph %s into %s and %s" %
-              (self.relpath(graph_file), self.relpath(train_file), self.relpath(test_file)))
+        assert len(files) == len(portions)
+        logger.info("splitting graph %s into %s" %
+                    (self.relpath(graph_file), ", ".join([self.relpath(file) for file in files])))
         np.random.seed(1024)
 
         nodes = set()
         edges = set()
-        num_test = 0
-        with open(graph_file, "r") as fin, open(train_file, "w") as ftrain, open(test_file, "w") as ftest:
+        portions = np.cumsum(portions, dtype=np.float32) / np.sum(portions)
+        files = [open(file, "w") for file in files]
+        num_edges = [0] * len(files)
+        with open(graph_file, "r") as fin:
             for line in fin:
                 u, v = line.split()
                 nodes.update([u, v])
                 edges.add((u, v))
-                if np.random.rand() > portion:
-                    ftrain.write("%s\t%s\n" % (u, v))
+                i = np.searchsorted(portions, np.random.rand())
+                if i == 0:
+                    files[i].write("%s\t%s\n" % (u, v))
                 else:
-                    ftest.write("%s\t%s\t1\n" % (u, v))
-                    num_test += 1
+                    files[i].write("%s\t%s\t1\n" % (u, v))
+                num_edges[i] += 1
 
         nodes = list(nodes)
-        with open(test_file, "a") as ftest:
-            for i in range(num_test):
+        for file, num_edge in zip(files[1:], num_edges[1:]):
+            for _ in range(num_edge):
                 valid = False
                 while not valid:
                     u = nodes[int(np.random.rand() * len(nodes))]
                     v = nodes[int(np.random.rand() * len(nodes))]
                     valid = u != v and (u, v) not in edges and (v, u) not in edges
-                ftest.write("%s\t%s\t0\n" % (u, v))
+                file.write("%s\t%s\t0\n" % (u, v))
+        for file in files:
+            file.close()
 
     def image_feature_data(self, dataset, model="resnet50", batch_size=128):
         """
-        Infer feature vectors on a dataset using a neural network.
+        Compute feature vectors for an image dataset using a neural network.
 
         Parameters:
             dataset (torch.utils.data.Dataset): dataset
             model (str or torch.nn.Module, optional): pretrained model.
-                If it is a str, use the last hidden layer of that model.
+                If it is a str, use the last hidden model of that model.
             batch_size (int, optional): batch size
         """
         import torch
@@ -366,41 +374,61 @@ class BlogCatalog(Dataset):
     BlogCatalog social network dataset.
 
     Splits:
-        train, label
+        graph, label, train, test
+
+    Train and test splits are used for link prediction purpose.
     """
 
     def __init__(self):
         super(BlogCatalog, self).__init__(
             "blogcatalog",
             urls={
-                "train": "http://socialcomputing.asu.edu/uploads/1283153973/BlogCatalog-dataset.zip",
-                "label": "http://socialcomputing.asu.edu/uploads/1283153973/BlogCatalog-dataset.zip"
+                "graph": "http://socialcomputing.asu.edu/uploads/1283153973/BlogCatalog-dataset.zip",
+                "label": "http://socialcomputing.asu.edu/uploads/1283153973/BlogCatalog-dataset.zip",
+                "train": [], # depends on `graph`
+                "valid": [], # depends on `graph`
+                "test": [] # depends on `graph`
             },
             members={
-                "train": "BlogCatalog-dataset/data/edges.csv",
+                "graph": "BlogCatalog-dataset/data/edges.csv",
                 "label": "BlogCatalog-dataset/data/group-edges.csv"
             }
         )
 
-    def train_preprocess(self, raw_file, save_file):
+    def graph_preprocess(self, raw_file, save_file):
         self.csv2txt(raw_file, save_file)
 
     def label_preprocess(self, raw_file, save_file):
         self.csv2txt(raw_file, save_file)
 
+    def train_preprocess(self, train_file):
+        valid_file = train_file[:train_file.rfind("train.txt")] + "valid.txt"
+        test_file = train_file[:train_file.rfind("train.txt")] + "test.txt"
+        self.link_prediction_split(self.graph, [train_file, valid_file, test_file], portions=[100, 1, 1])
+
+    def valid_preprocess(self, valid_file):
+        train_file = valid_file[:valid_file.rfind("valid.txt")] + "train.txt"
+        test_file = valid_file[:valid_file.rfind("valid.txt")] + "test.txt"
+        self.link_prediction_split(self.graph, [train_file, valid_file, test_file], portions=[100, 1, 1])
+
+    def test_preprocess(self, test_file):
+        train_file = test_file[:test_file.rfind("test.txt")] + "train.txt"
+        valid_file = test_file[:test_file.rfind("test.txt")] + "valid.txt"
+        self.link_prediction_split(self.graph, [train_file, valid_file, test_file], portions=[100, 1, 1])
+
 
 class Youtube(Dataset):
     """
     Youtube social network dataset.
 
     Splits:
-        train, label
+        graph, label
     """
     def __init__(self):
         super(Youtube, self).__init__(
             "youtube",
             urls={
-                "train": "http://socialnetworks.mpi-sws.mpg.de/data/youtube-links.txt.gz",
+                "graph": "http://socialnetworks.mpi-sws.mpg.de/data/youtube-links.txt.gz",
                 "label": "http://socialnetworks.mpi-sws.mpg.de/data/youtube-groupmemberships.txt.gz"
             }
         )
@@ -414,13 +442,13 @@ class Flickr(Dataset):
     Flickr social network dataset.
 
     Splits:
-        train, label
+        graph, label
     """
     def __init__(self):
         super(Flickr, self).__init__(
             "flickr",
             urls={
-                "train": "http://socialnetworks.mpi-sws.mpg.de/data/flickr-links.txt.gz",
+                "graph": "http://socialnetworks.mpi-sws.mpg.de/data/flickr-links.txt.gz",
                 "label": "http://socialnetworks.mpi-sws.mpg.de/data/flickr-groupmemberships.txt.gz"
             }
         )
@@ -441,17 +469,25 @@ def __init__(self):
             "hyperlink2012",
             urls={
                 "pld_train": "http://data.dws.informatik.uni-mannheim.de/hyperlinkgraph/2012-08/pld-arc.gz",
+                "pld_valid": "http://data.dws.informatik.uni-mannheim.de/hyperlinkgraph/2012-08/pld-arc.gz",
                 "pld_test": "http://data.dws.informatik.uni-mannheim.de/hyperlinkgraph/2012-08/pld-arc.gz"
             }
         )
 
     def pld_train_preprocess(self, graph_file, train_file):
+        valid_file = train_file[:train_file.rfind("pld_train.txt")] + "pld_valid.txt"
         test_file = train_file[:train_file.rfind("pld_train.txt")] + "pld_test.txt"
-        self.link_prediction_split(graph_file, train_file, test_file, portion=1e-4)
+        self.link_prediction_split(graph_file, [train_file, valid_file, test_file], portions=[10000, 1, 1])
+
+    def pld_valid_preprocess(self, graph_file, valid_file):
+        train_file = valid_file[:valid_file.rfind("pld_valid.txt")] + "pld_train.txt"
+        test_file = valid_file[:valid_file.rfind("pld_valid.txt")] + "pld_test.txt"
+        self.link_prediction_split(graph_file, [train_file, valid_file, test_file], portions=[10000, 1, 1])
 
     def pld_test_preprocess(self, graph_file, test_file):
         train_file = test_file[:test_file.rfind("pld_test.txt")] + "pld_train.txt"
-        self.link_prediction_split(graph_file, train_file, test_file, portion=1e-4)
+        valid_file = test_file[:test_file.rfind("pld_test.txt")] + "pld_valid.txt"
+        self.link_prediction_split(graph_file, [train_file, valid_file, test_file], portions=[10000, 1, 1])
 
 
 class Friendster(Dataset):
@@ -459,20 +495,20 @@ class Friendster(Dataset):
     Friendster social network dataset.
 
     Splits:
-        train, small_train, label
+        graph, small_graph, label
     """
     def __init__(self):
         super(Friendster, self).__init__(
             "friendster",
             urls={
-                "train": "https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz",
-                "small_train": ["https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz",
+                "graph": "https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz",
+                "small_graph": ["https://snap.stanford.edu/data/bigdata/communities/com-friendster.ungraph.txt.gz",
                                 "https://snap.stanford.edu/data/bigdata/communities/com-friendster.all.cmty.txt.gz"],
                 "label": "https://snap.stanford.edu/data/bigdata/communities/com-friendster.top5000.cmty.txt.gz"
             }
         )
 
-    def small_train_preprocess(self, graph_file, label_file, save_file):
+    def small_graph_preprocess(self, graph_file, label_file, save_file):
         self.induced_graph(graph_file, label_file, save_file)
 
     def label_preprocess(self, label_file, save_file):
@@ -484,17 +520,67 @@ class Wikipedia(Dataset):
     Wikipedia dump for word embedding.
 
     Splits:
-        train
+        graph
     """
     def __init__(self):
         super(Wikipedia, self).__init__(
             "wikipedia",
             urls={
-                "train": "https://www.dropbox.com/s/mwt4uu1qu9fflfk/enwiki-latest-pages-articles-sentences.txt.gz"
+                "graph": "https://www.dropbox.com/s/mwt4uu1qu9fflfk/enwiki-latest-pages-articles-sentences.txt.gz"
             }
         )
 
 
+class Math(Dataset):
+    """
+    Synthetic math knowledge graph dataset.
+
+    Splits:
+        train, valid, test
+    """
+
+    NUM_ENTITY = 1000
+    NUM_RELATION = 30
+    OPERATORS = [
+        ("+", lambda x, y: (x + y) % Math.NUM_ENTITY),
+        ("-", lambda x, y: (x - y) % Math.NUM_ENTITY),
+        ("*", lambda x, y: (x * y) % Math.NUM_ENTITY),
+        ("/", lambda x, y: x // y),
+        ("%", lambda x, y: x % y)
+    ]
+
+    def __init__(self):
+        super(Math, self).__init__(
+            "math",
+            urls={
+                "train": [],
+                "valid": [],
+                "test": []
+            }
+        )
+
+    def train_preprocess(self, save_file):
+        np.random.seed(1023)
+        self.generate_math(save_file, num_triplet=20000)
+
+    def valid_preprocess(self, save_file):
+        np.random.seed(1024)
+        self.generate_math(save_file, num_triplet=1000)
+
+    def test_preprocess(self, save_file):
+        np.random.seed(1025)
+        self.generate_math(save_file, num_triplet=1000)
+
+    def generate_math(self, save_file, num_triplet):
+        with open(save_file, "w") as fout:
+            for _ in range(num_triplet):
+                i = int(np.random.rand() * len(self.OPERATORS))
+                op, f = self.OPERATORS[i]
+                x = int(np.random.rand() * self.NUM_ENTITY)
+                y = int(np.random.rand() * self.NUM_RELATION) + 1
+                fout.write("%d\t%s%d\t%d\n" % (x, op, y, f(x, y)))
+
+
 class FB15k(Dataset):
     """
     FB15k knowledge graph dataset.
@@ -769,11 +855,8 @@ def readable_label(self, labels, save_file, hierarchy=False):
             for hierarchy in zip(*hierarchies):
                 fout.write("%s\n" % "\t".join(hierarchy))
 
-    def cached_feature_data(self, image_path, save_file):
-        numpy_file = os.path.splitext(save_file)[0] + ".npy"
-        if os.path.exists(numpy_file):
-            return np.load(numpy_file)
-
+    def image_feature_data(self, image_path):
+        """"""
         import torchvision
         from torchvision import transforms
 
@@ -784,8 +867,7 @@ def cached_feature_data(self, image_path, save_file):
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
         ])
         dataset = torchvision.datasets.ImageFolder(image_path, augmentation)
-        features = self.image_feature_data(dataset)
-        np.save(numpy_file, features)
+        features = super(self, ImageNet).image_feature_data(dataset)
 
         return features
 
@@ -801,7 +883,12 @@ def train_image_preprocess(self, image_path, save_file):
         return image_path
 
     def train_feature_data_preprocess(self, save_file):
-        return self.cached_feature_data(self.train_image, save_file)
+        numpy_file = os.path.splitext(save_file)[0] + ".npy"
+        if os.path.exists(numpy_file):
+            return np.load(numpy_file)
+        features = self.image_feature_data(self.train_image, save_file)
+        np.save(numpy_file, features)
+        return features
 
     def train_label_preprocess(self, save_file):
         image_files = glob.glob(os.path.join(self.train_image, "*/*.JPEG"))
@@ -842,7 +929,12 @@ def valid_image_preprocess(self, image_path, meta_path, save_file):
         return image_path
 
     def valid_feature_data_preprocess(self, save_file):
-        return self.cached_feature_data(self.valid_image, save_file)
+        numpy_file = os.path.splitext(save_file)[0] + ".npy"
+        if os.path.exists(numpy_file):
+            return np.load(numpy_file)
+        features = self.image_feature_data(self.valid_image, save_file)
+        np.save(numpy_file, features)
+        return features
 
     def valid_label_preprocess(self, meta_path, save_file):
         from scipy.io import loadmat
@@ -887,7 +979,6 @@ def hierarchical_label_preprocess(self, save_file):
         with open(save_file, "w") as fout:
             with open(self.train_hierarchical_label, "r") as fin:
                 shutil.copyfileobj(fin, fout)
-        with open(save_file, "a") as fout:
             with open(self.valid_hierarchical_label, "r") as fin:
                 shutil.copyfileobj(fin, fout)
 
@@ -899,6 +990,7 @@ def hierarchical_label_preprocess(self, save_file):
 friendster = Friendster()
 wikipedia = Wikipedia()
 
+math = Math()
 fb15k = FB15k()
 fb15k237 = FB15k237()
 wn18 = WN18()
@@ -912,6 +1004,6 @@ def hierarchical_label_preprocess(self, save_file):
 __all__ = [
     "Dataset",
     "BlogCatalog", "Youtube", "Flickr", "Hyperlink2012", "Friendster", "Wikipedia",
-    "FB15k", "FB15k237", "WN18", "WN18RR", "Freebase",
+    "Math", "FB15k", "FB15k237", "WN18", "WN18RR", "Freebase",
     "MNIST", "CIFAR10", "ImageNet"
 ]
\ No newline at end of file
diff --git a/python/graphvite/util.py b/python/graphvite/util.py
index 6f8eca6..5e959cb 100644
--- a/python/graphvite/util.py
+++ b/python/graphvite/util.py
@@ -19,9 +19,12 @@
 
 import os
 import logging
+import tempfile
 from time import time
 from functools import wraps
 
+import numpy as np
+
 logger = logging.getLogger(__name__)
 
 
@@ -47,6 +50,20 @@ def recursive_map(obj, function):
         return function(obj)
 
 
+def assert_in(candidates, **kwargs):
+
+    def readable_list(iterable):
+        iterable = ["`%s`" % x for x in iterable]
+        s = ", ".join(iterable[:-1])
+        if s:
+            s += " and "
+        s += iterable[-1]
+        return s
+
+    for key, value in kwargs.items():
+        assert value in candidates, \
+            "Unknown %s `%s`, candidates are %s" % (key, value, readable_list(candidates))
+
 class chdir(object):
     """
     Context manager for working directory.
@@ -65,6 +82,39 @@ def __exit__(self, *args):
         os.chdir(self.old_dir)
 
 
+class SharedNDArray(object):
+    """
+    Shared numpy ndarray with serialization interface.
+    This class can be used as a drop-in replacement for arguments in multiprocessing.
+
+    Parameters:
+        array (array-like): input data
+    """
+    def __init__(self, array):
+        array = np.asarray(array)
+        self.dtype = array.dtype
+        self.shape = array.shape
+        self.file = tempfile.NamedTemporaryFile()
+        self.buffer = np.memmap(self.file, dtype=self.dtype, shape=self.shape)
+        self.buffer[:] = array
+
+    def __getattr__(self, key):
+        if key in self.__dict__:
+            return self.__dict__[key]
+        else:
+            return getattr(self.buffer, key)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["file"]
+        state["buffer"] = self.buffer.filename
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.buffer = np.memmap(self.buffer, dtype=self.dtype, shape=self.shape)
+
+
 class Monitor(object):
     """
     Function call monitor.
@@ -88,6 +138,15 @@ def get_name(self, function, instance):
             return "%s.%s.%s" % (instance.__module__, instance.__class__.__name__, function.__name__)
 
     def time(self, function):
+        """
+        Monitor the run time of a function.
+
+        Parameters:
+            function (function): function to monitor
+
+        Returns:
+            function: wrapped function
+        """
         @wraps(function)
         def wrapper(*args, **kwargs):
             name = self.get_name(function, args[0])
@@ -100,6 +159,15 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     def call(self, function):
+        """
+        Monitor the arguments of a function.
+
+        Parameters:
+            function (function): function to monitor
+
+        Returns:
+            function: wrapped function
+        """
         @wraps(function)
         def wrapper(*args, **kwargs):
             name = self.get_name(function, args[0])
@@ -111,6 +179,15 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     def result(self, function):
+        """
+        Monitor the return value of a function.
+
+        Parameters:
+            function (function): function to monitor
+
+        Returns:
+            function: wrapped function
+        """
         @wraps(function)
         def wrapper(*args, **kwargs):
             name = self.get_name(function, args[0])
diff --git a/python/setup.py b/python/setup.py
index 616b1cb..4b12f9d 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -43,5 +43,5 @@
     entry_points={"console_scripts": ["graphvite = graphvite.cmd:main"]},
     zip_safe=False,
     #install_requires=["numpy", "pyyaml", "easydict", "six", "future"],
-    #extras_requires={"app": ["imageio", "scipy", "matplotlib", "torch", "torchvision", "nltk"]}
+    #extras_requires={"app": ["imageio", "psutil", "scipy", "matplotlib", "torch", "torchvision", "nltk"]}
 )
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 84a6379..3706ec0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (WIN32)
-    add_library(graphvite graphvite.cc)
+    add_library(graphvite graphvite.cu)
 else ()
     add_library(graphvite SHARED graphvite.cu)
     set_target_properties(graphvite PROPERTIES
diff --git a/src/graphvite.cu b/src/graphvite.cu
index 802350f..f89eee3 100644
--- a/src/graphvite.cu
+++ b/src/graphvite.cu
@@ -19,12 +19,11 @@
 #include "util/common.h"
 
 //#define USE_TIMER
-#define USE_LOSS
 //#define PINNED_MEMORY
 
 #include "bind.h"
 
-const std::string version = "0.1.0";
+const std::string version = "0.2.0";
 
 PYBIND11_MODULE(libgraphvite, module) {
     py::options options;
@@ -91,9 +90,17 @@ PYBIND11_MODULE(libgraphvite, module) {
     module.attr("ERROR") = google::ERROR;
     module.attr("FATAL") = google::FATAL;
 
+    // io
+    auto io = module.def_submodule("io");
+    io.def("size_string", graphvite::pretty::size_string, py::no_gil(), py::arg("size"));
+    io.def("yes_no", graphvite::pretty::yes_no, py::no_gil(), py::arg("x"));
+    io.def("block", graphvite::pretty::block, py::no_gil(), py::arg("content"));
+    io.def("header", graphvite::pretty::header, py::no_gil(), py::arg("content"));
+
     module.attr("auto") = graphvite::kAuto;
     module.def("KiB", graphvite::KiB, py::no_gil(), py::arg("size"));
     module.def("MiB", graphvite::MiB, py::no_gil(), py::arg("size"));
     module.def("GiB", graphvite::GiB, py::no_gil(), py::arg("size"));
+
     module.attr("__version__") = version;
 }
\ No newline at end of file