Adding Pachyderm Example (squashed) (#522)

* Adding Pachyderm Example (squashed) * Add Dan Sanche to OWNERS (#520) Fixed tf_operator import for github_issue_summarization example (#527) * fixed tf_operator import * updated tf-operator import path * small change * updated PYTHONPATH * fixed syntax error * formating issue Mnist pipelines (#524) * added mnist pipelines sample * fixed lint issues
kubeflow · Mar 20, 2019 · 52795bc · 52795bc
1 parent 895e88b
commit 52795bc
Show file tree

Hide file tree

Showing 18 changed files with 7,156 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,16 @@ This example covers the following concepts:
 1. Serving with Seldon Core
 1. Flask front-end
 
+### [Pachyderm Example - GitHub issue summarization](./github_issue_summarization/Pachyderm_Example)
+Author: [Nick Harvey](https://github.com/Nick-Harvey) & [Daniel Whitenack](https://github.com/dwhitena)
+
+This example covers the following concepts:
+1. A production pipeline for pre-processing, training, and model export
+1. CI/CD for model binaries, building and deploying a docker image for serving in Seldon
+1. Full tracking of what data produced which model, and what model is being used for inference
+1. Automatic updates of models based on changes to training data or code
+1. Training with single node Tensorflow and distributed TF-jobs
+
 ### [Pytorch MNIST](./pytorch_mnist)
 Author: [David Sabater](https://github.com/dsdinter)
 

diff --git a/github_issue_summarization/Pachyderm_Example/README.md b/github_issue_summarization/Pachyderm_Example/README.md
diff --git a/github_issue_summarization/Pachyderm_Example/build.json b/github_issue_summarization/Pachyderm_Example/build.json
@@ -0,0 +1,31 @@
+{
+  "pipeline": {
+    "name": "build"
+  },
+  "transform": {
+    "image": "seldonio/core-python-wrapper:0.7",
+    "cmd": [ "/bin/bash" ],
+    "stdin": [
+        "mkdir /my_model", 
+        "cp /pfs/pre_process/*.dpkl /my_model",
+	"cp /pfs/train/* /my_model",
+	"python wrap_model.py /my_model IssueSummarization $PACH_JOB_ID pachyderm --out-folder=/pfs/out --base-image=python:3.6"
+    ]
+  },
+  "input": {
+    "cross": [
+      {
+        "atom": {
+	  "repo": "train",
+	  "glob": "/"
+	}
+      },
+      {
+	"atom": {
+	  "repo": "pre_process",
+	  "glob": "/"
+	}
+      }
+    ]
+  }
+}
diff --git a/github_issue_summarization/Pachyderm_Example/code/Dockerfile b/github_issue_summarization/Pachyderm_Example/code/Dockerfile
@@ -0,0 +1,24 @@
+FROM python:3.6
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python-pandas \
+    && pip3 install -U scikit-learn \
+    && pip3 install -U ktext \
+    && pip3 install -U IPython \
+    && pip3 install -U annoy \
+    && pip3 install -U tqdm \
+    && pip3 install -U nltk \
+    && pip3 install -U matplotlib \
+    && pip3 install -U tensorflow \
+    && pip3 install -U bernoulli \
+    && pip3 install -U h5py \
+    && git clone https://github.com/google/seq2seq.git \
+    && pip3 install -e ./seq2seq/ \
+    && apt-get clean \
+    && rm -rf \
+        /var/lib/apt/lists/* \
+        /tmp/* \
+        /var/tmp/* \
+        /usr/share/man \
+        /usr/share/doc \
+        /usr/share/doc-base
+COPY . /workspace/src/ 
diff --git a/github_issue_summarization/Pachyderm_Example/code/IssueSummarization.py b/github_issue_summarization/Pachyderm_Example/code/IssueSummarization.py
@@ -0,0 +1,35 @@
+"""Generates predictions using a stored model.
+
+Uses trained model files to generate a prediction.
+"""
+
+from __future__ import print_function
+
+import os
+
+import numpy as np
+import dill as dpickle
+from keras.models import load_model
+from seq2seq_utils import Seq2Seq_Inference
+
+class IssueSummarization(object):
+
+  def __init__(self):
+    body_pp_file = os.getenv('BODY_PP_FILE', 'body_preprocessor.dpkl')
+    print('body_pp file {0}'.format(body_pp_file))
+    with open(body_pp_file, 'rb') as body_file:
+      body_pp = dpickle.load(body_file)
+
+    title_pp_file = os.getenv('TITLE_PP_FILE', 'title_preprocessor.dpkl')
+    print('title_pp file {0}'.format(title_pp_file))
+    with open(title_pp_file, 'rb') as title_file:
+      title_pp = dpickle.load(title_file)
+
+    model_file = os.getenv('MODEL_FILE', 'output_model.h5')
+    print('model file {0}'.format(model_file))
+    self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
+                                   decoder_preprocessor=title_pp,
+                                   seq2seq_model=load_model(model_file))
+
+  def predict(self, input_text, feature_names): # pylint: disable=unused-argument
+    return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
diff --git a/github_issue_summarization/Pachyderm_Example/code/prediction.py b/github_issue_summarization/Pachyderm_Example/code/prediction.py
@@ -0,0 +1,31 @@
+import argparse
+import keras
+import pandas as pd
+from seq2seq_utils import load_text_processor
+from seq2seq_utils import Seq2Seq_Inference
+
+# Parsing flags.
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_model_h5")
+parser.add_argument("--input_body_preprocessor_dpkl")
+parser.add_argument("--input_title_preprocessor_dpkl")
+parser.add_argument("--input_testdf_csv")
+parser.add_argument("--input_prediction_count", type=int, default=50)
+args = parser.parse_args()
+print(args)
+
+# Read data.
+testdf = pd.read_csv(args.input_testdf_csv)
+
+# Load model, preprocessors.
+seq2seq_Model = keras.models.load_model(args.input_model_h5)
+num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
+num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
+
+# Prepare inference.
+seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
+                                 decoder_preprocessor=title_pp,
+                                 seq2seq_model=seq2seq_Model)
+
+# Output predictions for n random rows in the test set.
+seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
diff --git a/github_issue_summarization/Pachyderm_Example/code/preprocess_data_for_deep_learning.py b/github_issue_summarization/Pachyderm_Example/code/preprocess_data_for_deep_learning.py
@@ -0,0 +1,50 @@
+import argparse
+import dill as dpickle
+import numpy as np
+from ktext.preprocess import processor
+import pandas as pd
+
+# Parsing flags.
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_traindf_csv")
+parser.add_argument("--output_body_preprocessor_dpkl")
+parser.add_argument("--output_title_preprocessor_dpkl")
+parser.add_argument("--output_train_title_vecs_npy")
+parser.add_argument("--output_train_body_vecs_npy")
+args = parser.parse_args()
+print(args)
+
+# Read data.
+traindf = pd.read_csv(args.input_traindf_csv)
+train_body_raw = traindf.body.tolist()
+train_title_raw = traindf.issue_title.tolist()
+
+# Clean, tokenize, and apply padding / truncating such that each document
+# length = 70. Also, retain only the top 8,000 words in the vocabulary and set
+# the remaining words to 1 which will become common index for rare words.
+body_pp = processor(keep_n=8000, padding_maxlen=70)
+train_body_vecs = body_pp.fit_transform(train_body_raw)
+
+print('Example original body:', train_body_raw[0])
+print('Example body after pre-processing:', train_body_vecs[0])
+
+# Instantiate a text processor for the titles, with some different parameters.
+title_pp = processor(append_indicators=True, keep_n=4500,
+                     padding_maxlen=12, padding='post')
+
+# process the title data
+train_title_vecs = title_pp.fit_transform(train_title_raw)
+
+print('Example original title:', train_title_raw[0])
+print('Example title after pre-processing:', train_title_vecs[0])
+
+# Save the preprocessor.
+with open(args.output_body_preprocessor_dpkl, 'wb') as f:
+  dpickle.dump(body_pp, f, protocol=2)
+
+with open(args.output_title_preprocessor_dpkl, 'wb') as f:
+  dpickle.dump(title_pp, f, protocol=2)
+
+# Save the processed data.
+np.save(args.output_train_title_vecs_npy, train_title_vecs)
+np.save(args.output_train_body_vecs_npy, train_body_vecs)
diff --git a/github_issue_summarization/Pachyderm_Example/code/process_data.py b/github_issue_summarization/Pachyderm_Example/code/process_data.py
@@ -0,0 +1,26 @@
+import argparse
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+# Parsing flags.
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_csv")
+parser.add_argument("--sample_size", type=int, default=2000000)
+parser.add_argument("--output_traindf_csv")
+parser.add_argument("--output_testdf_csv")
+args = parser.parse_args()
+print(args)
+
+pd.set_option('display.max_colwidth', 500)
+
+# Read in data sample 2M rows (for speed of tutorial)
+traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sample_size),
+                                   test_size=.10)
+
+# Print stats about the shape of the data.
+print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
+print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))
+
+# Store output as CSV.
+traindf.to_csv(args.output_traindf_csv)
+testdf.to_csv(args.output_testdf_csv)
diff --git a/github_issue_summarization/Pachyderm_Example/code/recommend.py b/github_issue_summarization/Pachyderm_Example/code/recommend.py
@@ -0,0 +1,36 @@
+import argparse
+import keras
+import pandas as pd
+from seq2seq_utils import load_text_processor
+from seq2seq_utils import Seq2Seq_Inference
+
+# Parsing flags.
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_csv")
+parser.add_argument("--input_model_h5")
+parser.add_argument("--input_body_preprocessor_dpkl")
+parser.add_argument("--input_title_preprocessor_dpkl")
+parser.add_argument("--input_testdf_csv")
+parser.add_argument("--input_topic_number", type=int, default=1)
+args = parser.parse_args()
+print(args)
+
+# Read data.
+all_data_df = pd.read_csv(args.input_csv)
+testdf = pd.read_csv(args.input_testdf_csv)
+
+# Load model, preprocessors.
+num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
+num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
+seq2seq_Model = keras.models.load_model(args.input_model_h5)
+
+# Prepare the recommender.
+all_data_bodies = all_data_df['body'].tolist()
+all_data_vectorized = body_pp.transform_parallel(all_data_bodies)
+seq2seq_inf_rec = Seq2Seq_Inference(encoder_preprocessor=body_pp,
+                                    decoder_preprocessor=title_pp,
+                                    seq2seq_model=seq2seq_Model)
+recsys_annoyobj = seq2seq_inf_rec.prepare_recommender(all_data_vectorized, all_data_df)
+
+# Output recommendations for n topics.
+seq2seq_inf_rec.demo_model_predictions(n=args.input_topic_number, issue_df=testdf, threshold=1)
diff --git a/github_issue_summarization/Pachyderm_Example/code/requirements.txt b/github_issue_summarization/Pachyderm_Example/code/requirements.txt
@@ -0,0 +1,11 @@
+numpy
+keras
+dill
+matplotlib
+tensorflow
+annoy
+tqdm
+nltk
+IPython
+ktext
+h5py