Skip to content

Commit

Permalink
Adding Pachyderm Example (squashed) (#522)
Browse files Browse the repository at this point in the history
* Adding Pachyderm Example (squashed)

* Add Dan Sanche to OWNERS (#520)

Fixed tf_operator import for github_issue_summarization example (#527)

* fixed tf_operator import

* updated tf-operator import path

* small change

* updated PYTHONPATH

* fixed syntax error

* formating issue

Mnist pipelines (#524)

* added mnist pipelines sample

* fixed lint issues
  • Loading branch information
Nick-Harvey authored and k8s-ci-robot committed Mar 20, 2019
1 parent 895e88b commit 52795bc
Show file tree
Hide file tree
Showing 18 changed files with 7,156 additions and 0 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ This example covers the following concepts:
1. Serving with Seldon Core
1. Flask front-end

### [Pachyderm Example - GitHub issue summarization](./github_issue_summarization/Pachyderm_Example)
Author: [Nick Harvey](https://github.com/Nick-Harvey) & [Daniel Whitenack](https://github.com/dwhitena)

This example covers the following concepts:
1. A production pipeline for pre-processing, training, and model export
1. CI/CD for model binaries, building and deploying a docker image for serving in Seldon
1. Full tracking of what data produced which model, and what model is being used for inference
1. Automatic updates of models based on changes to training data or code
1. Training with single node Tensorflow and distributed TF-jobs

### [Pytorch MNIST](./pytorch_mnist)
Author: [David Sabater](https://github.com/dsdinter)

Expand Down
420 changes: 420 additions & 0 deletions github_issue_summarization/Pachyderm_Example/README.md

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions github_issue_summarization/Pachyderm_Example/build.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"pipeline": {
"name": "build"
},
"transform": {
"image": "seldonio/core-python-wrapper:0.7",
"cmd": [ "/bin/bash" ],
"stdin": [
"mkdir /my_model",
"cp /pfs/pre_process/*.dpkl /my_model",
"cp /pfs/train/* /my_model",
"python wrap_model.py /my_model IssueSummarization $PACH_JOB_ID pachyderm --out-folder=/pfs/out --base-image=python:3.6"
]
},
"input": {
"cross": [
{
"atom": {
"repo": "train",
"glob": "/"
}
},
{
"atom": {
"repo": "pre_process",
"glob": "/"
}
}
]
}
}
24 changes: 24 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM python:3.6
RUN apt-get update && apt-get install -y --no-install-recommends \
python-pandas \
&& pip3 install -U scikit-learn \
&& pip3 install -U ktext \
&& pip3 install -U IPython \
&& pip3 install -U annoy \
&& pip3 install -U tqdm \
&& pip3 install -U nltk \
&& pip3 install -U matplotlib \
&& pip3 install -U tensorflow \
&& pip3 install -U bernoulli \
&& pip3 install -U h5py \
&& git clone https://github.com/google/seq2seq.git \
&& pip3 install -e ./seq2seq/ \
&& apt-get clean \
&& rm -rf \
/var/lib/apt/lists/* \
/tmp/* \
/var/tmp/* \
/usr/share/man \
/usr/share/doc \
/usr/share/doc-base
COPY . /workspace/src/
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Generates predictions using a stored model.
Uses trained model files to generate a prediction.
"""

from __future__ import print_function

import os

import numpy as np
import dill as dpickle
from keras.models import load_model
from seq2seq_utils import Seq2Seq_Inference

class IssueSummarization(object):

def __init__(self):
body_pp_file = os.getenv('BODY_PP_FILE', 'body_preprocessor.dpkl')
print('body_pp file {0}'.format(body_pp_file))
with open(body_pp_file, 'rb') as body_file:
body_pp = dpickle.load(body_file)

title_pp_file = os.getenv('TITLE_PP_FILE', 'title_preprocessor.dpkl')
print('title_pp file {0}'.format(title_pp_file))
with open(title_pp_file, 'rb') as title_file:
title_pp = dpickle.load(title_file)

model_file = os.getenv('MODEL_FILE', 'output_model.h5')
print('model file {0}'.format(model_file))
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=load_model(model_file))

def predict(self, input_text, feature_names): # pylint: disable=unused-argument
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
31 changes: 31 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import argparse
import keras
import pandas as pd
from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_model_h5")
parser.add_argument("--input_body_preprocessor_dpkl")
parser.add_argument("--input_title_preprocessor_dpkl")
parser.add_argument("--input_testdf_csv")
parser.add_argument("--input_prediction_count", type=int, default=50)
args = parser.parse_args()
print(args)

# Read data.
testdf = pd.read_csv(args.input_testdf_csv)

# Load model, preprocessors.
seq2seq_Model = keras.models.load_model(args.input_model_h5)
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)

# Prepare inference.
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=seq2seq_Model)

# Output predictions for n random rows in the test set.
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import argparse
import dill as dpickle
import numpy as np
from ktext.preprocess import processor
import pandas as pd

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_traindf_csv")
parser.add_argument("--output_body_preprocessor_dpkl")
parser.add_argument("--output_title_preprocessor_dpkl")
parser.add_argument("--output_train_title_vecs_npy")
parser.add_argument("--output_train_body_vecs_npy")
args = parser.parse_args()
print(args)

# Read data.
traindf = pd.read_csv(args.input_traindf_csv)
train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()

# Clean, tokenize, and apply padding / truncating such that each document
# length = 70. Also, retain only the top 8,000 words in the vocabulary and set
# the remaining words to 1 which will become common index for rare words.
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)

print('Example original body:', train_body_raw[0])
print('Example body after pre-processing:', train_body_vecs[0])

# Instantiate a text processor for the titles, with some different parameters.
title_pp = processor(append_indicators=True, keep_n=4500,
padding_maxlen=12, padding='post')

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

print('Example original title:', train_title_raw[0])
print('Example title after pre-processing:', train_title_vecs[0])

# Save the preprocessor.
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
dpickle.dump(body_pp, f, protocol=2)

with open(args.output_title_preprocessor_dpkl, 'wb') as f:
dpickle.dump(title_pp, f, protocol=2)

# Save the processed data.
np.save(args.output_train_title_vecs_npy, train_title_vecs)
np.save(args.output_train_body_vecs_npy, train_body_vecs)
26 changes: 26 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_csv")
parser.add_argument("--sample_size", type=int, default=2000000)
parser.add_argument("--output_traindf_csv")
parser.add_argument("--output_testdf_csv")
args = parser.parse_args()
print(args)

pd.set_option('display.max_colwidth', 500)

# Read in data sample 2M rows (for speed of tutorial)
traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sample_size),
test_size=.10)

# Print stats about the shape of the data.
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))

# Store output as CSV.
traindf.to_csv(args.output_traindf_csv)
testdf.to_csv(args.output_testdf_csv)
36 changes: 36 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/recommend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import keras
import pandas as pd
from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_csv")
parser.add_argument("--input_model_h5")
parser.add_argument("--input_body_preprocessor_dpkl")
parser.add_argument("--input_title_preprocessor_dpkl")
parser.add_argument("--input_testdf_csv")
parser.add_argument("--input_topic_number", type=int, default=1)
args = parser.parse_args()
print(args)

# Read data.
all_data_df = pd.read_csv(args.input_csv)
testdf = pd.read_csv(args.input_testdf_csv)

# Load model, preprocessors.
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
seq2seq_Model = keras.models.load_model(args.input_model_h5)

# Prepare the recommender.
all_data_bodies = all_data_df['body'].tolist()
all_data_vectorized = body_pp.transform_parallel(all_data_bodies)
seq2seq_inf_rec = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=seq2seq_Model)
recsys_annoyobj = seq2seq_inf_rec.prepare_recommender(all_data_vectorized, all_data_df)

# Output recommendations for n topics.
seq2seq_inf_rec.demo_model_predictions(n=args.input_topic_number, issue_df=testdf, threshold=1)
11 changes: 11 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
numpy
keras
dill
matplotlib
tensorflow
annoy
tqdm
nltk
IPython
ktext
h5py
Loading

0 comments on commit 52795bc

Please sign in to comment.