Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Pachyderm Example (squashed) #522

Merged
merged 2 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions OWNERS
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# TODO(jlewi): We should probably have OWNERs files in subdirectories that
# list approvers for individual components (e.g. Seldon folks for Seldon component)
approvers:
- dansanche
- gaocegege
- jlewi
- lluunn
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ This example covers the following concepts:
1. Serving with Seldon Core
1. Flask front-end

### [Pachyderm Example - GitHub issue summarization](./github_issue_summarization/Pachyderm_Example)
Author: [Nick Harvey](https://github.com/Nick-Harvey) & [Daniel Whitenack](https://github.com/dwhitena)

This example covers the following concepts:
1. A production pipeline for pre-processing, training, and model export
1. CI/CD for model binaries, building and deploying a docker image for serving in Seldon
1. Full tracking of what data produced which model, and what model is being used for inference
1. Automatic updates of models based on changes to training data or code
1. Training with single node Tensorflow and distributed TF-jobs

### [Pytorch MNIST](./pytorch_mnist)
Author: [David Sabater](https://github.com/dsdinter)

Expand Down
420 changes: 420 additions & 0 deletions github_issue_summarization/Pachyderm_Example/README.md

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions github_issue_summarization/Pachyderm_Example/build.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"pipeline": {
"name": "build"
},
"transform": {
"image": "seldonio/core-python-wrapper:0.7",
"cmd": [ "/bin/bash" ],
"stdin": [
"mkdir /my_model",
"cp /pfs/pre_process/*.dpkl /my_model",
"cp /pfs/train/* /my_model",
"python wrap_model.py /my_model IssueSummarization $PACH_JOB_ID pachyderm --out-folder=/pfs/out --base-image=python:3.6"
]
},
"input": {
"cross": [
{
"atom": {
"repo": "train",
"glob": "/"
}
},
{
"atom": {
"repo": "pre_process",
"glob": "/"
}
}
]
}
}
24 changes: 24 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM python:3.6
RUN apt-get update && apt-get install -y --no-install-recommends \
python-pandas \
&& pip3 install -U scikit-learn \
&& pip3 install -U ktext \
&& pip3 install -U IPython \
&& pip3 install -U annoy \
&& pip3 install -U tqdm \
&& pip3 install -U nltk \
&& pip3 install -U matplotlib \
&& pip3 install -U tensorflow \
&& pip3 install -U bernoulli \
&& pip3 install -U h5py \
&& git clone https://github.com/google/seq2seq.git \
&& pip3 install -e ./seq2seq/ \
&& apt-get clean \
&& rm -rf \
/var/lib/apt/lists/* \
/tmp/* \
/var/tmp/* \
/usr/share/man \
/usr/share/doc \
/usr/share/doc-base
COPY . /workspace/src/
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Generates predictions using a stored model.

Uses trained model files to generate a prediction.
"""

from __future__ import print_function

import os

import numpy as np
import dill as dpickle
from keras.models import load_model
from seq2seq_utils import Seq2Seq_Inference

class IssueSummarization(object):

def __init__(self):
body_pp_file = os.getenv('BODY_PP_FILE', 'body_preprocessor.dpkl')
print('body_pp file {0}'.format(body_pp_file))
with open(body_pp_file, 'rb') as body_file:
body_pp = dpickle.load(body_file)

title_pp_file = os.getenv('TITLE_PP_FILE', 'title_preprocessor.dpkl')
print('title_pp file {0}'.format(title_pp_file))
with open(title_pp_file, 'rb') as title_file:
title_pp = dpickle.load(title_file)

model_file = os.getenv('MODEL_FILE', 'output_model.h5')
print('model file {0}'.format(model_file))
self.model = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=load_model(model_file))

def predict(self, input_text, feature_names): # pylint: disable=unused-argument
return np.asarray([[self.model.generate_issue_title(body[0])[1]] for body in input_text])
31 changes: 31 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import argparse
import keras
import pandas as pd
from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_model_h5")
parser.add_argument("--input_body_preprocessor_dpkl")
parser.add_argument("--input_title_preprocessor_dpkl")
parser.add_argument("--input_testdf_csv")
parser.add_argument("--input_prediction_count", type=int, default=50)
args = parser.parse_args()
print(args)

# Read data.
testdf = pd.read_csv(args.input_testdf_csv)

# Load model, preprocessors.
seq2seq_Model = keras.models.load_model(args.input_model_h5)
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)

# Prepare inference.
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=seq2seq_Model)

# Output predictions for n random rows in the test set.
seq2seq_inf.demo_model_predictions(n=args.input_prediction_count, issue_df=testdf)
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import argparse
import dill as dpickle
import numpy as np
from ktext.preprocess import processor
import pandas as pd

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_traindf_csv")
parser.add_argument("--output_body_preprocessor_dpkl")
parser.add_argument("--output_title_preprocessor_dpkl")
parser.add_argument("--output_train_title_vecs_npy")
parser.add_argument("--output_train_body_vecs_npy")
args = parser.parse_args()
print(args)

# Read data.
traindf = pd.read_csv(args.input_traindf_csv)
train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()

# Clean, tokenize, and apply padding / truncating such that each document
# length = 70. Also, retain only the top 8,000 words in the vocabulary and set
# the remaining words to 1 which will become common index for rare words.
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)

print('Example original body:', train_body_raw[0])
print('Example body after pre-processing:', train_body_vecs[0])

# Instantiate a text processor for the titles, with some different parameters.
title_pp = processor(append_indicators=True, keep_n=4500,
padding_maxlen=12, padding='post')

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

print('Example original title:', train_title_raw[0])
print('Example title after pre-processing:', train_title_vecs[0])

# Save the preprocessor.
with open(args.output_body_preprocessor_dpkl, 'wb') as f:
dpickle.dump(body_pp, f, protocol=2)

with open(args.output_title_preprocessor_dpkl, 'wb') as f:
dpickle.dump(title_pp, f, protocol=2)

# Save the processed data.
np.save(args.output_train_title_vecs_npy, train_title_vecs)
np.save(args.output_train_body_vecs_npy, train_body_vecs)
26 changes: 26 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_csv")
parser.add_argument("--sample_size", type=int, default=2000000)
parser.add_argument("--output_traindf_csv")
parser.add_argument("--output_testdf_csv")
args = parser.parse_args()
print(args)

pd.set_option('display.max_colwidth', 500)

# Read in data sample 2M rows (for speed of tutorial)
traindf, testdf = train_test_split(pd.read_csv(args.input_csv).sample(n=args.sample_size),
test_size=.10)

# Print stats about the shape of the data.
print('Train: {:,} rows {:,} columns'.format(traindf.shape[0], traindf.shape[1]))
print('Test: {:,} rows {:,} columns'.format(testdf.shape[0], testdf.shape[1]))

# Store output as CSV.
traindf.to_csv(args.output_traindf_csv)
testdf.to_csv(args.output_testdf_csv)
36 changes: 36 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/recommend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import keras
import pandas as pd
from seq2seq_utils import load_text_processor
from seq2seq_utils import Seq2Seq_Inference

# Parsing flags.
parser = argparse.ArgumentParser()
parser.add_argument("--input_csv")
parser.add_argument("--input_model_h5")
parser.add_argument("--input_body_preprocessor_dpkl")
parser.add_argument("--input_title_preprocessor_dpkl")
parser.add_argument("--input_testdf_csv")
parser.add_argument("--input_topic_number", type=int, default=1)
args = parser.parse_args()
print(args)

# Read data.
all_data_df = pd.read_csv(args.input_csv)
testdf = pd.read_csv(args.input_testdf_csv)

# Load model, preprocessors.
num_encoder_tokens, body_pp = load_text_processor(args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(args.input_title_preprocessor_dpkl)
seq2seq_Model = keras.models.load_model(args.input_model_h5)

# Prepare the recommender.
all_data_bodies = all_data_df['body'].tolist()
all_data_vectorized = body_pp.transform_parallel(all_data_bodies)
seq2seq_inf_rec = Seq2Seq_Inference(encoder_preprocessor=body_pp,
decoder_preprocessor=title_pp,
seq2seq_model=seq2seq_Model)
recsys_annoyobj = seq2seq_inf_rec.prepare_recommender(all_data_vectorized, all_data_df)

# Output recommendations for n topics.
seq2seq_inf_rec.demo_model_predictions(n=args.input_topic_number, issue_df=testdf, threshold=1)
11 changes: 11 additions & 0 deletions github_issue_summarization/Pachyderm_Example/code/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
numpy
keras
dill
matplotlib
tensorflow
annoy
tqdm
nltk
IPython
ktext
h5py
Loading