Skip to content

Commit

Permalink
...
Browse files Browse the repository at this point in the history
  • Loading branch information
bkj committed Jun 10, 2018
1 parent 93df389 commit 0ba10a0
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions examples/nbsgd/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,25 @@ def parse_args():

text_train, y_train = texts_from_folders('data/aclImdb/train', ['neg', 'pos'])
text_test, y_test = texts_from_folders('data/aclImdb/test', ['neg', 'pos'])

# --
# Preprocess
print("prep.py: preprocessing", file=sys.stderr)

re_tok = re.compile('([%s“”¨«»®´·º½¾¿¡§£₤‘’])' % string.punctuation)
tokenizer = lambda x: re_tok.sub(r' \1 ', x).split()

vectorizer = CountVectorizer(
ngram_range=tuple(map(int, args.ngram_range.split(','))),
tokenizer=tokenizer,
max_features=args.max_features
)
X_train = vectorizer.fit_transform(text_train)
X_test = vectorizer.transform(text_test)

X_train_words, _ = bow2adjlist(X_train, maxcols=args.max_words)
X_test_words, _ = bow2adjlist(X_test, maxcols=args.max_words)

# --
# Save
print("prep.py: saving", file=sys.stderr)
Expand Down

0 comments on commit 0ba10a0

Please sign in to comment.