This repository has been archived by the owner on Jan 17, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
executable file
·87 lines (70 loc) · 2.73 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env Python
import string
import re
import os
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
# Stop words are messy and not that compelling;
# "very" and "not" are considered stop words, but they are obviously expressing sentiment
# The porter stemmer lemmatizes "was" to "wa". Seriously???
# I'm not sure we want to get into stop words
stopwords_english = stopwords.words('english')
# Also have my doubts about stemming...
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def process_tweet(tweet):
'''
Input:
tweet: a string containing a tweet
Output:
tweets_clean: a list of words containing the processed tweet
'''
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', tweet)
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
# tokenize tweets
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
### START CODE HERE ###
tweets_clean = []
for word in tweet_tokens:
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
#tweets_clean.append(word)
stem_word = stemmer.stem(word) # stemming word
tweets_clean.append(stem_word)
### END CODE HERE ###
return tweets_clean
# let's not reuse variables
#all_positive_tweets = twitter_samples.strings('positive_tweets.json')
#all_negative_tweets = twitter_samples.strings('negative_tweets.json')
def load_tweets():
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
return all_positive_tweets, all_negative_tweets
# Layers have weights and a foward function.
# They create weights when layer.initialize is called and use them.
# remove this or make it optional
class Layer(object):
"""Base class for layers."""
def __init__(self):
self.weights = None
def forward(self, x):
raise NotImplementedError
def init_weights_and_state(self, input_signature, random_key):
pass
def init(self, input_signature, random_key):
self.init_weights_and_state(input_signature, random_key)
return self.weights
def __call__(self, x):
return self.forward(x)