-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
114 lines (93 loc) · 4.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import cross_validation
from sklearn.preprocessing import MinMaxScaler
import preproc
import features
def test_gender_identification():
path = 'pan15-author-profiling-training-dataset-2015-03-02\\pan15-author-profiling-training-dataset-english-2015-03-02\\'
users = preproc.load_users(path)
users_dict = preproc.load_users_dict(path)
truth = preproc.get_users_truth(path)
#add features
print 'Creating features'
df = preproc.create_users_dataframe(path)
df['label'] = df['user_id'].map(lambda id : 0 if truth[id][0] == 'M' else 1)
df = features.add_self_references_count(df, users)
df = features.add_positive_words(df, users_dict, preproc.load_words('resources\\positive-words.txt'))
df = features.add_negative_words(df, users_dict, preproc.load_words('resources\\negative-words.txt'))
df = features.add_articles(df, users_dict)
df = features.add_url_count(df, users_dict)
df = features.add_long_words(df, users)
df = features.add_word_similarity(df, users, truth)
#normalize features
print 'Normalizing features'
scaler = MinMaxScaler(copy = False)
scaler.fit_transform(df['pos_words'])
scaler.fit_transform(df['neg_words'])
scaler.fit_transform(df['self_ref_count'])
scaler.fit_transform(df['url_count'])
scaler.fit_transform(df['articles'])
long_words = ['username', 'people', 'nowplaying', 'really', 'should', 'others', 'thanks', 'twitter',
'always', 'google', 'things', 'better', 'tumblr', 'school', 'because', 'someone', 'facebook',
'frzhtmoge7', 'please', 'something']
feature_names = ['self_ref_count', 'articles', 'pos_words', 'neg_words', 'url_count', 'male_similarity', 'female_similarity']
all_features = feature_names + long_words
#initialize classifiers
log_reg = LogReg()
svm_clf = svm.SVC()
gnb_clf = GaussianNB()
ranfor_clf = RandomForestClassifier()
clfs = {'logistic regression' : log_reg, 'linear SVM' : svm_clf, 'GaussianNB' : gnb_clf, 'random forest' : ranfor_clf}
for clf in clfs:
scores = cross_validation.cross_val_score(clfs[clf], df[all_features], df['label'], cv=10)
print clf + ' : ' + str(scores.mean())
def test_age_identification():
path = 'pan15-author-profiling-training-dataset-2015-03-02\\pan15-author-profiling-training-dataset-english-2015-03-02\\'
users = preproc.load_users(path)
users_dict = preproc.load_users_dict(path)
truth = preproc.get_users_truth(path)
#add features
print 'Creating features'
df = preproc.create_users_dataframe(path)
df['label'] = df['user_id'].map(lambda id : 0 if truth[id][0] == 'M' else 1)
df['age_label'] = df['user_id'].map(lambda user_id : truth[user_id][1])
le = preprocessing.LabelEncoder()
df['age_label'] = le.fit_transform(df['age_label'])
df = features.add_self_references_count(df, users)
df = features.add_positive_words(df, users_dict, preproc.load_words('resources\\positive-words.txt'))
df = features.add_negative_words(df, users_dict, preproc.load_words('resources\\negative-words.txt'))
df = features.add_articles(df, users_dict)
df = features.add_url_count(df, users_dict)
df = features.add_long_words(df, users)
#normalize features
print 'Normalizing features'
scaler = MinMaxScaler(copy = False)
scaler.fit_transform(df['pos_words'])
scaler.fit_transform(df['neg_words'])
scaler.fit_transform(df['self_ref_count'])
scaler.fit_transform(df['url_count'])
scaler.fit_transform(df['articles'])
long_words = ['username', 'people', 'nowplaying', 'really', 'should', 'others', 'thanks', 'twitter',
'always', 'google', 'things', 'better', 'tumblr', 'school', 'because', 'someone', 'facebook',
'frzhtmoge7', 'please', 'something']
feature_names = ['self_ref_count', 'articles', 'pos_words', 'neg_words', 'url_count']
all_features = feature_names + long_words
#initialize classifiers
log_reg = LogReg()
svm_clf = svm.SVC()
gnb_clf = GaussianNB()
ranfor_clf = RandomForestClassifier()
clfs = {'logistic regression' : log_reg, 'linear SVM' : svm_clf,
'GaussianNB' : gnb_clf, 'random forest' : ranfor_clf }
for clf in clfs:
scores = cross_validation.cross_val_score(clfs[clf], df[all_features], df['age_label'], cv=10)
print clf + ' : ' + str(scores.mean())
def main():
test_gender_identification()
#test_age_identification()
main()