Skip to content

Commit

Permalink
fix 16.RecommenderSystems
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Jun 2, 2018
1 parent 240eb53 commit cf7b3c7
Show file tree
Hide file tree
Showing 12 changed files with 152 additions and 120 deletions.
33 changes: 19 additions & 14 deletions src/py3.x/16.RecommenderSystems/RS-itemcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/python
# coding:utf8

'''
Created on 2015-06-22
Update on 2017-05-16
Expand All @@ -13,13 +12,14 @@
import math
import random
from operator import itemgetter
print(__doc__)

# 作用:使得随机数据可预测
random.seed(0)


class ItemBasedCF():
''' TopN recommendation - ItemBasedCF '''

def __init__(self):
self.trainset = {}
self.testset = {}
Expand Down Expand Up @@ -90,8 +90,8 @@ def calc_movie_sim(self):

print >> sys.stderr, 'counting movies number and popularity...'

# 统计在所有的用户中,不同电影的总出现次数
for user, movies in self.trainset.iteritems():
# 统计在所有的用户中,不同电影的总出现次数, user, movies
for _, movies in self.trainset.items():
for movie in movies:
# count item popularity
if movie not in self.movie_popular:
Expand All @@ -107,8 +107,8 @@ def calc_movie_sim(self):
# 统计在相同用户时,不同电影同时出现的次数
itemsim_mat = self.movie_sim_mat
print >> sys.stderr, 'building co-rated users matrix...'

for user, movies in self.trainset.iteritems():
# user, movies
for _, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
Expand All @@ -122,10 +122,11 @@ def calc_movie_sim(self):
print >> sys.stderr, 'calculating movie similarity matrix...'
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.iteritems():
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.iteritems():
# 余弦相似度
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
itemsim_mat[m1][m2] = count / math.sqrt(
self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
# 打印进度条
if simfactor_count % PRINT_STEP == 0:
Expand Down Expand Up @@ -153,7 +154,10 @@ def recommend(self, user):
# rating=电影评分, w=不同电影出现的次数
# 耗时分析:98.2%的时间在 line-154行
for movie, rating in watched_movies.iteritems():
for related_movie, w in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1), reverse=True)[0:K]:
for related_movie, w in sorted(
self.movie_sim_mat[movie].items(),
key=itemgetter(1),
reverse=True)[0:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
Expand Down Expand Up @@ -185,8 +189,8 @@ def evaluate(self):
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)

# 对比测试集和推荐集的差异
for movie, w in rec_movies:
# 对比测试集和推荐集的差异 movie, w
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
Expand All @@ -200,7 +204,8 @@ def evaluate(self):
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)

print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
precision, recall, coverage, popularity)


if __name__ == '__main__':
Expand All @@ -217,5 +222,5 @@ def evaluate(self):
# itemcf.evaluate()
# 查看推荐结果用户
user = "2"
print "推荐结果", itemcf.recommend(user)
print "---", itemcf.testset.get(user, {})
print("推荐结果", itemcf.recommend(user))
print("---", itemcf.testset.get(user, {}))
72 changes: 43 additions & 29 deletions src/py3.x/16.RecommenderSystems/RS-sklearn-rating.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,25 @@ def splitData(dataFile, test_size):
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]

print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
print('Number of users = ' + str(n_users) + ' | Number of movies = ' +
str(n_items))
train_data, test_data = cv.train_test_split(df, test_size=test_size)
print "数据量:", len(train_data), len(test_data)
print("数据量:", len(train_data), len(test_data))
return df, n_users, n_items, train_data, test_data


def calc_similarity(n_users, n_items, train_data, test_data):
# 创建用户产品矩阵,针对测试数据和训练数据,创建两个矩阵:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
train_data_matrix[line[1]-1, line[2]-1] = line[3]
train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
test_data_matrix[line[1]-1, line[2]-1] = line[3]
test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

# 使用sklearn的pairwise_distances函数来计算余弦相似性。
print "1:", np.shape(train_data_matrix) # 行:人,列:电影
print "2:", np.shape(train_data_matrix.T) # 行:电影,列:人
print("1:", np.shape(train_data_matrix)) # 行:人,列:电影
print("2:", np.shape(train_data_matrix.T)) # 行:电影,列:人

user_similarity = pairwise_distances(train_data_matrix, metric="cosine")
item_similarity = pairwise_distances(train_data_matrix.T, metric="cosine")
Expand All @@ -48,7 +49,7 @@ def calc_similarity(n_users, n_items, train_data, test_data):
# 统计在所有的用户中,不同电影的总出现次数
for i_index in range(n_items):
if np.sum(train_data_matrix[:, i_index]) != 0:
item_popular[i_index] = np.sum(train_data_matrix[:, i_index]!=0)
item_popular[i_index] = np.sum(train_data_matrix[:, i_index] != 0)
# print "pop=", i_index, self.item_popular[i_index]

# save the total number of items
Expand All @@ -59,9 +60,9 @@ def calc_similarity(n_users, n_items, train_data, test_data):


def predict(rating, similarity, type='user'):
print type
print "rating=", np.shape(rating)
print "similarity=", np.shape(similarity)
print(type)
print("rating=", np.shape(rating))
print("similarity=", np.shape(similarity))
if type == 'user':
# 求出每一个用户,所有电影的综合评分(axis=0 表示对列操作, 1表示对行操作)
# print "rating=", np.shape(rating)
Expand All @@ -75,10 +76,12 @@ def predict(rating, similarity, type='user'):
# print "rating_diff=", rating_diff[:3, :3]

# 均分 + 人-人-距离(943, 943)*人-电影-评分diff(943, 1682)=结果-人-电影(每个人对同一电影的综合得分)(943, 1682) 再除以 个人与其他人总的距离 = 人-电影综合得分
pred = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
pred = mean_user_rating[:, np.newaxis] + similarity.dot(
rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
# 综合打分: 人-电影-评分(943, 1682)*电影-电影-距离(1682, 1682)=结果-人-电影(各个电影对同一电影的综合得分)(943, 1682) / 再除以 电影与其他电影总的距离 = 人-电影综合得分
pred = rating.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
pred = rating.dot(similarity) / np.array(
[np.abs(similarity).sum(axis=1)])
return pred


Expand All @@ -96,11 +99,14 @@ def evaluate(prediction, item_popular, name):
all_rec_items = set()
for u_index in range(n_users):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 20]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:20]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]

# 对比测试集和推荐集的差异
for item, w in pre_items:
# 对比测试集和推荐集的差异 item, w
for item, _ in pre_items:
if item in test_items:
hit += 1
all_rec_items.add(item)
Expand All @@ -116,50 +122,58 @@ def evaluate(prediction, item_popular, name):
recall = hit / (1.0 * test_count)
coverage = len(all_rec_items) / (1.0 * len(item_popular))
popularity = popular_sum / (1.0 * rec_count)
print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (name, precision, recall, coverage, popularity)
print >> sys.stderr, '%s: precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
name, precision, recall, coverage, popularity)


def recommend(u_index, prediction):
items = np.where(train_data_matrix[u_index, :] == 0)[0]
pre_items = sorted(dict(zip(items, prediction[u_index, items])).items(), key=itemgetter(1), reverse=True)[: 10]
pre_items = sorted(
dict(zip(items, prediction[u_index, items])).items(),
key=itemgetter(1),
reverse=True)[:10]
test_items = np.where(test_data_matrix[u_index, :] != 0)[0]

print '原始结果:', test_items
print '推荐结果:', [key for key, value in pre_items]
print('原始结果:', test_items)
print('推荐结果:', [key for key, value in pre_items])


if __name__ == "__main__":

# 基于内存的协同过滤
# ...
# 拆分数据集
# http://files.grouplens.org/datasets/movielens/ml-100k.zip
dataFile = 'input/16.RecommenderSystems/ml-100k/u.data'
df, n_users, n_items, train_data, test_data = splitData(dataFile, test_size=0.25)
df, n_users, n_items, train_data, test_data = splitData(
dataFile, test_size=0.25)

# 计算相似度
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(n_users, n_items, train_data, test_data)
train_data_matrix, test_data_matrix, user_similarity, item_similarity, item_popular = calc_similarity(
n_users, n_items, train_data, test_data)

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# 评估:均方根误差
print 'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))
print 'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print(
'Item based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))
print(
'User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))

# 基于模型的协同过滤
# ...
# 计算MovieLens数据集的稀疏度 (n_users,n_items 是常量,所以,用户行为数据越少,意味着信息量少;越稀疏,优化的空间也越大)
sparsity = round(1.0 - len(df)/float(n_users*n_items), 3)
print 'The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%'
sparsity = round(1.0 - len(df) / float(n_users * n_items), 3)
print('The sparsity level of MovieLen100K is ' + str(sparsity * 100) + '%')

# 计算稀疏矩阵的最大k个奇异值/向量
u, s, vt = svds(train_data_matrix, k=15)
s_diag_matrix = np.diag(s)
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)
print "svd-shape:", np.shape(svd_prediction)
print 'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix))

print("svd-shape:", np.shape(svd_prediction))
print(
'Model based CF RMSE: ' + str(rmse(svd_prediction, test_data_matrix)))
"""
在信息量相同的情况下,矩阵越小,那么携带的信息越可靠。
所以:user-cf 推荐效果高于 item-cf; 而svd分解后,发现15个维度效果就能达到90%以上,所以信息更可靠,效果也更好。
Expand Down
35 changes: 19 additions & 16 deletions src/py3.x/16.RecommenderSystems/RS-usercf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/python
# coding:utf8

'''
Created on 2015-06-22
Update on 2017-05-16
Expand All @@ -20,6 +19,7 @@

class UserBasedCF():
''' TopN recommendation - UserBasedCF '''

def __init__(self):
self.trainset = {}
self.testset = {}
Expand Down Expand Up @@ -64,9 +64,9 @@ def generate_dataset(self, filename, pivot=0.7):
testset_len = 0

for line in self.loadfile(filename):
# 用户ID,电影名称,评分,时间戳
# 用户ID,电影名称,评分,时间戳timestamp
# user, movie, rating, timestamp = line.split('::')
user, movie, rating, timestamp = line.split('\t')
user, movie, rating, _ = line.split('\t')
# 通过pivot和随机函数比较,然后初始化用户和对应的值
if (random.random() < pivot):

Expand Down Expand Up @@ -95,7 +95,7 @@ def calc_user_sim(self):

# 同一个电影中,收集用户的集合
# 统计在所有的用户中,不同电影的总出现次数
for user, movies in self.trainset.iteritems():
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
Expand All @@ -116,7 +116,7 @@ def calc_user_sim(self):
# 统计在相同电影时,不同用户同时出现的次数
print >> sys.stderr, 'building user co-rated movies matrix...'

for movie, users in movie2users.iteritems():
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
Expand All @@ -130,10 +130,11 @@ def calc_user_sim(self):
print >> sys.stderr, 'calculating user similarity matrix...'
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.iteritems():
for u, related_users in usersim_mat.items():
for v, count in related_users.iteritems():
# 余弦相似度
usersim_mat[u][v] = count / math.sqrt(len(self.trainset[u]) * len(self.trainset[v]))
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
# 打印进度条
if simfactor_count % PRINT_STEP == 0:
Expand All @@ -160,15 +161,16 @@ def recommend(self, user):
# 计算top K 用户的相似度
# v=similar user, wuv=不同用户同时出现的次数,根据wuv倒序从大到小选出K个用户进行排列
# 耗时分析:50.4%的时间在 line-160行
for v, wuv in sorted(self.user_sim_mat[user].items(), key=itemgetter(1), reverse=True)[0:K]:
for v, wuv in sorted(
self.user_sim_mat[user].items(), key=itemgetter(1),
reverse=True)[0:K]:
for movie, rating in self.trainset[v].iteritems():
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += wuv * rating
# return the N best movies

"""
wuv
precision=0.3766 recall=0.0759 coverage=0.3183 popularity=6.9194
Expand Down Expand Up @@ -202,8 +204,8 @@ def evaluate(self):
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)

# 对比测试集和推荐集的差异
for movie, w in rec_movies:
# 对比测试集和推荐集的差异 movie, w
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
Expand All @@ -212,12 +214,13 @@ def evaluate(self):
rec_count += N
test_count += len(test_movies)

precision = hit / (1.0*rec_count)
recall = hit / (1.0*test_count)
coverage = len(all_rec_movies) / (1.0*self.movie_count)
popularity = popular_sum / (1.0*rec_count)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)

print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (precision, recall, coverage, popularity)
print >> sys.stderr, 'precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % (
precision, recall, coverage, popularity)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit cf7b3c7

Please sign in to comment.