Skip to content

Commit

Permalink
More hybrid mixing with specialized hybrids, Kaggle MAP 0.09674, ther…
Browse files Browse the repository at this point in the history
…eshold 2.1, with ials.
  • Loading branch information
Lodz97 committed Jan 12, 2021
1 parent 8a6f20d commit 15f3f91
Show file tree
Hide file tree
Showing 6 changed files with 1,036 additions and 19 deletions.
107 changes: 100 additions & 7 deletions Base/PredefinedListRecommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
from Base.BaseRecommender import BaseRecommender
from Base.Recommender_utils import check_matrix
import implicit

import scipy.sparse as sps

Expand All @@ -16,21 +17,24 @@ class PredefinedListRecommender(BaseRecommender):

RECOMMENDER_NAME = "PredefinedListRecommenderRecommender"

def __init__(self, URM_recommendations_items):
super(PredefinedListRecommender, self).__init__()
def __init__(self, URM_train):
super(PredefinedListRecommender, self).__init__(URM_train)

# convert to csc matrix for faster column-wise sum
self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int)

self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
self.URM_train = URM_train



def fit(self):
pass
self.model = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
self.model.fit(self.URM_train.T)

def recommend(self, user_id, cutoff = None, remove_seen_flag=True, remove_top_pop_flag = False, remove_custom_items_flag = False):

'''def recommend(self, user_id, cutoff = None, remove_seen_flag=True, remove_top_pop_flag = False, remove_custom_items_flag = False,
return_scores=False):
if cutoff is None:
cutoff= self.URM_train.shape[1] - 1
Expand All @@ -43,7 +47,96 @@ def recommend(self, user_id, cutoff = None, remove_seen_flag=True, remove_top_po
if len(recommendation_list[:cutoff]) == 0:
pass
return recommendation_list[:cutoff]
return recommendation_list[:cutoff]'''

def _compute_item_score(self, user_id_array, items_to_compute = None):


# recommend items for a user
rec_list = []
for el in user_id_array:
recommendations = self.model.recommend(el, self.URM_train, N=10000)
rec_list.append([x[1] for x in recommendations])
return np.array(rec_list)

def recommend(self, user_id_array, cutoff = None, remove_seen_flag=True, items_to_compute = None,
remove_top_pop_flag = False, remove_custom_items_flag = False, return_scores = False):

# If is a scalar transform it in a 1-cell array
if np.isscalar(user_id_array):
user_id_array = np.atleast_1d(user_id_array)
single_user = True
else:
single_user = False

if cutoff is None:
cutoff = self.URM_train.shape[1] - 1

# Compute the scores using the model-specific function
# Vectorize over all users in user_id_array
scores_batch = self._compute_item_score(user_id_array, items_to_compute=items_to_compute)


for user_index in range(len(user_id_array)):

user_id = user_id_array[user_index]
#print(scores_batch[user_index])

#if remove_seen_flag:
# scores_batch[user_index] = self._remove_seen_on_scores(user_id, scores_batch[user_index)

# Sorting is done in three steps. Faster then plain np.argsort for higher number of items
# - Partition the data to extract the set of relevant items
# - Sort only the relevant items
# - Get the original item index
# relevant_items_partition = (-scores_user).argpartition(cutoff)[0:cutoff]
# relevant_items_partition_sorting = np.argsort(-scores_user[relevant_items_partition])
# ranking = relevant_items_partition[relevant_items_partition_sorting]
#
# ranking_list.append(ranking)


if remove_top_pop_flag:
scores_batch = self._remove_TopPop_on_scores(scores_batch)

if remove_custom_items_flag:
scores_batch = self._remove_custom_items_on_scores(scores_batch)

# relevant_items_partition is block_size x cutoff
relevant_items_partition = (-scores_batch).argpartition(cutoff, axis=1)[:,0:cutoff]

# Get original value and sort it
# [:, None] adds 1 dimension to the array, from (block_size,) to (block_size,1)
# This is done to correctly get scores_batch value as [row, relevant_items_partition[row,:]]
relevant_items_partition_original_value = scores_batch[np.arange(scores_batch.shape[0])[:, None], relevant_items_partition]
relevant_items_partition_sorting = np.argsort(-relevant_items_partition_original_value, axis=1)
ranking = relevant_items_partition[np.arange(relevant_items_partition.shape[0])[:, None], relevant_items_partition_sorting]

ranking_list = [None] * ranking.shape[0]

# Remove from the recommendation list any item that has a -inf score
# Since -inf is a flag to indicate an item to remove
for user_index in range(len(user_id_array)):
user_recommendation_list = ranking[user_index]
user_item_scores = scores_batch[user_index, user_recommendation_list]

not_inf_scores_mask = np.logical_not(np.isinf(user_item_scores))

user_recommendation_list = user_recommendation_list[not_inf_scores_mask]
ranking_list[user_index] = user_recommendation_list.tolist()



# Return single list for one user, instead of list of lists
if single_user:
ranking_list = ranking_list[0]


if return_scores:
return ranking_list, scores_batch

else:
return ranking_list



Expand Down
4 changes: 2 additions & 2 deletions MatrixFactorization/IALSRecommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ class IALSRecommender(BaseMatrixFactorizationRecommender, Incremental_Training_E


def fit(self, epochs = 300,
num_factors = 20,
num_factors = 50,
confidence_scaling = "linear",
alpha = 1.0,
epsilon = 1.0,
reg = 1e-3,
reg = 1e-2,
init_mean=0.0,
init_std=0.1,
**earlystopping_kwargs):
Expand Down
20 changes: 10 additions & 10 deletions ParameterTuning/run_parameter_search_hyb.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@ def runParameterSearch_Hybrid(recommender_class, URM_train, ICM_train, W_sparse_
ScoresHybridSpecializedV3Cold, ScoresHybridSpecializedV3Warm]:

hyperparameters_range_dictionary = {}
hyperparameters_range_dictionary["topK_P"] = Integer(5, 1500)
hyperparameters_range_dictionary["topK_P"] = Integer(5, 3000)
hyperparameters_range_dictionary["alpha_P"] = Real(low = 0, high = 2, prior = 'uniform')
hyperparameters_range_dictionary["normalize_similarity_P"] = Categorical([False])
hyperparameters_range_dictionary["topK"] = Integer(5, 1500)
hyperparameters_range_dictionary["shrink"] = Integer(0, 1500)
hyperparameters_range_dictionary["topK"] = Integer(5, 3000)
hyperparameters_range_dictionary["shrink"] = Integer(0, 5000)
hyperparameters_range_dictionary["similarity"] = Categorical(["tversky", "tanimoto", 'cosine', 'asymmetric'])
hyperparameters_range_dictionary["normalize"] = Categorical([True, False])
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 1, prior = 'uniform')
hyperparameters_range_dictionary["alpha"] = Real(low = 0, high = 2, prior = 'uniform')
if recommender_class is ScoresHybridRP3betaKNNCBF:
hyperparameters_range_dictionary["beta_P"] = Real(low = 0, high = 2, prior = 'uniform')

Expand Down Expand Up @@ -272,7 +272,7 @@ def read_data_split_and_search():
URM_ICM_train = URM_ICM_train.tocsr()


output_folder_path = "ParamResultsExperiments/SKOPT_ScoresHybridP3alphaKNNCBF_warm_12_"
output_folder_path = "ParamResultsExperiments/SKOPT_ScoresHybridP3alphaKNNCBF_specialized_extend_param"
output_folder_path += datetime.now().strftime('%b%d_%H-%M-%S/')


Expand All @@ -283,7 +283,7 @@ def read_data_split_and_search():

hybrid_algorithm_list = [
#ScoresHybridP3alphaKNNCBF,
#ScoresHybridRP3betaKNNCBF,
ScoresHybridRP3betaKNNCBF,
#ScoresHybridP3alphaPureSVD,
#ScoresHybridSpecialized,
#ScoresHybridSpecializedCold,
Expand All @@ -293,7 +293,7 @@ def read_data_split_and_search():
#ScoresHybridSpecializedV2Warm,
#ScoresHybridSpecializedV3Warm,
#ScoresHybridSpecializedV2Mid12,
ScoresHybridSpecializedV2Warm12,
#ScoresHybridSpecializedV2Warm12,
#ScoresHybridSpecializedAdaptive,
#ScoresHybridKNNCFKNNCBF,
#ScoresHybridUserKNNCFKNNCBF,
Expand All @@ -302,7 +302,7 @@ def read_data_split_and_search():

from Base.Evaluation.Evaluator import EvaluatorHoldout

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[15])
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10])

#cf = ItemKNNCFRecommender(URM_ICM_train)
Expand All @@ -315,7 +315,7 @@ def read_data_split_and_search():
#W_sparse_CF = W_sparse_CF,
metric_to_optimize = "MAP",
n_cases = 100,
n_random_starts=30,
n_random_starts=20,
evaluator_validation_earlystopping = evaluator_validation,
evaluator_validation = evaluator_validation,
evaluator_test = evaluator_test,
Expand All @@ -327,7 +327,7 @@ def read_data_split_and_search():
from Utils.PoolWithSubprocess import PoolWithSubprocess


pool = PoolWithSubprocess(processes=int(multiprocessing.cpu_count()), maxtasksperchild=1)
pool = PoolWithSubprocess(processes=int(multiprocessing.cpu_count()-1), maxtasksperchild=1)
resultList = pool.map_async(runParameterSearch_Hybrid_partial, hybrid_algorithm_list)
pool.close()
pool.join()
Expand Down
136 changes: 136 additions & 0 deletions TryIALS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from Data_manager.RecSys2020 import RecSys2020Reader
from Notebooks_utils.data_splitter import train_test_holdout
import matplotlib.pyplot as pyplot
import numpy as np
from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython
from GraphBased import P3alphaRecommender, RP3betaRecommender
from SLIM_ElasticNet import SLIMElasticNetRecommender
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Base import PredefinedListRecommender
from MatrixFactorization.Cython import MatrixFactorization_Cython
from MatrixFactorization.PyTorch import MF_MSE_PyTorch
from MatrixFactorization import IALSRecommender, NMFRecommender, PureSVDRecommender
from KNN import ItemKNNCBFRecommender, ItemKNNCFRecommender, ItemKNNCustomSimilarityRecommender,\
ItemKNNSimilarityHybridRecommender, UserKNNCFRecommender
from EASE_R import EASE_R_Recommender
from FeatureWeighting import CFW_D_Similarity_Linalg
import ItemKNNScoresHybridRecommender
import ScoresHybridP3alphaKNNCBF, ScoresHybridSpecializedV2Mid
import CreateCSV
from scipy import sparse as sps
import implicit

# https://github.com/MaurizioFD/RecSys_Course_AT_PoliMi/blob/master/Practice%2009%20-%20SLIM%20BPR.ipynb
# https://github.com/nicolo-felicioni/recsys-polimi-2019/tree/master/Hybrid


if __name__ == '__main__':
URM_all, user_id_unique, item_id_unique = RecSys2020Reader.load_urm()
ICM_all = RecSys2020Reader.load_icm_asset()
target_ids = RecSys2020Reader.load_target()

URM_ICM_all = RecSys2020Reader.load_urm_icm()

'''item_popularity = np.ediff1d(URM_all.tocsc().indptr)
print(item_popularity)
item_popularity = np.sort(item_popularity)
pyplot.plot(item_popularity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted Item')
pyplot.show()
user_activity = np.ediff1d(URM_all.indptr)
user_activity = np.sort(user_activity)
pyplot.plot(user_activity, 'ro')
pyplot.ylabel('Num Interactions ')
pyplot.xlabel('Sorted User')
pyplot.show()'''

#np.random.seed(1234)
URM_train, URM_test = train_test_holdout(URM_all, train_perc=0.90)
ICM_train, ICM_test = train_test_holdout(ICM_all, train_perc=0.9)
evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[10], exclude_seen=True)

URM_ICM_train = sps.vstack([URM_train, ICM_all.T])
URM_ICM_train = URM_ICM_train.tocsr()
URM_ICM_train2 = sps.hstack([ICM_all, URM_train.T])
URM_ICM_train2 = URM_ICM_train2.tocsr()

earlystopping_keywargs = {"validation_every_n": 1,
"stop_on_validation": True,
"evaluator_object": evaluator_validation,
"lower_validations_allowed": 3,
"validation_metric": "MAP",
}

'''model = implicit.als.AlternatingLeastSquares(factors=50)
# train the model on a sparse matrix of item/user/confidence weights
model.fit(URM_train.T)
# recommend items for a user
rec_list = []
for el in target_ids:
recommendations = model.recommend(el, URM_train, N=100)
rec_list.append([x[1] for x in recommendations])
print(rec_list)
#rows, cols, vals = zip(*rec_list)
csr_rec = sps.csr_matrix(rec_list)
pred = PredefinedListRecommender.PredefinedListRecommender(URM_train)
pred.fit()'''

ials = IALSRecommender.IALSRecommender(URM_ICM_train)
alpha = 50
print(alpha)
ials.fit(**earlystopping_keywargs, num_factors=300, alpha=alpha)




'''itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender(URM_train, URM_train.T)
itemKNNCBF.fit(topK=700, shrink=200, similarity='jaccard', normalize=True, feature_weighting="TF-IDF")
itemKNNCBF2 = ItemKNNCBFRecommender.ItemKNNCBFRecommender(URM_train, URM_ICM_train.T)
itemKNNCBF2.fit(topK=700, shrink=200, similarity='jaccard', normalize=True, feature_weighting="TF-IDF")
itemKNNCBF3 = ItemKNNCBFRecommender.ItemKNNCBFRecommender(URM_train, URM_ICM_train2)
itemKNNCBF3.fit(topK=700, shrink=200, similarity='jaccard', normalize=True, feature_weighting="TF-IDF")
#cfw = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_train, itemKNNCF.W_sparse)
#cfw.fit(show_max_performance=False, logFile=None, loss_tolerance=1e-6,
# iteration_limit=500000, damp_coeff=0.5, topK=900, add_zeros_quota=0.5, normalize_similarity=True)
hyb5 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_all)
# Kaggle MAP 0.08856
args = {"topK_P": 903, "alpha_P": 0.4108657561671193, "normalize_similarity_P": False, "topK": 448, "shrink": 20,
"similarity": "tversky", "normalize": True, "alpha": 0.6290871066510789, "feature_weighting": "TF-IDF"}
hyb5.fit(**args)
hyb6 = ScoresHybridP3alphaKNNCBF.ScoresHybridP3alphaKNNCBF(URM_train, ICM_all)
#args = {"topK_P": 1303, "alpha_P": 0.4808657561671193, "normalize_similarity_P": False, "topK": 848, "shrink": 1,
# "similarity": "tversky", "normalize": False, "alpha": 0.5790871066510789, "feature_weighting": "TF-IDF"}
args = {"topK_P": 756, "alpha_P": 0.5292654015790155, "normalize_similarity_P": False, "topK": 1000, "shrink": 47,
"similarity": "tversky", "normalize": False, "alpha": 0.5207647439152092, "feature_weighting": "none"}
hyb6.fit(**args)
#cf = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_ICM_train)
#cf.fit(**{"topK": 259, "shrink": 24, "similarity": "cosine", "normalize": True})
#W_sparse_CF = cf.W_sparse
#hyb7 = CFW_D_Similarity_Linalg.CFW_D_Similarity_Linalg(URM_train, ICM_all, W_sparse_CF)
#hyb7.fit(**{"topK": 575, "add_zeros_quota": 0.6070346405411541, "normalize_similarity": False})
hyb7 = ScoresHybridSpecializedV2Mid.ScoresHybridSpecializedV2Mid(URM_ICM_train, URM_ICM_train.T)
hyb7.fit(**{"topK_P": 516, "alpha_P": 0.4753488773601332, "normalize_similarity_P": False, "topK": 258, "shrink": 136,
"similarity": "asymmetric", "normalize": False, "alpha": 0.48907705969537585, "feature_weighting": "BM25"})
print(evaluator_validation.evaluateRecommender(itemKNNCBF))
print(evaluator_validation.evaluateRecommender(itemKNNCBF2))
print(evaluator_validation.evaluateRecommender(itemKNNCBF3))
print(evaluator_validation.evaluateRecommender(hyb7))
print(evaluator_validation.evaluateRecommender(hyb5))
print(evaluator_validation.evaluateRecommender(hyb6)'''
print(evaluator_validation.evaluateRecommender(ials))

#item_list = recommender.recommend(target_ids, cutoff=10)
#CreateCSV.create_csv(target_ids, item_list, 'MyRec')
Loading

0 comments on commit 15f3f91

Please sign in to comment.