From f6193200255f6b1c30848dbe2133a55a5bfda123 Mon Sep 17 00:00:00 2001 From: hasteck <55817328+hasteck@users.noreply.github.com> Date: Mon, 19 Oct 2020 14:36:24 +0200 Subject: [PATCH] Add files via upload --- EDLAE_NeurIPS2020.ipynb | 1217 +++++++++++++++++++++++++++++++++++++++ README.md | 8 + 2 files changed, 1225 insertions(+) create mode 100644 EDLAE_NeurIPS2020.ipynb create mode 100644 README.md diff --git a/EDLAE_NeurIPS2020.ipynb b/EDLAE_NeurIPS2020.ipynb new file mode 100644 index 0000000..6060091 --- /dev/null +++ b/EDLAE_NeurIPS2020.ipynb @@ -0,0 +1,1217 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Autoencoders that don't overfit towards the Identity\n", + "\n", + "This notebook provides an implementation in Python 3.7.7 (and Tensorflow 1.15.0) of the algorithms outlined in the paper \n", + "\"Autoencoders that don't overfit towards the Identity\" \n", + "at the 34th Conference on Neural Information Processing Systems (NeurIPS 2020).\n", + "\n", + "For reproducibility, the experiments utilize publicly available [code](https://github.com/dawenl/vae_cf) for pre-processing three popular data-sets and for evaluating the learned models. That code accompanies the paper \"[Variational autoencoders for collaborative filtering](https://arxiv.org/abs/1802.05814)\" by Dawen Liang et al. at The Web Conference 2018. While the code for the Movielens-20M data-set was made publicly available, the code for pre-processing the other two data-sets can easily be obtained by modifying their code as described in their paper.\n", + "The experiments were run on an AWS instance with 128 GB RAM and 16 vCPUs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "import sys\n", + "import time\n", + "from copy import deepcopy\n", + "\n", + "import numpy as np\n", + "from scipy import sparse\n", + "import pandas as pd\n", + "import bottleneck as bn\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.contrib.layers import apply_regularization, l2_regularizer\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# change to the location of the data\n", + "DATA_DIR = '/my/data/directory/'\n", + "\n", + "itemId='songId' # for MSD data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train_triplets.txt'), sep='\\t', header=None, names=['userId', 'songId', 'playCount'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pre-processing of the Data \n", + "\n", + "Utilizing the publicly available [code](https://github.com/dawenl/vae_cf), which is copied below (with kind permission of Dawen Liang). Note that the following code is modified as to pre-process the [MSD data-set](https://labrosa.ee.columbia.edu/millionsong/tasteprofile). For pre-processing the [MovieLens-20M data-set](https://grouplens.org/datasets/movielens/20m/), see their original publicly-available [code](https://github.com/dawenl/vae_cf).\n", + "\n", + "### Data splitting procedure\n", + "- Select 50K users as heldout users, 50K users as validation users, and the rest of the users for training\n", + "- Use all the items from the training users as item set\n", + "- For each of both validation and test user, subsample 80% as fold-in data and the rest for prediction " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_count(tp, id):\n", + " playcount_groupbyid = tp[[id]].groupby(id, as_index=False)\n", + " count = playcount_groupbyid.size()\n", + " return count" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_triplets(tp, min_uc=5, min_sc=0):\n", + " # Only keep the triplets for items which were clicked on by at least min_sc users. \n", + " if min_sc > 0:\n", + " itemcount = get_count(tp, itemId)\n", + " tp = tp[tp[itemId].isin(itemcount.index[itemcount >= min_sc])]\n", + " \n", + " # Only keep the triplets for users who clicked on at least min_uc items\n", + " # After doing this, some of the items will have less than min_uc users, but should only be a small proportion\n", + " if min_uc > 0:\n", + " usercount = get_count(tp, 'userId')\n", + " tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]\n", + " \n", + " # Update both usercount and itemcount after filtering\n", + " usercount, itemcount = get_count(tp, 'userId'), get_count(tp, itemId) \n", + " return tp, usercount, itemcount" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=20, min_sc=200) # for MSD data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After filtering, there are 33633450 watching events from 571355 users and 41140 movies (sparsity: 0.143%)\n" + ] + } + ], + "source": [ + "sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])\n", + "\n", + "print(\"After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)\" % \n", + " (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "unique_uid = user_activity.index\n", + "\n", + "np.random.seed(98765)\n", + "idx_perm = np.random.permutation(unique_uid.size)\n", + "unique_uid = unique_uid[idx_perm]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# create train/validation/test users\n", + "n_users = unique_uid.size\n", + "n_heldout_users = 50000 # for MSD data\n", + "\n", + "tr_users = unique_uid[:(n_users - n_heldout_users * 2)]\n", + "vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]\n", + "te_users = unique_uid[(n_users - n_heldout_users):]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "unique_sid = pd.unique(train_plays[itemId])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))\n", + "profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "pro_dir = os.path.join(DATA_DIR, 'pro_sg')\n", + "\n", + "if not os.path.exists(pro_dir):\n", + " os.makedirs(pro_dir)\n", + "\n", + "with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:\n", + " for sid in unique_sid:\n", + " f.write('%s\\n' % sid)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def split_train_test_proportion(data, test_prop=0.2):\n", + " data_grouped_by_user = data.groupby('userId')\n", + " tr_list, te_list = list(), list()\n", + "\n", + " np.random.seed(98765)\n", + "\n", + " for i, (_, group) in enumerate(data_grouped_by_user):\n", + " n_items_u = len(group)\n", + "\n", + " if n_items_u >= 5:\n", + " idx = np.zeros(n_items_u, dtype='bool')\n", + " idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True\n", + "\n", + " tr_list.append(group[np.logical_not(idx)])\n", + " te_list.append(group[idx])\n", + " else:\n", + " tr_list.append(group)\n", + "\n", + " if i % 5000 == 0:\n", + " print(\"%d users sampled\" % i)\n", + " sys.stdout.flush()\n", + "\n", + " data_tr = pd.concat(tr_list)\n", + " data_te = pd.concat(te_list)\n", + " \n", + " return data_tr, data_te" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]\n", + "vad_plays = vad_plays.loc[vad_plays[itemId].isin(unique_sid)]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 users sampled\n", + "5000 users sampled\n", + "10000 users sampled\n", + "15000 users sampled\n", + "20000 users sampled\n", + "25000 users sampled\n", + "30000 users sampled\n", + "35000 users sampled\n", + "40000 users sampled\n", + "45000 users sampled\n" + ] + } + ], + "source": [ + "vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]\n", + "test_plays = test_plays.loc[test_plays[itemId].isin(unique_sid)]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 users sampled\n", + "5000 users sampled\n", + "10000 users sampled\n", + "15000 users sampled\n", + "20000 users sampled\n", + "25000 users sampled\n", + "30000 users sampled\n", + "35000 users sampled\n", + "40000 users sampled\n", + "45000 users sampled\n" + ] + } + ], + "source": [ + "test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Save the data into (user_index, item_index) format" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def numerize(tp):\n", + " uid = map(lambda x: profile2id[x], tp['userId'])\n", + " sid = map(lambda x: show2id[x], tp[itemId])\n", + " return pd.DataFrame(data={'uid': list(uid), 'sid': list(sid)}, columns=['uid', 'sid'])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = numerize(train_plays)\n", + "train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "vad_data_tr = numerize(vad_plays_tr)\n", + "vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "vad_data_te = numerize(vad_plays_te)\n", + "vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_tr = numerize(test_plays_tr)\n", + "test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_te = numerize(test_plays_te)\n", + "test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the pre-processed training and test data" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "unique_sid = list()\n", + "with open(os.path.join(pro_dir, 'unique_sid.txt'), 'r') as f:\n", + " for line in f:\n", + " unique_sid.append(line.strip())\n", + "\n", + "n_items = len(unique_sid)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def load_train_data(csv_file):\n", + " tp = pd.read_csv(csv_file)\n", + " n_users = tp['uid'].max() + 1\n", + "\n", + " rows, cols = tp['uid'], tp['sid']\n", + " data = sparse.csr_matrix((np.ones_like(rows),\n", + " (rows, cols)), dtype='float64',\n", + " shape=(n_users, n_items))\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# load training data\n", + "train_data = load_train_data(os.path.join(pro_dir, 'train.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "#Pre-computation of the item-item training-data (used by analytic solutions)\n", + "XtX= np.asarray(train_data.T.dot(train_data).todense(), dtype = np.float32) \n", + "XtXdiag = deepcopy(np.diag(XtX)) \n", + "ii_diag = np.diag_indices(XtX.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def load_tr_te_data(csv_file_tr, csv_file_te):\n", + " tp_tr = pd.read_csv(csv_file_tr)\n", + " tp_te = pd.read_csv(csv_file_te)\n", + "\n", + " start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())\n", + " end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())\n", + "\n", + " rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']\n", + " rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']\n", + "\n", + " data_tr = sparse.csr_matrix((np.ones_like(rows_tr),\n", + " (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))\n", + " data_te = sparse.csr_matrix((np.ones_like(rows_te),\n", + " (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))\n", + " return data_tr, data_te" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "test_data_tr, test_data_te = load_tr_te_data(\n", + " os.path.join(pro_dir, 'test_tr.csv'),\n", + " os.path.join(pro_dir, 'test_te.csv'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Evaluation \n", + "\n", + "Utilizing the publicly available [code](https://github.com/dawenl/vae_cf), which is copied below (with kind permission of Dawen Liang).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):\n", + " '''\n", + " normalized discounted cumulative gain@k for binary relevance\n", + " ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance\n", + " '''\n", + " batch_users = X_pred.shape[0]\n", + " idx_topk_part = bn.argpartition(-X_pred, k, axis=1)\n", + " topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],\n", + " idx_topk_part[:, :k]]\n", + " idx_part = np.argsort(-topk_part, axis=1)\n", + " # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted\n", + " # topk predicted score\n", + " idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]\n", + " # build the discount template\n", + " tp = 1. / np.log2(np.arange(2, k + 2))\n", + "\n", + " DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],\n", + " idx_topk].toarray() * tp).sum(axis=1)\n", + " IDCG = np.array([(tp[:min(n, k)]).sum()\n", + " for n in heldout_batch.getnnz(axis=1)])\n", + " return DCG / IDCG" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def Recall_at_k_batch(X_pred, heldout_batch, k=100):\n", + " batch_users = X_pred.shape[0]\n", + "\n", + " idx = bn.argpartition(-X_pred, k, axis=1)\n", + " X_pred_binary = np.zeros_like(X_pred, dtype=bool)\n", + " X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True\n", + "\n", + " X_true_binary = (heldout_batch > 0).toarray()\n", + " tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(\n", + " np.float32)\n", + " recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))\n", + " return recall" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(BB, test_data_tr = test_data_tr, test_data_te = test_data_te):\n", + " print(\"evaluating ...\")\n", + " N_test = test_data_tr.shape[0]\n", + " idxlist_test = range(N_test)\n", + "\n", + " batch_size_test = 5000\n", + " n100_list, r20_list, r50_list = [], [], []\n", + " for bnum, st_idx in enumerate(range(0, N_test, batch_size_test)):\n", + " end_idx = min(st_idx + batch_size_test, N_test)\n", + " X = test_data_tr[idxlist_test[st_idx:end_idx]]\n", + "\n", + " if sparse.isspmatrix(X):\n", + " X = X.toarray()\n", + " X = X.astype('float32')\n", + "\n", + " pred_val = X.dot(BB)\n", + " # exclude examples from training and validation (if any)\n", + " pred_val[X.nonzero()] = -np.inf\n", + " n100_list.append(NDCG_binary_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=100))\n", + " r20_list.append(Recall_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=20))\n", + " r50_list.append(Recall_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=50))\n", + "\n", + " n100_list = np.concatenate(n100_list)\n", + " r20_list = np.concatenate(r20_list)\n", + " r50_list = np.concatenate(r50_list)\n", + " print(\"Test NDCG@100=%.5f (%.5f)\" % (np.mean(n100_list), np.std(n100_list) / np.sqrt(len(n100_list))))\n", + " print(\"Test Recall@20=%.5f (%.5f)\" % (np.mean(r20_list), np.std(r20_list) / np.sqrt(len(r20_list))))\n", + " print(\"Test Recall@50=%.5f (%.5f)\" % (np.mean(r50_list), np.std(r50_list) / np.sqrt(len(r50_list))))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "class MyClock:\n", + " startTime = time.time()\n", + " def tic(self):\n", + " self.startTime = time.time()\n", + " def toc(self):\n", + " secs = time.time() - self.startTime \n", + " print(\"... elapsed time: {} min {} sec\".format(int(secs//60), secs%60) )\n", + "\n", + "myClock = MyClock()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Methods for Learning the various models in Table 1 in the paper " + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "def learn_EDLAE_fullrank(pdrop, L2const, XtX = XtX, XtXdiag = XtXdiag, iidiag = ii_diag):\n", + " # full-rank EDLAE, implements Eqs. 8 and 9 in the Paper\n", + " XtX[ii_diag]= XtXdiag + L2const + XtXdiag * pdrop / (1.0-pdrop) \n", + " BB=np.linalg.inv(XtX)\n", + " BB/=-np.diag(BB)\n", + " BB[ii_diag]=0.0\n", + " return BB" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "def learn_EDLAE_analytic(pdrop, L2const, omega, hidden_dim, train_epochs, init_scale = 0.0001, XtX = XtX, XtXdiag = XtXdiag, iidiag = ii_diag):\n", + " # low-rank EDLAE, implements ADMM approach derived in the Supplement to the Paper \n", + " #precompute\n", + " OmegaDiag = L2const + XtXdiag * pdrop / (1.0-pdrop) +omega # penalty-vector in ADMM\n", + " XtX[ii_diag]= XtXdiag + L2const + XtXdiag * pdrop / (1.0-pdrop)\n", + " PP=np.linalg.inv(XtX) \n", + " #intitialization\n", + " betaVec= np.zeros(XtX.shape[0])\n", + " gammaVec= np.zeros(XtX.shape[0])\n", + " UU=np.random.randn(XtX.shape[0],hidden_dim) * init_scale\n", + " # ADMM iterations \n", + " for itercnt in range(train_epochs):\n", + " print(\" iteration step: {}\".format(itercnt))\n", + " #### update VVt\n", + " XtX[ii_diag]= XtXdiag + L2const + XtXdiag * pdrop / (1.0-pdrop) + OmegaDiag\n", + " HH=UU.T.dot(XtX).dot(UU)\n", + " HH= np.linalg.inv(HH) .dot(UU.T)\n", + " XtX[ii_diag]= XtXdiag\n", + " GG= XtX *(1.0+betaVec)\n", + " GG[ii_diag]+= OmegaDiag* (betaVec-gammaVec)\n", + " VVt= HH.dot(GG)\n", + " #update UU\n", + " HH= VVt.dot(VVt.T)\n", + " HH=np.linalg.inv(HH)\n", + " HH=VVt.T.dot(HH)\n", + " XtX[ii_diag]= XtXdiag\n", + " UU= XtX *(1.0+betaVec)\n", + " UU[ii_diag]+= OmegaDiag * (betaVec-gammaVec)\n", + " UU= UU.dot(HH)\n", + " UU=PP.dot(UU)\n", + " # update betaVec\n", + " UUVVt = UU.dot(VVt)\n", + " UUVVdiag = np.diag(UUVVt)\n", + " XtX[ii_diag]= XtXdiag\n", + " HH = np.diag(XtX.dot(UUVVt)) - XtXdiag + OmegaDiag*(UUVVdiag+gammaVec)\n", + " GG = XtXdiag + OmegaDiag - XtXdiag * pdrop / (1.0-pdrop) - L2const \n", + " betaVec = HH / GG\n", + " betaVec=np.maximum(betaVec, 0.0) # self-similarity has be non-negative (for stability)\n", + " # update gammaVec\n", + " gammaVec+= UUVVdiag-betaVec \n", + " return [UU,VVt.T]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "#low-rank with (0-diagonal or not)-constraint, with prdouct-L2, \n", + "# for lines 2 nd 3 without constraint\n", + "# for figure 1, with and without constraint\n", + "# if 0-diag approximation to EDLAE for large matrix-ranks\n", + "def learn_EDLAE_approx(zeroDiagConstraint, pdrop, L2const, hidden_dim, train_epochs, init_scale = 0.0001, XtX = XtX, XtXdiag = XtXdiag, iidiag = ii_diag):\n", + " # approximates low-rank EDLAE for: \n", + " # 1. small matrix rank (-> unconstrained diagonal), and \n", + " # 2. large marrix rank (-> zero-diagonal enforced)\n", + " if zeroDiagConstraint:\n", + " print(\"zero diagonal enforced during training ...\")\n", + " else:\n", + " print(\"unconstrained diagonal during training ...\")\n", + " #precompute\n", + " XtX[ii_diag]= XtXdiag + L2const + XtXdiag * pdrop / (1.0-pdrop)\n", + " CC=np.linalg.inv(XtX)\n", + " #random initialization\n", + " VVt=np.random.randn(hidden_dim, XtX.shape[0]) * init_scale\n", + " # iterative optimization\n", + " for itercnt in range(train_epochs):\n", + " print(\" iteration step: {}\".format(itercnt))\n", + " #update UU\n", + " GG= VVt.dot(VVt.T)\n", + " GG=np.linalg.inv(GG)\n", + " # note: gammaVec equals eta + diag(Lambda) in the Supplement\n", + " gammaVec= L2const + XtXdiag * pdrop / (1.0-pdrop) # without zero-diag constraint \n", + " if zeroDiagConstraint:\n", + " KK = VVt.T.dot(GG).dot(VVt)\n", + " gammaVec= np.linalg.solve( CC * KK , np.diag(KK))\n", + " HH=CC *(-gammaVec) \n", + " HH[ii_diag]+=1.0\n", + " UU=HH.dot( VVt.T.dot(GG) )\n", + " #update VV\n", + " XtX[ii_diag]= XtXdiag + L2const + XtXdiag * pdrop / (1.0-pdrop)\n", + " HH=UU.T.dot(XtX).dot(UU)\n", + " HH= np.linalg.inv(HH) .dot(UU.T)\n", + " XtX[ii_diag]= XtXdiag\n", + " VVt= HH.dot(XtX )\n", + " if zeroDiagConstraint:\n", + " diag_up = np.diag(UU.dot(VVt))\n", + " diag_down = np.diag(UU.dot(HH))\n", + " etaVec= diag_up / diag_down \n", + " VVt-= HH * etaVec \n", + " return [UU,VVt.T]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "def learn_DAE_stochastic(pdrop, L2constAdd, L2constProd, hidden_dim, train_epochs, bsize =4096, X = train_data ):\n", + " # sampled denoising autoencoder, implemented using tensorflow \n", + " features_dim=X.shape[1]\n", + " tf.reset_default_graph() # reset graph if run many times\n", + " # training data\n", + " training_data = tf.placeholder(tf.float32, shape=(None, features_dim), name='training_data')\n", + " # L2-regularization lambda\n", + " lamAdd = tf.placeholder(tf.float32, shape=(), name='lambdaAdd')\n", + " lamProd = tf.placeholder(tf.float32, shape=(), name='lambdaProd')\n", + " # weight matrices\n", + " UU = tf.get_variable(name=\"UU\", shape=[features_dim, hidden_dim], initializer=tf.contrib.layers.xavier_initializer())\n", + " VVt = tf.get_variable(name=\"VVt\", shape=[hidden_dim, features_dim], initializer=tf.contrib.layers.xavier_initializer())\n", + " WeightMatrices=[UU,VVt]\n", + " # network architecture\n", + " h_in = tf.nn.dropout(training_data, rate= pdrop) # stochastic denoising applied to input\n", + " h_hidden = tf.matmul(h_in, UU) # linear model\n", + " h_out = tf.matmul(h_hidden, VVt)\n", + " # L2 regularization as in Line 1 in Table 1 in the paper\n", + " regAdd = l2_regularizer(lamAdd) \n", + " L2regAdd = apply_regularization(regAdd, WeightMatrices)\n", + " regProd = l2_regularizer(lamProd) \n", + " L2regProd = apply_regularization(regProd, [ tf.matmul(UU,VVt) ]) \n", + " # squared error + stochastic denoising + L2 regularization\n", + " mse = tf.reduce_mean( tf.square(tf.subtract(training_data, h_out)))\n", + " loss = mse + L2regAdd + L2regProd\n", + " # optimizer\n", + " optimizer = tf.train.AdamOptimizer()\n", + " train_op = optimizer.minimize(loss)\n", + " # training\n", + " N = X.shape[0]\n", + " idxlist = np.arange(N)\n", + " with tf.Session() as sess:\n", + " init = tf.global_variables_initializer()\n", + " sess.run(init)\n", + " loss_list = list()\n", + " mse_list = list()\n", + " for epoch in range(train_epochs):\n", + " loss_epoch = list()\n", + " mse_epoch = list()\n", + " np.random.shuffle(idxlist)\n", + " for bnum, st_idx in enumerate(range(0, N, bsize)):\n", + " end_idx = min(st_idx + bsize, N)\n", + " inp = X[idxlist[st_idx:end_idx]]\n", + " if sparse.isspmatrix(inp):\n", + " trainBatch = inp.toarray().astype('float32') \n", + " feed_dict = {training_data:trainBatch, lamAdd:L2constAdd, lamProd:L2constProd}\n", + " l, m, _ = sess.run([loss, mse, train_op], feed_dict=feed_dict)\n", + " loss_epoch.append(l)\n", + " mse_epoch.append(m)\n", + " loss_list.append(np.mean(loss_epoch))\n", + " mse_list.append(np.mean(mse_epoch))\n", + " print(\"Epoch: %d, training_rmse=%.3f, training_loss=%.3f\" % (epoch, np.sqrt(mse_list[-1]), loss_list[-1]))\n", + " UUfinal = sess.run(UU)\n", + " VVfinal = sess.run(VVt).T \n", + " return [UUfinal,VVfinal]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training of the various models in Table 1" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:\n", + "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", + "For more information, please see:\n", + " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", + " * https://github.com/tensorflow/addons\n", + " * https://github.com/tensorflow/io (for I/O related ops)\n", + "If you depend on functionality not listed there, please file an issue.\n", + "\n", + "Epoch: 0, training_rmse=0.037, training_loss=0.002\n", + "Epoch: 1, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 2, training_rmse=0.032, training_loss=0.001\n", + "Epoch: 3, training_rmse=0.032, training_loss=0.001\n", + "Epoch: 4, training_rmse=0.032, training_loss=0.001\n", + "... elapsed time: 250 min 5.366348743438721 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.24844 (0.00093)\n", + "Test Recall@20=0.19967 (0.00093)\n", + "Test Recall@50=0.27749 (0.00106)\n" + ] + } + ], + "source": [ + "# line 1 in Table 1 in the Paper\n", + "\n", + "#L2constAdd, train_epochs = [ 8e-6, 5 ] # for ML-20M data\n", + "#L2constAdd, train_epochs = [ 8e-6, 5 ] # for Netflix data\n", + "L2constAdd, train_epochs = [ 2e-7, 5 ] # for MSD data\n", + "\n", + "pdrop = 0.0 \n", + "L2constProd = 0.0\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU,VV=learn_DAE_stochastic(pdrop, L2constAdd, L2constProd, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU.dot(VV.T))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unconstrained diagonal during training ...\n", + " iteration step: 0\n", + " iteration step: 1\n", + " iteration step: 2\n", + " iteration step: 3\n", + " iteration step: 4\n", + "... elapsed time: 17 min 29.487412214279175 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.26636 (0.00095)\n", + "Test Recall@20=0.21677 (0.00095)\n", + "Test Recall@50=0.29608 (0.00106)\n" + ] + } + ], + "source": [ + "# line 2 in Table 1 in the Paper\n", + "# and also used for the blue dotted line in Figure 1 (left)\n", + "\n", + "#L2const, train_epochs = [ 10000.0, 5 ] # for ML-20M data\n", + "#L2const, train_epochs = [ 100000.0, 5 ] # for Netflix data\n", + "L2const, train_epochs = [ 30000.0, 5 ] # for MSD data\n", + "\n", + "\n", + "zeroDiagConstraint = False\n", + "pdrop = 0.0\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU_freediag,VV_freediag = learn_EDLAE_approx(zeroDiagConstraint, pdrop, L2const, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU_freediag.dot(VV_freediag.T))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unconstrained diagonal during training ...\n", + " iteration step: 0\n", + " iteration step: 1\n", + " iteration step: 2\n", + " iteration step: 3\n", + " iteration step: 4\n", + "... elapsed time: 17 min 26.759504556655884 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.31475 (0.00101)\n", + "Test Recall@20=0.25980 (0.00102)\n", + "Test Recall@50=0.35086 (0.00111)\n" + ] + } + ], + "source": [ + "# line 3 in Table 1 in the Paper\n", + "\n", + "#pdrop, L2const, train_epochs = [ 0.67, 200.0, 5 ] # for ML-20M data\n", + "#pdrop, L2const, train_epochs = [ 0.67, 500.0, 5 ] # for Netflix data\n", + "pdrop, L2const, train_epochs = [ 0.75, 20.0, 5 ] # for MSD data\n", + "\n", + "\n", + "zeroDiagConstraint = False\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU,VV = learn_EDLAE_approx(zeroDiagConstraint, pdrop, L2const, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU.dot(VV.T))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Large dropout rate: 0.67 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.\n", + "Epoch: 0, training_rmse=0.039, training_loss=0.001\n", + "Epoch: 1, training_rmse=0.036, training_loss=0.001\n", + "Epoch: 2, training_rmse=0.035, training_loss=0.001\n", + "Epoch: 3, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 4, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 5, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 6, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 7, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 8, training_rmse=0.034, training_loss=0.001\n", + "Epoch: 9, training_rmse=0.034, training_loss=0.001\n", + "... elapsed time: 507 min 10.029773712158203 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.30890 (0.00100)\n", + "Test Recall@20=0.25551 (0.00102)\n", + "Test Recall@50=0.34343 (0.00111)\n" + ] + } + ], + "source": [ + "# line 4 in Table 1 in the Paper\n", + "\n", + "#pdrop, L2constProd, train_epochs = [ 0.67, 3e-9, 10 ] # for ML-20M data\n", + "#pdrop, L2constProd, train_epochs = [ 0.67, 2e-9, 10 ] # for Netflix data\n", + "pdrop, L2constProd, train_epochs = [ 0.67, 1e-14, 10 ] # for MSD data\n", + "\n", + "\n", + "L2constAdd = 0.0\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU,VV=learn_DAE_stochastic(pdrop, L2constAdd, L2constProd, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU.dot(VV.T))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " iteration step: 0\n", + " iteration step: 1\n", + " iteration step: 2\n", + " iteration step: 3\n", + " iteration step: 4\n", + " iteration step: 5\n", + " iteration step: 6\n", + " iteration step: 7\n", + " iteration step: 8\n", + " iteration step: 9\n", + " iteration step: 10\n", + " iteration step: 11\n", + " iteration step: 12\n", + " iteration step: 13\n", + " iteration step: 14\n", + " iteration step: 15\n", + " iteration step: 16\n", + " iteration step: 17\n", + " iteration step: 18\n", + " iteration step: 19\n", + "... elapsed time: 215 min 18.817319869995117 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.31943 (0.00102)\n", + "Test Recall@20=0.26369 (0.00104)\n", + "Test Recall@50=0.35462 (0.00112)\n" + ] + } + ], + "source": [ + "# line 5 in Table 1 in the Paper\n", + "\n", + "#pdrop, L2const, omega, train_epochs = [ 0.33, 400.0, 500.0, 5 ] # for ML-20M data\n", + "#pdrop, L2const, omega, train_epochs = [ 0.15, 500.0, 500.0, 10 ] # for Netflix data\n", + "pdrop, L2const, omega, train_epochs = [ 0.1, 10.0, 500.0, 20 ] # for MSD data\n", + "\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU,VV = learn_EDLAE_analytic(pdrop, L2const, omega, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU.dot(VV.T))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "zero diagonal enforced during training ...\n", + " iteration step: 0\n", + " iteration step: 1\n", + " iteration step: 2\n", + " iteration step: 3\n", + " iteration step: 4\n", + " iteration step: 5\n", + " iteration step: 6\n", + " iteration step: 7\n", + " iteration step: 8\n", + " iteration step: 9\n", + "... elapsed time: 63 min 40.53152918815613 sec\n", + "evaluating ...\n", + "Test NDCG@100=0.30179 (0.00100)\n", + "Test Recall@20=0.24939 (0.00101)\n", + "Test Recall@50=0.33314 (0.00110)\n" + ] + } + ], + "source": [ + "# used for green solid line in Figure 1 (left)\n", + "\n", + "#L2const, train_epochs = [ 500.0, 10 ] # for ML-20M data\n", + "#L2const, train_epochs = [ 1000.0, 10 ] # for Netflix data\n", + "L2const, train_epochs = [ 200.0, 10 ] # for MSD data\n", + " \n", + "zeroDiagConstraint = True\n", + "pdrop = 0.0\n", + "hidden_dim =1000\n", + "myClock.tic()\n", + "UU_0diag,VV_0diag = learn_EDLAE_approx(zeroDiagConstraint, pdrop, L2const, hidden_dim, train_epochs)\n", + "myClock.toc()\n", + "evaluate(UU_0diag.dot(VV_0diag.T))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cosine-Similarity Plots, like in Figure 1 " + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating ...\n", + "Test NDCG@100=0.39151 (0.00110)\n", + "Test Recall@20=0.33408 (0.00114)\n", + "Test Recall@50=0.42948 (0.00119)\n" + ] + } + ], + "source": [ + "# compute full-rank EDLAE to determine the most similar items for each item\n", + "# given that it is the most accurate model in this notebook\n", + "\n", + "#pdrop, L2const = [ 0.33 , 300.0 ] # for ML-20M data\n", + "#pdrop, L2const = [ 0.33 , 500.0 ] # for Netflix data\n", + "pdrop, L2const = [ 0.25 , 70.0 ] # for MSD data\n", + "\n", + "BB_fullrank = learn_EDLAE_fullrank(pdrop, L2const)\n", + "evaluate(BB_fullrank)\n", + "\n", + "topSims =10\n", + "BB_fullrank[ii_diag] =1.0 #include item itself into set of similar items\n", + "ixSims = bn.argpartition(-BB_fullrank, topSims, axis=0)[:topSims,:] # top similar items in each column" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_cosineSimilarity(EE, ixSims):\n", + " EEn= EE / np.sqrt(np.sum(EE*EE,1))[:,None]\n", + " EEcosine = EEn.dot(EEn.T)\n", + " cosineSimilarity = np.ones(ixSims.shape[1])\n", + " for ii in range(ixSims.shape[1]):\n", + " ii_sims = ixSims[:,ii]\n", + " simMat=EEcosine[np.ix_(ii_sims,ii_sims)]\n", + " cosineSimilarity[ii] = np.median(simMat[np.triu_indices(topSims, k=1)]) # median of all pairs i <> j\n", + " return cosineSimilarity" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'density')" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEGCAYAAABvtY4XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUy0lEQVR4nO3de5QkZX3G8eeRXeMNY3RHD8hlxYiIRHCnRT0YoxgVECEaVDwsRg5urwYNiZdEEo5ickyiHokxkri9gqtEFDVBCWo0h4ubKLfuRWCBGAFR0TUsIQKKQS6//FE1O71jT3fNdFdX19vfzzl9ti9VXb+ann3m7bfeessRIQBAeh5SdQEAgHIQ8ACQKAIeABJFwANAogh4AEjUiqoL6LZq1apYvXp11WUAQG10Op3bI2Km12sTFfCrV69Wu92uugwAqA3b31vsNbpoACBRBDwAJIqAB4BEEfAAkCgCHgASRcADQKIIeABIFAEPAImaqBOdlsse/j2YFh9AamjBA0CikmjBz1lOK3wUrX8AmES04AEgUQQ8ACSq1C4a27dIulvSA5Luj4hGmdsDAMwbRx/8CyPi9jFsBwDQhS4aAEhU2QEfkr5mu2O72WsB203bbdvt7du3l1wOAEyPsgP+eRGxRtLhkk6y/fyFC0REKyIaEdGYmel51SkAwDKUGvAR8cP839sknSfp4DK3BwCYV1rA236k7V3n7kt6iaStZW0PALCzMkfRPEHSec5OFV0h6ZyI+NcStwcA6FJawEfEzZIOLOv9AQD9MUwSABJFwANAogh4AEhUUtMFD2OYaYO5WAiASUQLHgASNfUt+GFa31wsBMAkowUPAIki4AEgUQQ8ACSKgAeARBHwAJAoAh4AEkXAA0CiCHgASBQBDwCJIuABIFEEPAAkioAHgEQR8ACQKAIeABJFwANAogh4AEgUAQ8AiSLgASBRBDwAJIqAB4BEEfAAkCgCHgASRcADQKIIeABIFAEPAIkqPeBt72L7KtsXlL0tAMC8cbTgT5Z0wxi2AwDoUmrA295D0sskfazM7QAAflnZLfgPSfpjSQ8utoDtpu227fb27dtLLgcApkdpAW/7SEm3RUSn33IR0YqIRkQ0ZmZmyioHAKZOmS34QyQdZfsWSZ+RdKjtfyxxewCALqUFfEScEhF7RMRqScdKuigi1pa1PQDAzhgHDwCJWjGOjUTEJZIuGce2AAAZWvAAkCgCHgASNZYumtTZy183YnR1AEA3WvAAkCha8EMYpvU9TKsfAIqgBQ8AiSLgASBRBDwAJIqAB4BEEfAAkCgCHgASRcADQKIIeABIFAEPAIki4AEgUQQ8ACSKgAeARBHwAJAoAh4AEkXAA0CiCHgASBQBDwCJIuABIFEEPAAkioAHgEQR8ACQKAIeABJFwANAogh4AEhUoYC3/XLb/DEAgBopGtqvkfQd2++3vV+ZBQEARqNQwEfEWknPlHSTpE22L7XdtL3rYuvYfpjtK2xfbfs62+8ZUc0AgAIKd7tExF2SPi/pM5J2k/QKSVtsv2WRVe6VdGhEHCjpIEmH2X7OcOUCAIoq2gd/tO3zJF0iaaWkgyPicEkHSnpbr3Ui89P84cr8FkNXDAAoZEXB5V4p6W8iYnP3kxFxj+0TF1vJ9i6SOpJ+XdIZEXF5j2WakpqStNdeexWtGwAwQNEumh8vDHfb75OkiLhwsZUi4oGIOEjSHpIOtn1Aj2VaEdGIiMbMzEzxyhNhD3cDgMUUDfgX93ju8KIbiYifSLpY0mFF1wEADKdvwNt+k+1rJe1n+5qu23clXTNg3Rnbj8nvP1zZH4n/HFHdtRcx3A0ABhnUB3+OpK9I+itJ7+x6/u6IuGPAurtJ+kTeD/8QSZ+NiAuWXSkAYEkGBXxExC22T1r4gu3H9gv5iLhG2dh5AEAFirTgj1Q2EiYkdR/WC0n7lFQXAGBIfQM+Io7M/33SeMoBAIxK0ROdDrH9yPz+Wtun22bQOgBMsKLDJP9B0j22585cvUnS2aVVBQAYWtGAvz8iQtLRkj4SEWdIWnSiMQBA9YpOVXC37VMkrZX0/Hxu+JXllQUAGNZS5oO/V9KJEfFjZVMPfKC0qgAAQyvUgs9D/fSux9+X9MmyigIADK/oKJpX2v6O7Ttt32X7btt3lV0cgCH0m6Wu05lfrtlkNrtEFe2Df7+kl0fEDWUWAwAYnaIB/9+EO1Az7Xb27+xs/+VarezWbdA6qIWiAd+2fa6kLyg72CpJioh/LqMoACMwTEh3d+GgtooG/KMl3SPpJV3PhSQCHgAmVNFRNCeUXQiAEWs2s38Xdr9gahQdRbOv7Qttb80fP8P2qeWWBmAoGzdmt+VgFE0Sip7otFHSKZLuk3bM9X5sWUUByLVaxS/KOzvLEEfspGjAPyIirljw3P2jLgYAMDpFA/52209WdmBVto+RtK20qgBkms3iF+ftdLiAL3ZSdBTNSZJayi6+/UNJ35V0XGlVAZg/ODp3sBRYIkefv/C237rgqYcra/X/TJIi4vRfWmkIjUYj2nMnZyzBXHfjNDVWpnGfp06VHzK/YLVhuxMRjV6vDWrBz835/lRJz5L0RWXXZT1e0sI+eQDABBl0Tdb3SJLtzZLWRMTd+ePTJH2p9OoAVGPDhqorwAgU7YN/gqRfdD3+Rf4cgBTR75+EogH/SUlX2D4vf/w7kjaVURAAYDSKTlXwXttfkfSb+VMnRMRV5ZUFoFKM4ElC31E048YomuKmcZ+nDqNoUMAwo2gAVIVwxZCKnskKAKgZAh4AEkXAA5NqdpZL52Eo9MEDk2rLlqorQM2V1oK3vafti21fb/s62yeXtS0AwC8rswV/v6S3RcQW27tK6tj+t4i4vsRtAhgFRvAkobQWfERsi4gt+f27Jd0g6YllbQ8AsLOxHGS1vVrSMyVd3uO1pu227fb27dvHUQ5QjU6n/+X3Op35ZTmDFCNQesDbfpSkf5L0hxFx18LXI6IVEY2IaMzMzJRdDjBezWY9w5oRPEkodaoC2yslXSDpq0UuDsJUBcVN4z7XUl0/qLrWPYX6TVVQ5igaSzpT0g2jvvITAGCwMrtoDlF25adDbX8rvx1R4vYAAF1KGyYZEf+h7PJ+AIAKMFUBACSKgAcw2FKHeJov75OAuWiAMq1ZU3UFy7du3fLW27hxtHVg2Qj4mhumocQIuDHobtnWycJfjtlZfmFqiC4aAEgULfiaGqYxRfcoMB1owQNlmjsICVSAgAeARNFFA2C06jxyKDEEPIDRquvIoQTRRQMAiSLgASBRBDyA0WLk0MSgDx4o04YNVVeAKUbAA2Wq4+X6kAy6aAAgUQQ8MEiv6XEXXpB6sWl0d99darWqqRtTj4AHyrRtm7R+fdVVYErRBw8U1W+GN6bSxQQi4IFBlnvhi2nFyKGJQcADg9CHvjSMHJoY9MEDQKII+MVGP3Sbne1/wWGkrdNhAq2laLX41jMh6KIZFv2z6Ws0sn85kFrM3KghumoqR8DP6fefl9YbgBoi4DniDyBRBPwwXyPnWvYLz2oEgAnAQdZhNBrz/bOoj34Hzbv/4Hc6HERHrRHwHPEHkCjHBI0MaDQa0W63l7zeXCNrWbsyzMpDbbg6NS17OFO50xXhZz1WtjsR0bMrgT54AKNFsE+M0rpobJ9l+zbbW8vaBgBgcWX2wW+SdFiJ7w8A6KO0gI+IzZLuKOv9AUyo2VmGDk+IykfR2G7abttub9++vepylqbdzm6oRr/5gbpHRjFKary2bMluqFzlB1kjoiWpJWWjaCouZ2lopYzX3M+bqSOAQioP+MpN8RH/Yc7hqeTHtrBVWLSIZpOJrzCVKu+iqbVmc/EugqIXZQaAkpQ5TPLTki6V9FTbt9o+saxtVWbjxqorWJaI5d8A1EeZo2heGxG7RcTKiNgjIs4sbVuyQj1ax919tUtpbRfecJ8kXNhPTFICGDP64KVqjvgzTTFS1X0RnE6n/4R87fZ8A6vZnP9WTCNoJNIK+H6/FJM2qRgH/ZCqSfp/NuWSCPhZZWPRGTyXOC6PWD+zs8Vb461WbY9rTaokAn6Lajgefa6VM00t+YWjhtatm/85FPkqT8sQWBKGSVZl/frsNmjo5FIvTrHYrdcB52G0Wv2312sfAIxVEi34DZoLOlp4hVT9NbjXV/alfJUHUEgSF/zgAgNL1Ks1vWbNzq38RVrcP9JuOk2nqRVT1LWE8WE6iiXrd8EPumiwJLtrm1paX3UZSFWnQ7iPUBJdNFiiIt90Fllmx5elEZYDoBy04AEgUQQ8gMnBJHwjRcADQKKS6IPvaI0k1fF0JwAoTRIB38gnKeDAHwDMo4sGABKVRAse41e7y/0BUyiJgA95xz0AQCaJgMf4DNP6ZvQbBuJCOCNFwAOYHNM0ffYYcJAVABJFwAOYHJN2ac2ao4sGwORYn89USlfNSNCCB4BEJdGCbyo78s4XOwCYl0QLfqOa2ii+0gHJGOb6xNghiYAHAElcEWqBJLpo1u3onOGveB0Me8ITUx0krN+HWyS4G43B7zNFkgj4+WuEEvAAMCeJgEc9DNuoYqoDYGnogweARBHwAJCoUgPe9mG2v237RtvvLHNbAICdlRbwtneRdIakwyXtL+m1tvcva3uYHosNhS77hpo48sj5+83m4h/o7IKrOCf4QZfZgj9Y0o0RcXNE/ELSZyQdXeL2gFJV9YeF29L+CM9ecNqOx62Ni3+enS2L/AFf8KZNt3Y8bLrVt4Duhx0vfmJWy80dD2dd3rh9R0njRW0fI+mwiHhD/vh4Sc+OiDcvWK6p+fGNT5X07WVucpWk25e5bl2xz+mbtv2V2Oel2jsiZnq9UPkwyYhoaQTTyNhuR0RjBCXVBvucvmnbX4l9HqUyu2h+KGnPrsd75M8BAMagzIC/UtJTbD/J9kMlHSvp/BK3BwDoUloXTUTcb/vNkr4qaRdJZ0XEdWVtT9M5WzD7nL5p21+JfR6Z0g6yAgCqxZmsAJAoAh4AElW7gB80/YHtX7F9bv765bZXV1DmyBTY37favt72NbYvtL13FXWOUtEpLmz/ru2wXfshdUX22far88/6OtvnjLvGUSvwu72X7YttX5X/fh9RRZ2jYvss27fZ3rrI67b94fzncY3tNUNvNCJqc1N2sPYmSftIeqikqyXtv2CZ35f00fz+sZLOrbrukvf3hZIekd9/U533t+g+58vtKmmzpMskNaquewyf81MkXSXp1/LHj6+67jHsc0vSm/L7+0u6peq6h9zn50taI2nrIq8fIekrkizpOZIuH3abdWvBF5n+4GhJn8jvf17Si+ydTkSuk4H7GxEXR8Q9+cPLlJ1vUGdFp7j4C0nvk/R/4yyuJEX2eZ2kMyLifyUpIm4bc42jVmSfQ9Kj8/u/KulHY6xv5CJis6Q7+ixytKRPRuYySY+xvdsw26xbwD9R0g+6Ht+aP9dzmYi4X9Kdkh43lupGr8j+djtRWQugzgbuc/7Vdc+I+NI4CytRkc95X0n72v6G7ctsHza26spRZJ9Pk7TW9q2SvizpLeMprTJL/f8+UOVTFWA0bK+V1JD0W1XXUibbD5F0uqTXV1zKuK1Q1k3zAmXf0jbb/o2I+EmVRZXstZI2RcQHbT9X0tm2D4iIB6surC7q1oIvMv3BjmVsr1D21e5/xlLd6BWa7sH2b0v6M0lHRcS9Y6qtLIP2eVdJB0i6xPYtyvoqz6/5gdYin/Otks6PiPsi4ruS/ktZ4NdVkX0+UdJnJSkiLpX0MGWTcqVq5NO71C3gi0x/cL6k38vvHyPposiPYNTQwP21/UxJG5SFe937ZaUB+xwRd0bEqohYHRGrlR13OCoi2tWUOxJFfq+/oKz1LturlHXZ3DzGGketyD5/X9KLJMn205QF/PaxVjle50t6XT6a5jmS7oyIbcO8Ya26aGKR6Q9s/7mkdkScL+lMZV/lblR2QOPY6ioeTsH9/YCkR0n6XH4s+fsRcVRlRQ+p4D4npeA+f1XSS2xfL+kBSe+IiLp+My26z2+TtNH2Hyk74Pr6GjfWZPvTyv5Ir8qPK7xb0kpJioiPKjvOcISkGyXdI+mEobdZ458XAKCPunXRAAAKIuABIFEEPAAkioAHgEQR8ACQKAIetWe7YfvDJW/jm0tc/o22X5ff32T7mCHWf73t3ZeyPiAxTBIone1Nki6IiM8XXH5FPo/S3ONLJL295idzoQK04DERbL8unwP7attn215t+6Kuee73ypd7le2t+XKb8+deYPuC/P5p+bzbl9i+2fYfdG1jre0rbH/L9gbbu/So4+ldy1xj+yn58z/t2tbXbX8xf/+/tn1cvs61tp/cVcfbe7z/u2xfme9Da26m07zeD9luSzp5bv285d+Q9Km8ppfZ/kLX+73Y9nmj+hyQFgIelbP9dEmnSjo0Ig6UdLKkv5P0iYh4hqRPSZrrgnmXpJfmyy12xu5+kl6qbErad9temZ/q/hpJh0TEQcrOBj2ux7pvlPS3+TINZXPALHRgvtzTJB0vad+IOFjSxzR4xsOPRMSzIuIASQ+XdGTXaw+NiEZEfHDuibzV35Z0XF7TlyXtZ3smX+QESWcN2CamFAGPSXCopM9FxO2SFBF3SHqupLmrFp0t6Xn5/W9I2mR7nbJT3Hv5UkTcm7/fbZKeoGxOk1lJV9r+Vv54nx7rXirpT23/iaS9I+LnPZa5MiK25RO73STpa/nz10paPWBfX+jsSmPX5vv99K7Xzh2wrvJT9c9WNo3uY5T9nOo+RTRKUqu5aICIeKPtZ0t6maSO7dkei3XPqPmAst9zK/tGcEr3grZfoWxOEEl6Q0ScY/vy/P2/bHt9RFzU5/0f7Hr8oPr8n7L9MEl/r+wKVD+wfZqyCbTm/GyxdRf4uKR/UXaxk89199cD3WjBYxJcJOlVth8nSbYfK+mbmp8o7jhJ/56/9uSIuDwi3qVsZsE9e7xfLxdKOsb24+e2YXvviDgvIg7Kb23b+0i6OSI+LOmLkp4xqp3UfJjfbvtRymY7LeJuZdMkS5Ii4kfKrm50qrKwB3qiBY/K5bMIvlfS120/oOzao2+R9HHb71AW5HMz630gP/BpZaF9tQpc5CQirrd9qqSvObtoyH2STpL0vQWLvlrS8bbvk/RjSX859A7O1/AT2xslbc3f+8qCq26S9FHbP5f03Lzb6FOSZiLihlHVh/QwTBKoIdsfkXRVRJxZdS2YXAQ8UDO2O8r661+cwBW8UCICHgASxUFWAEgUAQ8AiSLgASBRBDwAJIqAB4BE/T+6G0eKzn1AIQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# The diagonal is constrained to zero, i.e, (near-)orthogonality constraint.\n", + "# This is an approximation to fully emphasized denoising.\n", + "# As an (indirect) consequence, the learned embeddings of similar items are NOT similar, \n", + "# i.e., their cosine-similarities are close to 0, especially in the learned encoder-matrix UU.\n", + "\n", + "UU_0diag_cosine = calc_cosineSimilarity(UU_0diag, ixSims) # embeddings UU_0diag, VV_0diag computed above\n", + "VV_0diag_cosine = calc_cosineSimilarity(VV_0diag, ixSims)\n", + "\n", + "plt.hist(UU_0diag_cosine, bins=20, range=(0.0,1.0), density=True,histtype='step', color='blue', linewidth=2.0, linestyle='-')\n", + "plt.hist(VV_0diag_cosine, bins=20, range=(0.0,1.0), density=True,histtype='step', color='red', linewidth=2.0, linestyle='--')\n", + "plt.xlabel('cosine-similarity')\n", + "plt.ylabel('density')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'density')" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEICAYAAABVv+9nAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXOUlEQVR4nO3de5AsZXnH8d9PwHjDGD0HiijHI5SoSER3RpTCGMWgqAjRoGKBRorsHC8xJl4SMZRiUrkYS6JGE3eO4FEUxUtQwiVqgUhUbjOHq6ApQFQuhkMMCmKQy5M/umd3dtnd6bM97/RMz/dTNXW6Z3q63z67+8w7T7/v044IAQDq50FVNwAAkAYBHgBqigAPADVFgAeAmiLAA0BNEeABoKZ2TLlz2zdIukPSfZLujYhmyuMBABYkDfC550fEbUU2XLduXWzcuDFxcwCgPrrd7m0RsX6510YR4AvbuHGjOp1O1c0AgIlh+0crvZY6Bx+Svm67a7uV+FgAgD6pe/DPiYibbO8i6Ru2vx8R5/dvkAf+liRt2LAhcXMAYHok7cFHxE35v7dKOk3Sfsts046IZkQ0169fNo0EAFiDZAHe9sNt79xblvRCSVelOh4AYLGUKZpdJZ1mu3ecUyLiPxIeDwDQJ1mAj4jrJe2bav8AgNUxkxUAaooADwA1RYAHgJLshceilSWPltvzqy23l7xx+AjwAFBTY1WqAAAmUUcNSVIjusom8C+vnT8y+RzPRL13iQAPAKU1tLXqJiyLFA0A1BQBHgBqihQNAFTIec5+5cz92tGDB4CaIsADQE2RogGAktqalZTf2GI79YZYSt2htaeHAA8AJW3KR7evJcCnHGJJigYAaooePACUNDOfXmmsut2oEeABoKSumvlSisGOa0eKBgBqigAPADVFigYAKlRmiOUgBHgAqFCZIZaDkKIBgJqiBw8AFUo5xJIADwAlNdSRtLZiAymHWBLgAaCkrWM2wamHHDwA1BQ9eAAoaW5+DEx71e1GjQAPACW1tDlfGq8AT4oGAGqKAA8ANUWKBgAqVGaI5SAEeACoUMohlqRoAKCm6MEDQEldzUhaW7GBlEMsCfAAUFIzz6CvpdhAyiGWyVM0tnewfantM1IfCwCwYBQ5+LdKumYExwEA9Eka4G0/TtJLJX0i5XEAoEohK+Sqm/EAqXvwH5L0F5LuT3wcAMASyQK87UMk3RoRq47ft92y3bHd2bZtW6rmAMDUSdmDP0DSobZvkPR5SQfa/szSjSKiHRHNiGiuX78+YXMAYPx0NTM/zHLYkg2TjIhjJR0rSbafJ+kdEXFUquMBwCQqM8RyEGayAkBNjWSiU0ScJ+m8URwLAJBhJisAlNTSnKS1zUVdGF7JTbcBYOxszuvJjNf9nMjBA0Bt0YMHgJJm5/vurVW3GzUCPACU1NamfGm8AjwpGgCoKQI8ANQUKRoAqFCZIZaDEOABoEIph1iSogGAmqIHDwAVSjnEkgAPACU5LzOwlmIDKYdYkqIBgJoiwANATZGiAYCSOmrkS6veoXTkCPAAUFJDW6tuwrJI0QBATRHgAaCmSNEAQIXKDLEchB48ANQUAR4AaooUDQCU1NaspLXNRU05xJIADwAlbcrryawlwKccYkmKBgBqih48AJQ0M59eaay63agR4AGgpK6a+VKKwY5rR4oGAGqKAA8ANUWKBgAqVGaI5SAEeACoUJkhloOQogGAmqIHDwAVSjnEkgAPACU11JG0tmIDKYdYEuABoKStYzbBqSdZDt72Q2xfbPty29+z/b5UxwIAPFDKHvzdkg6MiDtt7yTp27bPjogLEx4TAEZubn4MTLvSdiyVLMBHREi6M1/dKX+M1zxeABiCljbnS+MV4JMOk7S9g+3LJN0q6RsRcVHK4wEAFiQN8BFxX0Q8XdLjJO1ne5+l29hu2e7Y7mzbti1lcwBgqoxkolNE3C7pm5IOXua1dkQ0I6K5fv36UTQHAMZGQ535YZbDliwHb3u9pHsi4nbbD5V0kKT3pzoeAEyilEMsU46i2U3Sp2zvoOybwhci4oyExwMA9CkU4G2/TNKZEXF/0R1HxBWSnrHWhgHApOhqRtLaig2kHGLpbDTjgI3sz0jaX9KXJZ0UEd8fekskNZvN6HTS5KIAIBU7+7dAOB3ymyXb3YhoLvdaoYusEXGUst74dZK22L4gH/2y85paBABIrvAomoj4haQvSfq8svz6yyVttf2WRG0DAJRQKMDbPsz2aZLOUzYjdb+IeLGkfSW9PV3zAGD8hayQq27GAxQdRfMKSf8UEef3PxkRd9k+ZvjNAgCUVTRF89Olwd32+yUpIs4ZeqsAAKUVDfAHLfPci4fZEACYRl3NzA+zHLZVUzS23yjpTZL2tH1F30s7S/pOkhYBwBRp5veBSlFqd1AO/hRJZ0v6e0nv6nv+joj4WYL2AACGZFCAj4i4wfabl75g+9EEeQAYX0V68Icou5dsSIvGAYWkPRK1CwAmRktzktZWbGBheOWIb7odEYfk/z5h6EcGgJrYnNeTGa/7ORWf6HSA7Yfny0fZPsH2hrRNAwCUUXSY5L9Kust2b+bqdZJOTtYqAJggs2prduz678Vnst4bEWH7MEkfjYgTmcEKAJm2NuVLrVW3G7WiAf4O28dKOkrSc20/SFlNGgDAmCqaonm1pLslHRMRP1V2E+0PJGsVAKC0Qj34PKif0Lf+Y0mfTtUoAJgWZYZYDlL0ln2vUHbD7F2UjYW3sklQj0zQJgCYGimHWBbNwf+jpJdFxDUJ2gAASKBogP9vgjsADN/C8Mrhj8ApGuA7tk+V9BVlF1slSRHxb0NvEQBMGOdlBtZSbCDlEMuiAf6Rku6S9MK+50ISAR4AxlTRUTRHp24IAGC4itai2cv2ObavytefZvu4tE0DgMnQUUMdNapuxgMUnei0WdKxku6RpIi4QtIRqRoFAJOkoa1qaGvVzXiAogH+YRFx8ZLn7h12YwAAw1M0wN9me0/lF4ltHy7plmStAoARs6WGu9nCCo+Gu/OrbbcWXhtTRUfRvFnZRKsn275J0g8lHZmsVQAwJcoMsRy474iVd2v7bUueeqiyXv8vJSkiTnjAm0poNpvR6XSGuUsAKKTXEV8lJI7lcW13I6K53GuDevA75/8+SdIzJX1VWR2a10pampMHgIk1Nz/RaPxu3LFWg+7J+j5Jsn2+pJmIuCNfP17SmclbBwAj0tLmfGm0AX5heGV36PsumoPfVdKv+9Z/nT8HACgh5fDKogH+05Iutn1avv4HkrakaBAAYDiKlir4W9tnS/rd/KmjI+LS1d5je3dlHwy7KrtA3I6ID5dpLACguKI9eEXEVmm7vkvcK+ntEbHV9s6Sura/ERFXb28jAQDbr+hEp+0WEbfkHwrKL85eI+mxqY4HAFiscA++DNsbJT1D0kWjOB4AbK+uZiRpDEuGrV3yAG/7EZK+LOnPIuIXy7zeUl7pfsOGDambAwDLaubDFEc8z0ltzUpKcbuPATNZS+/c3knSGZK+VmTWKzNZAVSljjNZk+XgbVvSiZKuGXZJAwDAYClTNAcoK2lwpe3L8ufeHRFnJTwmAKxJyPNLozQzP4N1+Nn/ZAE+Ir4taXzraALAGOiql10Z/gdLshQNAKBaBHgAqCkCPADUFAEeAGqKAA8ANTWSUgUAMO5ampNUp/s5EeABQJK0OS8WMOoA31A2e3/493MiwANApbYmLG9GgAcASbPzffcUZb+qQYAHAEltbcqXRhvg5+aPN/zkEAEeACrU0uZ8afgBnmGSAFBTBHgAqCkCPADUFDl4ALXhEgXKR32rvlGgBw8ANUUPHkDtrO3+ptX04buakZTifk704AHUSMjZrfecP7p9BQBarYXnlz4a6WaTDtJUV80khQoI8ABQW6RoANTPcjmadjt7TBECPABUKOT5pWEjRQMANUUPHkBtpKytPokI8ABqI2Vt9UlEigYAaooePIDaSFlbfRIR4AHURsra6pOIAA8AFWppTlKajyQCPABUaHOeVkoR4LnICmBs9JeHabm9cu0Ye9Fq141ytYJrih48gLExm/djN4/4xtdVmp3vuw//nB1rq6uZRLPZjE6nU3UzAFSl1wsfo7iUXMlztt2NiOZyr5GiAYCaIsADQE0lC/C2T7J9q+2rUh0DALCylD34LZIOTrh/AMAqko2iiYjzbW9MtX8A46nMaMUpurQ6EuTgAaCmKg/wtlu2O7Y727Ztq7o5AErq3fg6ZEWnq4hsBGDMtha/1v+YaWSjBHsbT5He/0IKlQf4iGhHRDMimuvXr6+6OQBQG8xkBZDG0p74FN70umoph0l+TtIFkp5k+0bbx6Q6Vimr1LpY9MvYXqUuBgCsUUcNdRLdiSrlKJrXpNo3ANRFQ1uT7ZsUTdELOq1W9uhH7x3AGCPAlzFlV/uBIhrKCgZ2K24HCPCVd8L5jMBYWvqH0f+L2mhIW1dIK8zOaiu3yxsblQ+TrFrZCxwdNVYc2zvXV995Rt1ltwEmwaKba6ySMm5vXvk1jN7U9+B7FzjW3JNuSCtdI2nNSq1eZ6YradmKzcD4Wm4CTpPky8SY+gBfWrfgL3ujsfhTpOrcEFAAKcT02pqVlOJ+TqRogPqy1XVj8ZSNVeZ9tNyeX71Zu1Xd+qmxSW1tSnTdggBfkUae/V9tnlWRB7CatY6xPl7HD7chqAQBviJb1dBWNTSnlQswLb34y0VarMV8sa9YsrLk0Y7W/Go7WuRnRmRGXc0kuq5BDr4i8387LUkrjDxozEjR/3Mnnk+Vst/QCM+ToTs/+mL4P7Gp78G3NTt/kaOaBrRX7lUtvYC7qCuGabLSMNveo78H2PtWCEx9gE95gSO1m7XbfC6+5VWKodmLxzG7QQJ/O5S6RrI9P48lj95cigip21m9jd3Owmd/q8L+CsbL1Af4SbbWC2G9C29lL/BO64Xh1XrSs32dhdkhdBxavfxdb5jtSo9G3/Wapd8KMbWmPsCnvMCR1EoXxVZ4LPf3vr2zbod1cbeKD4gyx+vNdi4SL9tzC//P7bm+F1b5eTSiu3rwBtZo6gN8V82+ixzTqzXbl/pfLR1wyCFSp1P4Fmzzj+3o9Q76YBmU2pCttlvzqw1vf/66/9HIxzzNWy0Y91ccbbUI0qgUo2im0aCAs3TWbWLtOandi4ttSZtGdmhJ+QdaL8OxyqgmYNI4xqh30Ww2o9MZcDVp2Hrf+cfo/wHA9Gg4+wbZjbUVPbTdjYhl0xD04AGgQlsT3a5PqmMOvrFyXnZRfrTbrceQDgBYAT14AKjQwgi24c/HqUcOvkQenRQ8gEqVDEKr5eDrl6IBAEgiwANAbRHgAaCmCPAAUFMEeACoKYZJAkCFupqRpCTTneoR4OfmBm8DAGOomRe7SzFSuxYB3pvyiQIjLlIFAOOMHDwA1FQtevC9euLtaA3YEgDGy8INdIafpKlFgG/P52YI8ADQQ4oGAGqKAA8ANZU0wNs+2PYPbF9r+10pjwUAWCxZgLe9g6SPSXqxpL0lvcb23qmOBwBYLGUPfj9J10bE9RHxa0mfl3RYwuMBAPqkHEXzWEk/6Vu/UdKzEh4PACZOS9lM/OHfz2kMhknabmlhfOOdtn+wxl2tk33bkJo1KdZJ4pzrbdrOV5rSc97sTWs958ev9ELKAH+TpN371h+XP7dIRLQ1hA8v252VbltVV5xz/U3b+Uqc8zClzMFfIumJtp9g+8GSjpB0esLjAQD6JOvBR8S9tv9E0tck7SDppIj4XqrjAQAWS5qDj4izJJ2V8hh9UlyjGHecc/1N2/lKnPPQOCJFFWIAQNUoVQAANTVxAX5Q+QPbv2H71Pz1i2xvrKCZQ1PgfN9m+2rbV9g+x/aKQ6YmRdESF7b/0HbYnvgRF0XO2far8p/192yfMuo2DluB3+0Ntr9p+9L89/slVbRzWGyfZPtW21et8LptfyT//7jC9kzpg0bExDyUXay9TtIekh4s6XJJey/Z5k2SPp4vHyHp1Krbnfh8ny/pYfnyGyf5fIuec77dzpLOl3ShpGbV7R7Bz/mJki6V9Fv5+i5Vt3sE59yW9MZ8eW9JN1Td7pLn/FxJM5KuWuH1l0g6W5IlPVvSRWWPOWk9+CLlDw6T9Kl8+UuSXmDbmkwDzzcivhkRd+WrFyqbbzDJipa4+BtJ75f0f6NsXCJFznlW0sci4n8lKSJuHXEbh63IOYekR+bLvynp5hG2b+gi4nxJP1tlk8MkfToyF0p6lO3dyhxz0gL8cuUPHrvSNhFxr6SfS3rMSFo3fEXOt98xynoAk2zgOedfXXePiDNH2bCEivyc95K0l+3v2L7Q9sEja10aRc75eElH2b5R2Wi8t4ymaZXZ3r/3gSovVYDhsH2UpKak36u6LSnZfpCkEyS9vuKmjNqOytI0z1P2Le18278TEbdX2ajEXiNpS0R80Pb+kk62vU9E3F91wybFpPXgi5Q/mN/G9o7Kvtr9z0haN3yFyj3Y/n1JfyXp0Ii4e0RtS2XQOe8saR9J59m+QVmu8vQJv9Ba5Od8o6TTI+KeiPihpP9SFvAnVZFzPkbSFyQpIi6Q9BBldWrqqtDf+/aYtABfpPzB6ZL+KF8+XNK5kV/BmEADz9f2MyTNKQvuk56XlQacc0T8PCLWRcTGiNio7LrDoRHRqaa5Q1Hk9/orynrvsr1OWcrm+hG2cdiKnPOPJb1Akmw/RVmA3zbSVo7W6ZJel4+mebakn0fELWV2OFEpmlih/IHtv5bUiYjTJZ2o7KvctcouaBxRXYvLKXi+H5D0CElfzK8l/zgiDq2s0SUVPOdaKXjOX5P0QttXS7pP0jsjYlK/mRY957dL2mz7z5VdcH39BHfWZPtzyj6k1+XXFd4raSdJioiPK7vO8BJJ10q6S9LRpY85wf9fAIBVTFqKBgBQEAEeAGqKAA8ANUWAB4CaIsADQE0R4DHxbDdtfyTxMb67ndu/wfbr8uUttg8v8f7X2/7t7Xk/IDFMEkjO9hZJZ0TElwpuv2NeR6m3fp6kd0z4ZC5UgB48xoLt1+U1sC+3fbLtjbbP7atzvyHf7pW2r8q3Oz9/7nm2z8iXj8/rbp9n+3rbf9p3jKNsX2z7MttztndYph1P7dvmCttPzJ+/s+9Y37L91Xz//2D7yPw9V9res68d71hm/++xfUl+Du1epdO8vR+y3ZH01t77855/U9Jn8za91PZX+vZ3kO3ThvVzQL0Q4FE520+VdJykAyNiX0lvlfTPkj4VEU+T9FlJvRTMeyS9KN9upRm7T5b0ImUlad9re6d8qvurJR0QEU9XNhv0yGXe+wZJH863aSqrAbPUvvl2T5H0Wkl7RcR+kj6hwRUPPxoRz4yIfSQ9VNIhfa89OCKaEfHB3hN5r78j6ci8TWdJerLt9fkmR0s6acAxMaUI8BgHB0r6YkTcJkkR8TNJ+0vq3bXoZEnPyZe/I2mL7VllU9yXc2ZE3J3v71ZJuyqradKQdInty/L1PZZ57wWS3m37LyU9PiJ+tcw2l0TELXlht+skfT1//kpJGwec6/Od3Wnsyvy8n9r32qkD3qt8qv7JysroPkrZ/9Okl4hGIhNViwaIiDfYfpakl0rq2m4ss1l/Rc37lP2eW9k3gmP7N7T9cmU1QSTpjyPiFNsX5fs/y/amiDh3lf3f37d+v1b5m7L9EEn/ouwOVD+xfbyyAlo9v1zpvUt8UtK/K7vZyRf78/VAP3rwGAfnSnql7cdIku1HS/quFgrFHSnpP/PX9oyIiyLiPcoqC+6+zP6Wc46kw23v0juG7cdHxGkR8fT80bG9h6TrI+Ijkr4q6WnDOkktBPPbbD9CWbXTIu5QViZZkhQRNyu7u9FxyoI9sCx68KhcXkXwbyV9y/Z9yu49+hZJn7T9TmWBvFdZ7wP5hU8rC9qXq8BNTiLiatvHSfq6s5uG3CPpzZJ+tGTTV0l6re17JP1U0t+VPsGFNtxue7Okq/J9X1LwrVskfdz2ryTtn6eNPitpfURcM6z2oX4YJglMINsflXRpRJxYdVswvgjwwISx3VWWrz+oBnfwQkIEeACoKS6yAkBNEeABoKYI8ABQUwR4AKgpAjwA1BQBHgBq6v8Bu4ZHvMvT4Z8AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Unconstrained diagonal, i.e., only L2 norm regularization, like in denoising autoencoder. \n", + "# As a consequence, similar items have similar embeddings, as indicated by cosine-similarities close to 1.\n", + "\n", + "UU_freediag_cosine = calc_cosineSimilarity(UU_freediag, ixSims) # embeddings UU_freediag, VV_freediag computed above\n", + "VV_freediag_cosine = calc_cosineSimilarity(VV_freediag, ixSims)\n", + "\n", + "plt.hist(UU_freediag_cosine, bins=20, range=(0.0,1.0), density=True,histtype='step', color='blue', linewidth=2.0, linestyle='-')\n", + "plt.hist(VV_freediag_cosine, bins=20, range=(0.0,1.0), density=True,histtype='step', color='red', linewidth=2.0, linestyle='--')\n", + "plt.xlabel('cosine-similarity')\n", + "plt.ylabel('density')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..54443df --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# Autoencoders that don't overfit towards the Identity + +This notebook provides an implementation in Python 3.7.7 (and Tensorflow 1.15.0) of the algorithms outlined in the paper +"Autoencoders that don't overfit towards the Identity" +at the 34th Conference on Neural Information Processing Systems (NeurIPS 2020). + +For reproducibility, the experiments utilize publicly available [code](https://github.com/dawenl/vae_cf) for pre-processing three popular data-sets and for evaluating the learned models. That code accompanies the paper "[Variational autoencoders for collaborative filtering](https://arxiv.org/abs/1802.05814)" by Dawen Liang et al. at The Web Conference 2018. While the code for the Movielens-20M data-set was made publicly available, the code for pre-processing the other two data-sets can easily be obtained by modifying their code as described in their paper. +The experiments were run on an AWS instance with 128 GB RAM and 16 vCPUs.