[ADD] data processing code

yehjin-shin · Sep 14, 2024 · f018249 · f018249
1 parent 1dc3d69
commit f018249
Show file tree

Hide file tree

Showing 6 changed files with 374 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -5,6 +5,11 @@ This is the official source code for our AAAI 2024 Paper ["An Attentive Inductiv
 Beyond Self-Attention for Sequential Recommendation (BSARec) leverages Fourier transform to strike a balance between our inductive bias and self-attention.
 ![BSARec](fig/model_architecture.png)
 
+## Updates
+- (Seq 14, 2024) add data processing code
+- (Apr 20, 2024) rename variable 'beta' to 'sqrt_beta'
+- (Apr 16, 2024) add visualization code for figure 3
+
 ## Dataset
 In our experiments, we utilize six datasets, all stored in the `src/data` folder. 
 - For the Beauty, Sports, Toys, and Yelp datasets, we employed the datasets downloaded from [this repository](https://github.com/Woeee/FMLP-Rec). 

diff --git a/src/data/process/README.md b/src/data/process/README.md
@@ -0,0 +1,26 @@
+# Processing Dataset
+
+This repository contains code for downloading raw data and performing transformations. Follow the instructions below to execute the processing script and input your desired dataset name. If you want to process all datasets, enter `all`.
+
+## Instructions
+
+1. Run the following command to start the data processing:
+
+   ```sh
+   sh process.sh
+   ```
+
+2. You will see the following message with a list of available datasets:
+
+   ```sh
+   Available datasets: ['Beauty', 'Sports_and_Outdoors', 'Toys_and_Games', 'LastFM', 'ML-1M']
+   Enter the name of the dataset (or type 'all' to process all datasets):
+   ```
+
+3. Enter the name of the dataset you want to process, or type all to process all datasets.
+
+## Acknowledgement
+
+The code base used for processing datasets is available at: [CIKM2020-S3Rec](https://github.com/RUCAIBox/CIKM2020-S3Rec/tree/master).
+
+Please note that we could not reproduce the results as the Yelp raw data used in the code base could not be located. In our experiments, we utilized pre-processed data provided in the code base. We will update this document if we succeed in reproducing the Yelp data results.
diff --git a/src/data/process/_download.sh b/src/data/process/_download.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+DATASET_NAME="$1"
+mkdir -p raw
+cd raw
+
+if [ "$DATASET_NAME" = "all" ]; then
+    cd ..
+    sh _download.sh Beauty            
+    sh _download.sh Sports_and_Outdoors            
+    sh _download.sh Toys_and_Games            
+    sh _download.sh LastFM            
+    sh _download.sh ML-1M 
+    # sh _download.sh Yelp           
+
+elif [ "$DATASET_NAME" = "Beauty" ]; then
+    wget https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Beauty_5.json.gz -O Beauty.json.gz
+    gzip -d Beauty.json.gz
+
+elif [ "$DATASET_NAME" = "Sports_and_Outdoors" ]; then
+    wget https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz -O Sports_and_Outdoors.json.gz
+    gzip -d Sports_and_Outdoors.json.gz
+
+elif [ "$DATASET_NAME" = "Toys_and_Games" ]; then
+    wget https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Toys_and_Games_5.json.gz -O Toys_and_Games.json.gz
+    gzip -d Toys_and_Games.json.gz
+
+elif [ "$DATASET_NAME" = "LastFM" ]; then
+    wget https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip -O ./LastFM.zip
+    mkdir -p LastFM
+    unzip LastFM.zip -d ./LastFM
+    rm LastFM.zip
+
+elif [ "$DATASET_NAME" = "ML-1M" ]; then
+    wget https://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./raw_data/ML-1M.zip
+    unzip ./raw_data/ML-1M.zip
+    rm ./raw_data/ML-1M.zip
+
+else   
+    echo "Invalid dataset name"
+    echo "Available datasets: ['Beauty', 'Sports_and_Outdoors', 'Toys_and_Games', 'LastFM', 'ML-1M']" # TODO: Yelp
+    echo "To download all datasets at once, enter 'all' as the dataset name."
+    echo ""
+fi
+
diff --git a/src/data/process/_transform.py b/src/data/process/_transform.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2020/4/4 8:18
+# @Author  : Hui Wang
+
+from collections import defaultdict
+import random
+import numpy as np
+import pandas as pd
+import json
+import pickle
+import gzip
+import sys
+import tqdm
+import _utils as dutils
+import datetime
+import matplotlib.pyplot as plt
+
+# return (user item timestamp) sort in get_interaction
+def Amazon(dataset_name, rating_score):
+    '''
+    reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
+    asin - ID of the product, e.g. 0000013714
+    reviewerName - name of the reviewer
+    helpful - helpfulness rating of the review, e.g. 2/3
+    --"helpful": [2, 3],
+    reviewText - text of the review
+    --"reviewText": "I bought this for my husband who plays the piano. ..."
+    overall - rating of the product
+    --"overall": 5.0,
+    summary - summary of the review
+    --"summary": "Heavenly Highway Hymns",
+    unixReviewTime - time of the review (unix time)
+    --"unixReviewTime": 1252800000,
+    reviewTime - time of the review (raw)
+    --"reviewTime": "09 13, 2009"
+    '''
+    datas = []
+    data_flie = './raw/{}.json'.format(dataset_name)
+    lines = open(data_flie).readlines()
+    for line in tqdm.tqdm(lines):
+        inter = json.loads(line.strip())
+        if float(inter['overall']) <= rating_score: # Less than a certain percentage.
+            continue
+        user = inter['reviewerID']
+        item = inter['asin']
+        time = inter['unixReviewTime']
+        # "reviewTime": "09 13, 2009"
+        datas.append((user, item, int(time)))
+
+    return datas
+
+def ML1M():
+    datas = []
+    filepath = './raw/ml-1m/ratings.dat'
+    # import pdb; pdb.set_trace()
+    df = pd.read_csv(filepath, delimiter='::', header=None, engine='python', names=['user', 'item', 'rating', 'timestamp'])
+    df = df[['user', 'item', 'timestamp']]
+
+    for i in tqdm.tqdm(range(len(df))):
+        datas.append(tuple(df.iloc[i]))
+
+    return datas
+
+def Yelp(date_min, date_max, rating_score):
+    datas = []
+    data_flie = './raw/yelp_academic_dataset_review.json'
+    years = []
+    lines = open(data_flie).readlines()
+    for line in tqdm.tqdm(lines):
+        review = json.loads(line.strip())
+        user = review['user_id']
+        item = review['business_id']
+        rating = review['stars']
+        date = review['date']
+        # Exclude some examples
+        years.append(int(date.split('-')[0]))
+        if date < date_min or date > date_max or float(rating) <= rating_score:
+            continue
+        time = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
+        datas.append((user, item, int(time.timestamp())))
+
+    return datas
+
+def LastFM():
+    datas = []
+    data_file = './raw/LastFM/user_taggedartists-timestamps.dat'
+    lines = open(data_file).readlines()
+
+    for line in tqdm.tqdm(lines[1:]):
+        user, item, _, timestamp = line.strip().split('\t')
+        datas.append((user, item, int(timestamp)))
+
+    return datas
+
+def process(data_name='Beauty'):
+    np.random.seed(12345)
+    rating_score = 0.0  # rating score smaller than this score would be deleted
+    # user 5-core item 5-core
+    user_core = 5
+    item_core = 5
+
+    if data_name in ["Sports_and_Outdoors", "Toys_and_Games", "Beauty"]:
+        datas = Amazon(data_name, rating_score)
+    if data_name == 'Yelp':
+        date_max = '2019-12-31 00:00:00'
+        date_min = '2019-01-01 00:00:00'
+        datas = Yelp(date_min, date_max, rating_score)
+    elif data_name == 'ML-1M':
+        datas = ML1M()
+    elif data_name == 'LastFM':
+        datas = LastFM()
+
+    user_items, time_interval = dutils.get_interaction(datas, data_name)
+    print(f'{data_name} Raw data has been processed!')
+    # raw_id user: [item1, item2, item3...]
+    user_items, time_interval = dutils.filter_Kcore(user_items, time_interval, user_core=user_core, item_core=item_core)
+    print(f'User {user_core}-core complete! Item {item_core}-core complete!')
+    user_items, time_interval, user_num, item_num, data_maps = dutils.id_map(user_items, time_interval)  # new_num_id
+
+    avg_seqlen = np.mean([len(seq) for seq in user_items.values()])
+    user_count, item_count, _ = dutils.check_Kcore(user_items, user_core=user_core, item_core=item_core)
+    user_count_list = list(user_count.values())
+
+    user_avg, user_min, user_max = np.mean(user_count_list), np.min(user_count_list), np.max(user_count_list)
+    item_count_list = list(item_count.values())
+    item_avg, item_min, item_max = np.mean(item_count_list), np.min(item_count_list), np.max(item_count_list)
+    interact_num = np.sum([x for x in user_count_list])
+    sparsity = (1 - interact_num / (user_num * item_num)) * 100
+    show_info = f'\n====={data_name}=====\n' + \
+                f'Total User: {user_num}, Avg User: {user_avg:.2f}, Min Len: {user_min}, Max Len: {user_max}\n' + \
+                f'Total Item: {item_num}, Avg Item: {item_avg:.2f}, Min Inter: {item_min}, Max Inter: {item_max}\n' + \
+                f'Iteraction Num: {interact_num}, Avg Sequence Length: {avg_seqlen:.1f}, Sparsity: {sparsity:.2f}%'
+    print(show_info)
+
+    item_file = '../'+ data_name + '.txt'
+    with open(item_file, 'w') as out:
+        for user, items in user_items.items():
+            out.write(user + ' ' + ' '.join(items) + '\n')
+
+if __name__ == "__main__":
+    dataname = sys.argv[1]
+    available_datasets = ['Beauty', 'Sports_and_Outdoors', 'Toys_and_Games', 'LastFM', 'ML-1M'] # TODO: Yelp
+    if dataname == 'all':
+        for name in available_datasets:
+            process(name)
+    elif dataname in available_datasets:
+        process(dataname)
+    else:
+        print('Invalid dataset name')
+        print(f"Available datasets: {available_datasets}")
+        print("To transform all datasets at once, enter 'all' as the dataset name.")
diff --git a/src/data/process/_utils.py b/src/data/process/_utils.py
@@ -0,0 +1,137 @@
+from collections import defaultdict
+import random
+import numpy as np
+import pandas as pd
+import json
+import pickle
+import gzip
+import tqdm
+
+def add_comma(num):
+    # 1000000 -> 1,000,000
+    str_num = str(num)
+    res_num = ''
+    for i in range(len(str_num)):
+        res_num += str_num[i]
+        if (len(str_num)-i-1) % 3 == 0:
+            res_num += ','
+    return res_num[:-1]
+
+def get_interaction(datas, data_name):
+    if data_name == 'LastFM':
+        # Repeated items
+        user_seq = {}
+        user_seq_notime = {}
+        for data in datas:
+            user, item, time = data
+            if user in user_seq:
+                if item not in user_seq_notime[user]:
+                    user_seq[user].append((item, time))
+                    user_seq_notime[user].append(item)
+                else:
+                    continue
+            else:
+                user_seq[user] = []
+                user_seq_notime[user] = []
+
+                user_seq[user].append((item, time))
+                user_seq_notime[user].append(item)
+    else:
+        user_seq = {}
+        for data in datas:
+            user, item, time = data
+            if user in user_seq:
+                user_seq[user].append((item, time))
+            else:
+                user_seq[user] = []
+                user_seq[user].append((item, time))
+
+    time_interval = {}
+    for user, item_time in user_seq.items():
+        item_time.sort(key=lambda x: x[1])  # Sort individual data sets separately
+        items = []
+        times = []
+        for i in range(len(item_time)):
+            item = item_time[i][0]
+            curr_time = item_time[i][1]
+            prev_time = item_time[i-1][1] if i > 0 else item_time[i][1]
+            items.append(item)
+            times.append(curr_time-prev_time)
+
+        user_seq[user] = items
+        time_interval[user] = times
+
+    return user_seq, time_interval
+
+def id_map(user_items, time_interval): # user_items dict
+
+    user2id = {} # raw 2 uid
+    item2id = {} # raw 2 iid
+    id2user = {} # uid 2 raw
+    id2item = {} # iid 2 raw
+    user_id = 1
+    item_id = 1
+    final_data = {}
+    for user, items in user_items.items():
+        if user not in user2id:
+            user2id[user] = str(user_id)
+            id2user[str(user_id)] = user
+            user_id += 1
+        iids = [] # item id lists
+        for item in items:
+            if item not in item2id:
+                item2id[item] = str(item_id)
+                id2item[str(item_id)] = item
+                item_id += 1
+            iids.append(item2id[item])
+        uid = user2id[user]
+        final_data[uid] = iids
+
+    final_delta = {}
+    for uid in id2user.keys():
+        final_delta[uid] = time_interval[id2user[uid]]
+
+    data_maps = {
+        'user2id': user2id,
+        'item2id': item2id,
+        'id2user': id2user,
+        'id2item': id2item
+    }
+    return final_data, final_delta, user_id-1, item_id-1, data_maps
+
+# Circular filtration K-core
+def filter_Kcore(user_items, time_interval, user_core, item_core): # User to all items
+    user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
+    while not isKcore:
+        for user, num in user_count.items():
+            if user_count[user] < user_core: # Remove the user directly.
+                user_items.pop(user)
+                time_interval.pop(user)
+            else:
+                for item in user_items[user]:
+                    if item_count[item] < item_core:
+                        idx = user_items[user].index(item)
+                        user_items[user].remove(item)
+                        if idx != len(user_items[user]):        
+                            time_interval[user][idx+1] += time_interval[user][idx]
+                        time_interval[user].pop(idx)
+                time_interval[user][0] = 0
+        user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
+    return user_items, time_interval
+
+# K-core user_core item_core
+def check_Kcore(user_items, user_core, item_core):
+    user_count = defaultdict(int)
+    item_count = defaultdict(int)
+    for user, items in user_items.items():
+        for item in items:
+            user_count[user] += 1
+            item_count[item] += 1
+
+    for user, num in user_count.items():
+        if num < user_core:
+            return user_count, item_count, False
+    for item, num in item_count.items():
+        if num < item_core:
+            return user_count, item_count, False
+    return user_count, item_count, True # Kcore is guaranteed.
diff --git a/src/data/process/process.sh b/src/data/process/process.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+available_datasets="Beauty Sports_and_Outdoors Toys_and_Games Yelp LastFM ML-1M"
+
+echo "Available datasets: ['Beauty', 'Sports_and_Outdoors', 'Toys_and_Games', 'LastFM', 'ML-1M']" # TODO: Yelp
+echo "Enter the name of the dataset (or type 'all' to process all datasets):"
+read dataname
+
+sh _download.sh "$dataname"
+python _transform.py "$dataname"