From 0f191de6c1e8ada93c6ae95c46f8a4034f9509b8 Mon Sep 17 00:00:00 2001 From: yujunhao <772986150@qq.com> Date: Fri, 9 Aug 2024 09:29:42 +0800 Subject: [PATCH] Update data preprocess example --- ...{assistment.ipynb => assistment2009.ipynb} | 72 ++++++++++--------- 1 file changed, 39 insertions(+), 33 deletions(-) rename scripts/dataset/{assistment.ipynb => assistment2009.ipynb} (91%) diff --git a/scripts/dataset/assistment.ipynb b/scripts/dataset/assistment2009.ipynb similarity index 91% rename from scripts/dataset/assistment.ipynb rename to scripts/dataset/assistment2009.ipynb index 030683d..06ce919 100644 --- a/scripts/dataset/assistment.ipynb +++ b/scripts/dataset/assistment2009.ipynb @@ -26,8 +26,10 @@ " print('Total length: {}'.format(len(data)))\n", " elif isinstance(key, str):\n", " print('Number of unique {}: {}'.format(key, len(data[key].unique())))\n", + " return len(data[key].unique())\n", " elif isinstance(key, list):\n", - " print('Number of unique [{}]: {}'.format(','.join(key), len(data.drop_duplicates(key, keep='first'))))" + " print('Number of unique [{}]: {}'.format(','.join(key), len(data.drop_duplicates(key, keep='first'))))\n", + " return len(data.drop_duplicates(key, keep='first'))" ] }, { @@ -251,8 +253,7 @@ } ], "source": [ - "data_path ='../../data/assistment/'\n", - "raw_data = pd.read_csv('../../data/assistment/assistment.csv', encoding = 'utf-8', dtype={'skill_id': str})\n", + "raw_data = pd.read_csv('../../data/assistment/assistment2009.csv', encoding = 'utf-8', dtype={'skill_id': str})\n", "raw_data.head()" ] }, @@ -289,12 +290,10 @@ ], "source": [ "stat_unique(all_data, None)\n", - "stat_unique(all_data, ['student_id', 'question_id'])\n", - "stat_unique(all_data, 'student_id')\n", - "stat_unique(all_data, 'question_id')\n", - "stat_unique(all_data, 'knowledge_id')\n", - "ques_num = len(all_data['question_id'].unique())\n", - "know_num = len(all_data['knowledge_id'].unique())" + "a=stat_unique(all_data, ['student_id', 'question_id'])\n", + "b=stat_unique(all_data, 'student_id')\n", + "c=stat_unique(all_data, 'question_id')\n", + "d=stat_unique(all_data, 'knowledge_id')" ] }, { @@ -343,14 +342,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "filter 1471 students\n" + "filter 2120 students\n" ] } ], "source": [ "# filter students\n", "n_questions = selected_data.groupby('student_id')['question_id'].count()\n", - "student_filter = n_questions[n_questions < 10].index.tolist()\n", + "student_filter = n_questions[n_questions < 20].index.tolist()\n", "print(f'filter {len(student_filter)} students')\n", "selected_data = selected_data[~selected_data['student_id'].isin(student_filter)]" ] @@ -366,7 +365,7 @@ "table = selected_data.loc[:, ['question_id', 'knowledge_id']].drop_duplicates()\n", "for i, row in table.iterrows():\n", " q = row['question_id']\n", - " q2k[q] = set(map(int, str(row['knowledge_id']).split('_')))\n", + " q2k[q] = set(map(int, str(int(float(row['knowledge_id']))).split('_')))\n", " \n", "# get knowledge to question map\n", "k2q = {}\n", @@ -391,6 +390,7 @@ ], "source": [ "# filter knowledges\n", + "#selected_knowledges = { k for k, q in k2q.items()}\n", "selected_knowledges = { k for k, q in k2q.items() if len(q) >= 10}\n", "print(f'filter {len(k2q) - len(selected_knowledges)} knowledges')" ] @@ -463,10 +463,10 @@ "cnt = 0\n", "for i, row in selected_data.iterrows():\n", " for k in str(row.knowledge_id).split('_'):\n", - " if int(k) not in k2n:\n", - " k2n[int(k)] = cnt\n", + " if int(float(k)) not in k2n:\n", + " k2n[int(float(k))] = cnt\n", " cnt += 1\n", - "selected_data.loc[:, 'knowledge_id'] = selected_data.loc[:, 'knowledge_id'].apply(lambda x: '_'.join(map(lambda y: str(k2n[int(y)]), str(x).split('_'))))\n" + "selected_data.loc[:, 'knowledge_id'] = selected_data.loc[:, 'knowledge_id'].apply(lambda x: '_'.join(map(lambda y: str(k2n[int(float(y))]), str(x).split('_'))))\n" ] }, { @@ -478,9 +478,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Total length: 110398\n", - "Number of unique [student_id,question_id]: 78747\n", - "Number of unique student_id: 1940\n", + "Total length: 101470\n", + "Number of unique [student_id,question_id]: 71515\n", + "Number of unique student_id: 1291\n", "Number of unique question_id: 1485\n", "Number of unique knowledge_id: 35\n", "Average #questions per knowledge: 59.4\n" @@ -489,10 +489,10 @@ ], "source": [ "stat_unique(selected_data, None)\n", - "stat_unique(selected_data, ['student_id', 'question_id'])\n", - "stat_unique(selected_data, 'student_id')\n", - "stat_unique(selected_data, 'question_id')\n", - "stat_unique(selected_data, 'knowledge_id')\n", + "a=stat_unique(selected_data, ['student_id', 'question_id'])\n", + "b=stat_unique(selected_data, 'student_id')\n", + "c=stat_unique(selected_data, 'question_id')\n", + "d=stat_unique(selected_data, 'knowledge_id')\n", "print('Average #questions per knowledge: {}'.format((len(q2k) / len(k2q))))" ] }, @@ -502,8 +502,7 @@ "metadata": {}, "outputs": [], "source": [ - "# save selected data\n", - "selected_data.to_csv(data_path+'selected_data.csv', index=False)" + "selected_data.to_csv('selected_data.csv', index=False)" ] }, { @@ -653,9 +652,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "train records length: 60393\n", + "train records length: 51465\n", "test records length: 50005\n", - "all records length: 110398\n" + "all records length: 101470\n" ] } ], @@ -694,9 +693,9 @@ "metadata": {}, "outputs": [], "source": [ - "save_to_csv(train_data, data_path+'train_triples.csv')\n", - "save_to_csv(test_data, data_path+'test_triples.csv')\n", - "save_to_csv(all_data, data_path+'triples.csv')" + "save_to_csv(train_data, 'train_triples.csv')\n", + "save_to_csv(test_data, 'test_triples.csv')\n", + "save_to_csv(all_data, 'triples.csv')" ] }, { @@ -706,8 +705,8 @@ "outputs": [], "source": [ "metadata = {\"num_students\": n_students, \n", - " \"num_questions\": ques_num,\n", - " \"num_concepts\": know_num, \n", + " \"num_questions\": c,\n", + " \"num_concepts\": d, \n", " \"num_records\": len(all_data), \n", " \"num_train_students\": n_students - len(test_students), \n", " \"num_test_students\": len(test_students)}" @@ -719,14 +718,21 @@ "metadata": {}, "outputs": [], "source": [ - "with open(data_path+'metadata.json', 'w') as f:\n", + "with open('metadata.json', 'w') as f:\n", " json.dump(metadata, f)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Basenv", "language": "python", "name": "python3" },