Skip to content

Commit

Permalink
Update data preprocess example
Browse files Browse the repository at this point in the history
  • Loading branch information
Hhhhhhand committed Aug 9, 2024
1 parent f551bb1 commit 0f191de
Showing 1 changed file with 39 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
" print('Total length: {}'.format(len(data)))\n",
" elif isinstance(key, str):\n",
" print('Number of unique {}: {}'.format(key, len(data[key].unique())))\n",
" return len(data[key].unique())\n",
" elif isinstance(key, list):\n",
" print('Number of unique [{}]: {}'.format(','.join(key), len(data.drop_duplicates(key, keep='first'))))"
" print('Number of unique [{}]: {}'.format(','.join(key), len(data.drop_duplicates(key, keep='first'))))\n",
" return len(data.drop_duplicates(key, keep='first'))"
]
},
{
Expand Down Expand Up @@ -251,8 +253,7 @@
}
],
"source": [
"data_path ='../../data/assistment/'\n",
"raw_data = pd.read_csv('../../data/assistment/assistment.csv', encoding = 'utf-8', dtype={'skill_id': str})\n",
"raw_data = pd.read_csv('../../data/assistment/assistment2009.csv', encoding = 'utf-8', dtype={'skill_id': str})\n",
"raw_data.head()"
]
},
Expand Down Expand Up @@ -289,12 +290,10 @@
],
"source": [
"stat_unique(all_data, None)\n",
"stat_unique(all_data, ['student_id', 'question_id'])\n",
"stat_unique(all_data, 'student_id')\n",
"stat_unique(all_data, 'question_id')\n",
"stat_unique(all_data, 'knowledge_id')\n",
"ques_num = len(all_data['question_id'].unique())\n",
"know_num = len(all_data['knowledge_id'].unique())"
"a=stat_unique(all_data, ['student_id', 'question_id'])\n",
"b=stat_unique(all_data, 'student_id')\n",
"c=stat_unique(all_data, 'question_id')\n",
"d=stat_unique(all_data, 'knowledge_id')"
]
},
{
Expand Down Expand Up @@ -343,14 +342,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"filter 1471 students\n"
"filter 2120 students\n"
]
}
],
"source": [
"# filter students\n",
"n_questions = selected_data.groupby('student_id')['question_id'].count()\n",
"student_filter = n_questions[n_questions < 10].index.tolist()\n",
"student_filter = n_questions[n_questions < 20].index.tolist()\n",
"print(f'filter {len(student_filter)} students')\n",
"selected_data = selected_data[~selected_data['student_id'].isin(student_filter)]"
]
Expand All @@ -366,7 +365,7 @@
"table = selected_data.loc[:, ['question_id', 'knowledge_id']].drop_duplicates()\n",
"for i, row in table.iterrows():\n",
" q = row['question_id']\n",
" q2k[q] = set(map(int, str(row['knowledge_id']).split('_')))\n",
" q2k[q] = set(map(int, str(int(float(row['knowledge_id']))).split('_')))\n",
" \n",
"# get knowledge to question map\n",
"k2q = {}\n",
Expand All @@ -391,6 +390,7 @@
],
"source": [
"# filter knowledges\n",
"#selected_knowledges = { k for k, q in k2q.items()}\n",
"selected_knowledges = { k for k, q in k2q.items() if len(q) >= 10}\n",
"print(f'filter {len(k2q) - len(selected_knowledges)} knowledges')"
]
Expand Down Expand Up @@ -463,10 +463,10 @@
"cnt = 0\n",
"for i, row in selected_data.iterrows():\n",
" for k in str(row.knowledge_id).split('_'):\n",
" if int(k) not in k2n:\n",
" k2n[int(k)] = cnt\n",
" if int(float(k)) not in k2n:\n",
" k2n[int(float(k))] = cnt\n",
" cnt += 1\n",
"selected_data.loc[:, 'knowledge_id'] = selected_data.loc[:, 'knowledge_id'].apply(lambda x: '_'.join(map(lambda y: str(k2n[int(y)]), str(x).split('_'))))\n"
"selected_data.loc[:, 'knowledge_id'] = selected_data.loc[:, 'knowledge_id'].apply(lambda x: '_'.join(map(lambda y: str(k2n[int(float(y))]), str(x).split('_'))))\n"
]
},
{
Expand All @@ -478,9 +478,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Total length: 110398\n",
"Number of unique [student_id,question_id]: 78747\n",
"Number of unique student_id: 1940\n",
"Total length: 101470\n",
"Number of unique [student_id,question_id]: 71515\n",
"Number of unique student_id: 1291\n",
"Number of unique question_id: 1485\n",
"Number of unique knowledge_id: 35\n",
"Average #questions per knowledge: 59.4\n"
Expand All @@ -489,10 +489,10 @@
],
"source": [
"stat_unique(selected_data, None)\n",
"stat_unique(selected_data, ['student_id', 'question_id'])\n",
"stat_unique(selected_data, 'student_id')\n",
"stat_unique(selected_data, 'question_id')\n",
"stat_unique(selected_data, 'knowledge_id')\n",
"a=stat_unique(selected_data, ['student_id', 'question_id'])\n",
"b=stat_unique(selected_data, 'student_id')\n",
"c=stat_unique(selected_data, 'question_id')\n",
"d=stat_unique(selected_data, 'knowledge_id')\n",
"print('Average #questions per knowledge: {}'.format((len(q2k) / len(k2q))))"
]
},
Expand All @@ -502,8 +502,7 @@
"metadata": {},
"outputs": [],
"source": [
"# save selected data\n",
"selected_data.to_csv(data_path+'selected_data.csv', index=False)"
"selected_data.to_csv('selected_data.csv', index=False)"
]
},
{
Expand Down Expand Up @@ -653,9 +652,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
"train records length: 60393\n",
"train records length: 51465\n",
"test records length: 50005\n",
"all records length: 110398\n"
"all records length: 101470\n"
]
}
],
Expand Down Expand Up @@ -694,9 +693,9 @@
"metadata": {},
"outputs": [],
"source": [
"save_to_csv(train_data, data_path+'train_triples.csv')\n",
"save_to_csv(test_data, data_path+'test_triples.csv')\n",
"save_to_csv(all_data, data_path+'triples.csv')"
"save_to_csv(train_data, 'train_triples.csv')\n",
"save_to_csv(test_data, 'test_triples.csv')\n",
"save_to_csv(all_data, 'triples.csv')"
]
},
{
Expand All @@ -706,8 +705,8 @@
"outputs": [],
"source": [
"metadata = {\"num_students\": n_students, \n",
" \"num_questions\": ques_num,\n",
" \"num_concepts\": know_num, \n",
" \"num_questions\": c,\n",
" \"num_concepts\": d, \n",
" \"num_records\": len(all_data), \n",
" \"num_train_students\": n_students - len(test_students), \n",
" \"num_test_students\": len(test_students)}"
Expand All @@ -719,14 +718,21 @@
"metadata": {},
"outputs": [],
"source": [
"with open(data_path+'metadata.json', 'w') as f:\n",
"with open('metadata.json', 'w') as f:\n",
" json.dump(metadata, f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Basenv",
"language": "python",
"name": "python3"
},
Expand Down

0 comments on commit 0f191de

Please sign in to comment.