\u001b[0;34m()\u001b[0m\n\u001b[1;32m 11\u001b[0m recom_dict[i] \u001b[38;5;241m=\u001b[39m li\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m recom_dict\n\u001b[0;32m---> 14\u001b[0m \u001b[43mcust_feat\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
- "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36mcust_feat\u001b[0;34m(i)\u001b[0m\n\u001b[1;32m 3\u001b[0m recom_dict \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 4\u001b[0m li \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 5\u001b[0m reco_group\u001b[38;5;241m=\u001b[39m\u001b[43mreco_cust_pv\u001b[49m[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mê³ ê°ë²ˆí˜¸\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m중분류명\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgradient\u001b[39m\u001b[38;5;124m'\u001b[39m]]\u001b[38;5;241m.\u001b[39mgroupby([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mê³ ê°ë²ˆí˜¸\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 6\u001b[0m 중분류 \u001b[38;5;241m=\u001b[39m reco_group\u001b[38;5;241m.\u001b[39mget_group((i))\u001b[38;5;241m.\u001b[39msort_values(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgradient\u001b[39m\u001b[38;5;124m'\u001b[39m)[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m중분류명\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 7\u001b[0m li\u001b[38;5;241m.\u001b[39mappend(중분류)\n",
- "\u001b[0;31mNameError\u001b[0m: name 'reco_cust_pv' is not defined"
- ]
- }
- ],
- "source": [
- "# ê³ ê°ì˜ 가장 ë§¤ì¶œì´ ê°ì†Œí•œ 중분류 와 ê³ ê° íŠ¹ì„± 반환 함수 \n",
- "def cust_feat(i):\n",
- " recom_dict = {}\n",
- " li = []\n",
- " reco_group=reco_cust_pv[['ê³ ê°ë²ˆí˜¸','중분류명','gradient']].groupby(['ê³ ê°ë²ˆí˜¸'])\n",
- " 중분류 = reco_group.get_group((i)).sort_values(by='gradient')['중분류명'].iloc[0]\n",
- " li.append(중분류)\n",
- " cust = pp_demo[pp_demo['ê³ ê°ë²ˆí˜¸']==i].values\n",
- " cust_info = (' ').join(cust[:,1:][0])\n",
- " li.append(중분류+' '+cust_info)\n",
- " recom_dict[i] = li\n",
- " return recom_dict\n",
- "\n",
- "cust_feat(1)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "46a76a9a",
+ "id": "a09ca62b",
"metadata": {},
"source": [
- "## ì¶”ì²œì„ ìœ„í•œ 소분류명별 종합 특성 컬럼 ìƒì„±"
+ "## ì¶”ì²œì„ ìœ„í•œ 소분류명별 종합 특성 DataFrame ìƒì„±"
]
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "c72e0f3f",
+ "execution_count": 29,
+ "id": "fc51698a",
"metadata": {
"ExecuteTime": {
- "end_time": "2022-10-27T07:04:30.733037Z",
- "start_time": "2022-10-27T07:03:49.209165Z"
+ "end_time": "2022-10-27T10:43:46.914340Z",
+ "start_time": "2022-10-27T10:43:21.543655Z"
},
"scrolled": false
},
@@ -3470,7 +3318,7 @@
"[28437054 rows x 21 columns]"
]
},
- "execution_count": 15,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -3482,14 +3330,14 @@
},
{
"cell_type": "code",
- "execution_count": 42,
- "id": "63fe3ed0",
+ "execution_count": 76,
+ "id": "7940a745",
"metadata": {
"ExecuteTime": {
- "end_time": "2022-10-27T07:45:17.232544Z",
- "start_time": "2022-10-27T07:44:59.002954Z"
+ "end_time": "2022-10-27T11:44:49.886400Z",
+ "start_time": "2022-10-27T11:44:38.227980Z"
},
- "scrolled": true
+ "scrolled": false
},
"outputs": [
{
@@ -3517,7 +3365,6 @@
" 성별\n",
" ì—°ë ¹ëŒ€\n",
" 거주지ì—\n",
- " ì œíœ´ì‚¬\n",
" 중분류명\n",
" \n",
" \n",
@@ -3526,45 +3373,40 @@
" 0\n",
" ìœ ì œí’ˆ\n",
" M\n",
- " 60세ì´ìƒ\n",
+ " 60\n",
" 서울특별시 강남구\n",
- " A\n",
" 축산가공\n",
" \n",
" \n",
" 1\n",
" ìœ ì œí’ˆ\n",
" M\n",
- " 60세ì´ìƒ\n",
+ " 60\n",
" 경기ë„\n",
- " A\n",
" 축산가공\n",
" \n",
" \n",
" 2\n",
" ìœ ì œí’ˆ\n",
" F\n",
- " 60세ì´ìƒ\n",
+ " 60\n",
" 서울특별시 ë…¸ì›êµ¬\n",
- " A\n",
" 축산가공\n",
" \n",
" \n",
" 3\n",
" ìœ ì œí’ˆ\n",
" F\n",
- " 60세ì´ìƒ\n",
+ " 60\n",
" ê°•ì›ë„\n",
- " A\n",
" 축산가공\n",
" \n",
" \n",
" 4\n",
" ìœ ì œí’ˆ\n",
" F\n",
- " 60세ì´ìƒ\n",
+ " 60\n",
" 서울특별시 서대문구\n",
- " A\n",
" 축산가공\n",
" \n",
" \n",
@@ -3574,99 +3416,150 @@
" ...\n",
" ...\n",
" ...\n",
- " ...\n",
" \n",
" \n",
- " 845166\n",
+ " 833889\n",
" 기타íƒêµ¬ìš©í’ˆ\n",
" F\n",
- " 40세~44세\n",
+ " 40\n",
" ì „ë¼ë‚¨ë„\n",
- " C\n",
" ë ˆì €ì·¨ë¯¸\n",
" \n",
" \n",
- " 845167\n",
+ " 833890\n",
" 여성발가ë½\n",
" F\n",
- " 50세~54세\n",
+ " 50\n",
" 경기ë„\n",
- " B\n",
" 여성양ë§\n",
" \n",
" \n",
- " 845168\n",
+ " 833891\n",
" 페스츄리류\n",
" M\n",
- " 50세~54세\n",
+ " 50\n",
" 경기ë„\n",
- " C\n",
" ë² ì´ì»¤ë¦¬\n",
" \n",
" \n",
- " 845169\n",
+ " 833892\n",
" 기타한방약재\n",
" F\n",
- " 50세~54세\n",
+ " 50\n",
" ì „ë¼ë¶ë„\n",
- " C\n",
" 근채류\n",
" \n",
" \n",
- " 845170\n",
+ " 833893\n",
" 컵아ì´ìŠ¤í¬ë¦¼\n",
" F\n",
- " 25세~29세\n",
+ " 25\n",
" ì¸ì²œê´‘ì—ì‹œ\n",
- " D\n",
" ê³¼ìž\n",
" \n",
" \n",
"\n",
- " 845171 rows × 6 columns \n",
+ "833894 rows × 5 columns \n",
""
],
"text/plain": [
- " 소분류명 성별 ì—°ë ¹ëŒ€ ê±°ì£¼ì§€ì— ì œíœ´ì‚¬ 중분류명\n",
- "0 ìœ ì œí’ˆ M 60세ì´ìƒ 서울특별시 강남구 A 축산가공\n",
- "1 ìœ ì œí’ˆ M 60세ì´ìƒ ê²½ê¸°ë„ A 축산가공\n",
- "2 ìœ ì œí’ˆ F 60세ì´ìƒ 서울특별시 ë…¸ì›êµ¬ A 축산가공\n",
- "3 ìœ ì œí’ˆ F 60세ì´ìƒ ê°•ì›ë„ A 축산가공\n",
- "4 ìœ ì œí’ˆ F 60세ì´ìƒ 서울특별시 서대문구 A 축산가공\n",
- "... ... .. ... ... .. ...\n",
- "845166 기타íƒêµ¬ìš©í’ˆ F 40세~44세 ì „ë¼ë‚¨ë„ C ë ˆì €ì·¨ë¯¸\n",
- "845167 ì—¬ì„±ë°œê°€ë½ F 50세~54세 ê²½ê¸°ë„ B 여성양ë§\n",
- "845168 페스츄리류 M 50세~54세 ê²½ê¸°ë„ C ë² ì´ì»¤ë¦¬\n",
- "845169 기타한방약재 F 50세~54세 ì „ë¼ë¶ë„ C 근채류\n",
- "845170 컵아ì´ìŠ¤í¬ë¦¼ F 25세~29세 ì¸ì²œê´‘ì—ì‹œ D ê³¼ìž\n",
- "\n",
- "[845171 rows x 6 columns]"
+ " 소분류명 성별 ì—°ë ¹ëŒ€ ê±°ì£¼ì§€ì— ì¤‘ë¶„ë¥˜ëª…\n",
+ "0 ìœ ì œí’ˆ M 60 서울특별시 강남구 축산가공\n",
+ "1 ìœ ì œí’ˆ M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ\n",
+ "2 ìœ ì œí’ˆ F 60 서울특별시 ë…¸ì›êµ¬ 축산가공\n",
+ "3 ìœ ì œí’ˆ F 60 ê°•ì›ë„ 축산가공\n",
+ "4 ìœ ì œí’ˆ F 60 서울특별시 서대문구 축산가공\n",
+ "... ... .. .. ... ...\n",
+ "833889 기타íƒêµ¬ìš©í’ˆ F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸\n",
+ "833890 ì—¬ì„±ë°œê°€ë½ F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§\n",
+ "833891 페스츄리류 M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬\n",
+ "833892 기타한방약재 F 50 ì „ë¼ë¶ë„ 근채류\n",
+ "833893 컵아ì´ìŠ¤í¬ë¦¼ F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž\n",
+ "\n",
+ "[833894 rows x 5 columns]"
]
},
- "execution_count": 42,
+ "execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pp_reco_df = pp_reco[pp_reco['분기']!='15_4']\n",
- "pp_reco_df = pp_reco_df[['소분류명', 'ê³ ê°ë²ˆí˜¸', '성별', 'ì—°ë ¹ëŒ€', '거주지ì—','ì œíœ´ì‚¬', '중분류명']] # 25006807\n",
+ "pp_reco_df = pp_reco_df[['소분류명', 'ê³ ê°ë²ˆí˜¸', '성별', 'ì—°ë ¹ëŒ€', '거주지ì—','중분류명']] # 25006807\n",
"pp_reco_df = pp_reco_df.drop_duplicates(['소분류명', 'ê³ ê°ë²ˆí˜¸'], keep='first').reset_index(drop=True)\n",
"pp_reco_df = pp_reco_df.drop(columns=['ê³ ê°ë²ˆí˜¸'])\n",
- "pp_reco_df = pp_reco_df.drop_duplicates(['소분류명', '성별', 'ì—°ë ¹ëŒ€', '거주지ì—', 'ì œíœ´ì‚¬', '중분류명'], keep='first').reset_index(drop=True)\n",
+ "pp_reco_df = pp_reco_df.drop_duplicates(['소분류명', '성별', 'ì—°ë ¹ëŒ€', '거주지ì—', '중분류명'], keep='first').reset_index(drop=True)\n",
+ "pp_reco_df['ì—°ë ¹ëŒ€'] = pp_reco_df['ì—°ë ¹ëŒ€'].apply(lambda x:x[:2])\n",
"# pp_reco_df = pp_reco_df[(pp_reco_df['소분류명']=='ìœ ì œí’ˆ') & (pp_reco_df['성별']=='M')]\n",
"pp_reco_df"
]
},
{
"cell_type": "code",
- "execution_count": 43,
- "id": "53979309",
+ "execution_count": 77,
+ "id": "c73d40d8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:44:55.704008Z",
+ "start_time": "2022-10-27T11:44:55.693838Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# pp_reco_df = pp_reco_df.drop(columns='reco_feature')\n",
+ "# pp_reco_df['reco_feature'] = pp_reco_df.iloc[:,1:].apply(lambda row : (' ').join(row.values), axis=1)\n",
+ "# pp_reco_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b8242b3",
+ "metadata": {},
+ "source": [
+ "## ê³ ê°ë³„ 소분류별 구매 횟수"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "de520817",
"metadata": {
"ExecuteTime": {
- "end_time": "2022-10-27T07:45:21.061346Z",
- "start_time": "2022-10-27T07:45:17.235142Z"
+ "end_time": "2022-10-27T09:54:01.717148Z",
+ "start_time": "2022-10-27T09:54:01.701756Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ 1, 2, 3, ..., 19346, 19360, 19364])"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "# 추천 시스템용 ë°ì´í„°(recommend)\n",
+ "reco = pd.merge(pp_demo, pp_datasets, on='ê³ ê°ë²ˆí˜¸', how='left')\n",
+ "reco_cust = reco[pred_lr==1]\n",
+ "reco_cust_num = reco_cust['ê³ ê°ë²ˆí˜¸'].values\n",
+ "reco_cust_num"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "9c74c2b5",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T10:31:24.242593Z",
+ "start_time": "2022-10-27T10:31:19.606221Z"
+ },
+ "scrolled": true
},
"outputs": [
{
@@ -3690,183 +3583,774 @@
" \n",
" \n",
" \n",
+ " ê³ ê°ë²ˆí˜¸\n",
" 소분류명\n",
- " 성별\n",
- " ì—°ë ¹ëŒ€\n",
- " 거주지ì—\n",
- " ì œíœ´ì‚¬\n",
- " 중분류명\n",
- " reco_feature\n",
+ " count\n",
" \n",
" \n",
" \n",
" \n",
" 0\n",
- " ìœ ì œí’ˆ\n",
- " M\n",
- " 60세ì´ìƒ\n",
- " 서울특별시 강남구\n",
- " A\n",
- " 축산가공\n",
- " M 60세ì´ìƒ 서울특별시 강남구 A 축산가공\n",
+ " 1\n",
+ " Bag&Bag\n",
+ " 1.0\n",
" \n",
" \n",
" 1\n",
- " ìœ ì œí’ˆ\n",
- " M\n",
- " 60세ì´ìƒ\n",
- " 경기ë„\n",
- " A\n",
- " 축산가공\n",
- " M 60세ì´ìƒ ê²½ê¸°ë„ A 축산가공\n",
+ " 1\n",
+ " L.B\n",
+ " 3.0\n",
" \n",
" \n",
" 2\n",
- " ìœ ì œí’ˆ\n",
- " F\n",
- " 60세ì´ìƒ\n",
- " 서울특별시 ë…¸ì›êµ¬\n",
- " A\n",
- " 축산가공\n",
- " F 60세ì´ìƒ 서울특별시 ë…¸ì›êµ¬ A 축산가공\n",
+ " 1\n",
+ " L/C 아웃ë„ì–´\n",
+ " 1.0\n",
" \n",
" \n",
" 3\n",
- " ìœ ì œí’ˆ\n",
- " F\n",
- " 60세ì´ìƒ\n",
- " ê°•ì›ë„\n",
- " A\n",
- " 축산가공\n",
- " F 60세ì´ìƒ ê°•ì›ë„ A 축산가공\n",
+ " 1\n",
+ " MP3 外\n",
+ " 2.0\n",
" \n",
" \n",
" 4\n",
- " ìœ ì œí’ˆ\n",
- " F\n",
- " 60세ì´ìƒ\n",
- " 서울특별시 서대문구\n",
- " A\n",
- " 축산가공\n",
- " F 60세ì´ìƒ 서울특별시 서대문구 A 축산가공\n",
+ " 1\n",
+ " N.B\n",
+ " 3.0\n",
" \n",
" \n",
" ...\n",
" ...\n",
" ...\n",
" ...\n",
- " ...\n",
- " ...\n",
- " ...\n",
- " ...\n",
" \n",
" \n",
- " 845166\n",
- " 기타íƒêµ¬ìš©í’ˆ\n",
- " F\n",
- " 40세~44세\n",
- " ì „ë¼ë‚¨ë„\n",
- " C\n",
- " ë ˆì €ì·¨ë¯¸\n",
- " F 40세~44세 ì „ë¼ë‚¨ë„ C ë ˆì €ì·¨ë¯¸\n",
+ " 6117702\n",
+ " 19383\n",
+ " 하드캔디\n",
+ " 10.0\n",
" \n",
" \n",
- " 845167\n",
- " 여성발가ë½\n",
- " F\n",
- " 50세~54세\n",
- " 경기ë„\n",
- " B\n",
+ " 6117703\n",
+ " 19383\n",
+ " 핸드로션/í¬ë¦¼\n",
+ " 2.0\n",
+ " \n",
+ " \n",
+ " 6117704\n",
+ " 19383\n",
+ " 핸드워시/ì†ì„¸ì •ì œ\n",
+ " 2.0\n",
+ " \n",
+ " \n",
+ " 6117705\n",
+ " 19383\n",
+ " 헤어ì—센스\n",
+ " 3.0\n",
+ " \n",
+ " \n",
+ " 6117706\n",
+ " 19383\n",
+ " 혼합탄산\n",
+ " 3.0\n",
+ " \n",
+ " \n",
+ "\n",
+ "6117707 rows × 3 columns \n",
+ ""
+ ],
+ "text/plain": [
+ " ê³ ê°ë²ˆí˜¸ 소분류명 count\n",
+ "0 1 Bag&Bag 1.0\n",
+ "1 1 L.B 3.0\n",
+ "2 1 L/C 아웃ë„ì–´ 1.0\n",
+ "3 1 MP3 外 2.0\n",
+ "4 1 N.B 3.0\n",
+ "... ... ... ...\n",
+ "6117702 19383 하드캔디 10.0\n",
+ "6117703 19383 핸드로션/í¬ë¦¼ 2.0\n",
+ "6117704 19383 핸드워시/ì†ì„¸ì •ì œ 2.0\n",
+ "6117705 19383 헤어ì—센스 3.0\n",
+ "6117706 19383 혼합탄산 3.0\n",
+ "\n",
+ "[6117707 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "reco_cust_pv = pp_purprd_prodcl.pivot_table(index=['소분류명'],columns='ê³ ê°ë²ˆí˜¸',values='구매금액',aggfunc='count')\n",
+ "reco_cust_pv = reco_cust_pv.unstack().dropna().reset_index()\n",
+ "reco_cust_pv = reco_cust_pv.rename(columns={0:'count'})\n",
+ "reco_cust_pv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "99b518ac",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:31:21.545514Z",
+ "start_time": "2022-10-27T11:31:21.202680Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " | ê³ ê°ë²ˆí˜¸ | count |
---|
count | 6.117707e+06 | 6.117707e+06 |
---|
mean | 9.717193e+03 | 4.648319e+00 |
---|
std | 5.464717e+03 | 1.018067e+01 |
---|
min | 1.000000e+00 | 1.000000e+00 |
---|
25% | 5.023000e+03 | 1.000000e+00 |
---|
50% | 9.781000e+03 | 2.000000e+00 |
---|
75% | 1.438400e+04 | 4.000000e+00 |
---|
max | 1.938300e+04 | 7.060000e+02 |
---|
\n",
+ " "
+ ],
+ "text/plain": [
+ " ê³ ê°ë²ˆí˜¸ count\n",
+ "count 6.117707e+06 6.117707e+06\n",
+ "mean 9.717193e+03 4.648319e+00\n",
+ "std 5.464717e+03 1.018067e+01\n",
+ "min 1.000000e+00 1.000000e+00\n",
+ "25% 5.023000e+03 1.000000e+00\n",
+ "50% 9.781000e+03 2.000000e+00\n",
+ "75% 1.438400e+04 4.000000e+00\n",
+ "max 1.938300e+04 7.060000e+02"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "reco_cust_pv.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "7e370cf9",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:31:44.149035Z",
+ "start_time": "2022-10-27T11:31:43.370043Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " | ê³ ê°ë²ˆí˜¸ | 소분류명 | count |
---|
0 | 1 | Bag&Bag | 0.007082 |
---|
1 | 1 | L.B | 0.021246 |
---|
2 | 1 | L/C 아웃ë„ì–´ | 0.007082 |
---|
3 | 1 | MP3 外 | 0.014164 |
---|
4 | 1 | N.B | 0.021246 |
---|
... | ... | ... | ... |
---|
6117702 | 19383 | 하드캔디 | 0.070822 |
---|
6117703 | 19383 | 핸드로션/í¬ë¦¼ | 0.014164 |
---|
6117704 | 19383 | 핸드워시/ì†ì„¸ì •ì œ | 0.014164 |
---|
6117705 | 19383 | 헤어ì—센스 | 0.021246 |
---|
6117706 | 19383 | 혼합탄산 | 0.021246 |
---|
\n",
+ " 6117707 rows × 3 columns \n",
+ " "
+ ],
+ "text/plain": [
+ " ê³ ê°ë²ˆí˜¸ 소분류명 count\n",
+ "0 1 Bag&Bag 0.007082\n",
+ "1 1 L.B 0.021246\n",
+ "2 1 L/C 아웃ë„ì–´ 0.007082\n",
+ "3 1 MP3 外 0.014164\n",
+ "4 1 N.B 0.021246\n",
+ "... ... ... ...\n",
+ "6117702 19383 하드캔디 0.070822\n",
+ "6117703 19383 핸드로션/í¬ë¦¼ 0.014164\n",
+ "6117704 19383 핸드워시/ì†ì„¸ì •ì œ 0.014164\n",
+ "6117705 19383 헤어ì—센스 0.021246\n",
+ "6117706 19383 혼합탄산 0.021246\n",
+ "\n",
+ "[6117707 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "reco_cust_pv['count'] = reco_cust_pv['count'].apply(lambda x: 5*(x/706))\n",
+ "reco_cust_pv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "9ec28428",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:31:56.781796Z",
+ "start_time": "2022-10-27T11:31:56.498635Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " | ê³ ê°ë²ˆí˜¸ | count |
---|
count | 6.117707e+06 | 6.117707e+06 |
---|
mean | 9.717193e+03 | 3.292011e-02 |
---|
std | 5.464717e+03 | 7.210105e-02 |
---|
min | 1.000000e+00 | 7.082153e-03 |
---|
25% | 5.023000e+03 | 7.082153e-03 |
---|
50% | 9.781000e+03 | 1.416431e-02 |
---|
75% | 1.438400e+04 | 2.832861e-02 |
---|
max | 1.938300e+04 | 5.000000e+00 |
---|
\n",
+ " "
+ ],
+ "text/plain": [
+ " ê³ ê°ë²ˆí˜¸ count\n",
+ "count 6.117707e+06 6.117707e+06\n",
+ "mean 9.717193e+03 3.292011e-02\n",
+ "std 5.464717e+03 7.210105e-02\n",
+ "min 1.000000e+00 7.082153e-03\n",
+ "25% 5.023000e+03 7.082153e-03\n",
+ "50% 9.781000e+03 1.416431e-02\n",
+ "75% 1.438400e+04 2.832861e-02\n",
+ "max 1.938300e+04 5.000000e+00"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "reco_cust_pv.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "id": "4dc04104",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:48:50.683544Z",
+ "start_time": "2022-10-27T11:48:48.873756Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
- " \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
- " \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
- " \n",
+ " \n",
" \n",
- " \n",
" \n",
- " \n",
+ " \n",
" \n",
" \n",
" | 소분류명 | 성별 | ì—°ë ¹ëŒ€ | ê±°ì£¼ì§€ì— | 중분류명 | reco_feature |
---|
0 | ìœ ì œí’ˆ | M | 60 | 서울특별시 강남구 | 축산가공 | M 60 서울특별시 강남구 축산가공 |
---|
1 | ìœ ì œí’ˆ | M | 60 | ê²½ê¸°ë„ | 축산가공 | M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ |
---|
2 | ìœ ì œí’ˆ | F | 60 | 서울특별시 ë…¸ì›êµ¬ | 축산가공 | F 60 서울특별시 ë…¸ì›êµ¬ 축산가공 |
---|
3 | ìœ ì œí’ˆ | F | 60 | ê°•ì›ë„ | 축산가공 | F 60 ê°•ì›ë„ 축산가공 |
---|
4 | ìœ ì œí’ˆ | F | 60 | 서울특별시 서대문구 | 축산가공 | F 60 서울특별시 서대문구 축산가공 |
---|
... | ... | ... | ... | ... | ... | ... |
---|
833889 | 기타íƒêµ¬ìš©í’ˆ | F | 40 | ì „ë¼ë‚¨ë„ | ë ˆì €ì·¨ë¯¸ | F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸ |
---|
833890 | ì—¬ì„±ë°œê°€ë½ | F | 50 | ê²½ê¸°ë„ | ì—¬ì„±ì–‘ë§ | F 50세~54세 ê²½ê¸°ë„ B ì—¬ì„±ì–‘ë§ | F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§ |
---|
845168 | 833891 | 페스츄리류 | M | 50세~54세 | 50 | ê²½ê¸°ë„ | C | ë² ì´ì»¤ë¦¬ | M 50세~54세 ê²½ê¸°ë„ C ë² ì´ì»¤ë¦¬ | M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬ |
---|
845169 | 833892 | 기타한방약재 | F | 50세~54세 | 50 | ì „ë¼ë¶ë„ | C | 근채류 | F 50세~54세 ì „ë¼ë¶ë„ C 근채류 | F 50 ì „ë¼ë¶ë„ 근채류 |
---|
845170 | 833893 | 컵아ì´ìŠ¤í¬ë¦¼ | F | 25세~29세 | 25 | ì¸ì²œê´‘ì—ì‹œ | D | ê³¼ìž | F 25세~29세 ì¸ì²œê´‘ì—ì‹œ D ê³¼ìž | F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž |
---|
\n",
- " 845171 rows × 7 columns \n",
+ " 833894 rows × 6 columns \n",
" "
],
"text/plain": [
- " 소분류명 성별 ì—°ë ¹ëŒ€ ê±°ì£¼ì§€ì— ì œíœ´ì‚¬ 중분류명 reco_feature\n",
- "0 ìœ ì œí’ˆ M 60세ì´ìƒ 서울특별시 강남구 A 축산가공 M 60세ì´ìƒ 서울특별시 강남구 A 축산가공\n",
- "1 ìœ ì œí’ˆ M 60세ì´ìƒ ê²½ê¸°ë„ A 축산가공 M 60세ì´ìƒ ê²½ê¸°ë„ A 축산가공\n",
- "2 ìœ ì œí’ˆ F 60세ì´ìƒ 서울특별시 ë…¸ì›êµ¬ A 축산가공 F 60세ì´ìƒ 서울특별시 ë…¸ì›êµ¬ A 축산가공\n",
- "3 ìœ ì œí’ˆ F 60세ì´ìƒ ê°•ì›ë„ A 축산가공 F 60세ì´ìƒ ê°•ì›ë„ A 축산가공\n",
- "4 ìœ ì œí’ˆ F 60세ì´ìƒ 서울특별시 서대문구 A 축산가공 F 60세ì´ìƒ 서울특별시 서대문구 A 축산가공\n",
- "... ... .. ... ... .. ... ...\n",
- "845166 기타íƒêµ¬ìš©í’ˆ F 40세~44세 ì „ë¼ë‚¨ë„ C ë ˆì €ì·¨ë¯¸ F 40세~44세 ì „ë¼ë‚¨ë„ C ë ˆì €ì·¨ë¯¸\n",
- "845167 ì—¬ì„±ë°œê°€ë½ F 50세~54세 ê²½ê¸°ë„ B ì—¬ì„±ì–‘ë§ F 50세~54세 ê²½ê¸°ë„ B 여성양ë§\n",
- "845168 페스츄리류 M 50세~54세 ê²½ê¸°ë„ C ë² ì´ì»¤ë¦¬ M 50세~54세 ê²½ê¸°ë„ C ë² ì´ì»¤ë¦¬\n",
- "845169 기타한방약재 F 50세~54세 ì „ë¼ë¶ë„ C 근채류 F 50세~54세 ì „ë¼ë¶ë„ C 근채류\n",
- "845170 컵아ì´ìŠ¤í¬ë¦¼ F 25세~29세 ì¸ì²œê´‘ì—ì‹œ D ê³¼ìž F 25세~29세 ì¸ì²œê´‘ì—ì‹œ D ê³¼ìž\n",
- "\n",
- "[845171 rows x 7 columns]"
+ " 소분류명 성별 ì—°ë ¹ëŒ€ ê±°ì£¼ì§€ì— ì¤‘ë¶„ë¥˜ëª… reco_feature\n",
+ "0 ìœ ì œí’ˆ M 60 서울특별시 강남구 축산가공 M 60 서울특별시 강남구 축산가공\n",
+ "1 ìœ ì œí’ˆ M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ\n",
+ "2 ìœ ì œí’ˆ F 60 서울특별시 ë…¸ì›êµ¬ 축산가공 F 60 서울특별시 ë…¸ì›êµ¬ 축산가공\n",
+ "3 ìœ ì œí’ˆ F 60 ê°•ì›ë„ 축산가공 F 60 ê°•ì›ë„ 축산가공\n",
+ "4 ìœ ì œí’ˆ F 60 서울특별시 서대문구 축산가공 F 60 서울특별시 서대문구 축산가공\n",
+ "... ... .. .. ... ... ...\n",
+ "833889 기타íƒêµ¬ìš©í’ˆ F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸ F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸\n",
+ "833890 ì—¬ì„±ë°œê°€ë½ F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§ F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§\n",
+ "833891 페스츄리류 M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬ M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬\n",
+ "833892 기타한방약재 F 50 ì „ë¼ë¶ë„ 근채류 F 50 ì „ë¼ë¶ë„ 근채류\n",
+ "833893 컵아ì´ìŠ¤í¬ë¦¼ F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž\n",
+ "\n",
+ "[833894 rows x 6 columns]"
]
},
- "execution_count": 43,
+ "execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# pp_reco_df = pp_reco_df.drop(columns='reco_feature')\n",
- "pp_reco_df['reco_feature'] = pp_reco_df.iloc[:,1:].apply(lambda row : (' ').join(row.values), axis=1)\n",
- "pp_reco_df"
+ "# pp_reco_df = pp_reco_df.drop(columns='reco_feature')\n",
+ "pp_reco_df['reco_feature'] = pp_reco_df.iloc[:,1:].apply(lambda row : (' ').join(row.values), axis=1)\n",
+ "pp_reco_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "92f160d8",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:39:24.972715Z",
+ "start_time": "2022-10-27T11:36:54.545128Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Evaluating RMSE, MAE of algorithm SVD on 2 split(s).\n",
+ "\n",
+ " Fold 1 Fold 2 Mean Std \n",
+ "RMSE (testset) 0.0667 0.0666 0.0666 0.0001 \n",
+ "MAE (testset) 0.0283 0.0284 0.0283 0.0000 \n",
+ "Fit time 22.38 23.84 23.11 0.73 \n",
+ "Test time 40.04 45.88 42.96 2.92 \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'test_rmse': array([0.06670089, 0.06657903]),\n",
+ " 'test_mae': array([0.02831861, 0.02836099]),\n",
+ " 'fit_time': (22.381165981292725, 23.844670057296753),\n",
+ " 'test_time': (40.044742822647095, 45.880324840545654)}"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from surprise.model_selection import cross_validate\n",
+ "from surprise import Reader, Dataset\n",
+ "from surprise import SVD\n",
+ "from surprise import Dataset\n",
+ "from surprise import accuracy\n",
+ "from surprise.model_selection import train_test_split\n",
+ "\n",
+ "# íŒë‹¤ìŠ¤ DataFrameì—ì„œ Surprise ë°ì´í„° 세트로 ë°ì´í„° 로딩\n",
+ "reader = Reader(rating_scale=(0, 5.0))\n",
+ "data = Dataset.load_from_df(reco_cust_pv, reader)\n",
+ "\n",
+ "algo = SVD(random_state=0)\n",
+ "cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=2, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "id": "44d768b1",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:41:32.974835Z",
+ "start_time": "2022-10-27T11:39:24.974366Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.06699629358085352\n",
+ "{'n_epochs': 10, 'n_factors': 50}\n"
+ ]
+ }
+ ],
+ "source": [
+ "from surprise.model_selection import GridSearchCV\n",
+ "\n",
+ "# 최ì í™”í• íŒŒë¼ë¯¸í„°ë¥¼ 딕셔너리 형태로 ì§€ì •\n",
+ "param_grid = {'n_epochs':[10], 'n_factors':[50]}# n_epochs : ì 진ì 하강 ë°©ì‹ì˜ 반복 횟수, n_factors : ìž ìž¬ìš”ì¸ í¬ê¸° K mxn = mxp + nxp K 는 pì˜ í¬ê¸° \n",
+ "\n",
+ "# CV를 3ê°œ í´ë“œ 세트로 ì§€ì •, 성능 í‰ê°€ëŠ” rmse, mseë¡œ 수행하ë„ë¡ GridSearchCV 구성\n",
+ "gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)\n",
+ "gs.fit(data)\n",
+ "\n",
+ "# ìµœê³ RMSE Evaluation ì 수와 ê·¸ë•Œì˜ í•˜ì´í¼ 파ë¼ë¯¸í„°\n",
+ "print(gs.best_score['rmse'])\n",
+ "print(gs.best_params['rmse'])"
]
},
{
- "cell_type": "markdown",
- "id": "f3ef7e54",
- "metadata": {},
+ "cell_type": "code",
+ "execution_count": 72,
+ "id": "bd0d7ae1",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:41:48.736188Z",
+ "start_time": "2022-10-27T11:41:32.976115Z"
+ }
+ },
+ "outputs": [],
"source": [
- "## 추천 ì•Œê³ ë¦¬ì¦˜"
+ "from surprise.dataset import DatasetAutoFolds\n",
+ "\n",
+ "reco_cust_pv.to_csv('./reco_cust_pv_noh.csv', index=False, header=False)\n",
+ "\n",
+ "reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5.0))\n",
+ "# DatasetAutoFolds í´ëž˜ìŠ¤ë¥¼ ratings_noh.csv íŒŒì¼ ê¸°ë°˜ìœ¼ë¡œ ìƒì„±\n",
+ "data_folds = DatasetAutoFolds(ratings_file='./reco_cust_pv_noh.csv', reader=reader)\n",
+ "\n",
+ "# ì „ì²´ ë°ì´í„°ë¥¼ 학습 ë°ì´í„°ë¡œ ìƒì„±í•¨.\n",
+ "trainset = data_folds.build_full_trainset()"
]
},
{
- "cell_type": "markdown",
- "id": "6304e18c",
- "metadata": {},
+ "cell_type": "code",
+ "execution_count": 73,
+ "id": "6fe5ccd2",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:42:04.776038Z",
+ "start_time": "2022-10-27T11:41:48.737714Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "### CountVectorizer : 추천 target 벡터화"
+ "algo = SVD(n_epochs=20, n_factors=50, random_state=0)\n",
+ "algo.fit(trainset)"
]
},
{
"cell_type": "code",
- "execution_count": 89,
- "id": "7983eeb0",
+ "execution_count": 95,
+ "id": "9dd55a11",
"metadata": {
"ExecuteTime": {
- "end_time": "2022-10-27T08:17:31.241224Z",
- "start_time": "2022-10-27T08:17:25.458062Z"
+ "end_time": "2022-10-26T02:48:29.085581Z",
+ "start_time": "2022-10-26T02:48:29.073980Z"
}
},
"outputs": [
@@ -3874,182 +4358,322 @@
"name": "stdout",
"output_type": "stream",
"text": [
- " (0, 162)\t1\n",
- " (0, 13865)\t1\n",
- " (0, 233)\t1\n",
- " (0, 24114)\t1\n",
- " (0, 171)\t1\n",
- " (0, 13866)\t1\n",
- " (0, 757)\t1\n",
- " (1, 162)\t1\n",
- " (1, 24114)\t1\n",
- " (1, 3263)\t1\n",
- " (1, 164)\t1\n",
- " (1, 3839)\t1\n",
- " (2, 162)\t1\n",
- " (2, 13865)\t1\n",
- " (2, 24114)\t1\n",
- " (2, 171)\t1\n",
- " (2, 8251)\t1\n",
- " (2, 13874)\t1\n",
- " (2, 8792)\t1\n",
- "(845171, 25478)\n"
+ "ì‚¬ìš©ìž ì•„ì´ë”” 9 는 ì˜í™” ì•„ì´ë”” 42ì˜ í‰ì ì—†ìŒ\n",
+ " movieId title genres\n",
+ "38 42 Dead Presidents (1995) Action|Crime|Drama\n"
]
}
],
"source": [
- "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "# ì‚¼í’ˆì— ëŒ€í•œ ìƒì„¸ ì†ì„± ì •ë³´ DataFrame 로딩\n",
+ "pp_reco_df\n",
"\n",
- "# CountVectorizer를 ì 용하기 위해 공백문ìžë¡œ word 단위가 구분ë˜ëŠ” 문ìžì—´ë¡œ 변환. \n",
- "# movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))\n",
- "# print(movies_df['genres_literal'])\n",
- "count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))\n",
- "item_mat = count_vect.fit_transform(pp_reco_df['reco_feature'])\n",
- "print(item_mat[:3])\n",
- "print(item_mat.shape)"
+ "# userId=9ì˜ movieId ë°ì´í„°ë¥¼ 추출해 movieId=42 ë°ì´í„°ê°€ 있는지 확ì¸.\n",
+ "custIds = reco_cust_pv[reco_cust_pv['ê³ ê°ë²ˆí˜¸']==9]['movieId']\n",
+ "if custIds[custIds==42].count() == 0:\n",
+ " print('ê³ ê°ë²ˆí˜¸ê°€9 ì´ê³ ì˜ í‰ì ì—†ìŒ')\n",
+ " \n",
+ "print(pp_reco_df[pp_reco_df['소분류명']==42])"
]
},
{
- "cell_type": "markdown",
- "id": "f00c0180",
- "metadata": {},
+ "cell_type": "code",
+ "execution_count": 75,
+ "id": "5e984a1d",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:42:15.506940Z",
+ "start_time": "2022-10-27T11:42:15.501978Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "user: 9 item: ìš°ìœ r_ui = None est = 0.01 {'was_impossible': False}\n"
+ ]
+ }
+ ],
"source": [
- "### ì½”ì‚¬ì¸ ìœ ì‚¬ë„ ì¸¡ì •"
+ "uid = str(9)\n",
+ "iid = str('ìš°ìœ ')\n",
+ "\n",
+ "pred = algo.predict(uid, iid, verbose=True)"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "b202f53c",
+ "execution_count": 79,
+ "id": "f3c9e198",
"metadata": {
"ExecuteTime": {
- "start_time": "2022-10-27T08:17:27.584Z"
+ "end_time": "2022-10-27T11:45:32.152803Z",
+ "start_time": "2022-10-27T11:45:32.103128Z"
}
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['ìœ ì œí’ˆ', 'ì²ê³¼', '멸치류', ..., '기타íƒêµ¬ìš©í’ˆ', '여성발가ë½', '기타한방약재'], dtype=object)"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "from sklearn.metrics.pairwise import cosine_similarity\n",
- "\n",
- "item_sim = cosine_similarity(item_mat, item_mat)\n",
- "print(item_sim.shape)\n",
- "print(item_sim[:10])\n",
- "\n",
- "\n",
- "\n",
- "item_sim_sorted_ind = item_sim.argsort()[:, ::-1]\n",
- "print(item_sim_sorted_ind[:5])\n",
- "\n",
- "\n",
- "\n",
- "print(pp_reco_df[pp_reco_df['reco_feature']==1])\n",
- "print(pp_reco_df[pp_reco_df['reco_feature']==262])"
+ "pp_reco_df['소분류명'].unique()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "be8a7f70",
- "metadata": {},
- "outputs": [],
+ "execution_count": 81,
+ "id": "45b1b79a",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:49:03.378424Z",
+ "start_time": "2022-10-27T11:49:03.357226Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " | 소분류명 | 성별 | ì—°ë ¹ëŒ€ | ê±°ì£¼ì§€ì— | 중분류명 | reco_feature |
---|
0 | ìœ ì œí’ˆ | M | 60 | 서울특별시 강남구 | 축산가공 | M 60 서울특별시 강남구 축산가공 |
---|
1 | ìœ ì œí’ˆ | M | 60 | ê²½ê¸°ë„ | 축산가공 | M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ |
---|
2 | ìœ ì œí’ˆ | F | 60 | 서울특별시 ë…¸ì›êµ¬ | 축산가공 | F 60 서울특별시 ë…¸ì›êµ¬ 축산가공 |
---|
3 | ìœ ì œí’ˆ | F | 60 | ê°•ì›ë„ | 축산가공 | F 60 ê°•ì›ë„ 축산가공 |
---|
4 | ìœ ì œí’ˆ | F | 60 | 서울특별시 서대문구 | 축산가공 | F 60 서울특별시 서대문구 축산가공 |
---|
... | ... | ... | ... | ... | ... | ... |
---|
833889 | 기타íƒêµ¬ìš©í’ˆ | F | 40 | ì „ë¼ë‚¨ë„ | ë ˆì €ì·¨ë¯¸ | F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸ |
---|
833890 | ì—¬ì„±ë°œê°€ë½ | F | 50 | ê²½ê¸°ë„ | ì—¬ì„±ì–‘ë§ | F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§ |
---|
833891 | 페스츄리류 | M | 50 | ê²½ê¸°ë„ | ë² ì´ì»¤ë¦¬ | M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬ |
---|
833892 | 기타한방약재 | F | 50 | ì „ë¼ë¶ë„ | 근채류 | F 50 ì „ë¼ë¶ë„ 근채류 |
---|
833893 | 컵아ì´ìŠ¤í¬ë¦¼ | F | 25 | ì¸ì²œê´‘ì—ì‹œ | ê³¼ìž | F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž |
---|
\n",
+ " 833894 rows × 6 columns \n",
+ " "
+ ],
+ "text/plain": [
+ " 소분류명 성별 ì—°ë ¹ëŒ€ ê±°ì£¼ì§€ì— ì¤‘ë¶„ë¥˜ëª… reco_feature\n",
+ "0 ìœ ì œí’ˆ M 60 서울특별시 강남구 축산가공 M 60 서울특별시 강남구 축산가공\n",
+ "1 ìœ ì œí’ˆ M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ M 60 ê²½ê¸°ë„ ì¶•ì‚°ê°€ê³µ\n",
+ "2 ìœ ì œí’ˆ F 60 서울특별시 ë…¸ì›êµ¬ 축산가공 F 60 서울특별시 ë…¸ì›êµ¬ 축산가공\n",
+ "3 ìœ ì œí’ˆ F 60 ê°•ì›ë„ 축산가공 F 60 ê°•ì›ë„ 축산가공\n",
+ "4 ìœ ì œí’ˆ F 60 서울특별시 서대문구 축산가공 F 60 서울특별시 서대문구 축산가공\n",
+ "... ... .. .. ... ... ...\n",
+ "833889 기타íƒêµ¬ìš©í’ˆ F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸ F 40 ì „ë¼ë‚¨ë„ ë ˆì €ì·¨ë¯¸\n",
+ "833890 ì—¬ì„±ë°œê°€ë½ F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§ F 50 ê²½ê¸°ë„ ì—¬ì„±ì–‘ë§\n",
+ "833891 페스츄리류 M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬ M 50 ê²½ê¸°ë„ ë² ì´ì»¤ë¦¬\n",
+ "833892 기타한방약재 F 50 ì „ë¼ë¶ë„ 근채류 F 50 ì „ë¼ë¶ë„ 근채류\n",
+ "833893 컵아ì´ìŠ¤í¬ë¦¼ F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž F 25 ì¸ì²œê´‘ì—ì‹œ ê³¼ìž\n",
+ "\n",
+ "[833894 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "def find_sim_movie(df, sorted_ind, title_name, top_n=10):\n",
- " \n",
- " # ì¸ìžë¡œ ìž…ë ¥ëœ movies_df DataFrameì—ì„œ 'title' ì»¬ëŸ¼ì´ ìž…ë ¥ëœ title_name ê°’ì¸ DataFrame추출\n",
- " title_movie = df[df['title'] == title_name]\n",
- " \n",
- " # title_namedì„ ê°€ì§„ DataFrameì˜ index ê°ì²´ë¥¼ ndarrayë¡œ ë°˜í™˜í•˜ê³ \n",
- " # sorted_ind ì¸ìžë¡œ ìž…ë ¥ëœ genre_sim_sorted_ind ê°ì²´ì—ì„œ ìœ ì‚¬ë„ ìˆœìœ¼ë¡œ top_n ê°œì˜ index 추출\n",
- " title_index = title_movie.index.values\n",
- " similar_indexes = sorted_ind[title_index, :(top_n)]\n",
- " \n",
- " # ì¶”ì¶œëœ top_n index들 ì¶œë ¥. top_n index는 2ì°¨ì› ë°ì´í„° ìž„. \n",
- " #dataframeì—ì„œ indexë¡œ 사용하기 위해서 1ì°¨ì› arrayë¡œ 변경\n",
- " print(similar_indexes)\n",
- " similar_indexes = similar_indexes.reshape(-1)\n",
- " print(similar_indexes)\n",
- " \n",
- " return df.iloc[similar_indexes]\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "similar_movies = find_sim_movie(pp_reco_df, item_sim_sorted_ind, 1 ,10)\n",
- "similar_movies[['title', 'vote_average']]"
+ "pp_reco_df"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "516874af",
- "metadata": {},
- "outputs": [],
+ "execution_count": 95,
+ "id": "a45a9852",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-10-27T11:59:47.826597Z",
+ "start_time": "2022-10-27T11:59:47.744785Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Top 10 추천 ì•„ì´í…œ 리스트\n",
+ "ì¢…ëŸ‰ì œë´‰íˆ¬ : 0.21711009996111366\n",
+ "ë‘부류 : 0.20415478729479797\n",
+ "ì¼ë°˜ìš°ìœ : 0.2015140544313135\n",
+ "ì¼ë°˜ì†Œì£¼ : 0.2002306990452087\n",
+ "공병/공박스 : 0.19890586511587782\n",
+ "ì¼ë°˜ìŠ¤ë‚µ : 0.17264869170813266\n",
+ "ì¼ë°˜í°ìš°ìœ : 0.1701977942065114\n",
+ "ì£¼ìœ ì†Œ : 0.16094399710554957\n",
+ "êµì‚°ë§¥ì£¼ : 0.16034588071534042\n",
+ "수입담배 : 0.15426302258444086\n"
+ ]
+ }
+ ],
"source": [
- "# í‰ê°€ 횟수가 ì ì€ ì˜í™”ë¡œ ì¸í•˜ì—¬ ì˜ë¯¸ 없는 목ë¡ì´ 나올 수 있다.\n",
- "movies_df[['title','vote_average','vote_count']].sort_values('vote_average', ascending=False)[:10]\n",
- "\n",
- "\n",
- "\n",
- "# ëª¨ë“ ì˜í™”ì˜ í‰ì í‰ê· = C\n",
- "C = movies_df['vote_average'].mean()\n",
- "# ê³ ê° í‰ê°€ 수가 ìƒìœ„ 60%ì¸ ì˜í™”ì˜ ê³ ê° í‰ê°€ 수 = m\n",
- "m = movies_df['vote_count'].quantile(0.6) # 분위수\n",
- "# DBì— ìžˆëŠ” ì˜í™” ì´ íŽ¸ 수 : total_movie\n",
- "total_movie = len(movies_df)\n",
- "print('C:',round(C,3), 'm:',round(m,3), 'total_movie:',total_movie)\n",
- "\n",
- "\n",
- "\n",
- "percentile = 0.6\n",
- "m = movies_df['vote_count'].quantile(percentile) # ê³ ê° í‰ê°€ 수가 ìƒìœ„ 60%ì¸ ì˜í™”ì˜ ê³ ê° í‰ê°€ 수\n",
- "C = movies_df['vote_average'].mean() # ëª¨ë“ ì˜í™”ì˜ í‰ì í‰ê· \n",
- "\n",
- "def weighted_vote_average(record):\n",
- " v = record['vote_count'] # 해당 ì˜í™”ì˜ ê³ ê° í‰ê°€ 수 \n",
- " R = record['vote_average'] # 해당 ì˜í™”ì˜ í‰ê· í‰ì \n",
+ "def recomm_item_by_surprise(algo, userId, top_n=10):\n",
" \n",
- " # 해당 ì˜í™”ì— ê³ ê° í‰ê°€ 수가 많ì€ë§Œí¼ 해당 ì˜í™” í‰ì ì— ê°€ì‚°ì ì„ ì£¼ê³ ,\n",
- " # 해당 ì˜í™”ì˜ ê³ ê° í‰ê°€ 수가 ì 으면 ì „ì²´ ì˜í™” í‰ì ì— ê°ì ì„ ì¤€ë‹¤.\n",
- " return ( (v/(v+m)) * R ) + ( (m/(m+v)) * C )\n",
+ " # ì•Œê³ ë¦¬ì¦˜ ê°ì²´ì˜ predict() 메서드를 í‰ì ì´ ì—†ëŠ” ì˜í™”ì— ë°˜ë³µ 수행한 후 결과를 list ê°ì²´ë¡œ ì €ìž¥\n",
+ " predictions = [algo.predict(str(userId), str(s_cat)) for s_cat in pp_reco_df['소분류명'].unique()]\n",
" \n",
- "# 가충치를 부여한 ì 수 컬럼 추가\n",
- "movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "movies_df[['title','vote_average','weighted_vote','vote_count']].sort_values('weighted_vote',\n",
- " ascending=False)[:10]\n",
- "\n",
- "\n",
- "\n",
- "def find_sim_movie(df, sorted_ind, title_name, top_n=10):\n",
- " title_movie = df[df['title'] == title_name]\n",
- " title_index = title_movie.index.values\n",
- " print(title_movie.index)\n",
- " print(title_index)\n",
+ " # predictions list ê°ì²´ëŠ” surpriseì˜ Predictions ê°ì²´ë¥¼ ì›ì†Œë¡œ ê°€ì§€ê³ ìžˆìŒ\n",
+ " # [Prediction(uid='9'm iid='1', est=3.69)....]\n",
+ " \n",
+ " # ì´ë¥¼ est 값으로 ì •ë ¬í•˜ê¸° 위해서 ì•„ëž˜ì˜ sortkey_est 함수를 ì •ì˜í•¨\n",
+ " # sortkey_set 함수는 list ê°ì²´ì˜ sort() í•¨ìˆ˜ì˜ í‚¤ 값으로 사용ë˜ì–´ ì •ë ¬ 수행.\n",
+ " def sortkey_est(pred):\n",
+ " return pred.est\n",
+ " \n",
+ " # sortket_est() ë°˜í™˜ê°’ì˜ ë‚´ë¦¼ 차순으로 ì •ë ¬ ìˆ˜í–‰í•˜ê³ top_n ê°œì˜ ìµœìƒìœ„ ê°’ 추출.\n",
+ " predictions.sort(key=sortkey_est, reverse=True)\n",
+ " top_predictions = predictions[:top_n]\n",
+ "# display(top_predictions)\n",
+ " \n",
+ " # top_n으로 ì¶”ì¶œëœ ì˜í™”ì˜ ì •ë³´ 추출, ì˜í™” ì•„ì´ë””, 추천 ì˜ˆìƒ í‰ì , ì œëª© 추출\n",
+ " top_item_ids = [ str(pred.iid) for pred in top_predictions]\n",
+ " top_item_rating = [ pred.est for pred in top_predictions]\n",
+ "# top_item_titles = movies[movies.movieId.isin(top_movie_ids)]['title']\n",
+ " top_item_titles = pp_reco_df[pp_reco_df.소분류명.isin(top_item_ids)]['소분류명'].unique()\n",
+ "# print(top_item_titles)\n",
" \n",
- " # top_nì˜ 2ë°°ì— í•´ë‹¹í•˜ëŠ” ìŸë¥´ ìœ ì‚¬ì„±ì´ ë†’ì€ index 추출 \n",
- " similar_indexes = sorted_ind[title_index, :(top_n*2)]\n",
- " print(similar_indexes)\n",
- " similar_indexes = similar_indexes.reshape(-1)\n",
- " print(similar_indexes)\n",
- "# 기준 ì˜í™” index는 ì œì™¸\n",
- " similar_indexes = similar_indexes[similar_indexes != title_index]\n",
+ " top_item_preds = [ (id, title, rating) for id, title, rating in zip(top_item_ids, top_item_titles, top_item_rating)]\n",
" \n",
- " # top_nì˜ 2ë°°ì— í•´ë‹¹í•˜ëŠ” 후보군ì—ì„œ weighted_vote ë†’ì€ ìˆœìœ¼ë¡œ top_n ë§Œí¼ ì¶”ì¶œ \n",
- " return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]\n",
+ " return top_item_preds\n",
"\n",
- "similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)\n",
- "similar_movies[['title', 'vote_average', 'weighted_vote']]"
+ "# unseen_movies = get_unseen_surprise(ratings, movies, 9)\n",
+ "top_item_preds = recomm_item_by_surprise(algo, 1200, top_n=10)\n",
+ "\n",
+ "print(' Top 10 추천 ì•„ì´í…œ 리스트')\n",
+ "for top_item in top_item_preds:\n",
+ " print(top_item[1], ':', top_item[2])"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "ae750ca0",
+ "cell_type": "markdown",
+ "id": "f3ef7e54",
"metadata": {},
- "outputs": [],
- "source": []
+ "source": [
+ "## 추천 ì•Œê³ ë¦¬ì¦˜"
+ ]
}
],
"metadata": {
@@ -4083,7 +4707,7 @@
"height": "772px",
"left": "56px",
"top": "91.328125px",
- "width": "259px"
+ "width": "258.970581px"
},
"toc_section_display": true,
"toc_window_display": true
|