From 3376901afcba776f11478590dbcb2c597dd9740b Mon Sep 17 00:00:00 2001 From: Max Date: Tue, 14 Jan 2020 09:38:06 +0100 Subject: [PATCH] avoid using rigid DataFrames; use lists for evaluation results instead --- notebooks/example_evaluation.ipynb | 332 ++++++++++++++--------------- src/justcause/evaluation.py | 73 ++----- tests/test_evaluation.py | 52 ++--- 3 files changed, 196 insertions(+), 261 deletions(-) diff --git a/notebooks/example_evaluation.ipynb b/notebooks/example_evaluation.ipynb index 08ee03e..6a2db56 100644 --- a/notebooks/example_evaluation.ipynb +++ b/notebooks/example_evaluation.ipynb @@ -4,9 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Evaluation with `justcause`\n", + "# Evaluation with JustCause\n", "\n", - "In this notebook, we examplify how to use `justcause` in order to evaluate methods using reference datasets. For simplicity, we only use one dataset, but show how evaluation works with multiple methods. Both standard causal methods implemented in the framework as well as custom methods. \n" + "In this notebook, we examplify how to use JustCause in order to evaluate methods using reference datasets. For simplicity, we only use one dataset, but show how evaluation works with multiple methods. Both standard causal methods implemented in the framework as well as custom methods. \n" ] }, { @@ -14,12 +14,12 @@ "metadata": {}, "source": [ "## Custom First\n", - "The goal of the `justcause` framework is to be a modular and flexible facilitator of causal evaluation." + "The goal of the JustCause framework is to be a modular and flexible facilitator of causal evaluation." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -39,12 +39,12 @@ "# Loading all required packages \n", "import itertools\n", "import numpy as np\n", + "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from justcause.data import Col\n", "from justcause.data.sets import load_ihdp\n", "from justcause.metrics import pehe_score, mean_absolute\n", - "from justcause.evaluation import setup_result_df, setup_scores_df, calc_scores, \\\n", - " summarize_scores\n", + "from justcause.evaluation import calc_scores, summarize_scores\n", "\n", "from sklearn.linear_model import LinearRegression" ] @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -111,14 +111,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "results_df = setup_result_df(metrics)\n", + "results_df = list()\n", " \n", - "test_scores = setup_scores_df(metrics)\n", - "train_scores = setup_scores_df(metrics)\n", + "test_scores = list()\n", + "train_scores = list()\n", "\n", "for rep in replications:\n", "\n", @@ -130,26 +130,18 @@ " train_ite, test_ite = weighted_slearner(train, test)\n", "\n", " # Calculate the scores and append them to a dataframe\n", - " test_scores = test_scores.append(calc_scores(\n", - " test[Col.ite], test_ite, metrics\n", - " ), ignore_index=True)\n", - "\n", - " train_scores = train_scores.append(calc_scores(\n", - " train[Col.ite], train_ite, metrics\n", - " ), ignore_index=True)\n", + " train_scores.append(calc_scores(train[Col.ite], train_ite, metrics))\n", + " test_scores.append(calc_scores(test[Col.ite], test_ite, metrics))\n", "\n", "# Summarize the scores and save them in a dataframe\n", "train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n", "train_result.update({'method': 'weighted_slearner', 'train': True})\n", - "test_result.update({'method': 'weighted_slearner', 'train': False})\n", - "\n", - "results_df = results_df.append(train_result, ignore_index=True)\n", - "results_df = results_df.append(test_result, ignore_index=True)" + "test_result.update({'method': 'weighted_slearner', 'train': False})\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -173,60 +165,60 @@ " \n", " \n", " \n", - " method\n", - " train\n", " pehe_score-mean\n", " pehe_score-median\n", " pehe_score-std\n", " mean_absolute-mean\n", " mean_absolute-median\n", " mean_absolute-std\n", + " method\n", + " train\n", " \n", " \n", " \n", " \n", " 0\n", - " weighted_slearner\n", - " True\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " True\n", " \n", " \n", " 1\n", - " weighted_slearner\n", - " False\n", " 5.493401\n", " 2.589651\n", " 7.903174\n", " 0.655602\n", " 0.287201\n", " 0.941941\n", + " weighted_slearner\n", + " False\n", " \n", " \n", "\n", "" ], "text/plain": [ - " method train pehe_score-mean pehe_score-median \\\n", - "0 weighted_slearner True 5.592356 2.569472 \n", - "1 weighted_slearner False 5.493401 2.589651 \n", + " pehe_score-mean pehe_score-median pehe_score-std mean_absolute-mean \\\n", + "0 5.592356 2.569472 8.248291 0.369939 \n", + "1 5.493401 2.589651 7.903174 0.655602 \n", "\n", - " pehe_score-std mean_absolute-mean mean_absolute-median mean_absolute-std \n", - "0 8.248291 0.369939 0.212427 0.524395 \n", - "1 7.903174 0.655602 0.287201 0.941941 " + " mean_absolute-median mean_absolute-std method train \n", + "0 0.212427 0.524395 weighted_slearner True \n", + "1 0.287201 0.941941 weighted_slearner False " ] }, - "execution_count": 14, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results_df" + "pd.DataFrame([train_result, test_result])" ] }, { @@ -238,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -256,12 +248,12 @@ "\n", "methods = [basic_slearner, weighted_slearner]\n", "\n", - "results_df = setup_result_df(metrics)\n", + "results = list()\n", "\n", "for method in methods:\n", " \n", - " test_scores = setup_scores_df(metrics)\n", - " train_scores = setup_scores_df(metrics)\n", + " test_scores = list()\n", + " train_scores = list()\n", "\n", " for rep in replications:\n", "\n", @@ -273,26 +265,21 @@ " train_ite, test_ite = method(train, test)\n", "\n", " # Calculate the scores and append them to a dataframe\n", - " test_scores = test_scores.append(calc_scores(\n", - " test[Col.ite], test_ite, metrics\n", - " ), ignore_index=True)\n", - "\n", - " train_scores = train_scores.append(calc_scores(\n", - " train[Col.ite], train_ite, metrics\n", - " ), ignore_index=True)\n", + " test_scores.append(calc_scores(test[Col.ite], test_ite, metrics))\n", + " train_scores.append(calc_scores(train[Col.ite], train_ite, metrics))\n", "\n", " # Summarize the scores and save them in a dataframe\n", " train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n", " train_result.update({'method': method.__name__, 'train': True})\n", " test_result.update({'method': method.__name__, 'train': False})\n", "\n", - " results_df = results_df.append(train_result, ignore_index=True)\n", - " results_df = results_df.append(test_result, ignore_index=True)" + " results.append(train_result)\n", + " results.append(test_result)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -316,86 +303,87 @@ " \n", " \n", " \n", - " method\n", - " train\n", " pehe_score-mean\n", " pehe_score-median\n", " pehe_score-std\n", " mean_absolute-mean\n", " mean_absolute-median\n", " mean_absolute-std\n", + " method\n", + " train\n", " \n", " \n", " \n", " \n", " 0\n", - " weighted_slearner\n", - " True\n", " 5.633660\n", " 2.623297\n", " 8.362125\n", " 0.732443\n", " 0.238185\n", " 1.493276\n", + " basic_slearner\n", + " True\n", " \n", " \n", " 1\n", - " weighted_slearner\n", - " False\n", " 5.625971\n", " 2.635993\n", " 8.213626\n", " 1.292668\n", " 0.396246\n", " 2.474603\n", + " basic_slearner\n", + " False\n", " \n", " \n", " 2\n", - " weighted_slearner\n", - " True\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " True\n", " \n", " \n", " 3\n", - " weighted_slearner\n", - " False\n", " 5.493401\n", " 2.589651\n", " 7.903174\n", " 0.655602\n", " 0.287201\n", " 0.941941\n", + " weighted_slearner\n", + " False\n", " \n", " \n", "\n", "" ], "text/plain": [ - " method train pehe_score-mean pehe_score-median \\\n", - "0 weighted_slearner True 5.633660 2.623297 \n", - "1 weighted_slearner False 5.625971 2.635993 \n", - "2 weighted_slearner True 5.592356 2.569472 \n", - "3 weighted_slearner False 5.493401 2.589651 \n", + " pehe_score-mean pehe_score-median pehe_score-std mean_absolute-mean \\\n", + "0 5.633660 2.623297 8.362125 0.732443 \n", + "1 5.625971 2.635993 8.213626 1.292668 \n", + "2 5.592356 2.569472 8.248291 0.369939 \n", + "3 5.493401 2.589651 7.903174 0.655602 \n", "\n", - " pehe_score-std mean_absolute-mean mean_absolute-median mean_absolute-std \n", - "0 8.362125 0.732443 0.238185 1.493276 \n", - "1 8.213626 1.292668 0.396246 2.474603 \n", - "2 8.248291 0.369939 0.212427 0.524395 \n", - "3 7.903174 0.655602 0.287201 0.941941 " + " mean_absolute-median mean_absolute-std method train \n", + "0 0.238185 1.493276 basic_slearner True \n", + "1 0.396246 2.474603 basic_slearner False \n", + "2 0.212427 0.524395 weighted_slearner True \n", + "3 0.287201 0.941941 weighted_slearner False " ] }, - "execution_count": 16, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results_df" + "# For visualization\n", + "pd.DataFrame(results)" ] }, { @@ -421,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -432,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -456,86 +444,86 @@ " \n", " \n", " \n", - " method\n", - " train\n", " pehe_score-mean\n", " pehe_score-median\n", " pehe_score-std\n", " mean_absolute-mean\n", " mean_absolute-median\n", " mean_absolute-std\n", + " method\n", + " train\n", " \n", " \n", " \n", " \n", " 0\n", - " basic_slearner\n", - " True\n", " 5.633660\n", " 2.623297\n", " 8.362125\n", " 0.732443\n", " 0.238185\n", " 1.493276\n", + " basic_slearner\n", + " True\n", " \n", " \n", " 1\n", - " basic_slearner\n", - " False\n", " 5.633660\n", " 2.623297\n", " 8.362125\n", " 0.732443\n", " 0.238185\n", " 1.493276\n", + " basic_slearner\n", + " False\n", " \n", " \n", " 2\n", - " weighted_slearner\n", - " True\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " True\n", " \n", " \n", " 3\n", - " weighted_slearner\n", - " False\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " False\n", " \n", " \n", "\n", "" ], "text/plain": [ - " method train pehe_score-mean pehe_score-median \\\n", - "0 basic_slearner True 5.633660 2.623297 \n", - "1 basic_slearner False 5.633660 2.623297 \n", - "2 weighted_slearner True 5.592356 2.569472 \n", - "3 weighted_slearner False 5.592356 2.569472 \n", + " pehe_score-mean pehe_score-median pehe_score-std mean_absolute-mean \\\n", + "0 5.633660 2.623297 8.362125 0.732443 \n", + "1 5.633660 2.623297 8.362125 0.732443 \n", + "2 5.592356 2.569472 8.248291 0.369939 \n", + "3 5.592356 2.569472 8.248291 0.369939 \n", "\n", - " pehe_score-std mean_absolute-mean mean_absolute-median mean_absolute-std \n", - "0 8.362125 0.732443 0.238185 1.493276 \n", - "1 8.362125 0.732443 0.238185 1.493276 \n", - "2 8.248291 0.369939 0.212427 0.524395 \n", - "3 8.248291 0.369939 0.212427 0.524395 " + " mean_absolute-median mean_absolute-std method train \n", + "0 0.238185 1.493276 basic_slearner True \n", + "1 0.238185 1.493276 basic_slearner False \n", + "2 0.212427 0.524395 weighted_slearner True \n", + "3 0.212427 0.524395 weighted_slearner False " ] }, - "execution_count": 20, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "result" + "pd.DataFrame(result)" ] }, { @@ -550,7 +538,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -564,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -588,176 +576,176 @@ " \n", " \n", " \n", - " method\n", - " train\n", " pehe_score-mean\n", " pehe_score-median\n", " pehe_score-std\n", " mean_absolute-mean\n", " mean_absolute-median\n", " mean_absolute-std\n", + " method\n", + " train\n", " \n", " \n", " \n", " \n", " 0\n", - " SLearner(learner=LinearRegression)\n", - " True\n", " 5.633660\n", " 2.623297\n", " 8.362125\n", " 0.732443\n", " 0.238185\n", " 1.493276\n", + " SLearner(learner=LinearRegression)\n", + " True\n", " \n", " \n", " 1\n", - " SLearner(learner=LinearRegression)\n", - " False\n", " 5.633660\n", " 2.623297\n", " 8.362125\n", " 0.732443\n", " 0.238185\n", " 1.493276\n", + " SLearner(learner=LinearRegression)\n", + " False\n", " \n", " \n", " 2\n", - " weighted_slearner\n", - " True\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " True\n", " \n", " \n", " 3\n", - " weighted_slearner\n", - " False\n", " 5.592356\n", " 2.569472\n", " 8.248291\n", " 0.369939\n", " 0.212427\n", " 0.524395\n", + " weighted_slearner\n", + " False\n", " \n", " \n", " 4\n", - " TLearner(control=LassoLars, treated=LassoLars)\n", - " True\n", " 5.572626\n", " 2.543798\n", " 8.213573\n", " 0.293187\n", " 0.166370\n", " 0.428028\n", + " TLearner(control=LassoLars, treated=LassoLars)\n", + " True\n", " \n", " \n", " 5\n", - " TLearner(control=LassoLars, treated=LassoLars)\n", - " False\n", " 5.572626\n", " 2.543798\n", " 8.213573\n", " 0.293187\n", " 0.166370\n", " 0.428028\n", + " TLearner(control=LassoLars, treated=LassoLars)\n", + " False\n", " \n", " \n", " 6\n", - " XLearner(outcome_c=LassoLars, outcome_t=LassoL...\n", - " True\n", - " 5.579285\n", + " 5.579297\n", " 2.543798\n", - " 8.240606\n", - " 0.289699\n", + " 8.240655\n", + " 0.289592\n", " 0.166370\n", - " 0.427008\n", + " 0.427021\n", + " XLearner(outcome_c=LassoLars, outcome_t=LassoL...\n", + " True\n", " \n", " \n", " 7\n", - " XLearner(outcome_c=LassoLars, outcome_t=LassoL...\n", - " False\n", - " 5.579285\n", + " 5.579297\n", " 2.543798\n", - " 8.240606\n", - " 0.289699\n", + " 8.240655\n", + " 0.289592\n", " 0.166370\n", - " 0.427008\n", + " 0.427021\n", + " XLearner(outcome_c=LassoLars, outcome_t=LassoL...\n", + " False\n", " \n", " \n", " 8\n", + " 2.637110\n", + " 1.277486\n", + " 3.824333\n", + " 0.234029\n", + " 0.196398\n", + " 0.206225\n", " RLearner(outcome=LinearRegression, effect=Line...\n", " True\n", - " 2.560234\n", - " 1.221982\n", - " 3.731162\n", - " 0.253945\n", - " 0.152045\n", - " 0.283504\n", " \n", " \n", " 9\n", + " 2.637110\n", + " 1.277486\n", + " 3.824333\n", + " 0.234029\n", + " 0.196398\n", + " 0.206225\n", " RLearner(outcome=LinearRegression, effect=Line...\n", " False\n", - " 2.560234\n", - " 1.221982\n", - " 3.731162\n", - " 0.253945\n", - " 0.152045\n", - " 0.283504\n", " \n", " \n", "\n", "" ], "text/plain": [ - " method train pehe_score-mean \\\n", - "0 SLearner(learner=LinearRegression) True 5.633660 \n", - "1 SLearner(learner=LinearRegression) False 5.633660 \n", - "2 weighted_slearner True 5.592356 \n", - "3 weighted_slearner False 5.592356 \n", - "4 TLearner(control=LassoLars, treated=LassoLars) True 5.572626 \n", - "5 TLearner(control=LassoLars, treated=LassoLars) False 5.572626 \n", - "6 XLearner(outcome_c=LassoLars, outcome_t=LassoL... True 5.579285 \n", - "7 XLearner(outcome_c=LassoLars, outcome_t=LassoL... False 5.579285 \n", - "8 RLearner(outcome=LinearRegression, effect=Line... True 2.560234 \n", - "9 RLearner(outcome=LinearRegression, effect=Line... False 2.560234 \n", + " pehe_score-mean pehe_score-median pehe_score-std mean_absolute-mean \\\n", + "0 5.633660 2.623297 8.362125 0.732443 \n", + "1 5.633660 2.623297 8.362125 0.732443 \n", + "2 5.592356 2.569472 8.248291 0.369939 \n", + "3 5.592356 2.569472 8.248291 0.369939 \n", + "4 5.572626 2.543798 8.213573 0.293187 \n", + "5 5.572626 2.543798 8.213573 0.293187 \n", + "6 5.579297 2.543798 8.240655 0.289592 \n", + "7 5.579297 2.543798 8.240655 0.289592 \n", + "8 2.637110 1.277486 3.824333 0.234029 \n", + "9 2.637110 1.277486 3.824333 0.234029 \n", "\n", - " pehe_score-median pehe_score-std mean_absolute-mean \\\n", - "0 2.623297 8.362125 0.732443 \n", - "1 2.623297 8.362125 0.732443 \n", - "2 2.569472 8.248291 0.369939 \n", - "3 2.569472 8.248291 0.369939 \n", - "4 2.543798 8.213573 0.293187 \n", - "5 2.543798 8.213573 0.293187 \n", - "6 2.543798 8.240606 0.289699 \n", - "7 2.543798 8.240606 0.289699 \n", - "8 1.221982 3.731162 0.253945 \n", - "9 1.221982 3.731162 0.253945 \n", + " mean_absolute-median mean_absolute-std \\\n", + "0 0.238185 1.493276 \n", + "1 0.238185 1.493276 \n", + "2 0.212427 0.524395 \n", + "3 0.212427 0.524395 \n", + "4 0.166370 0.428028 \n", + "5 0.166370 0.428028 \n", + "6 0.166370 0.427021 \n", + "7 0.166370 0.427021 \n", + "8 0.196398 0.206225 \n", + "9 0.196398 0.206225 \n", "\n", - " mean_absolute-median mean_absolute-std \n", - "0 0.238185 1.493276 \n", - "1 0.238185 1.493276 \n", - "2 0.212427 0.524395 \n", - "3 0.212427 0.524395 \n", - "4 0.166370 0.428028 \n", - "5 0.166370 0.428028 \n", - "6 0.166370 0.427008 \n", - "7 0.166370 0.427008 \n", - "8 0.152045 0.283504 \n", - "9 0.152045 0.283504 " + " method train \n", + "0 SLearner(learner=LinearRegression) True \n", + "1 SLearner(learner=LinearRegression) False \n", + "2 weighted_slearner True \n", + "3 weighted_slearner False \n", + "4 TLearner(control=LassoLars, treated=LassoLars) True \n", + "5 TLearner(control=LassoLars, treated=LassoLars) False \n", + "6 XLearner(outcome_c=LassoLars, outcome_t=LassoL... True \n", + "7 XLearner(outcome_c=LassoLars, outcome_t=LassoL... False \n", + "8 RLearner(outcome=LinearRegression, effect=Line... True \n", + "9 RLearner(outcome=LinearRegression, effect=Line... False " ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "result" + "pd.DataFrame(result)" ] }, { diff --git a/src/justcause/evaluation.py b/src/justcause/evaluation.py index 7331239..82f6e82 100644 --- a/src/justcause/evaluation.py +++ b/src/justcause/evaluation.py @@ -18,8 +18,6 @@ Format = Callable[[Union[np.array, List[np.array]]], Union[float, List[float]]] Frame = Union[CausalFrame, pd.DataFrame] -STD_COL = ["method", "train"] - METHOD = "method" TRAIN = "train" @@ -33,35 +31,6 @@ def format_metric(metric, form): return "{}-{}".format(metric_string, form.__name__) -def setup_scores_df(metrics: Union[List[Metric], Metric]): - """Setup DataFrame containing the metric scores for all replications - - Args: - metrics: metrics used for naming the columns - - Returns: DataFrame to store the scores for each replication - """ - cols = [metric.__name__ for metric in metrics] - return pd.DataFrame(columns=cols) - - -def setup_result_df( - metrics: Union[List[Metric], Metric], formats=(np.mean, np.median, np.std) -): - """Setup DataFrame containing the summarized scores for all methods and datasets - - Args: - metrics: metrics used for scoring - formats: formats for summarizing metrics (e.g. mean, std, ...) - - Returns: DataFrame to store the results for each method - """ - cols = STD_COL + [ - format_metric(metric, form) for metric in metrics for form in formats - ] - return pd.DataFrame(columns=cols) - - def evaluate_ite( replications: Union[CausalFrame, List[CausalFrame]], methods, @@ -69,7 +38,7 @@ def evaluate_ite( formats: Union[List[Format], Format] = (np.mean, np.median, np.std), train_size: float = 0.8, random_state: Optional[RandomState] = None, -) -> pd.DataFrame: +) -> List[dict]: """Evaluate methods with multiple metrics on a given set of replications Good for use with standard causal methods and callables on new datasets. @@ -99,7 +68,7 @@ def evaluate_ite( if not isinstance(replications, list): replications = [replications] - results_df = setup_result_df(metrics, formats) + results = list() for method in methods: @@ -116,10 +85,10 @@ def evaluate_ite( train_result.update({METHOD: name, TRAIN: True}) test_result.update({METHOD: name, TRAIN: False}) - results_df = results_df.append(train_result, ignore_index=True) - results_df = results_df.append(test_result, ignore_index=True) + results.append(train_result) + results.append(test_result) - return results_df + return results def _evaluate_single_method( @@ -129,13 +98,17 @@ def _evaluate_single_method( formats=(np.mean, np.median, np.std), train_size=0.8, random_state=None, -): +) -> Tuple[dict, dict]: """Helper to evaluate method with multiple metrics on the given replications. This is the standard variant of an evaluation loop, which the user can implement manually to modify parts of it. Here, only ITE prediction and evaluation is considered. + Returns: + a tuple of two dicts which map (score_name) -> (score) + summarized over all replications for train and test respectively + """ if not isinstance(metrics, list): metrics = [metrics] @@ -143,8 +116,8 @@ def _evaluate_single_method( if not isinstance(replications, list): replications = [replications] - test_scores = setup_scores_df(metrics) - train_scores = setup_scores_df(metrics) + train_scores = list() + test_scores = list() for rep in replications: train, test = train_test_split( @@ -156,13 +129,8 @@ def _evaluate_single_method( else: train_ite, test_ite = default_predictions(method, train, test) - test_scores = test_scores.append( - calc_scores(test[Col.ite], test_ite, metrics), ignore_index=True - ) - - train_scores = train_scores.append( - calc_scores(train[Col.ite], train_ite, metrics), ignore_index=True - ) + train_scores.append(calc_scores(train[Col.ite], train_ite, metrics)) + test_scores.append(calc_scores(test[Col.ite], test_ite, metrics)) train_results = summarize_scores(train_scores, formats) test_results = summarize_scores(train_scores, formats) @@ -221,7 +189,7 @@ def default_predictions( def summarize_scores( - scores_df: pd.DataFrame, + scores: Union[pd.DataFrame, List[dict]], formats: Union[List[Format], Format] = (np.mean, np.median, np.std), ) -> np.array: """ @@ -229,15 +197,18 @@ def summarize_scores( Call for train and test separately Args: - scores_df: the dataframe containing scores for all replications + scores: the DataFrame or DataFrame-like containing scores for all replications formats: Summaries to calculate over the scores of multiple replications - Returns: The rows to be added to the result dataframe + Returns: + dict: a dictionary mapping """ + # make sure we're dealing with pd.DataFrame + df = pd.DataFrame(scores) dict_of_results = { - format_metric(metric, form): form(scores_df[metric]) - for metric in scores_df.columns + format_metric(metric, form): form(df[metric]) + for metric in df.columns for form in formats } return dict_of_results diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 9030eea..0397e5f 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -4,30 +4,29 @@ import pandas as pd from sklearn.linear_model import LinearRegression -from justcause.evaluation import ( - calc_scores, - evaluate_ite, - setup_result_df, - setup_scores_df, - summarize_scores, -) +from justcause.evaluation import calc_scores, evaluate_ite, summarize_scores from justcause.learners import SLearner -from justcause.metrics import enormse, pehe_score +from justcause.metrics import pehe_score def test_single_evaluation(ihdp_data): reps = list(islice(ihdp_data, 10)) learner = SLearner(LinearRegression()) - df = evaluate_ite(reps, learner, pehe_score, train_size=0.8) - assert len(df) == 2 - assert len(df.columns) == 5 # 2 standard + 3 formats for one metric - assert "pehe_score-mean" in df.columns # three format per metric are reported + result = evaluate_ite(reps, learner, pehe_score, train_size=0.8) + row = result[0] + assert len(result) == 2 + assert len(row) == 5 # 2 standard + 3 formats for one metric + assert "pehe_score-mean" in row.keys() # three format per metric are reported def test_summary(): - metrics = [pehe_score] - df = setup_scores_df(metrics) - df["pehe_score"] = np.full(10, 1) + data = {"pehe_score": np.full(10, 1)} + summary = summarize_scores(data) + assert len(summary) == 3 # 5 pseudo-metrics times 3 formats + assert summary["pehe_score-mean"] == 1 + + # Also works with pd.DataFrame + df = pd.DataFrame(data) summary = summarize_scores(df) assert len(summary) == 3 # 5 pseudo-metrics times 3 formats assert summary["pehe_score-mean"] == 1 @@ -44,26 +43,3 @@ def test_calc_scores(): score_dict = calc_scores(true, pred, pehe_score) assert list(score_dict.values())[0] == 1 assert "pehe_score" in score_dict.keys() - - -def test_setup_df(): - - metrics = [pehe_score] - - df = setup_scores_df(metrics) - assert len(df.columns) == 1 - assert "pehe_score" in df.columns - - metrics = [pehe_score, enormse] - df = setup_scores_df(metrics) - assert len(df.columns) == 2 - assert "enormse" in df.columns - - result = setup_result_df(metrics) - assert len(result.columns) == 8 # 2 base + 3 for each metric - assert "pehe_score-mean" in result.columns - - formats = [np.mean, np.std] - result = setup_result_df(metrics, formats) - assert len(result.columns) == 6 # 2 base + 2 for each metric - assert "enormse-std" in result.columns