From 3376901afcba776f11478590dbcb2c597dd9740b Mon Sep 17 00:00:00 2001
From: Max <franzmaximilian1996@gmail.com>
Date: Tue, 14 Jan 2020 09:38:06 +0100
Subject: [PATCH] avoid using rigid DataFrames; use lists for evaluation
 results instead

---
 notebooks/example_evaluation.ipynb | 332 ++++++++++++++---------------
 src/justcause/evaluation.py        |  73 ++-----
 tests/test_evaluation.py           |  52 ++---
 3 files changed, 196 insertions(+), 261 deletions(-)

diff --git a/notebooks/example_evaluation.ipynb b/notebooks/example_evaluation.ipynb
index 08ee03e..6a2db56 100644
--- a/notebooks/example_evaluation.ipynb
+++ b/notebooks/example_evaluation.ipynb
@@ -4,9 +4,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Evaluation with `justcause`\n",
+    "# Evaluation with JustCause\n",
     "\n",
-    "In this notebook, we examplify how to use `justcause` in order to evaluate methods using reference datasets. For simplicity, we only use one dataset, but show how evaluation works with multiple methods. Both standard causal methods implemented in the framework as well as custom methods. \n"
+    "In this notebook, we examplify how to use JustCause in order to evaluate methods using reference datasets. For simplicity, we only use one dataset, but show how evaluation works with multiple methods. Both standard causal methods implemented in the framework as well as custom methods. \n"
    ]
   },
   {
@@ -14,12 +14,12 @@
    "metadata": {},
    "source": [
     "## Custom First\n",
-    "The goal of the `justcause` framework is to be a modular and flexible facilitator of causal evaluation."
+    "The goal of the JustCause framework is to be a modular and flexible facilitator of causal evaluation."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -39,12 +39,12 @@
     "# Loading all required packages \n",
     "import itertools\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "from sklearn.model_selection import train_test_split\n",
     "from justcause.data import Col\n",
     "from justcause.data.sets import load_ihdp\n",
     "from justcause.metrics import pehe_score, mean_absolute\n",
-    "from justcause.evaluation import setup_result_df, setup_scores_df, calc_scores, \\\n",
-    "    summarize_scores\n",
+    "from justcause.evaluation import calc_scores, summarize_scores\n",
     "\n",
     "from sklearn.linear_model import LinearRegression"
    ]
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -111,14 +111,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
-    "results_df = setup_result_df(metrics)\n",
+    "results_df = list()\n",
     "    \n",
-    "test_scores = setup_scores_df(metrics)\n",
-    "train_scores = setup_scores_df(metrics)\n",
+    "test_scores = list()\n",
+    "train_scores = list()\n",
     "\n",
     "for rep in replications:\n",
     "\n",
@@ -130,26 +130,18 @@
     "    train_ite, test_ite = weighted_slearner(train, test)\n",
     "\n",
     "    # Calculate the scores and append them to a dataframe\n",
-    "    test_scores = test_scores.append(calc_scores(\n",
-    "        test[Col.ite], test_ite, metrics\n",
-    "    ), ignore_index=True)\n",
-    "\n",
-    "    train_scores = train_scores.append(calc_scores(\n",
-    "        train[Col.ite], train_ite, metrics\n",
-    "    ), ignore_index=True)\n",
+    "    train_scores.append(calc_scores(train[Col.ite], train_ite, metrics))\n",
+    "    test_scores.append(calc_scores(test[Col.ite], test_ite, metrics))\n",
     "\n",
     "# Summarize the scores and save them in a dataframe\n",
     "train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n",
     "train_result.update({'method': 'weighted_slearner', 'train': True})\n",
-    "test_result.update({'method': 'weighted_slearner', 'train': False})\n",
-    "\n",
-    "results_df = results_df.append(train_result, ignore_index=True)\n",
-    "results_df = results_df.append(test_result, ignore_index=True)"
+    "test_result.update({'method': 'weighted_slearner', 'train': False})\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -173,60 +165,60 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>method</th>\n",
-       "      <th>train</th>\n",
        "      <th>pehe_score-mean</th>\n",
        "      <th>pehe_score-median</th>\n",
        "      <th>pehe_score-std</th>\n",
        "      <th>mean_absolute-mean</th>\n",
        "      <th>mean_absolute-median</th>\n",
        "      <th>mean_absolute-std</th>\n",
+       "      <th>method</th>\n",
+       "      <th>train</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.493401</td>\n",
        "      <td>2.589651</td>\n",
        "      <td>7.903174</td>\n",
        "      <td>0.655602</td>\n",
        "      <td>0.287201</td>\n",
        "      <td>0.941941</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "              method  train  pehe_score-mean  pehe_score-median  \\\n",
-       "0  weighted_slearner   True         5.592356           2.569472   \n",
-       "1  weighted_slearner  False         5.493401           2.589651   \n",
+       "   pehe_score-mean  pehe_score-median  pehe_score-std  mean_absolute-mean  \\\n",
+       "0         5.592356           2.569472        8.248291            0.369939   \n",
+       "1         5.493401           2.589651        7.903174            0.655602   \n",
        "\n",
-       "   pehe_score-std  mean_absolute-mean  mean_absolute-median  mean_absolute-std  \n",
-       "0        8.248291            0.369939              0.212427           0.524395  \n",
-       "1        7.903174            0.655602              0.287201           0.941941  "
+       "   mean_absolute-median  mean_absolute-std             method  train  \n",
+       "0              0.212427           0.524395  weighted_slearner   True  \n",
+       "1              0.287201           0.941941  weighted_slearner  False  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "results_df"
+    "pd.DataFrame([train_result, test_result])"
    ]
   },
   {
@@ -238,7 +230,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -256,12 +248,12 @@
     "\n",
     "methods = [basic_slearner, weighted_slearner]\n",
     "\n",
-    "results_df = setup_result_df(metrics)\n",
+    "results = list()\n",
     "\n",
     "for method in methods:\n",
     "    \n",
-    "    test_scores = setup_scores_df(metrics)\n",
-    "    train_scores = setup_scores_df(metrics)\n",
+    "    test_scores = list()\n",
+    "    train_scores = list()\n",
     "\n",
     "    for rep in replications:\n",
     "\n",
@@ -273,26 +265,21 @@
     "        train_ite, test_ite = method(train, test)\n",
     "\n",
     "        # Calculate the scores and append them to a dataframe\n",
-    "        test_scores = test_scores.append(calc_scores(\n",
-    "            test[Col.ite], test_ite, metrics\n",
-    "        ), ignore_index=True)\n",
-    "\n",
-    "        train_scores = train_scores.append(calc_scores(\n",
-    "            train[Col.ite], train_ite, metrics\n",
-    "        ), ignore_index=True)\n",
+    "        test_scores.append(calc_scores(test[Col.ite], test_ite, metrics))\n",
+    "        train_scores.append(calc_scores(train[Col.ite], train_ite, metrics))\n",
     "\n",
     "    # Summarize the scores and save them in a dataframe\n",
     "    train_result, test_result = summarize_scores(train_scores), summarize_scores(test_scores)\n",
     "    train_result.update({'method': method.__name__, 'train': True})\n",
     "    test_result.update({'method': method.__name__, 'train': False})\n",
     "\n",
-    "    results_df = results_df.append(train_result, ignore_index=True)\n",
-    "    results_df = results_df.append(test_result, ignore_index=True)"
+    "    results.append(train_result)\n",
+    "    results.append(test_result)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -316,86 +303,87 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>method</th>\n",
-       "      <th>train</th>\n",
        "      <th>pehe_score-mean</th>\n",
        "      <th>pehe_score-median</th>\n",
        "      <th>pehe_score-std</th>\n",
        "      <th>mean_absolute-mean</th>\n",
        "      <th>mean_absolute-median</th>\n",
        "      <th>mean_absolute-std</th>\n",
+       "      <th>method</th>\n",
+       "      <th>train</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.633660</td>\n",
        "      <td>2.623297</td>\n",
        "      <td>8.362125</td>\n",
        "      <td>0.732443</td>\n",
        "      <td>0.238185</td>\n",
        "      <td>1.493276</td>\n",
+       "      <td>basic_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.625971</td>\n",
        "      <td>2.635993</td>\n",
        "      <td>8.213626</td>\n",
        "      <td>1.292668</td>\n",
        "      <td>0.396246</td>\n",
        "      <td>2.474603</td>\n",
+       "      <td>basic_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.493401</td>\n",
        "      <td>2.589651</td>\n",
        "      <td>7.903174</td>\n",
        "      <td>0.655602</td>\n",
        "      <td>0.287201</td>\n",
        "      <td>0.941941</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "              method  train  pehe_score-mean  pehe_score-median  \\\n",
-       "0  weighted_slearner   True         5.633660           2.623297   \n",
-       "1  weighted_slearner  False         5.625971           2.635993   \n",
-       "2  weighted_slearner   True         5.592356           2.569472   \n",
-       "3  weighted_slearner  False         5.493401           2.589651   \n",
+       "   pehe_score-mean  pehe_score-median  pehe_score-std  mean_absolute-mean  \\\n",
+       "0         5.633660           2.623297        8.362125            0.732443   \n",
+       "1         5.625971           2.635993        8.213626            1.292668   \n",
+       "2         5.592356           2.569472        8.248291            0.369939   \n",
+       "3         5.493401           2.589651        7.903174            0.655602   \n",
        "\n",
-       "   pehe_score-std  mean_absolute-mean  mean_absolute-median  mean_absolute-std  \n",
-       "0        8.362125            0.732443              0.238185           1.493276  \n",
-       "1        8.213626            1.292668              0.396246           2.474603  \n",
-       "2        8.248291            0.369939              0.212427           0.524395  \n",
-       "3        7.903174            0.655602              0.287201           0.941941  "
+       "   mean_absolute-median  mean_absolute-std             method  train  \n",
+       "0              0.238185           1.493276     basic_slearner   True  \n",
+       "1              0.396246           2.474603     basic_slearner  False  \n",
+       "2              0.212427           0.524395  weighted_slearner   True  \n",
+       "3              0.287201           0.941941  weighted_slearner  False  "
       ]
      },
-     "execution_count": 16,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "results_df"
+    "# For visualization\n",
+    "pd.DataFrame(results)"
    ]
   },
   {
@@ -421,7 +409,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -432,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -456,86 +444,86 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>method</th>\n",
-       "      <th>train</th>\n",
        "      <th>pehe_score-mean</th>\n",
        "      <th>pehe_score-median</th>\n",
        "      <th>pehe_score-std</th>\n",
        "      <th>mean_absolute-mean</th>\n",
        "      <th>mean_absolute-median</th>\n",
        "      <th>mean_absolute-std</th>\n",
+       "      <th>method</th>\n",
+       "      <th>train</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>basic_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.633660</td>\n",
        "      <td>2.623297</td>\n",
        "      <td>8.362125</td>\n",
        "      <td>0.732443</td>\n",
        "      <td>0.238185</td>\n",
        "      <td>1.493276</td>\n",
+       "      <td>basic_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>basic_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.633660</td>\n",
        "      <td>2.623297</td>\n",
        "      <td>8.362125</td>\n",
        "      <td>0.732443</td>\n",
        "      <td>0.238185</td>\n",
        "      <td>1.493276</td>\n",
+       "      <td>basic_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "              method  train  pehe_score-mean  pehe_score-median  \\\n",
-       "0     basic_slearner   True         5.633660           2.623297   \n",
-       "1     basic_slearner  False         5.633660           2.623297   \n",
-       "2  weighted_slearner   True         5.592356           2.569472   \n",
-       "3  weighted_slearner  False         5.592356           2.569472   \n",
+       "   pehe_score-mean  pehe_score-median  pehe_score-std  mean_absolute-mean  \\\n",
+       "0         5.633660           2.623297        8.362125            0.732443   \n",
+       "1         5.633660           2.623297        8.362125            0.732443   \n",
+       "2         5.592356           2.569472        8.248291            0.369939   \n",
+       "3         5.592356           2.569472        8.248291            0.369939   \n",
        "\n",
-       "   pehe_score-std  mean_absolute-mean  mean_absolute-median  mean_absolute-std  \n",
-       "0        8.362125            0.732443              0.238185           1.493276  \n",
-       "1        8.362125            0.732443              0.238185           1.493276  \n",
-       "2        8.248291            0.369939              0.212427           0.524395  \n",
-       "3        8.248291            0.369939              0.212427           0.524395  "
+       "   mean_absolute-median  mean_absolute-std             method  train  \n",
+       "0              0.238185           1.493276     basic_slearner   True  \n",
+       "1              0.238185           1.493276     basic_slearner  False  \n",
+       "2              0.212427           0.524395  weighted_slearner   True  \n",
+       "3              0.212427           0.524395  weighted_slearner  False  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "result"
+    "pd.DataFrame(result)"
    ]
   },
   {
@@ -550,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -564,7 +552,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -588,176 +576,176 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>method</th>\n",
-       "      <th>train</th>\n",
        "      <th>pehe_score-mean</th>\n",
        "      <th>pehe_score-median</th>\n",
        "      <th>pehe_score-std</th>\n",
        "      <th>mean_absolute-mean</th>\n",
        "      <th>mean_absolute-median</th>\n",
        "      <th>mean_absolute-std</th>\n",
+       "      <th>method</th>\n",
+       "      <th>train</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>SLearner(learner=LinearRegression)</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.633660</td>\n",
        "      <td>2.623297</td>\n",
        "      <td>8.362125</td>\n",
        "      <td>0.732443</td>\n",
        "      <td>0.238185</td>\n",
        "      <td>1.493276</td>\n",
+       "      <td>SLearner(learner=LinearRegression)</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>SLearner(learner=LinearRegression)</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.633660</td>\n",
        "      <td>2.623297</td>\n",
        "      <td>8.362125</td>\n",
        "      <td>0.732443</td>\n",
        "      <td>0.238185</td>\n",
        "      <td>1.493276</td>\n",
+       "      <td>SLearner(learner=LinearRegression)</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>weighted_slearner</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.592356</td>\n",
        "      <td>2.569472</td>\n",
        "      <td>8.248291</td>\n",
        "      <td>0.369939</td>\n",
        "      <td>0.212427</td>\n",
        "      <td>0.524395</td>\n",
+       "      <td>weighted_slearner</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>TLearner(control=LassoLars, treated=LassoLars)</td>\n",
-       "      <td>True</td>\n",
        "      <td>5.572626</td>\n",
        "      <td>2.543798</td>\n",
        "      <td>8.213573</td>\n",
        "      <td>0.293187</td>\n",
        "      <td>0.166370</td>\n",
        "      <td>0.428028</td>\n",
+       "      <td>TLearner(control=LassoLars, treated=LassoLars)</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>TLearner(control=LassoLars, treated=LassoLars)</td>\n",
-       "      <td>False</td>\n",
        "      <td>5.572626</td>\n",
        "      <td>2.543798</td>\n",
        "      <td>8.213573</td>\n",
        "      <td>0.293187</td>\n",
        "      <td>0.166370</td>\n",
        "      <td>0.428028</td>\n",
+       "      <td>TLearner(control=LassoLars, treated=LassoLars)</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>XLearner(outcome_c=LassoLars, outcome_t=LassoL...</td>\n",
-       "      <td>True</td>\n",
-       "      <td>5.579285</td>\n",
+       "      <td>5.579297</td>\n",
        "      <td>2.543798</td>\n",
-       "      <td>8.240606</td>\n",
-       "      <td>0.289699</td>\n",
+       "      <td>8.240655</td>\n",
+       "      <td>0.289592</td>\n",
        "      <td>0.166370</td>\n",
-       "      <td>0.427008</td>\n",
+       "      <td>0.427021</td>\n",
+       "      <td>XLearner(outcome_c=LassoLars, outcome_t=LassoL...</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>XLearner(outcome_c=LassoLars, outcome_t=LassoL...</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.579285</td>\n",
+       "      <td>5.579297</td>\n",
        "      <td>2.543798</td>\n",
-       "      <td>8.240606</td>\n",
-       "      <td>0.289699</td>\n",
+       "      <td>8.240655</td>\n",
+       "      <td>0.289592</td>\n",
        "      <td>0.166370</td>\n",
-       "      <td>0.427008</td>\n",
+       "      <td>0.427021</td>\n",
+       "      <td>XLearner(outcome_c=LassoLars, outcome_t=LassoL...</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
+       "      <td>2.637110</td>\n",
+       "      <td>1.277486</td>\n",
+       "      <td>3.824333</td>\n",
+       "      <td>0.234029</td>\n",
+       "      <td>0.196398</td>\n",
+       "      <td>0.206225</td>\n",
        "      <td>RLearner(outcome=LinearRegression, effect=Line...</td>\n",
        "      <td>True</td>\n",
-       "      <td>2.560234</td>\n",
-       "      <td>1.221982</td>\n",
-       "      <td>3.731162</td>\n",
-       "      <td>0.253945</td>\n",
-       "      <td>0.152045</td>\n",
-       "      <td>0.283504</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
+       "      <td>2.637110</td>\n",
+       "      <td>1.277486</td>\n",
+       "      <td>3.824333</td>\n",
+       "      <td>0.234029</td>\n",
+       "      <td>0.196398</td>\n",
+       "      <td>0.206225</td>\n",
        "      <td>RLearner(outcome=LinearRegression, effect=Line...</td>\n",
        "      <td>False</td>\n",
-       "      <td>2.560234</td>\n",
-       "      <td>1.221982</td>\n",
-       "      <td>3.731162</td>\n",
-       "      <td>0.253945</td>\n",
-       "      <td>0.152045</td>\n",
-       "      <td>0.283504</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                              method  train  pehe_score-mean  \\\n",
-       "0                 SLearner(learner=LinearRegression)   True         5.633660   \n",
-       "1                 SLearner(learner=LinearRegression)  False         5.633660   \n",
-       "2                                  weighted_slearner   True         5.592356   \n",
-       "3                                  weighted_slearner  False         5.592356   \n",
-       "4     TLearner(control=LassoLars, treated=LassoLars)   True         5.572626   \n",
-       "5     TLearner(control=LassoLars, treated=LassoLars)  False         5.572626   \n",
-       "6  XLearner(outcome_c=LassoLars, outcome_t=LassoL...   True         5.579285   \n",
-       "7  XLearner(outcome_c=LassoLars, outcome_t=LassoL...  False         5.579285   \n",
-       "8  RLearner(outcome=LinearRegression, effect=Line...   True         2.560234   \n",
-       "9  RLearner(outcome=LinearRegression, effect=Line...  False         2.560234   \n",
+       "   pehe_score-mean  pehe_score-median  pehe_score-std  mean_absolute-mean  \\\n",
+       "0         5.633660           2.623297        8.362125            0.732443   \n",
+       "1         5.633660           2.623297        8.362125            0.732443   \n",
+       "2         5.592356           2.569472        8.248291            0.369939   \n",
+       "3         5.592356           2.569472        8.248291            0.369939   \n",
+       "4         5.572626           2.543798        8.213573            0.293187   \n",
+       "5         5.572626           2.543798        8.213573            0.293187   \n",
+       "6         5.579297           2.543798        8.240655            0.289592   \n",
+       "7         5.579297           2.543798        8.240655            0.289592   \n",
+       "8         2.637110           1.277486        3.824333            0.234029   \n",
+       "9         2.637110           1.277486        3.824333            0.234029   \n",
        "\n",
-       "   pehe_score-median  pehe_score-std  mean_absolute-mean  \\\n",
-       "0           2.623297        8.362125            0.732443   \n",
-       "1           2.623297        8.362125            0.732443   \n",
-       "2           2.569472        8.248291            0.369939   \n",
-       "3           2.569472        8.248291            0.369939   \n",
-       "4           2.543798        8.213573            0.293187   \n",
-       "5           2.543798        8.213573            0.293187   \n",
-       "6           2.543798        8.240606            0.289699   \n",
-       "7           2.543798        8.240606            0.289699   \n",
-       "8           1.221982        3.731162            0.253945   \n",
-       "9           1.221982        3.731162            0.253945   \n",
+       "   mean_absolute-median  mean_absolute-std  \\\n",
+       "0              0.238185           1.493276   \n",
+       "1              0.238185           1.493276   \n",
+       "2              0.212427           0.524395   \n",
+       "3              0.212427           0.524395   \n",
+       "4              0.166370           0.428028   \n",
+       "5              0.166370           0.428028   \n",
+       "6              0.166370           0.427021   \n",
+       "7              0.166370           0.427021   \n",
+       "8              0.196398           0.206225   \n",
+       "9              0.196398           0.206225   \n",
        "\n",
-       "   mean_absolute-median  mean_absolute-std  \n",
-       "0              0.238185           1.493276  \n",
-       "1              0.238185           1.493276  \n",
-       "2              0.212427           0.524395  \n",
-       "3              0.212427           0.524395  \n",
-       "4              0.166370           0.428028  \n",
-       "5              0.166370           0.428028  \n",
-       "6              0.166370           0.427008  \n",
-       "7              0.166370           0.427008  \n",
-       "8              0.152045           0.283504  \n",
-       "9              0.152045           0.283504  "
+       "                                              method  train  \n",
+       "0                 SLearner(learner=LinearRegression)   True  \n",
+       "1                 SLearner(learner=LinearRegression)  False  \n",
+       "2                                  weighted_slearner   True  \n",
+       "3                                  weighted_slearner  False  \n",
+       "4     TLearner(control=LassoLars, treated=LassoLars)   True  \n",
+       "5     TLearner(control=LassoLars, treated=LassoLars)  False  \n",
+       "6  XLearner(outcome_c=LassoLars, outcome_t=LassoL...   True  \n",
+       "7  XLearner(outcome_c=LassoLars, outcome_t=LassoL...  False  \n",
+       "8  RLearner(outcome=LinearRegression, effect=Line...   True  \n",
+       "9  RLearner(outcome=LinearRegression, effect=Line...  False  "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "result"
+    "pd.DataFrame(result)"
    ]
   },
   {
diff --git a/src/justcause/evaluation.py b/src/justcause/evaluation.py
index 7331239..82f6e82 100644
--- a/src/justcause/evaluation.py
+++ b/src/justcause/evaluation.py
@@ -18,8 +18,6 @@
 Format = Callable[[Union[np.array, List[np.array]]], Union[float, List[float]]]
 Frame = Union[CausalFrame, pd.DataFrame]
 
-STD_COL = ["method", "train"]
-
 METHOD = "method"
 TRAIN = "train"
 
@@ -33,35 +31,6 @@ def format_metric(metric, form):
     return "{}-{}".format(metric_string, form.__name__)
 
 
-def setup_scores_df(metrics: Union[List[Metric], Metric]):
-    """Setup DataFrame containing the metric scores for all replications
-
-    Args:
-        metrics: metrics used for naming the columns
-
-    Returns: DataFrame to store the scores for each replication
-    """
-    cols = [metric.__name__ for metric in metrics]
-    return pd.DataFrame(columns=cols)
-
-
-def setup_result_df(
-    metrics: Union[List[Metric], Metric], formats=(np.mean, np.median, np.std)
-):
-    """Setup DataFrame containing the summarized scores for all methods and datasets
-
-    Args:
-        metrics: metrics used for scoring
-        formats: formats for summarizing metrics (e.g. mean, std, ...)
-
-    Returns: DataFrame to store the results for each method
-    """
-    cols = STD_COL + [
-        format_metric(metric, form) for metric in metrics for form in formats
-    ]
-    return pd.DataFrame(columns=cols)
-
-
 def evaluate_ite(
     replications: Union[CausalFrame, List[CausalFrame]],
     methods,
@@ -69,7 +38,7 @@ def evaluate_ite(
     formats: Union[List[Format], Format] = (np.mean, np.median, np.std),
     train_size: float = 0.8,
     random_state: Optional[RandomState] = None,
-) -> pd.DataFrame:
+) -> List[dict]:
     """Evaluate methods with multiple metrics on a given set of replications
 
     Good for use with standard causal methods and callables on new datasets.
@@ -99,7 +68,7 @@ def evaluate_ite(
     if not isinstance(replications, list):
         replications = [replications]
 
-    results_df = setup_result_df(metrics, formats)
+    results = list()
 
     for method in methods:
 
@@ -116,10 +85,10 @@ def evaluate_ite(
         train_result.update({METHOD: name, TRAIN: True})
         test_result.update({METHOD: name, TRAIN: False})
 
-        results_df = results_df.append(train_result, ignore_index=True)
-        results_df = results_df.append(test_result, ignore_index=True)
+        results.append(train_result)
+        results.append(test_result)
 
-    return results_df
+    return results
 
 
 def _evaluate_single_method(
@@ -129,13 +98,17 @@ def _evaluate_single_method(
     formats=(np.mean, np.median, np.std),
     train_size=0.8,
     random_state=None,
-):
+) -> Tuple[dict, dict]:
     """Helper to evaluate method with multiple metrics on the given replications.
 
     This is the standard variant of an evaluation loop, which the user can implement
     manually to modify parts of it. Here, only ITE prediction and evaluation is
     considered.
 
+    Returns:
+        a tuple of two dicts which map (score_name) -> (score)
+        summarized over all replications for train and test respectively
+
     """
     if not isinstance(metrics, list):
         metrics = [metrics]
@@ -143,8 +116,8 @@ def _evaluate_single_method(
     if not isinstance(replications, list):
         replications = [replications]
 
-    test_scores = setup_scores_df(metrics)
-    train_scores = setup_scores_df(metrics)
+    train_scores = list()
+    test_scores = list()
 
     for rep in replications:
         train, test = train_test_split(
@@ -156,13 +129,8 @@ def _evaluate_single_method(
         else:
             train_ite, test_ite = default_predictions(method, train, test)
 
-        test_scores = test_scores.append(
-            calc_scores(test[Col.ite], test_ite, metrics), ignore_index=True
-        )
-
-        train_scores = train_scores.append(
-            calc_scores(train[Col.ite], train_ite, metrics), ignore_index=True
-        )
+        train_scores.append(calc_scores(train[Col.ite], train_ite, metrics))
+        test_scores.append(calc_scores(test[Col.ite], test_ite, metrics))
 
     train_results = summarize_scores(train_scores, formats)
     test_results = summarize_scores(train_scores, formats)
@@ -221,7 +189,7 @@ def default_predictions(
 
 
 def summarize_scores(
-    scores_df: pd.DataFrame,
+    scores: Union[pd.DataFrame, List[dict]],
     formats: Union[List[Format], Format] = (np.mean, np.median, np.std),
 ) -> np.array:
     """
@@ -229,15 +197,18 @@ def summarize_scores(
     Call for train and test separately
 
     Args:
-        scores_df: the dataframe containing scores for all replications
+        scores: the DataFrame or DataFrame-like containing scores for all replications
         formats: Summaries to calculate over the scores of multiple replications
 
-    Returns: The rows to be added to the result dataframe
+    Returns:
+        dict: a dictionary mapping
 
     """
+    # make sure we're dealing with pd.DataFrame
+    df = pd.DataFrame(scores)
     dict_of_results = {
-        format_metric(metric, form): form(scores_df[metric])
-        for metric in scores_df.columns
+        format_metric(metric, form): form(df[metric])
+        for metric in df.columns
         for form in formats
     }
     return dict_of_results
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 9030eea..0397e5f 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -4,30 +4,29 @@
 import pandas as pd
 from sklearn.linear_model import LinearRegression
 
-from justcause.evaluation import (
-    calc_scores,
-    evaluate_ite,
-    setup_result_df,
-    setup_scores_df,
-    summarize_scores,
-)
+from justcause.evaluation import calc_scores, evaluate_ite, summarize_scores
 from justcause.learners import SLearner
-from justcause.metrics import enormse, pehe_score
+from justcause.metrics import pehe_score
 
 
 def test_single_evaluation(ihdp_data):
     reps = list(islice(ihdp_data, 10))
     learner = SLearner(LinearRegression())
-    df = evaluate_ite(reps, learner, pehe_score, train_size=0.8)
-    assert len(df) == 2
-    assert len(df.columns) == 5  # 2 standard + 3 formats for one metric
-    assert "pehe_score-mean" in df.columns  # three format per metric are reported
+    result = evaluate_ite(reps, learner, pehe_score, train_size=0.8)
+    row = result[0]
+    assert len(result) == 2
+    assert len(row) == 5  # 2 standard + 3 formats for one metric
+    assert "pehe_score-mean" in row.keys()  # three format per metric are reported
 
 
 def test_summary():
-    metrics = [pehe_score]
-    df = setup_scores_df(metrics)
-    df["pehe_score"] = np.full(10, 1)
+    data = {"pehe_score": np.full(10, 1)}
+    summary = summarize_scores(data)
+    assert len(summary) == 3  # 5 pseudo-metrics times 3 formats
+    assert summary["pehe_score-mean"] == 1
+
+    # Also works with pd.DataFrame
+    df = pd.DataFrame(data)
     summary = summarize_scores(df)
     assert len(summary) == 3  # 5 pseudo-metrics times 3 formats
     assert summary["pehe_score-mean"] == 1
@@ -44,26 +43,3 @@ def test_calc_scores():
     score_dict = calc_scores(true, pred, pehe_score)
     assert list(score_dict.values())[0] == 1
     assert "pehe_score" in score_dict.keys()
-
-
-def test_setup_df():
-
-    metrics = [pehe_score]
-
-    df = setup_scores_df(metrics)
-    assert len(df.columns) == 1
-    assert "pehe_score" in df.columns
-
-    metrics = [pehe_score, enormse]
-    df = setup_scores_df(metrics)
-    assert len(df.columns) == 2
-    assert "enormse" in df.columns
-
-    result = setup_result_df(metrics)
-    assert len(result.columns) == 8  # 2 base + 3 for each metric
-    assert "pehe_score-mean" in result.columns
-
-    formats = [np.mean, np.std]
-    result = setup_result_df(metrics, formats)
-    assert len(result.columns) == 6  # 2 base + 2 for each metric
-    assert "enormse-std" in result.columns