Added more functions to analyze data with pandas

xianglunkai · Feb 24, 2017 · 738e6d2 · 738e6d2
1 parent 2044a78
commit 738e6d2
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -193,3 +193,4 @@ out/
 # Data Files
 # -------------------------------------------------------------------
 interfaces/python/tests/qp_problems/results/
+interfaces/python/tests/qp_problems/figures/
diff --git a/interfaces/python/tests/qp_problems/fit_results.py b/interfaces/python/tests/qp_problems/fit_results.py
@@ -1,31 +1,145 @@
 from __future__ import print_function
 import matplotlib as mpl
-mpl.use('Agg')  # For plotting on remote server
+# mpl.use('Agg')  # For plotting on remote server
 import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D
 import numpy as np
 import pandas as pd
-pd.set_option('display.width', 1000)
 
 
-# import sklearn tools
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline   # Make pipeline for estimators
-from sklearn.preprocessing import PolynomialFeatures  # Construct polynomials
-from sklearn.linear_model import (LinearRegression, HuberRegressor, Ridge)
-from sklearn.metrics import mean_squared_error
-
-# # Define candidate function to be fit
-# def func_iter(x, c0, c1, c2, c3, c4):
+# # import sklearn tools
+# from sklearn.model_selection import train_test_split
+# from sklearn.pipeline import make_pipeline   # Make pipeline for estimators
+# from sklearn.preprocessing import PolynomialFeatures  # Construct polynomials
+# from sklearn.linear_model import (LinearRegression, HuberRegressor, Ridge)
+# from sklearn.metrics import mean_squared_error
 #
-#     return c0*np.power(x[0], c1*x[3] + c2*x[4])*np.power(x[1], c3)*np.power(x[2], c4)
+# # # Define candidate function to be fit
+# # def func_iter(x, c0, c1, c2, c3, c4):
+# #
+# #     return c0*np.power(x[0], c1*x[3] + c2*x[4])*np.power(x[1], c3)*np.power(x[2], c4)
+
+
+def get_best_params(df):
+    """
+    Transform weighted frame into another frame with best parameters
+    """
+    # Get best parameters
+    df_best = df.loc[df['w'] == 1.]
+
+    # Get highest sigma
+    min_sigma = df_best['sigma'].min()
+
+    # Get best row
+    df_best = df_best.loc[(df_best['sigma'] == min_sigma)]
+
+    if len(df_best) > 1:  # If multiple values choose one with min alpha
+        min_alpha = df_best['alpha'].min()
+        df_best = df_best.loc[(df_best['alpha'] == min_alpha)]
+
+    return df_best
+
+
+
+
+def weight_by_iter(df):
+    """
+    Weight sample using their number of iterations related to the min one
+    """
+    df['w'] = df['iter'].min() / df['iter']
+    return df
+
+
+def save_plot(df, name):
+    """
+    Plot behavior of 'name' in selected dataframe
+    """
+
+    # Dummy value always true
+    location = (df['alpha'] > 0 )
+
+    # Get best iteration values (there are many) and pick first pair sigma and alpha
+    if name is not 'sigma':
+        test_sigma = df.loc[(df['w'] == 1.)].sigma.values[-1]
+        location &= (df['sigma'] == test_sigma)
+    if name is not 'alpha':
+        test_alpha = df.loc[(df['w'] == 1.)].alpha.values[-1]
+        location &= (df['alpha'] == test_alpha)
+    if name is not 'rho':
+        test_rho = df.loc[(df['w'] == 1.)].rho.values[-1]
+        location &= (df['rho'] == test_rho)
+
+    # Get test case in specified location
+    test_case = df.loc[location]
+
+
+    # Plot behavior
+    plt.figure(figsize=(12,6))
+    plt.subplot(1, 2, 1)
+    ax = plt.gca()
+    if name is 'rho':
+        ax.set_xscale('log')
+    plt.scatter(test_case[name], test_case['iter'])
+    ax.set_ylabel('iter')
+    ax.set_xlabel(name)
+    plt.grid()
+    plt.show(block=False)
+
+    plt.subplot(1, 2, 2)
+    ax = plt.gca()
+    if name is 'rho':
+        ax.set_xscale('log')
+    plt.scatter(test_case[name], test_case['w'])
+    ax.set_ylabel('weight')
+    ax.set_xlabel(name)
+    plt.grid()
+    plt.show(block=False)
+
+    plt.tight_layout()
+    plt.savefig('figures/%s.pdf' % name)
+
 
 # Main function
 if __name__ == '__main__':
 
     # Read results (only the ones less then max_iter)
     res = pd.read_csv('results/results_full.csv')
-    res = res.loc[(res['iter'] < 2400)]
+    res = res.loc[(res['iter'] < 2499)]  # Select problems not saturated at max number of iterations
+
+    # Problem headings
+    headings = ['n', 'm', 'name', 'seed']
+
+    # Group problems
+    problems = res.groupby(headings)
+    # n_problems = len(problems.groups)
+
+    # Assign weights to samples
+    res_w = problems.apply(weight_by_iter)
+    problems_w = res_w.groupby(headings)
+
+
+    # Plot behavior for fixed sigma and alpha and changing rho
+    # test_name = (50.0, 60.0, 'svm', 3076953921.0)
+    # test_name = (50.0, 60.0, 'svm', 107769053.0)
+    test_name = (40.0, 40.0, 'lasso', 685148778.0)
+    # test_name = (40.0, 40.0, 'lasso', 4089288235.0)
+
+    test_instance = problems_w.get_group(test_name)
+
+    # Save plots for rho, sigma and alpha
+    # save_plot(test_instance, 'rho')
+    # save_plot(test_instance, 'sigma')
+    # save_plot(test_instance, 'alpha')
+
+
+
+    # Get optimal parameters for lasso problem
+    same_type_probs = res_w.groupby(['name'])
+    lasso_probs = same_type_probs.get_group(('lasso'))
+    best_lasso = lasso_probs.groupby(['seed']).apply(get_best_params)
+    pd.tools.plotting.scatter_matrix(best_lasso)
+
+
 
     # Select smaller dataset and consider only n, m, trP
     # res = res.loc[(res['m'] < 100) & (res['n'] < 100)]

diff --git a/interfaces/python/tests/qp_problems/qp_examples/qp_example.py b/interfaces/python/tests/qp_problems/qp_examples/qp_example.py
@@ -3,10 +3,14 @@
 from builtins import range
 from builtins import object
 import numpy as np
-from utils.data_struct import data_struct, full_data_struct
+import sys   # To get maxsize
+
+# Metadata class
 import abc
 from future.utils import with_metaclass
 
+# Data structures
+from utils.data_struct import data_struct, full_data_struct
 
 class QPExample(with_metaclass(abc.ABCMeta, object)):
 
@@ -53,7 +57,7 @@ def perform_tests(self, **kwargs):
             for _ in range(self.nm_num_prob):  # Generate some random problems
 
                 # Get current seed
-                current_seed = np.random.randint(0, 4294967295)
+                current_seed = np.random.randint(0, sys.maxsize)
                 np.random.seed(current_seed)
 
                 # generate problem and store statistics