speed improvement to FSSD and bug fixes as suggested by Jonathan Huggins

wittawatj · Oct 15, 2017 · 60c3f2a · 60c3f2a
1 parent 6120bfa
commit 60c3f2a
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 23 deletions.
diff --git a/ipynb/demo_kgof.ipynb b/ipynb/demo_kgof.ipynb
@@ -624,7 +624,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,

diff --git a/ipynb/ex2_results.ipynb b/ipynb/ex2_results.ipynb
@@ -351,7 +351,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,

diff --git a/ipynb/fssd_locs_surface.ipynb b/ipynb/fssd_locs_surface.ipynb
@@ -132,9 +132,9 @@
     "    # plot the data\n",
     "    plt.plot(X[:, 0], X[:, 1], '.m', markeredgecolor='m', markersize=3, alpha=0.7)\n",
     "    ax = plt.gca()\n",
-    "    ax.set_aspect('auto')\n",
-    "#     plt.xlim([-2.6, 2.6]);\n",
-    "#     plt.ylim([-2.6, 2.6]);\n",
+    "#     ax.set_aspect('auto')\n",
+    "    plt.xlim([-4, 4]);\n",
+    "    plt.ylim([-4, 4]);\n",
     "    \n",
     "    # return the locations V\n",
     "    \n",
@@ -181,7 +181,7 @@
     "# sample\n",
     "n = 2000\n",
     "# true p\n",
-    "seed = 19\n",
+    "seed = 20\n",
     "d = 2\n",
     "mean = np.zeros(d)\n",
     "variance = 1\n",
@@ -197,11 +197,11 @@
     "\n",
     "\n",
     "# Scaling of 1/sqrt(2) will make the variance 1.\n",
-    "# ds_laplace = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
-    "# dat = ds_laplace.sample(n, seed=seed)\n",
+    "ds_laplace = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
+    "dat = ds_laplace.sample(n, seed=seed)\n",
     "\n",
-    "problem_name = 'pgauss_qgauss'\n",
-    "# problem_name = 'pgauss_qlaplace'"
+    "# problem_name = 'pgauss_qgauss'\n",
+    "problem_name = 'pgauss_qlaplace'"
    ]
   },
   {
@@ -292,7 +292,8 @@
     "generic_contourf(p, dat, k, func_fssd_ustat_std)\n",
     "plt.title('U-statistic standard deviation')\n",
     "plt.colorbar()\n",
-    "plt.grid()"
+    "plt.grid()\n",
+    "plt.savefig('{}_h1sd.pdf'.format(problem_name), bbox_inches='tight')"
    ]
   },
   {
@@ -671,7 +672,19 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.11"
+   "version": "2.7.13"
+  },
+  "widgets": {
+   "state": {
+    "dc1948047f324bdf86d97592d6c7e77e": {
+     "views": [
+      {
+       "cell_index": 20
+      }
+     ]
+    }
+   },
+   "version": "1.2.0"
   }
  },
  "nbformat": 4,

diff --git a/ipynb/preliminary.ipynb b/ipynb/preliminary.ipynb
@@ -75,7 +75,7 @@
    "source": [
     "# true p\n",
     "seed = 21\n",
-    "d = 3\n",
+    "d = 5\n",
     "mean = np.zeros(d)\n",
     "variance = 1\n"
    ]
@@ -89,7 +89,7 @@
    "outputs": [],
    "source": [
     "# sample\n",
-    "n = 500\n",
+    "n = 5000\n",
     "\n",
     "# only one dimension of the mean is shifted\n",
     "#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
@@ -212,6 +212,7 @@
     "    'tol_fun':1e-3, \n",
     "    'disp':True\n",
     "}\n",
+    "\n",
     "V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)"
    ]
   },
@@ -351,6 +352,44 @@
     "plt.plot(X_per[:, 0], X_per[:, 1], 'rx')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Visually compare IMQ and Gaussian kernels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "b = -0.5\n",
+    "k_imq = kernel.KIMQ(b=b, c=1)\n",
+    "k_g = kernel.KGauss(sigma2=1.0)\n",
+    "\n",
+    "dom = np.linspace(-8, 8, 100)[:, np.newaxis]\n",
+    "v = 0\n",
+    "plt.plot(dom, k_imq.eval(dom, np.array([[v]])), 'b-', label='IMQ kernel')\n",
+    "plt.plot(dom, k_g.eval(dom, np.array([[v]])), 'r-', label='Gaussian kernel')\n",
+    "\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/kgof/ex/run_ex2.sh b/kgof/ex/run_ex2.sh
@@ -13,7 +13,7 @@ screen -AdmS ex2_kgof -t tab0 bash
 
 
 #screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh10"
-screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh40"
+#screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh40"
 screen -S ex2_kgof -X screen -t tab7 bash -lic "python ex2_prob_params.py glaplace"
 
 
diff --git a/kgof/goftest.py b/kgof/goftest.py
@@ -435,8 +435,8 @@ def fssd_grid_search_kernel(p, dat, test_locs, list_kernel):
         objs = np.zeros(n_cand)
         for i in xrange(n_cand):
             ki = list_kernel[i]
-            obj = FSSD.power_criterion(p, dat, ki, test_locs)
-            logging.info('(%d), obj: %5.4g, k: %s' %(i, obj, str(ki)))
+            objs[i] = FSSD.power_criterion(p, dat, ki, test_locs)
+            logging.info('(%d), obj: %5.4g, k: %s' %(i, objs[i], str(ki)))
 
         #Widths that come early in the list 
         # are preferred if test powers are equal.
@@ -446,6 +446,7 @@ def fssd_grid_search_kernel(p, dat, test_locs, list_kernel):
 
 # end of FSSD
 # --------------------------------------
+
 class GaussFSSD(FSSD):
     """
     FSSD using an isotropic Gaussian kernel.

diff --git a/kgof/test/test_goftest.py b/kgof/test/test_goftest.py
@@ -57,6 +57,80 @@ def test_basic(self):
                 self.assertGreaterEqual(tresult['pvalue'], 0)
                 self.assertLessEqual(tresult['pvalue'], 1)
 
+    def test_optimized_fssd(self):
+        """
+        Test FSSD test with parameter optimization.
+        """
+        seed = 4
+        # sample size
+        n = 179 
+        alpha = 0.01
+        for d in [1, 3]:
+            mean = np.zeros(d)
+            variance = 1.0
+            p = density.IsotropicNormal(mean, variance)
+            # Mean difference. obvious reject
+            ds = data.DSIsotropicNormal(mean+4, variance+0)
+            dat = ds.sample(n, seed=seed)
+            # test 
+            for J in [1, 4]:
+                opts = {
+                    'reg': 1e-2,
+                    'max_iter': 10, 
+                    'tol_fun':1e-3, 
+                    'disp':False
+                }
+                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)
+
+                Xtr = tr.X
+                gwidth0 = util.meddistance(Xtr, subsample=1000)**2
+                # random test locations
+                V0 = util.fit_gaussian_draw(Xtr, J, seed=seed+1)
+                V_opt, gw_opt, opt_result = \
+                gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)
+
+                # construct a test
+                k_opt = kernel.KGauss(gw_opt)
+                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
+                fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
+                fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
+                assert fssd_opt_result['h0_rejected']
+
+    def test_auto_init_opt_fssd(self):
+        """
+        Test FSSD-opt test with automatic parameter initialization.
+        """
+        seed = 5
+        # sample size
+        n = 191 
+        alpha = 0.01
+        for d in [1, 4]:
+            mean = np.zeros(d)
+            variance = 1.0
+            p = density.IsotropicNormal(mean, variance)
+            # Mean difference. obvious reject
+            ds = data.DSIsotropicNormal(mean+4, variance+0)
+            dat = ds.sample(n, seed=seed)
+            # test 
+            for J in [1, 3]:
+                opts = {
+                    'reg': 1e-2,
+                    'max_iter': 10, 
+                    'tol_fun': 1e-3, 
+                    'disp':False
+                }
+                tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)
+
+                V_opt, gw_opt, opt_result = \
+                gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
+
+                # construct a test
+                k_opt = kernel.KGauss(gw_opt)
+                null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
+                fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
+                fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
+                assert fssd_opt_result['h0_rejected']
+
     def test_ustat_h1_mean_variance(self):
         seed = 20
         # sample

diff --git a/kgof/util.py b/kgof/util.py
@@ -251,12 +251,16 @@ def outer_rows(X, Y):
 
     Return an n x dx x dy numpy array.
     """
-    n, dx = X.shape
-    dy = Y.shape[1]
-    X_col_rep = X[:, np.tile(range(dx), (dy, 1)).T.reshape(-1) ]
-    Y_tile = np.tile(Y, (1, dx))
-    Z = X_col_rep*Y_tile
-    return np.reshape(Z, (n, dx, dy))
+
+    # Matlab way to do this. According to Jonathan Huggins, this is not
+    # efficient. Use einsum instead. See below.
+    #n, dx = X.shape
+    #dy = Y.shape[1]
+    #X_col_rep = X[:, np.tile(range(dx), (dy, 1)).T.reshape(-1) ]
+    #Y_tile = np.tile(Y, (1, dx))
+    #Z = X_col_rep*Y_tile
+    #return np.reshape(Z, (n, dx, dy))
+    return np.einsum('ij,ik->ijk', X, Y)
 
 def randn(m, n, seed=3):
     with NumpySeedContext(seed=seed):