Skip to content

Commit

Permalink
speed improvement to FSSD and bug fixes as suggested by Jonathan Huggins
Browse files Browse the repository at this point in the history
  • Loading branch information
wittawatj committed Oct 15, 2017
1 parent 6120bfa commit 60c3f2a
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 23 deletions.
2 changes: 1 addition & 1 deletion ipynb/demo_kgof.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
"version": "2.7.13"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion ipynb/ex2_results.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
"version": "2.7.13"
}
},
"nbformat": 4,
Expand Down
33 changes: 23 additions & 10 deletions ipynb/fssd_locs_surface.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@
" # plot the data\n",
" plt.plot(X[:, 0], X[:, 1], '.m', markeredgecolor='m', markersize=3, alpha=0.7)\n",
" ax = plt.gca()\n",
" ax.set_aspect('auto')\n",
"# plt.xlim([-2.6, 2.6]);\n",
"# plt.ylim([-2.6, 2.6]);\n",
"# ax.set_aspect('auto')\n",
" plt.xlim([-4, 4]);\n",
" plt.ylim([-4, 4]);\n",
" \n",
" # return the locations V\n",
" \n",
Expand Down Expand Up @@ -181,7 +181,7 @@
"# sample\n",
"n = 2000\n",
"# true p\n",
"seed = 19\n",
"seed = 20\n",
"d = 2\n",
"mean = np.zeros(d)\n",
"variance = 1\n",
Expand All @@ -197,11 +197,11 @@
"\n",
"\n",
"# Scaling of 1/sqrt(2) will make the variance 1.\n",
"# ds_laplace = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
"# dat = ds_laplace.sample(n, seed=seed)\n",
"ds_laplace = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
"dat = ds_laplace.sample(n, seed=seed)\n",
"\n",
"problem_name = 'pgauss_qgauss'\n",
"# problem_name = 'pgauss_qlaplace'"
"# problem_name = 'pgauss_qgauss'\n",
"problem_name = 'pgauss_qlaplace'"
]
},
{
Expand Down Expand Up @@ -292,7 +292,8 @@
"generic_contourf(p, dat, k, func_fssd_ustat_std)\n",
"plt.title('U-statistic standard deviation')\n",
"plt.colorbar()\n",
"plt.grid()"
"plt.grid()\n",
"plt.savefig('{}_h1sd.pdf'.format(problem_name), bbox_inches='tight')"
]
},
{
Expand Down Expand Up @@ -671,7 +672,19 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
"version": "2.7.13"
},
"widgets": {
"state": {
"dc1948047f324bdf86d97592d6c7e77e": {
"views": [
{
"cell_index": 20
}
]
}
},
"version": "1.2.0"
}
},
"nbformat": 4,
Expand Down
43 changes: 41 additions & 2 deletions ipynb/preliminary.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"source": [
"# true p\n",
"seed = 21\n",
"d = 3\n",
"d = 5\n",
"mean = np.zeros(d)\n",
"variance = 1\n"
]
Expand All @@ -89,7 +89,7 @@
"outputs": [],
"source": [
"# sample\n",
"n = 500\n",
"n = 5000\n",
"\n",
"# only one dimension of the mean is shifted\n",
"#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
Expand Down Expand Up @@ -212,6 +212,7 @@
" 'tol_fun':1e-3, \n",
" 'disp':True\n",
"}\n",
"\n",
"V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)"
]
},
Expand Down Expand Up @@ -351,6 +352,44 @@
"plt.plot(X_per[:, 0], X_per[:, 1], 'rx')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Visually compare IMQ and Gaussian kernels"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"b = -0.5\n",
"k_imq = kernel.KIMQ(b=b, c=1)\n",
"k_g = kernel.KGauss(sigma2=1.0)\n",
"\n",
"dom = np.linspace(-8, 8, 100)[:, np.newaxis]\n",
"v = 0\n",
"plt.plot(dom, k_imq.eval(dom, np.array([[v]])), 'b-', label='IMQ kernel')\n",
"plt.plot(dom, k_g.eval(dom, np.array([[v]])), 'r-', label='Gaussian kernel')\n",
"\n",
"plt.legend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
2 changes: 1 addition & 1 deletion kgof/ex/run_ex2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ screen -AdmS ex2_kgof -t tab0 bash


#screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh10"
screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh40"
#screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh40"
screen -S ex2_kgof -X screen -t tab7 bash -lic "python ex2_prob_params.py glaplace"


5 changes: 3 additions & 2 deletions kgof/goftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,8 @@ def fssd_grid_search_kernel(p, dat, test_locs, list_kernel):
objs = np.zeros(n_cand)
for i in xrange(n_cand):
ki = list_kernel[i]
obj = FSSD.power_criterion(p, dat, ki, test_locs)
logging.info('(%d), obj: %5.4g, k: %s' %(i, obj, str(ki)))
objs[i] = FSSD.power_criterion(p, dat, ki, test_locs)
logging.info('(%d), obj: %5.4g, k: %s' %(i, objs[i], str(ki)))

#Widths that come early in the list
# are preferred if test powers are equal.
Expand All @@ -446,6 +446,7 @@ def fssd_grid_search_kernel(p, dat, test_locs, list_kernel):

# end of FSSD
# --------------------------------------

class GaussFSSD(FSSD):
"""
FSSD using an isotropic Gaussian kernel.
Expand Down
74 changes: 74 additions & 0 deletions kgof/test/test_goftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,80 @@ def test_basic(self):
self.assertGreaterEqual(tresult['pvalue'], 0)
self.assertLessEqual(tresult['pvalue'], 1)

def test_optimized_fssd(self):
"""
Test FSSD test with parameter optimization.
"""
seed = 4
# sample size
n = 179
alpha = 0.01
for d in [1, 3]:
mean = np.zeros(d)
variance = 1.0
p = density.IsotropicNormal(mean, variance)
# Mean difference. obvious reject
ds = data.DSIsotropicNormal(mean+4, variance+0)
dat = ds.sample(n, seed=seed)
# test
for J in [1, 4]:
opts = {
'reg': 1e-2,
'max_iter': 10,
'tol_fun':1e-3,
'disp':False
}
tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)

Xtr = tr.X
gwidth0 = util.meddistance(Xtr, subsample=1000)**2
# random test locations
V0 = util.fit_gaussian_draw(Xtr, J, seed=seed+1)
V_opt, gw_opt, opt_result = \
gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)

# construct a test
k_opt = kernel.KGauss(gw_opt)
null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
assert fssd_opt_result['h0_rejected']

def test_auto_init_opt_fssd(self):
"""
Test FSSD-opt test with automatic parameter initialization.
"""
seed = 5
# sample size
n = 191
alpha = 0.01
for d in [1, 4]:
mean = np.zeros(d)
variance = 1.0
p = density.IsotropicNormal(mean, variance)
# Mean difference. obvious reject
ds = data.DSIsotropicNormal(mean+4, variance+0)
dat = ds.sample(n, seed=seed)
# test
for J in [1, 3]:
opts = {
'reg': 1e-2,
'max_iter': 10,
'tol_fun': 1e-3,
'disp':False
}
tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)

V_opt, gw_opt, opt_result = \
gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)

# construct a test
k_opt = kernel.KGauss(gw_opt)
null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
assert fssd_opt_result['h0_rejected']

def test_ustat_h1_mean_variance(self):
seed = 20
# sample
Expand Down
16 changes: 10 additions & 6 deletions kgof/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,16 @@ def outer_rows(X, Y):
Return an n x dx x dy numpy array.
"""
n, dx = X.shape
dy = Y.shape[1]
X_col_rep = X[:, np.tile(range(dx), (dy, 1)).T.reshape(-1) ]
Y_tile = np.tile(Y, (1, dx))
Z = X_col_rep*Y_tile
return np.reshape(Z, (n, dx, dy))

# Matlab way to do this. According to Jonathan Huggins, this is not
# efficient. Use einsum instead. See below.
#n, dx = X.shape
#dy = Y.shape[1]
#X_col_rep = X[:, np.tile(range(dx), (dy, 1)).T.reshape(-1) ]
#Y_tile = np.tile(Y, (1, dx))
#Z = X_col_rep*Y_tile
#return np.reshape(Z, (n, dx, dy))
return np.einsum('ij,ik->ijk', X, Y)

def randn(m, n, seed=3):
with NumpySeedContext(seed=seed):
Expand Down

0 comments on commit 60c3f2a

Please sign in to comment.