Tidy up BO for OED

pyro-ppl · eb8680 · Jul 25, 2018 · Jul 17, 2018 · Jul 17, 2018 · Jul 19, 2018
commit 523362b9ad43e5f68b20c6ba802fe199bc74e73c
diff --git a/examples/contrib/oed/gp_bayes_opt.py b/examples/contrib/oed/gp_bayes_opt.py
@@ -12,19 +12,37 @@
 
 
 class GPBayesOptimizer:
+    """Performs Bayesian Optimization using a Gaussian Process as an
+    emulator for the unknown function.
+    """
 
     def __init__(self, f, constraints, gpmodel):
+        """
+        :param function f: the objective function which should accept `torch.Tensor`
+            inputs
+        :param torch.constraint constraints: constraints defining the domain of `f`
+        :param gp.models.GPRegression gpmodel: a (possibly initialized) GP
+            regression model. The kernel, etc is specified via `gpmodel`.
+        """
         self.f = f
         self.constraints = constraints
         self.gpmodel = gpmodel
 
-    def update_posterior(self, gpmodel, X, y):
-        X = torch.cat([gpmodel.X, X])
-        y = torch.cat([gpmodel.y, y])
-        gpmodel.set_data(X, y)
-        gpmodel.optimize()
+    def update_posterior(self, X, y):
+        X = torch.cat([self.gpmodel.X, X])
+        y = torch.cat([self.gpmodel.y, y])
+        self.gpmodel.set_data(X, y)
+        self.gpmodel.optimize()
 
-    def find_a_candidate(self, sampler, x_init):
+    def find_a_candidate(self, differentiable, x_init):
+        """Given a starting point, `x_init`, takes one LBFGS step
+        to optimize the differentiable function.
+
+        :param function differentiable: a function amenable to torch
+            autograd
+        :param torch.Tensor x_init: the initial point
+
+        """
         # transform x to an unconstrained domain
         unconstrained_x_init = transform_to(self.constraints).inv(x_init)
         unconstrained_x = torch.tensor(unconstrained_x_init, requires_grad=True)
@@ -35,23 +53,29 @@ def closure():
             if (torch.log(torch.abs(unconstrained_x)) > 15.).any():
                 return torch.tensor(float('inf'))
             x = transform_to(self.constraints)(unconstrained_x)
-            y = sampler(x)
-            autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x, retain_graph=True))
-            # print(x, y, unconstrained_x, unconstrained_x.grad)
+            y = differentiable(x)
+            autograd.backward(unconstrained_x,
+                autograd.grad(y, unconstrained_x, retain_graph=True))
             return y
 
-        # print("Starting off minimizer from", unconstrained_x, x_init)
         minimizer.step(closure)
         # after finding a candidate in the unconstrained domain,
         # convert it back to original domain.
         x = transform_to(self.constraints)(unconstrained_x)
-        opt_y = sampler(x)
-        if torch.isnan(opt_y).any():
-            print('opt_y', opt_y)
-            print('x', x)
+        opt_y = differentiable(x)
         return x.detach(), opt_y.detach()
 
     def opt_differentiable(self, differentiable, num_candidates=5):
+        """Optimizes a differentiable function by choosing `num_candidates`
+        initial points at random and calling :func:`find_a_candidate` on
+        each. The best candidate is returned with its function value.
+
+        :param function differentiable: a function amenable to torch autograd
+        :param int num_candidates: the number of random starting points to
+            use
+        :return: the minimiser and its function value
+        :rtype: tuple
+        """
 
         x_init = self.gpmodel.X[-1:].new_empty(1).uniform_(self.constraints.lower_bound, 
                     self.constraints.upper_bound)
@@ -67,7 +91,18 @@ def opt_differentiable(self, differentiable, num_candidates=5):
         mvalue, argmin = torch.min(torch.cat(values), dim=0)
         return candidates[argmin.item()], mvalue
 
-    def acquire(self, method="Thompson", num_candidates=1, num_acquisitions=1):
+    def acquire(self, method="Thompson", num_acquisitions=1, **opt_params):
+        """Selects `num_acquisitions` query points at which to query the
+        original function.
+
+        :param str method: the method to use for acquisition. Choose from:
+            Thompson
+        :param int num_acquisitions: the number of points to generate
+        :param dict opt_params: additional parameters for optimization
+            routines
+        :return: a tensor of points to evaluate `self.f` at
+        :rtype: torch.Tensor
+        """
 
         if method == "Thompson":
 
@@ -76,7 +111,7 @@ def acquire(self, method="Thompson", num_candidates=1, num_acquisitions=1):
 
             for i in range(num_acquisitions):
                 sampler = self.gpmodel.iter_sample(noiseless=False)
-                x, _ = self.opt_differentiable(sampler, num_candidates=5)
+                x, _ = self.opt_differentiable(sampler, **opt_params)
                 X[i, ...] = x
 
             return X
@@ -85,38 +120,52 @@ def acquire(self, method="Thompson", num_candidates=1, num_acquisitions=1):
             raise NotImplementedError("Only method Thompson implemented for acquisition")
 
     def run(self, num_steps, num_acquisitions):
-        plt.figure(figsize=(12, 30))
-        outer_gs = gridspec.GridSpec(num_steps, 1)
+        """
+        Optimizes `self.f` in `num_steps` steps, acquiring `num_acquisitions`
+        new function evaluations at each step.
+
+        :param int num_steps:
+        :param int num_steps"
+        :return: the minimiser and the minimum value
+        :rtype: tuple
+        """
 
         for i in range(num_steps):
             X = self.acquire(num_acquisitions=num_acquisitions)
             y = self.f(X)
-            gs = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=outer_gs[i])
-            self.plot(gs, xlabel=i+1, with_title=(i % 2 == 0))
-            self.update_posterior(self.gpmodel, X, y)
-
-        plt.show()
+            self.update_posterior(X, y)
 
         return self.opt_differentiable(lambda x: self.gpmodel(x)[0])
 
+    # def run(self, num_steps, num_acquisitions):
+    #     plt.figure(figsize=(12, 30))
+    #     outer_gs = gridspec.GridSpec(num_steps, 1)
 
-    def plot(self, gs, xlabel=None, with_title=True):
-        xlabel = "xmin" if xlabel is None else "x{}".format(xlabel)
-        Xnew = torch.linspace(-1., 101.)
-        ax1 = plt.subplot(gs[0])
-        ax1.plot(self.gpmodel.X.detach().numpy(), self.gpmodel.y.detach().numpy(), "kx")  # plot all observed data
-        with torch.no_grad():
-            loc, var = self.gpmodel(Xnew, full_cov=False, noiseless=False)
-            sd = var.sqrt()
-            ax1.plot(Xnew.numpy(), loc.numpy(), "r", lw=2)  # plot predictive mean
-            ax1.fill_between(Xnew.numpy(), loc.numpy() - 2*sd.numpy(), loc.numpy() + 2*sd.numpy(),
-                             color="C0", alpha=0.3)  # plot uncertainty intervals
-        ax1.set_xlim(-1, 101)
-        ax1.set_title("Find {}".format(xlabel))
-        if with_title:
-            ax1.set_ylabel("Gaussian Process Regression")
-
-
-
+    #     for i in range(num_steps):
+    #         X = self.acquire(num_acquisitions=num_acquisitions)
+    #         y = self.f(X)
+    #         gs = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=outer_gs[i])
+    #         self.plot(gs, xlabel=i+1, with_title=(i % 2 == 0))
+    #         self.update_posterior(self.gpmodel, X, y)
 
+    #     plt.show()
+
+    #     return self.opt_differentiable(lambda x: self.gpmodel(x)[0])
+
+
+    # def plot(self, gs, xlabel=None, with_title=True):
+    #     xlabel = "xmin" if xlabel is None else "x{}".format(xlabel)
+    #     Xnew = torch.linspace(-1., 101.)
+    #     ax1 = plt.subplot(gs[0])
+    #     ax1.plot(self.gpmodel.X.detach().numpy(), self.gpmodel.y.detach().numpy(), "kx")  # plot all observed data
+    #     with torch.no_grad():
+    #         loc, var = self.gpmodel(Xnew, full_cov=False, noiseless=False)
+    #         sd = var.sqrt()
+    #         ax1.plot(Xnew.numpy(), loc.numpy(), "r", lw=2)  # plot predictive mean
+    #         ax1.fill_between(Xnew.numpy(), loc.numpy() - 2*sd.numpy(), loc.numpy() + 2*sd.numpy(),
+    #                          color="C0", alpha=0.3)  # plot uncertainty intervals
+    #     ax1.set_xlim(-1, 101)
+    #     ax1.set_title("Find {}".format(xlabel))
+    #     if with_title:
+    #         ax1.set_ylabel("Gaussian Process Regression")
 
diff --git a/pyro/contrib/gp/models/gpr.py b/pyro/contrib/gp/models/gpr.py
@@ -144,64 +144,64 @@ def forward(self, Xnew, full_cov=False, noiseless=True):
 
         return loc + self.mean_function(Xnew), cov
 
-    def iter_sample(self, full_cov=False, noiseless=True):
+    def iter_sample(self, noiseless=True):
         r"""
         Iteratively constructs a sample from the Gaussian Process posterior.
 
         Recall that at test input points :math:`X_{new}`, the posterior is
         multivariate Gaussian distributed with mean and covariance matrix
         given by :func:`forward`.
 
-        This method samples lazily from multivariate Gaussian. The advantage
+        This method samples lazily from this multivariate Gaussian. The advantage
         of this approach is that later query points can depend upon earlier ones.
+        Particularly useful when the querying is to be done by an optimisation
+        routine.
 
         .. note:: The noise parameter ``noise`` (:math:`\epsilon`) together with
             kernel's parameters have been learned from a training procedure (MCMC or
             SVI).
 
-        :param bool noiseless: A flag to decide if we want to include noise in the
-            sample or not.
+        :param bool noiseless: A flag to decide if we want to add sampling noise
+            to the samples beyond the noise inherent in the GP posterior.
         :returns: sampler
         :rtype: function
         """
-        noise = self.guide().detach()
+        # Make these visible in the inner function
         global X, y, Kff, N
+
+        noise = self.guide().detach()
         X = self.X.clone().detach()
         y = self.y.clone().detach()
-
         N = X.shape[0]
         Kff = self.kernel(X).contiguous()
         Kff.view(-1)[::N + 1] += noise  # add noise to the diagonal
 
         def sample_next(xnew):
+            """Repeatedly samples from the Gaussian process posterior,
+            conditioning on previously sampled values.
+            """
             if torch.isnan(xnew).any():
                 raise ValueError("Cannot evaluate GP at value: {}".format(xnew))
 
+            # Variables from outer scope
             global X, y, Kff, N
-            Lff = Kff.potrf(upper=False)
 
+            # Compute Cholesky decomposition of kernel matrix
+            Lff = Kff.potrf(upper=False)
             y_residual = y - self.mean_function(X)
+
+            # Compute conditional mean and variance
             loc, cov = conditional(xnew, X, self.kernel, y_residual, None, Lff,
                                    False, jitter=self.jitter)
-
             if not noiseless:
                 cov = cov + noise
 
             # Todo use pyro.sample
             d = normal.Normal(torch.tensor(0.), torch.tensor(1.))
-            # cov = torch.max(cov, torch.tensor(0.))
-            if torch.isnan(loc) or torch.isnan(cov) or torch.isnan(cov.sqrt()):
-                print('loc', loc)
-                print('cov', cov)
-                print('X', X)
-                print('xnew', xnew)
-                print('Kff', Kff)
-                print('LL^T', Lff.mm(Lff.t()))
-                print('logdet', Kff.logdet())
-                print('N', N)
-                raise
+            # Reparametrize explicitly - aids autograd
             ynew = (loc + self.mean_function(xnew)) + d.sample()*cov.sqrt()
 
+            # Update kernel matrix
             Kffnew = Kff.new_empty(N+1,N+1)
             Kffnew[:N, :N] = Kff
             cross = self.kernel(X, xnew).squeeze()