Added stochastic Robust PCA example first draft

openopt · GeoffNN · Mar 31, 2021 · Mar 31, 2021 · Apr 1, 2021 · Apr 2, 2021
commit 6afc5104a15217ff6fb818d96f6f8f929f65d26a
diff --git a/chop/stochastic.py b/chop/stochastic.py
@@ -569,35 +569,50 @@ def __init__(self, params, lmo, prox=None,
                  momentum=0., weight_decay=0.,
                  normalization='none'):
 
-        self.lmo = []
+        # initialize proxes
+        if prox is None:
+            prox = [None] * len(list(params))
+
+        prox_candidates = []
+        for prox_el in prox:
+            if prox_el is not None:
+                prox_candidates.append(lambda x, s=None: prox_el(x.unsqueeze(0), s).squeeze(0))
+            else:
+                prox_candidates.append(lambda x, s=None: x)
+        # initialize lmos
+        lmo_candidates = []
         for oracle in lmo:
             if oracle is None:
                 # Then FW will not be used on this parameter
-                raise ValueError("LMOs cannot be None for this optimizer.")
+                _lmo = None
             else:
                 def _lmo(u, x):
                     update_direction, max_step_size = oracle(u.unsqueeze(0), x.unsqueeze(0))
                     return update_direction.squeeze(dim=0), max_step_size
-            self.lmo.append(_lmo)
-
-        if prox is None:
-            prox = [None] * len(list(params))
+            lmo_candidates.append(_lmo)
 
+        self.lmo = []
         self.prox = []
-        for prox_el in prox:
-            if prox_el is not None:
-                self.prox.append(lambda x, s=None: prox_el(x.unsqueeze(0), s).squeeze(0))
+        useable_params = []
+        for param, lmo_oracle, prox_oracle in zip(params, lmo_candidates, prox_candidates):
+            if lmo_oracle:
+                useable_params.append(param)
+                self.lmo.append(lmo_oracle)
+                self.prox.append(prox_oracle)
             else:
-                self.prox.append(lambda x, s=None: x)
+                msg = (f"No LMO was provided for parameter {param}. "
+                       f"This optimizer will not optimize this parameter. "
+                       f"Please pass this parameter to another optimizer.")
+                warnings.warn(msg)
 
         for name, lr in (('lr_lmo', lr_lmo),
                          ('lr_prox', lr_prox)):
-            if not type(lr) == float:
-                msg = f"{name} should be a float, got {lr}."
+            if not ((type(lr) == float) or lr == 'sublinear'):
+                msg = f"{name} should be a float or 'sublinear', got {lr}."
                 raise ValueError(msg)
 
         if not(0. <= momentum <= 1.):
-            raise ValueError("omentum must be in [0., 1.].")
+            raise ValueError("momentum must be in [0., 1.].")
 
         if not (weight_decay >= 0):
             raise ValueError("weight_decay must be nonnegative.")
@@ -631,25 +646,34 @@ def step(self, closure=None):
             for p in group['params']:
                 if p.grad is None:
                     continue
-                grad = p.grad + self.weight_decay * p
-
+                grad = p.grad
+                state = self.state[p]
                 if grad.is_sparse:
                     raise RuntimeError("We do not yet support sparse gradients.")
                 # Keep track of the step
-                state = self.state[p]
-
+                grad += group['weight_decay'] * p
                 # Initialization
                 if len(state) == 0:
                     state['step'] = 0.
                     # split variable: p = x + y
                     state['x'] = .5 * p.detach().clone()
                     state['y'] = .5 * p.detach().clone()
+                    # initialize grad estimate
+                    state['grad_est'] = grad
+                    # initialize learning rates
+                    state['lr_prox'] = group['lr_prox'] if type(group['lr_prox'] == float) else 0.
+                    state['lr_lmo'] = group['lr_lmo'] if type(group['lr_lmo'] == float) else 0.
                 state['step'] += 1.
+                state['grad_est'].add_(grad, alpha=1. - group['momentum'])
+
+                for lr in ('lr_prox', 'lr_lmo'):
+                    if group[lr] == 'sublinear':
+                        state[lr] = 2. / (state['step'] + 2)
 
-                y_update, max_step_size = self.lmo[idx](-grad, state['y'])
-                state['lr_lmo'] = torch.minimum(state['lr_lmo'], max_step_size)
+                y_update, max_step_size = group['lmo'][idx](-state['grad_est'], state['y'])
+                state['lr_lmo'] = min(state['lr_lmo'], max_step_size)
                 w = y_update + state['y']
-                v = self.prox[idx](state['x'] + state['y'] - w - grad / state['lr_prox'], state['lr_prox'])
+                v = group['prox'][idx](state['x'] + state['y'] - w - state['grad_est'] / state['lr_prox'], group['lr_prox'])
 
                 state['y'].add_(y_update, alpha=state['lr_lmo'])
                 x_update = v - state['x']

diff --git a/examples/plot_robust_PCA.py b/examples/plot_robust_PCA.py
@@ -25,8 +25,10 @@
 m = 1000
 n = 1000
 
-r_p = [(5, 1e-3), (5, 3e-3), (25, 1e-3), (25, 3e-3),
-       (25, 3e-2), (130, 1e-2)]
+r_p = [(5, 1e-3),
+    #    (5, 3e-3), (25, 1e-3), (25, 3e-3),
+    #    (25, 3e-2), (130, 1e-2)
+       ]
 
 for r, p in r_p:
     print(f'r={r} and p={p}')
@@ -49,7 +51,7 @@
 
     @utils.closure
     def sqloss(Z):
-        return .5 * torch.linalg.norm((Z - M).squeeze(), ord='fro') ** 2
+        return .5 / M.numel() * torch.linalg.norm((Z - M).squeeze(), ord='fro') ** 2
 
     rnuc = torch.linalg.norm(L.squeeze(), ord='nuc')
     sL1 = abs(S).sum()
@@ -102,6 +104,7 @@ def line_search(kwargs):
     fig.suptitle(f'r={r} and p={p}')
 
     axes[0].plot(f_vals)
+    axes[0].set_ylim(0, 250)
     axes[0].set_title("Function values")
 
     axes[1].plot(sparse_comp)

diff --git a/examples/plot_stochastic_robust_PCA.py b/examples/plot_stochastic_robust_PCA.py
@@ -0,0 +1,102 @@
+
+"""
+Stochastic Robust PCA
+===========
+
+This example fits a Robust PCA model to data.
+It uses a stochastic hybrid Frank-Wolfe and proximal method.
+See description in :func:`chop.stochastic.SplittingProxFW`.
+
+
+We reproduce the synthetic experimental setting from `[Garber et al. 2018] <https://arxiv.org/pdf/1802.05581.pdf>`_.
+We aim to recover :math:`M = L + S + N`, where :math:`L` is rank :math:`p`,
+:math:`S` is :math:`p` sparse, and :math:`N` is standard Gaussian elementwise.
+"""
+
+
+import matplotlib.pyplot as plt
+import torch
+import chop
+from chop import utils
+from chop.utils.logging import Trace
+
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+m = 1000
+n = 1000
+
+r_p = [(5, 1e-3),
+    #    (5, 3e-3), (25, 1e-3), (25, 3e-3),
+    #    (25, 3e-2), (130, 1e-2)
+       ]
+
+n_epochs = 100
+
+for r, p in r_p:
+    print(f'r={r} and p={p}')
+    U = torch.normal(torch.zeros(m, r))
+    V = torch.normal(torch.zeros(r, n))
+
+    # Low rank component
+    L = 10 * utils.bmm(U, V)
+
+    # Sparse component
+    S = 100 * torch.normal(torch.zeros(m, n))
+
+    S *= (torch.rand_like(S) <= p)
+
+    # Add noise
+    N = torch.normal(torch.zeros(m, n))
+
+    M = L + S + N
+    M = M.to(device)
+
+    def sqloss(Z, M):
+        return .5 / M.numel() * torch.linalg.norm((Z - M).squeeze(), ord='fro') ** 2
+
+    rnuc = torch.linalg.norm(L.squeeze(), ord='nuc')
+    sL1 = abs(S).sum()
+
+    print(f"Initial L1 norm: {sL1}")
+    print(f"Initial Nuclear norm: {rnuc}")
+
+    rank_constraint = chop.constraints.NuclearNormBall(rnuc)
+    sparsity_constraint = chop.constraints.L1Ball(sL1)
+
+    lmo = rank_constraint.lmo
+    prox = sparsity_constraint.prox
+
+    Z = torch.zeros_like(M, device=device)
+    Z.requires_grad_(True)
+
+    sampler = torch.utils.data.BatchSampler(torch.utils.data.RandomSampler(range(M.size(0))),
+                                            batch_size=100,
+                                            drop_last=False)
+
+    optimizer = chop.stochastic.SplittingProxFW([Z], lmo=[lmo], prox=[prox],
+                                                lr_lmo='sublinear',
+                                                lr_prox='sublinear',
+                                                normalization='none')
+
+    train_losses = []
+    losses = []
+
+    for it in range(n_epochs):
+        for idx in sampler:
+            optimizer.zero_grad()
+            loss = sqloss(Z[idx], M[idx])
+            # for logging
+            with torch.no_grad():
+                full_loss = sqloss(Z, M)
+                losses.append(full_loss.item())
+            train_losses.append(loss.item())
+            loss.backward()
+            optimizer.step()
+
+
+    plt.plot(train_losses, label='training_losses')
+    plt.plot(losses, label='loss')
+    plt.ylim(0, 250)
+    plt.legend()
+    print("Done.")