add first sketch of the factorization module

Yliu566 · May 9, 2020 · 5ffd87b · 5ffd87b
1 parent be8ae23
commit 5ffd87b
Show file tree

Hide file tree

Showing 3 changed files with 383 additions and 0 deletions.
diff --git a/numpy_ml/factorization/README.md b/numpy_ml/factorization/README.md
@@ -0,0 +1,6 @@
+# Factors
+The `factors.py` module includes common approximate matrix-factorization
+algorithms including:
+
+- Regularized alternating least squares (ALS)
+- Non-negative matrix factorization via fast hierarchical least squares (HALS) ([Cichocki & Phan, 2008](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.214.6398&rep=rep1&type=pdf))
diff --git a/numpy_ml/factorization/__init__.py b/numpy_ml/factorization/__init__.py
@@ -0,0 +1,3 @@
+"""Algorithms for approximate matrix factorization"""
+
+from .factors import *
diff --git a/numpy_ml/factorization/factors.py b/numpy_ml/factorization/factors.py
@@ -0,0 +1,374 @@
+"""Algorithms for approximate matrix factorization"""
+
+from copy import deepcopy
+
+import numpy as np
+
+
+class VanillaALS:
+    def __init__(self, K, alpha=1, max_iter=200, tol=1e-4):
+        r"""
+        Approximately factor a real-valued matrix using regularized alternating
+        least-squares (ALS).
+
+        Notes
+        -----
+        The regularized ALS minimization problem is
+
+        .. math::
+
+            \min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2 -
+                \alpha \left(
+                    ||\mathbf{W}||^2 + ||\mathbf{H}||^2
+                \right)
+
+        where :math:`||\cdot||` denotes the Frobenius norm, **X** is the
+        :math:`N \times M` data matrix, :math:`\mathbf{W}` and
+        :math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
+        \times K` and :math:`K \times M`, respectively. :math:`\alpha` is a
+        user-defined regularization weight.
+
+        ALS proceeds by alternating between fixing **W** and optimizing for
+        **H** and fixing **H** and optimizing for **W**. Vanilla ALS has no
+        convergance guarantees and the objective function is prone to
+        oscillation across updates, particularly for dense input matrices [1]_.
+
+        References
+        ----------
+        .. [1] Gillis, N. (2014). The why and how of nonnegative matrix
+        factorization.  *Regularization, optimization, kernels, and support
+        vector machines*, 12(257), 257-291.
+
+        Parameters
+        ----------
+        K : int
+            The number of latent factors to include in the factor matrices W
+            and H.
+        alpha : float
+            The L2 regularization weight on the factor matrices. Larger
+            values result in more aggressive regularization. Default is 1.
+        max_iter : int
+            The maximum number of iterations to run before stopping. Default is
+            200.
+        tol : float
+            The tolerance for the stopping condition. Default is 1e-4.
+        """
+        self.K = K
+        self.W = None
+        self.H = None
+        self.tol = tol
+        self.alpha = alpha
+        self.max_iter = max_iter
+
+    @property
+    def parameters(self):
+        """Return a dictionary of the current model parameters"""
+        return {"W": self.W, "H": self.H}
+
+    @property
+    def hyperparameters(self):
+        """Return a dictionary of the model hyperparameters"""
+        return {
+            "id": "ALSFactor",
+            "K": self.K,
+            "tol": self.tol,
+            "alpha": self.alpha,
+            "max_iter": self.max_iter,
+        }
+
+    def _init_factor_matrices(self, X, W=None, H=None):
+        """Randomly initialize the factor matrices"""
+        N, M = X.shape
+        scale = np.sqrt(X.mean() / self.K)
+        self.W = np.random.rand(N, self.K) * scale if W is None else W
+        self.H = np.random.rand(self.K, M) * scale if H is None else H
+
+        assert self.W.shape == (N, self.K)
+        assert self.H.shape == (self.K, M)
+
+    def _loss(self, X, Xhat):
+        """Regularized Frobenius loss"""
+        alpha, W, H = self.alpha, self.W, self.H
+        sq_fnorm = lambda x: np.sum(x ** 2)  # noqa: E731
+        return sq_fnorm(X - Xhat) + alpha * (sq_fnorm(W) + sq_fnorm(H))
+
+    def _update_factor(self, X, A):
+        """Perform the ALS update"""
+        T1 = np.linalg.inv(A.T @ A + self.alpha * np.eye(self.K))
+        return X @ A @ T1
+
+    def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
+        """
+        Factor a data matrix into two low rank factors via ALS.
+
+        Parameters
+        ----------
+        X : numpy array of shape `(N, M)`
+            The data matrix to factor.
+        W : numpy array of shape `(N, K)` or None
+            An initial value for the `W` factor matrix. If None, initialize `W`
+            randomly. Default is None.
+        H : numpy array of shape `(K, M)` or None
+            An initial value for the `H` factor matrix. If None, initialize `H`
+            randomly. Default is None.
+        n_initializations : int
+            Number of re-initializations of the algorithm to perform before
+            taking the answer with the lowest reconstruction error. This value
+            is ignored and set to 1 if both `W` and `H` are not None. Default
+            is 10.
+        verbose : bool
+            Whether to print the loss at each iteration. Default is False.
+        """
+        if W is not None and H is not None:
+            n_initializations = 1
+
+        best_loss = np.inf
+        for f in range(n_initializations):
+            if verbose:
+                print("\nINITIALIZATION {}".format(f + 1))
+
+            new_W, new_H, loss = self._fit(X, W, H, verbose)
+
+            if loss <= best_loss:
+                best_loss = loss
+                best_W, best_H = deepcopy(new_W), deepcopy(new_H)
+
+        self.W, self.H = best_W, best_H
+
+        if verbose:
+            print("\nFINAL LOSS: {}".format(best_loss))
+
+    def _fit(self, X, W, H, verbose):
+        self._init_factor_matrices(X, W, H)
+        prev_loss = loss = np.inf
+        W, H = self.W, self.H
+
+        for i in range(self.max_iter):
+            prev_loss = loss
+            W = self._update_factor(X, H.T)
+            H = self._update_factor(X.T, W).T
+
+            loss = self._loss(X, W @ H)
+
+            if verbose:
+                print("[Iter {}] Loss: {:.6f}".format(i + 1, loss))
+
+            if (prev_loss - loss) <= self.tol:
+                break
+
+        return W, H, loss
+
+
+class NMF:
+    def __init__(self, K, max_iter=200, tol=1e-4):
+        r"""
+        Nonnegative matrix factorization (NMF) performed using fast
+        hierarchical alternating least squares (HALS) [*]_.
+
+        Notes
+        -----
+        The NMF minimization problem is
+
+        .. math::
+
+            \min_{\mathbf{W}, \mathbf{H}} ||\mathbf{X} - \mathbf{WH}||^2
+                \ \ \ \ \text{subject to } \mathbf{W}, \mathbf{H} \geq 0
+
+        where :math:`||\cdot||` denotes the Frobenius norm, and the notation
+        :math:`\mathbf{A} \geq 0` indicates that each element of **A** is
+        greater than or equal to 0. In the above equation, **X** is the
+        :math:`N \times M` data matrix, :math:`\mathbf{W}` and
+        :math:`\mathbf{H}` are learned factor matrices with dimensions :math:`N
+        \times K` and :math:`K \times M`, respectively.
+
+        As with other ALS-based approaches, there is no guarantee that NMF will
+        converge to a stationary point, let alone a global minimum. As a result
+        it is generally good practice to run the algorithm multiple times with
+        different initializations, taking the outcome that achieves the lowest
+        reconstruction error.
+
+        References
+        ----------
+        .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
+           large scale nonnegative matrix and tensor factorizations. *IEICE
+           Transactions on Fundamentals of Electronics, Communications and
+           Computer Sciences, 92(3)*, 708-721.
+
+        Parameters
+        ----------
+        K : int
+            The number of latent factors to include in the factor matrices **W**
+            and **H**.
+        max_iter : int
+            The maximum number of iterations to run before stopping. Default is
+            200.
+        tol : float
+            The tolerance for the stopping condition. Default is 1e-4.
+        """
+        self.K = K
+        self.W = None
+        self.H = None
+        self.tol = tol
+        self.max_iter = max_iter
+
+    @property
+    def parameters(self):
+        """Return a dictionary of the current model parameters"""
+        return {"W": self.W, "H": self.H}
+
+    @property
+    def hyperparameters(self):
+        """Return a dictionary of the model hyperparameters"""
+        return {
+            "id": "NMF",
+            "K": self.K,
+            "tol": self.tol,
+            "max_iter": self.max_iter,
+        }
+
+    def _init_factor_matrices(self, X, W, H):
+        """Initialize the factor matrices using vanilla ALS"""
+        ALS = None
+        N, M = X.shape
+
+        # initialize factors using ALS if not already defined
+        if W is None:
+            ALS = VanillaALS(self.K, alpha=0, max_iter=200)
+            ALS.fit(X, verbose=False)
+            W = ALS.W / np.linalg.norm(ALS.W, axis=0)
+
+        if H is None:
+            H = np.abs(np.random.rand(self.K, M)) if ALS is None else ALS.H
+
+        assert W.shape == (N, self.K)
+        assert H.shape == (self.K, M)
+
+        self.H = H
+        self.W = W
+
+    def _loss(self, X, Xhat):
+        """Return the least-squares reconstruction loss between X and Xhat"""
+        return np.sum((X - Xhat) ** 2)
+
+    def _update_H(self, X, W, H):
+        """Perform the fast HALS update for H"""
+        #  eps = np.finfo(float).eps
+        eps = 1e-16
+        XtW = X.T @ W  # dim: (M, K)
+        WtW = W.T @ W  # dim: (K, K)
+
+        for k in range(self.K):
+            H[k, :] += XtW[:, k] - H.T @ WtW[:, k]
+            H[k, :] = np.clip(H[k, :], eps, np.inf)  # enforce nonnegativity
+        return H
+
+    def _update_W(self, X, W, H):
+        """Perform the fast HALS update for W"""
+        eps = 1e-16  # np.finfo(float).eps
+        XHt = X @ H.T  # dim: (N, K)
+        HHt = H @ H.T  # dim: (K, K)
+
+        for k in range(self.K):
+            W[:, k] = W[:, k] * HHt[k, k] + XHt[:, k] - W @ HHt[:, k]
+            W[:, k] = np.clip(W[:, k], eps, np.inf)  # enforce nonnegativity
+
+            # renormalize the new column
+            n = np.linalg.norm(W[:, k])
+            W[:, k] /= n if n > 0 else 1.0
+        return W
+
+    def fit(self, X, W=None, H=None, n_initializations=10, verbose=False):
+        r"""
+        Factor a data matrix into two nonnegative low rank factor matrices via
+        fast HALS.
+
+        Notes
+        -----
+        This method implements Algorithm 2 from [*]_. In contrast to vanilla
+        ALS, HALS proceeds by minimizing a *set* of local cost functions with
+        the same global minima. Each cost function is defined on a "residue" of
+        the factor matrices **W** and **H**:
+
+        .. math::
+
+           \mathbf{X}^{(j)} :=
+                \mathbf{X} - \mathbf{WH}^\top + \mathbf{w}_j \mathbf{h}_j^\top
+
+        where :math:`\mathbf{X}^{(j)}` is the `j`th residue, **X** is the input
+        data matrix, and :math:`\mathbf{w}_j` and :math:`\mathbf{h}_j` are the
+        `j`th columns of the current factor matrices **W** and **H**. HALS
+        proceeds by minimizing the cost for each residue, first with respect to
+        :math:`\mathbf{w}_j` holding :math:`\mathbf{h}_j` fixed, and then with
+        respect to :math:`\mathbf{h}_j`, holding the newly updated
+        :math:`\mathbf{w}_j` fixed. The residue cost :math:`\mathcal{L}^{(j)}`
+        for :math:`\mathbf{X}^{j}` is simply:
+
+        .. math::
+
+            \mathcal{L}^{(j)} :=
+                || \mathbf{X}^{(j)} - \mathbf{w}_j \mathbf{h}_j^\top ||
+
+        where :math:`||\cdot||` denotes the Frobenius norm. For NMF, this
+        minimization is performed under the constraint that all elements of
+        both **W** and **H** are nonnegative.
+
+        References
+        ----------
+        .. [*] Cichocki, A., & Phan, A. (2009). Fast local algorithms for
+           large scale nonnegative matrix and tensor factorizations. *IEICE
+           Transactions on Fundamentals of Electronics, Communications and
+           Computer Sciences, 92(3)*, 708-721.
+
+        Parameters
+        ----------
+        X : numpy array of shape `(N, M)`
+            The data matrix to factor.
+        W : numpy array of shape `(N, K)` or None
+            An initial value for the `W` factor matrix. If None, initialize
+            **W** using vanilla ALS. Default is None.
+        H : numpy array of shape `(K, M)` or None
+            An initial value for the `H` factor matrix. If None, initialize
+            **H** using vanilla ALS. Default is None.
+        n_initializations : int
+            Number of re-initializations of the algorithm to perform before
+            taking the answer with the lowest reconstruction error. This value
+            is ignored and set to 1 if both `W` and `H` are not None. Default
+            is 10.
+        verbose : bool
+            Whether to print the loss at each iteration. Default is False.
+        """
+        if W is not None and H is not None:
+            n_initializations = 1
+
+        best_loss = np.inf
+        for f in range(n_initializations):
+            if verbose:
+                print("\nINITIALIZATION {}".format(f + 1))
+
+            new_W, new_H, loss = self._fit(X, W, H, verbose)
+
+            if loss <= best_loss:
+                best_loss = loss
+                best_W, best_H = deepcopy(new_W), deepcopy(new_H)
+
+        self.W, self.H = best_W, best_H
+        if verbose:
+            print("\nFINAL LOSS: {}".format(best_loss))
+
+    def _fit(self, X, W, H, verbose):
+        self._init_factor_matrices(X, W, H)
+
+        W, H = self.W, self.H
+        prev_loss = loss = np.inf
+        for i in range(self.max_iter):
+            prev_loss = loss
+            H = self._update_H(X, W, H)
+            W = self._update_W(X, W, H)
+            loss = self._loss(X, W @ H)
+
+            if verbose:
+                print("[Iter {}] Loss: {:.4f}".format(i + 1, loss))
+
+            if (prev_loss - loss) <= self.tol:
+                break
+        return W, H, loss
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Algorithms for approximate matrix factorization"""

		from .factors import *