Source code for graspologic.embed.svd

# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

import numpy as np
import scipy
import sklearn
from scipy.stats import norm
from scipy.sparse import isspmatrix_csr

def _compute_likelihood(arr):
    Computes the log likelihoods based on normal distribution given
    a 1d-array of sorted values. If the input has no variance,
    the likelihood will be nan.
    n_elements = len(arr)
    likelihoods = np.zeros(n_elements)

    for idx in range(1, n_elements + 1):
        # split into two samples
        s1 = arr[:idx]
        s2 = arr[idx:]

        # deal with when input only has 2 elements
        if (s1.size == 1) & (s2.size == 1):
            likelihoods[idx - 1] = -np.inf

        # compute means
        mu1 = np.mean(s1)
        if s2.size != 0:
            mu2 = np.mean(s2)
            # Prevent numpy warning for taking mean of empty array
            mu2 = -np.inf

        # compute pooled variance
        variance = ((np.sum((s1 - mu1) ** 2) + np.sum((s2 - mu2) ** 2))) / (
            n_elements - 1 - (idx < n_elements)
        std = np.sqrt(variance)

        # compute log likelihoods
        likelihoods[idx - 1] = np.sum(norm.logpdf(s1, loc=mu1, scale=std)) + np.sum(
            norm.logpdf(s2, loc=mu2, scale=std)

    return likelihoods

[docs]def select_dimension( X, n_components=None, n_elbows=2, threshold=None, return_likelihoods=False ): """ Generates profile likelihood from array based on Zhu and Godsie method. Elbows correspond to the optimal embedding dimension. Parameters ---------- X : 1d or 2d array-like Input array generate profile likelihoods for. If 1d-array, it should be sorted in decreasing order. If 2d-array, shape should be (n_samples, n_features). n_components : int, optional, default: None. Number of components to embed. If None, ``n_components = floor(log2(min(n_samples, n_features)))``. Ignored if ``X`` is 1d-array. n_elbows : int, optional, default: 2. Number of likelihood elbows to return. Must be ``> 1``. threshold : float, int, optional, default: None If given, only consider the singular values that are ``> threshold``. Must be ``>= 0``. return_likelihoods : bool, optional, default: False If True, returns the all likelihoods associated with each elbow. Returns ------- elbows : list Elbows indicate subsequent optimal embedding dimensions. Number of elbows may be less than ``n_elbows`` if there are not enough singular values. sing_vals : list The singular values associated with each elbow. likelihoods : list of array-like Array of likelihoods of the corresponding to each elbow. Only returned if ``return_likelihoods`` is True. References ---------- .. [1] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the scree plot via the use of profile likelihood. Computational Statistics & Data Analysis, 51(2), pp.918-930. """ # Handle input data if not isinstance(X, np.ndarray) and not isspmatrix_csr(X): msg = "X must be a numpy array or scipy.sparse.csr_matrix, not {}.".format( type(X) ) raise ValueError(msg) if X.ndim > 2: msg = "X must be a 1d or 2d-array, not {}d array.".format(X.ndim) raise ValueError(msg) elif np.min(X.shape) <= 1: msg = "X must have more than 1 samples or 1 features." raise ValueError(msg) # Handle n_elbows if not isinstance(n_elbows, int): msg = "n_elbows must be an integer, not {}.".format(type(n_elbows)) raise ValueError(msg) elif n_elbows < 1: msg = "number of elbows should be an integer > 1, not {}.".format(n_elbows) raise ValueError(msg) # Handle threshold if threshold is not None: if not isinstance(threshold, (int, float)): msg = "threshold must be an integer or a float, not {}.".format( type(threshold) ) raise ValueError(msg) elif threshold < 0: msg = "threshold must be >= 0, not {}.".format(threshold) raise ValueError(msg) # Handle n_components if n_components is None: # per recommendation by Zhu & Godsie k = int(np.ceil(np.log2(np.min(X.shape)))) elif not isinstance(n_components, int): msg = "n_components must be an integer, not {}.".format(type(n_components)) raise ValueError(msg) else: k = n_components # Check to see if svd is needed if X.ndim == 1: D = np.sort(X)[::-1] elif X.ndim == 2: # Singular values in decreasing order D = scipy.sparse.linalg.svds(A=X, k=k, return_singular_vectors=False) D = np.sort(D)[::-1] # U, D, V = sklearn.utils.extmath.randomized_svd() if threshold is not None: D = D[D > threshold] if len(D) == 0: msg = "No values greater than threshold {}." raise IndexError(msg.format(threshold)) idx = 0 elbows = [] values = [] likelihoods = [] for _ in range(n_elbows): arr = D[idx:] if arr.size <= 1: # Cant compute likelihoods with 1 numbers break lq = _compute_likelihood(arr) idx += np.argmax(lq) + 1 elbows.append(idx) values.append(D[idx - 1]) likelihoods.append(lq) if return_likelihoods: return elbows, values, likelihoods else: return elbows, values
[docs]def selectSVD(X, n_components=None, n_elbows=2, algorithm="randomized", n_iter=5): r""" Dimensionality reduction using SVD. Performs linear dimensionality reduction by using either full singular value decomposition (SVD) or truncated SVD. Full SVD is performed using SciPy's wrapper for ARPACK, while truncated SVD is performed using either SciPy's wrapper for LAPACK or Sklearn's implementation of randomized SVD. It also performs optimal dimensionality selection using Zhu & Godsie algorithm if number of target dimension is not specified. Parameters ---------- X : array-like, shape (n_samples, n_features) The data to perform svd on. n_components : int or None, default = None Desired dimensionality of output data. If "full", ``n_components`` must be ``<= min(X.shape)``. Otherwise, ``n_components`` must be ``< min(X.shape)``. If None, then optimal dimensions will be chosen by :func:`~graspologic.embed.select_dimension` using ``n_elbows`` argument. n_elbows : int, optional, default: 2 If ``n_components`` is None, then compute the optimal embedding dimension using :func:`~graspologic.embed.select_dimension`. Otherwise, ignored. algorithm : {'randomized' (default), 'full', 'truncated'}, optional SVD solver to use: - 'randomized' Computes randomized svd using :func:`sklearn.utils.extmath.randomized_svd` - 'full' Computes full svd using :func:`scipy.linalg.svd` Does not support ``graph`` input of type scipy.sparse.csr_matrix - 'truncated' Computes truncated svd using :func:`scipy.sparse.linalg.svds` n_iter : int, optional (default = 5) Number of iterations for randomized SVD solver. Not used by 'full' or 'truncated'. The default is larger than the default in randomized_svd to handle sparse matrices that may have large slowly decaying spectrum. Returns ------- U : array-like, shape (n_samples, n_components) Left singular vectors corresponding to singular values. D : array-like, shape (n_components) Singular values in decreasing order, as a 1d array. V : array-like, shape (n_components, n_samples) Right singular vectors corresponding to singular values. References ---------- .. [1] Zhu, M. and Ghodsi, A. (2006). Automatic dimensionality selection from the scree plot via the use of profile likelihood. Computational Statistics & Data Analysis, 51(2), pp.918-930. """ # Added in order to pass check estimator, must include words "one sample" if X.shape[0] == 1: msg = "Input data has only one sample (node)" raise ValueError(msg) # Deal with algorithms if algorithm not in ["full", "truncated", "randomized"]: msg = "algorithm must be one of {full, truncated, randomized}." raise ValueError(msg) if algorithm == "full" and isspmatrix_csr(X): msg = "'full' agorithm does not support scipy.sparse.csr_matrix inputs." raise TypeError(msg) if n_components is None: elbows, _ = select_dimension(X, n_elbows=n_elbows, threshold=None) n_components = elbows[-1] # Check if (algorithm == "full") & (n_components > min(X.shape)): msg = "n_components must be <= min(X.shape)." raise ValueError(msg) elif algorithm == "full": U, D, V = scipy.linalg.svd(X) U = U[:, :n_components] D = D[:n_components] V = V[:n_components, :] if (algorithm in ["truncated", "randomized"]) & (n_components >= min(X.shape)): msg = "n_components must be strictly < min(X.shape)." raise ValueError(msg) elif algorithm == "truncated": U, D, V = scipy.sparse.linalg.svds(X, k=n_components) idx = np.argsort(D)[::-1] # sort in decreasing order D = D[idx] U = U[:, idx] V = V[idx, :] elif algorithm == "randomized": U, D, V = sklearn.utils.extmath.randomized_svd(X, n_components, n_iter=n_iter) return U, D, V