Source code for graspologic.embed.ase

# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.

import warnings
import numpy as np
from sklearn.utils.validation import check_is_fitted
import networkx as nx

from .base import BaseSpectralEmbed
from ..utils import (
    import_graph,
    is_fully_connected,
    augment_diagonal,
    pass_to_ranks,
    is_unweighted,
)


[docs]class AdjacencySpectralEmbed(BaseSpectralEmbed): r""" Class for computing the adjacency spectral embedding of a graph. The adjacency spectral embedding (ASE) is a k-dimensional Euclidean representation of the graph based on its adjacency matrix. It relies on an SVD to reduce the dimensionality to the specified k, or if k is unspecified, can find a number of dimensions automatically (see :class:`~graspologic.embed.selectSVD`). Read more in the :ref:`tutorials <embed_tutorials>` Parameters ---------- n_components : int or None, default = None Desired dimensionality of output data. If "full", ``n_components`` must be ``<= min(X.shape)``. Otherwise, ``n_components`` must be ``< min(X.shape)``. If None, then optimal dimensions will be chosen by :func:`~graspologic.embed.select_dimension` using ``n_elbows`` argument. n_elbows : int, optional, default: 2 If ``n_components`` is None, then compute the optimal embedding dimension using :func:`~graspologic.embed.select_dimension`. Otherwise, ignored. algorithm : {'randomized' (default), 'full', 'truncated'}, optional SVD solver to use: - 'randomized' Computes randomized svd using :func:`sklearn.utils.extmath.randomized_svd` - 'full' Computes full svd using :func:`scipy.linalg.svd` Does not support ``graph`` input of type scipy.sparse.csr_matrix - 'truncated' Computes truncated svd using :func:`scipy.sparse.linalg.svds` n_iter : int, optional (default = 5) Number of iterations for randomized SVD solver. Not used by 'full' or 'truncated'. The default is larger than the default in randomized_svd to handle sparse matrices that may have large slowly decaying spectrum. check_lcc : bool , optional (default = True) Whether to check if input graph is connected. May result in non-optimal results if the graph is unconnected. If True and input is unconnected, a UserWarning is thrown. Not checking for connectedness may result in faster computation. diag_aug : bool, optional (default = True) Whether to replace the main diagonal of the adjacency matrix with a vector corresponding to the degree (or sum of edge weights for a weighted network) before embedding. Empirically, this produces latent position estimates closer to the ground truth. concat : bool, optional (default False) If graph is directed, whether to concatenate left and right (out and in) latent positions along axis 1. Attributes ---------- n_features_in_: int Number of features passed to the :func:`~graspologic.embed.AdjacencySpectralEmbed.fit` method. latent_left_ : array, shape (n_samples, n_components) Estimated left latent positions of the graph. latent_right_ : array, shape (n_samples, n_components), or None Only computed when the graph is directed, or adjacency matrix is assymetric. Estimated right latent positions of the graph. Otherwise, None. singular_values_ : array, shape (n_components) Singular values associated with the latent position matrices. See Also -------- graspologic.embed.selectSVD graspologic.embed.select_dimension Notes ----- The singular value decomposition: .. math:: A = U \Sigma V^T is used to find an orthonormal basis for a matrix, which in our case is the adjacency matrix of the graph. These basis vectors (in the matrices U or V) are ordered according to the amount of variance they explain in the original matrix. By selecting a subset of these basis vectors (through our choice of dimensionality reduction) we can find a lower dimensional space in which to represent the graph. References ---------- .. [1] Sussman, D.L., Tang, M., Fishkind, D.E., Priebe, C.E. "A Consistent Adjacency Spectral Embedding for Stochastic Blockmodel Graphs," Journal of the American Statistical Association, Vol. 107(499), 2012 .. [2] Levin, K., Roosta-Khorasani, F., Mahoney, M. W., & Priebe, C. E. (2018). Out-of-sample extension of graph adjacency spectral embedding. PMLR: Proceedings of Machine Learning Research, 80, 2975-2984. """ def __init__( self, n_components=None, n_elbows=2, algorithm="randomized", n_iter=5, check_lcc=True, diag_aug=True, concat=False, ): super().__init__( n_components=n_components, n_elbows=n_elbows, algorithm=algorithm, n_iter=n_iter, check_lcc=check_lcc, concat=concat, ) if not isinstance(diag_aug, bool): raise TypeError("`diag_aug` must be of type bool") self.diag_aug = diag_aug self.is_fitted_ = False
[docs] def fit(self, graph, y=None): """ Fit ASE model to input graph Parameters ---------- graph : array-like, scipy.sparse.csr_matrix, or networkx.Graph Input graph to embed. y: Ignored Returns ------- self : object Returns an instance of self. """ A = import_graph(graph) if self.check_lcc: if not is_fully_connected(A): msg = ( "Input graph is not fully connected. Results may not" + "be optimal. You can compute the largest connected component by" + "using ``graspologic.utils.get_lcc``." ) warnings.warn(msg, UserWarning) if self.diag_aug: A = augment_diagonal(A) self.n_features_in_ = A.shape[0] self._reduce_dim(A) # for out-of-sample inv_eigs = np.diag(1 / self.singular_values_) self._pinv_left = self.latent_left_ @ inv_eigs if self.latent_right_ is not None: self._pinv_right = self.latent_right_ @ inv_eigs self.is_fitted_ = True return self
[docs] def transform(self, X): """ Obtain latent positions from an adjacency matrix or matrix of out-of-sample vertices. For more details on transforming out-of-sample vertices, see the :ref:`tutorials <embed_tutorials>`. For mathematical background, see [2]. Parameters ---------- X : array-like or tuple, original shape or (n_oos_vertices, n_vertices). The original fitted matrix ("graph" in fit) or new out-of-sample data. If ``X`` is the original fitted matrix, returns a matrix close to ``self.fit_transform(X)``. If ``X`` is an out-of-sample matrix, n_oos_vertices is the number of new vertices, and n_vertices is the number of vertices in the original graph. If tuple, graph is directed and ``X[0]`` contains edges from out-of-sample vertices to in-sample vertices. Returns ------- array_like or tuple, shape (n_oos_vertices, n_components) or (n_vertices, n_components). Array of latent positions. Transforms the fitted matrix if it was passed in. If ``X`` is an array or tuple containing adjacency vectors corresponding to new nodes, returns the estimated latent positions for the new out-of-sample adjacency vectors. If undirected, returns array. If directed, returns ``(X_out, X_in)``, where ``X_out`` contains latent positions corresponding to nodes with edges from out-of-sample vertices to in-sample vertices. Notes ----- If the matrix was diagonally augmented (e.g., ``self.diag_aug`` was True), ``fit`` followed by ``transform`` will produce a slightly different matrix than ``fit_transform``. To get the original embedding, using ``fit_transform`` is recommended. In the directed case, if A is the original in-sample adjacency matrix, the tuple (A.T, A) will need to be passed to ``transform`` if you do not wish to use ``fit_transform``. """ # checks check_is_fitted(self, "is_fitted_") if isinstance(X, nx.classes.graph.Graph): X = import_graph(X) directed = self.latent_right_ is not None # correct types? if directed and not isinstance(X, tuple): if X.shape[0] == X.shape[1]: # in case original matrix was passed msg = """A square matrix A was passed to ``transform`` in the directed case. If this was the original in-sample matrix, either use ``fit_transform`` or pass a tuple (A.T, A). If this was an out-of-sample matrix, directed graphs require a tuple (X_out, X_in).""" raise TypeError(msg) else: msg = "Directed graphs require a tuple (X_out, X_in) for out-of-sample transforms." raise TypeError(msg) if not directed and not isinstance(X, np.ndarray): raise TypeError("Undirected graphs require array input") # correct shape in y? latent_rows = self.latent_left_.shape[0] _X = X[0] if directed else X X_cols = _X.shape[-1] if _X.ndim > 2: raise ValueError("out-of-sample vertex must be 1d or 2d") if latent_rows != X_cols: msg = "out-of-sample vertex must be shape (n_oos_vertices, n_vertices)" raise ValueError(msg) # workhorse code if not directed: return X @ self._pinv_left elif directed: return X[1] @ self._pinv_right, X[0] @ self._pinv_left