# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.
from typing import Union, Tuple
from ..embed import BaseSpectralEmbed
from ..embed import AdjacencySpectralEmbed, LaplacianSpectralEmbed
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator
[docs]class SpectralVertexNomination(BaseEstimator):
"""
Class for spectral vertex nomination on a single graph.
Given a graph :math:`G=(V,E)` and a subset of :math:`V` called :math:`S`
(the "seed"), Single Graph Vertex Nomination is the problem of ranking all :math:`V`
in order of relation to members of :math:`S`. Spectral Vertex Nomination solves
this problem by embedding :math:`G` into a low dimensional euclidean space
(:ref:`tutorials <embed_tutorials>`), and then generating a nomination list by some
distance based algorithm. In the simple unattributed case, for each seed vertex
:math:`u`, the other vertices are ranked in order of euclidean distance from
:math:`u`.
Parameters
----------
input_graph: bool, default = True
Flag whether to expect two full graphs, or the embeddings.
- True
.fit and .fit_predict() expect graphs as adjacency matrix, provided as
ndarray of shape (n, n). They will be embedded using the specified
``embedder``.
- False
.fit() and .fit_predict() expect an embedding of the graph, i.e. a ndarray
of size (n, d).
embedder: str or BaseEmbed, default = 'ASE'
May provide either a embed object or a string indicating which embedding method
to use, which may be either:
"ASE" for :py:class:`~graspologic.embed.AdjacencySpectralEmbed` or
"LSE" for :py:class:`~graspologic.embed.LaplacianSpectralEmbed`.
n_neighbors: int, default=None
The number of vertices to nominate for each seed.
metric : str, default = 'euclidean'
Distance metric to use when finding the nearest neighbors, all sklearn metrics
available.
metric_params : dict, default = None
Arguments for the sklearn `DistanceMetric` specified via `metric` parameter.
Attributes
----------
nearest_neighbors_ : sklearn.neighbors.NearestNeighbors
A fit sklearn `NearestNeighbors` classifier used to find closest vertices to
each seed.
References
----------
.. [1] Fishkind, D. E.; Lyzinski, V.; Pao, H.; Chen, L.; Priebe, C. E. Vertex
nomination schemes for membership prediction. Ann. Appl. Stat. 9 2015.
https://projecteuclid.org/euclid.aoas/1446488749
.. [2] Jordan Yoder, Li Chen, Henry Pao, Eric Bridgeford, Keith Levin,
Donniell E. Fishkind, Carey Priebe, Vince Lyzinski, Vertex nomination: The
canonical sampling and the extended spectral nomination schemes, Computational
Statistics & Data Analysis, Volume 145, 2020.
http://www.sciencedirect.com/science/article/pii/S0167947320300074
"""
def __init__(
self,
input_graph: bool = True,
embedder: Union[str, BaseSpectralEmbed] = "ase",
n_neighbors=None,
metric: str = "euclidean",
metric_params: dict = None,
):
super().__init__()
self.embedder = embedder
self.input_graph = input_graph
self.n_neighbors = n_neighbors
self.metric = metric
self.metric_params = metric_params
self._check_params()
def _check_x(self, X: np.ndarray):
# check X
if not isinstance(X, np.ndarray):
raise TypeError("X must be of type np.ndarray.")
elif not np.issubdtype(X.dtype, np.number):
raise TypeError("Adjacency matrix or embedding should have numeric type")
elif np.ndim(X) != 2:
raise IndexError("Adjacency matrix or embedding must have dim 2")
elif not self.input_graph:
# embedding was provided
if X.shape[1] > X.shape[0]:
raise IndexError("Dim 1 of an embedding should be smaller than dim 0.")
if not np.issubdtype(X.dtype, np.float):
raise TypeError("Embedding should have type float")
elif not np.issubdtype(X.dtype, np.int):
raise TypeError("Adjacency matrix should have type int")
elif X.shape[0] != X.shape[1]:
raise IndexError("Adjacency Matrix should be square.")
def _check_y(self, y: np.ndarray):
# check y
if not isinstance(y, np.ndarray):
raise TypeError("y must be of type np.ndarray")
elif not np.issubdtype(y.dtype, np.integer):
raise TypeError("y must have dtype int")
elif np.ndim(y) > 2 or (y.ndim == 2 and y.shape[1] > 1):
raise IndexError("y must have shape (n) or (n, 1).")
elif y.shape[0] > self.embedding_.shape[0]:
raise ValueError(
"the number of seeds must be less than the number of vertices"
)
def _check_params(self):
if self.n_neighbors is not None and type(self.n_neighbors) is not int:
raise TypeError("k must be an integer")
elif self.n_neighbors is not None and self.n_neighbors <= 0:
raise ValueError("k must be greater than 0")
if type(self.metric) is not str:
raise TypeError("metric must be a string")
if self.metric_params is not None and type(self.metric_params) is not dict:
raise TypeError("metric_params must be a dictionary")
if type(self.input_graph) is not bool:
raise TypeError("input_graph_ must be of type bool.")
if self.input_graph:
if not isinstance(self.embedder, BaseSpectralEmbed) and not isinstance(
self.embedder, str
):
raise TypeError(
"embedder must be either of type str or BaseSpectralEmbed"
)
def _embed(self, X: np.ndarray):
# Embed graph if embedding not provided
if self.input_graph:
if isinstance(self.embedder, BaseSpectralEmbed):
embedder = self.embedder
elif not isinstance(self.embedder, str):
raise TypeError("embedder must be type str or BaseSpectralEmbed")
elif self.embedder.lower() == "ase":
embedder = AdjacencySpectralEmbed()
elif self.embedder.lower() == "lse":
embedder = LaplacianSpectralEmbed()
else:
raise ValueError(
"Requested embedding method does not exist, if str is passed must "
"be either 'ASE' or 'LSE'."
)
self.embedding_ = embedder.fit_transform(X)
else:
self.embedding_ = X
[docs] def fit(
self,
X: np.ndarray,
y=None,
):
"""
Constructs the embedding if not provided, then calculates the pairwise distance
from each seed to each vertex in graph.
Parameters
----------
X : np.ndarray
- If `input_graph` is True
Expects a graph as an adjacency matrix, i.e. an ndarray of shape (n, n).
Will be embedded using the specified embedder.
- If `input_graph` is False
Expects an embedding of the graph, i.e. a ndarray of size (n, d).
y : NoneType
Included by sklearn convention.
Returns
-------
self : object
Returns an instance of self.
"""
# ensure y has correct shape. If unattributed (1d)
# add unique attribute to each seed vertex.
self._check_x(X)
self._embed(X)
if self.n_neighbors is None:
self.n_neighbors = X.shape[0]
self.nearest_neighbors_ = NearestNeighbors(
n_neighbors=self.n_neighbors,
metric=self.metric,
metric_params=self.metric_params,
)
self.nearest_neighbors_.fit(self.embedding_)
return self
[docs] def predict(self, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
Nominates vertices for each seed vertex. Methodology is distance based ranking.
Parameters
----------
y : np.ndarray
The indices of the seed vertices. Should be a dim 1 array with length less
than :math:`|V|`.
Returns
-------
Nomination List : np.ndarray
Shape is ``(number_vertices, number_vertices_in_seed)`` . Each
column is a seed vertex, and the rows of each
column are a list of vertex indexes from the original adjacency
matrix in order degree of match.
Distance Matrix : np.ndarray
The matrix of distances associated with each element of the
nomination list.
"""
self._check_y(y)
y = y.reshape(-1)
y_vec = self.embedding_[y.astype(np.int)]
if not hasattr(self, "nearest_neighbors_"):
raise ValueError("Fit must be called before predict.")
distance_matrix, nomination_list = self.nearest_neighbors_.kneighbors(y_vec)
# transpose for consistency with literature
return nomination_list.T.astype(np.int), distance_matrix.T
[docs] def fit_predict(
self,
X: np.ndarray,
y: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Calls this class' fit and then predict methods.
Parameters
----------
X : np.ndarray
- If `input_graph` is True
Expects a graph as an adjacency matrix, i.e. an ndarray of shape (n, n).
Will be embedded using the specified embedder.
- If `input_graph` is False
Expects an embedding of the graph, i.e. a ndarray of size (n, d).
y : np.ndarray.
List of unattributed seed vertex indices.
Returns
-------
Nomination List : np.ndarray
Shape is ``(number_vertices, number_vertices_in_seed)`` . Each
column is a seed vertex, and the rows of each
column are a list of vertex indexes from the original adjacency
matrix in order degree of match.
Distance Matrix : np.ndarray
The matrix of distances associated with each element of the
nomination list.
"""
self.fit(X)
return self.predict(y)