# Source code for graspologic.nominate.VNviaSGM

```
import numpy as np
from ..match import GraphMatch as GMP
from sklearn.base import BaseEstimator
import itertools
import warnings
[docs]class VNviaSGM(BaseEstimator):
"""
This class implements Vertex Nomination via Seeded Graph Matching (VNviaSGM) with
the algorithm described in [1].
Rather than providing a 1-1 matching for the vertices of two graphs, as in
:class:`~graspologic.match.GraphMatch`, VNviaSGM ranks the potential matches for a
vertex of interst (VOI) in one to graph to the vertices in another graph, based on
probability of matching.
Parameters
----------
order_voi_subgraph: int, positive (default = 1)
Order used to create induced subgraph on ``A`` about VOI where the max distance
between VOI and other nodes is ``order_voi_subgraph``. This induced subgraph
will be used to determine what seeds are used when the SGM algorithm is called.
If no seeds are in this subgraph about VOI, then a UserWarning is thrown, and
``nomination_list_`` is None.
order_seeds_subgraph: int, positive (default = 1)
Order used to create induced subgraphs on ``A`` and ``B``. These subgraphs
are centered about the seeds that were determined by the subgraph generated
by ``order_voi_subgraph``. These two subgraphs will be passed into the SGM
algorithm.
n_init: int, positive (default = 100)
Number of random initializations of the seeded graph matching algorithm (SGM).
Increasing the number of restarts will make the probabilities returned more
precise.
max_nominations: int (default = None)
Max number of nominations to include in the nomination list. If None is passed,
then all nominations computed will be returned.
graph_match_kws : dict (default = {})
Gives users the option to pass custom arguments to the graph matching
algorithm. Format should be {'arg_name': arg_value, ...}. See
:class:`~graspologic.match.GraphMatch`
Attributes
----------
n_seeds_: int
Number of seeds passed in `seedsA` that occured in the induced subgraph about
VOI
nomination_list_: 2d-array
An array containing vertex nominations in the form nomination
list = [[j, p_val],...] where p_val is the probability that the VOI matches
to node j in graph B (sorted by descending probability)
Notes
-----
VNviaSGM generates an initial induced subgraph about the VOI to determine which
seeds are close enough to be used. If no seeds are close enough, then a warning
is thrown and ``nomination_list_`` is set to None.
All the seeds that are close enough are then used to generate subgraphs in both
``A`` and ``B``. These subgraphs are matched using several random initializations
of the seeded graph matching algorithm (SGM), and a nomination list is returned.
See :class:`~graspologic.match.GraphMatch` for SGM docs
References
----------
.. [1] Patsolic, HG, Park, Y, Lyzinski, V, Priebe, CE. Vertex nomination via seeded
graph matching. Stat Anal Data Min: The ASA Data Sci Journal. 2020; 13: 229–
244. https://doi.org/10.1002/sam.11454
"""
def __init__(
self,
order_voi_subgraph=1,
order_seeds_subgraph=1,
n_init=100,
max_nominations=None,
graph_match_kws={},
):
if isinstance(order_voi_subgraph, int) and order_voi_subgraph > 0:
self.order_voi_subgraph = order_voi_subgraph
else:
msg = '"order_voi_subgraph" must be an integer > 0'
raise ValueError(msg)
if isinstance(order_seeds_subgraph, int) and order_seeds_subgraph > 0:
self.order_seeds_subgraph = order_seeds_subgraph
else:
msg = '"order_seeds_subgraph" must be an integer > 0'
raise ValueError(msg)
if isinstance(n_init, int) and n_init > 0:
self.n_init = n_init
else:
msg = '"n_init" must be an integer > 0'
raise ValueError(msg)
if max_nominations is None:
self.max_nominations = max_nominations
elif isinstance(max_nominations, int) and max_nominations >= 1:
self.max_nominations = max_nominations
else:
msg = '"max_nominations" must be an integer >= 1'
raise ValueError(msg)
# Error checking of these will be handled by GMP
if isinstance(graph_match_kws, dict):
self.graph_match_kws = graph_match_kws
else:
msg = '"graph_match_kws` must be type dict'
raise ValueError(msg)
[docs] def fit(self, A, B, voi, seeds):
"""
Fits the model to two graphs.
Parameters
----------
A: 2d-array, square
Adjacency matrix, the graph where ``voi`` is known
B: 2d-array, square
Adjacency matrix, the graph where ``voi`` is not known
voi: int
Vertex of interest
seeds: list, 2d-array
List of length two, of form `[seedsA, seedsB]`. The elements of `seedsA`
and `seedsB` are vertex indices from ``A`` and ``B``, respectively, which
are known to be matched; that is, vertex `seedsA[i]` is matched to vertex
`seedsB[i]`. Note: `len(seedsA)==len(seedsB)`.
Returns
-------
self: An instance of self
"""
A = np.atleast_2d(A)
B = np.atleast_2d(B)
if not isinstance(A, np.ndarray) or not isinstance(B, np.ndarray):
msg = '"A" and "B" must be type np.ndarray'
raise ValueError(msg)
elif A.ndim != 2 or B.ndim != 2:
msg = '"A" and "B" must be two-dimensional'
raise ValueError(msg)
elif A.shape[0] != A.shape[1] or B.shape[0] != B.shape[1]:
msg = '"A" and "B" must be square'
raise ValueError(msg)
if not isinstance(voi, int):
msg = '"voi" must be an integer'
raise ValueError(msg)
elif voi < 0 or voi >= A.shape[0]:
msg = '"voi" must be in range[0, num_verts_A)'
raise ValueError(msg)
if not (isinstance(seeds, list) or isinstance(seeds, np.ndarray)):
msg = '"seeds" must be a list'
raise ValueError(msg)
if isinstance(seeds, list):
if len(seeds) != 2:
msg = 'seeds must be length two, with first element containing seeds \
of "A" and the second containing seeds of "B"'
raise ValueError(msg)
if not (
isinstance(seeds[0], list) or isinstance(seeds[0], np.ndarray)
) or not (isinstance(seeds[1], list) or isinstance(seeds[1], np.ndarray)):
msg = '"seeds" elements must be lists or arrays'
raise ValueError(msg)
seedsA = np.array(seeds[0])
seedsB = np.array(seeds[1])
else:
seeds = np.atleast_2d(seeds)
if not isinstance(seeds, np.ndarray):
msg = '"seeds" be a list or 2d-array'
raise ValueError(msg)
if seeds.shape[1] != 2:
msg = '"seeds" must have a second dimension of two'
raise ValueError(msg)
seedsA = seeds[:, 0]
seedsB = seeds[:, 1]
if len(seedsA) != len(seedsB):
msg = "Must have the same number of seeds for each adjacency matrix"
raise ValueError(msg)
elif len(seedsA) == 0:
msg = 'len("seeds") must be at least one'
raise ValueError(msg)
elif not len(set(seedsA)) == len(seedsA) or not len(set(seedsB)) == len(seedsB):
msg = '"seeds" column entries must be unique'
raise ValueError(msg)
elif len(seedsA) > A.shape[0]:
msg = '"seeds" cant have more entries than its associated adjacency matrix'
raise ValueError(msg)
elif (seedsA >= A.shape[0]).any() or (seedsB >= B.shape[0]).any():
msg = '"seeds" entries must be less than number of nodes in their graphs'
raise ValueError(msg)
# get vertex reordering for Ax
# in the form (seedsA, voi, rest in order)
nsx1 = np.setdiff1d(np.arange(A.shape[0]), np.append(seedsA, voi))
a_reord = np.append(np.append(seedsA, voi), nsx1)
# get reordering for B in the form (seedsB, rest in numerical order)
nsx2 = np.setdiff1d(np.arange(B.shape[0]), seedsB)
b_reord = np.concatenate((seedsB, nsx2))
# Reorder the two graphs with our new vertices order
A_perm = A[a_reord][:, a_reord]
B_perm = B[b_reord][:, b_reord]
# Record where the new seeds and voi locations are
# in our re-ordered graphs
seeds_reord = np.arange(len(seedsA))
voi_reord = len(seedsA)
# Determine what seeds are within a specified subgraph
# given by `self.order_voi_subgraph`. If there are no
# seeds in this subgraph, print a message and return None
subgraph_A_perm = _get_induced_subgraph_list(
A_perm, self.order_voi_subgraph, voi_reord, mindist=1
)
close_seeds = np.intersect1d(subgraph_A_perm, seeds_reord)
if len(close_seeds) <= 0:
warnings.warn(
'Voi {} was not a member of the induced subgraph A[{}], \
Try increasing "order_voi_subgraph"'.format(
voi, seedsA
)
)
self.n_seeds_ = None
self.nomination_list_ = None
return self
# Generate the two induced subgraphs that will be used by the matching
# algorithm using the seeds that we identified in the previous step.
verts_A = _get_induced_subgraph_list(
A_perm, self.order_seeds_subgraph, close_seeds, mindist=0
)
verts_B = _get_induced_subgraph_list(
B_perm, self.order_seeds_subgraph, close_seeds, mindist=0
)
# Determine the final reordering for the graphs that include only
# the vertices found by the induced subgraphs in the previous step
# For graph A, its of the form (close_seeds, voi, rest in verts_A
# in num order). For graph B its of the form (close_seeds, rest in
# verts_B in num order)
permed_verts = np.append(close_seeds, voi_reord)
ind1 = np.append(
np.append(close_seeds, voi_reord), np.setdiff1d(verts_A, permed_verts)
)
ind2 = np.concatenate((close_seeds, np.setdiff1d(verts_B, close_seeds)))
# Generate adjacency matrices for the ordering found in the prev step
SG_1 = A_perm[ind1][:, ind1]
SG_2 = B_perm[ind2][:, ind2]
# Record the number of seeds used because this may differ from the number
# of seeds passed. See the step where close_seeds was computed for an
# explanation
self.n_seeds_ = len(close_seeds)
seeds_fin = np.arange(self.n_seeds_)
# Call the SGM algorithm using user set parameters and generate a prob
# vector for the voi.
sgm = GMP(
**self.graph_match_kws,
)
prob_vector = np.zeros((max(SG_1.shape[0], SG_2.shape[0]) - self.n_seeds_))
for ii in range(self.n_init):
sgm.fit(SG_1, SG_2, seeds_A=seeds_fin, seeds_B=seeds_fin)
prob_vector[sgm.perm_inds_[self.n_seeds_] - self.n_seeds_] += 1.0
prob_vector /= self.n_init
# Get the original vertices names in the B graph to make the nom list
b_inds = b_reord[ind2]
# Generate the nomination list. Note, the probability matrix does not
# include the seeds, so we must remove them from b_inds. Return a list
# sorted so it returns the vertex with the highest probability first.
nomination_list_ = np.dstack((b_inds[self.n_seeds_ :], prob_vector))[0]
nomination_list_ = nomination_list_[nomination_list_[:, 1].argsort()][::-1]
if self.max_nominations is not None and self.max_nominations < len(
nomination_list_
):
nomination_list_ = nomination_list_[0 : self.max_nominations]
self.nomination_list_ = nomination_list_
return self
[docs] def fit_predict(self, A, B, voi, seeds):
"""
Fits model to two adjacency matrices and returns nomination list
Parameters
----------
A: 2d-array, square
Adjacency matrix, the graph where ``voi`` is known
B: 2d-array, square
Adjacency matrix, the graph where ``voi`` is not known
voi: int
Vertex of interest
seeds: list, 2d-array
List of length two, of form `[seedsA, seedsB]`. The elements of `seedsA`
and `seedsB` are vertex indices from ``A`` and ``B``, respectively, which
are known to be matched; that is, vertex `seedsA[i]` is matched to
vertex `seedsB[i]`. Note: `len(seedsA)==len(seedsB)`.
Returns
-------
nomination_list_ : 2d-array
The nomination list.
"""
self.fit(A, B, voi, seeds)
return self.nomination_list_
def _get_induced_subgraph(graph_adj_matrix, order, node, mindist=1):
"""
Generates a vertex list for the induced subgraph about a node with
max and min distance parameters.
Parameters
----------
graph_adj_matrix: 2-d array
Adjacency matrix of interest.
order: int
Distance to create the induced subgraph with. Max distance away from the node
to include in subgraph.
node: int
The vertex to center the induced subgraph about.
mindist: int (default = 1)
The minimum distance away from the node to include in the subgraph.
Returns
-------
induced_subgraph : list
The list containing all the vertices in the induced subgraph.
"""
# Note all nodes are zero based in this implementation, i.e the first node is 0
dists = [[node]]
dists_conglom = [node]
for ii in range(1, order + 1):
clst = []
for nn in dists[-1]:
clst.extend(list(np.where(graph_adj_matrix[nn] >= 1)[0]))
clst = np.unique(clst)
cn_proc = np.setdiff1d(clst, dists_conglom)
dists.append(cn_proc)
dists_conglom = np.unique(np.append(dists_conglom, cn_proc))
ress = list(itertools.chain(*dists[mindist : order + 1]))
return np.unique(ress)
def _get_induced_subgraph_list(graph_adj_matrix, order, node, mindist=1):
"""
Generates a vertex list for the induced subgraph about a node with
max and min distance parameters.
Parameters
----------
graph_adj_matrix: 2-d array
Adjacency matrix of interest.
order: int
Distance to create the induce subgraph with. Max distance away from the node
to include in subgraph.
node: int or list
The list of vertices to center the induced subgraph about.
mindist: int (default = 1)
The minimum distance away from the node to include in the subgraph.
Returns
-------
induced_subgraph : list
The list containing all the vertices in the induced subgraph.
"""
if isinstance(node, list) or isinstance(node, np.ndarray):
total_res = []
for nn in node:
ego_res = _get_induced_subgraph(
graph_adj_matrix, order, nn, mindist=mindist
)
total_res.extend(ego_res)
return np.unique(total_res)
else:
return _get_induced_subgraph(graph_adj_matrix, order, node)
```