# Source code for graspologic.preprocessing.graph_cuts

```
# Copyright (c) Microsoft Corporation and contributors.
# Licensed under the MIT License.
import logging
import random
from typing import Any, Callable, List, NamedTuple, Optional, Tuple, Union
import networkx as nx
import numpy as np
LARGER_THAN_INCLUSIVE = "larger_than_inclusive"
LARGER_THAN_EXCLUSIVE = "larger_than_exclusive"
SMALLER_THAN_INCLUSIVE = "smaller_than_inclusive"
SMALLER_THAN_EXCLUSIVE = "smaller_than_exclusive"
def _filter_function_for_make_cuts(
cut_threshold: Union[int, float],
cut_process: str,
) -> Callable[[Tuple[Any, Union[int, float]]], bool]:
filter_functions = {
LARGER_THAN_EXCLUSIVE: lambda x: x[1] > cut_threshold,
LARGER_THAN_INCLUSIVE: lambda x: x[1] >= cut_threshold,
SMALLER_THAN_EXCLUSIVE: lambda x: x[1] < cut_threshold,
SMALLER_THAN_INCLUSIVE: lambda x: x[1] <= cut_threshold,
}
if cut_process not in filter_functions:
raise ValueError(f"Provided cut_process '{cut_process}' is not a valid value")
return filter_functions[cut_process]
[docs]class DefinedHistogram(NamedTuple):
"""
Contains the histogram and the edges of the bins in the histogram.
The bin_edges will have a length 1 greater than the histogram, as it defines the minimal and maximal edges as well
as each edge in between.
"""
histogram: np.ndarray
bin_edges: np.ndarray
[docs]def histogram_edge_weight(
graph: Union[nx.Graph, nx.DiGraph],
bin_directive: Union[int, List[Union[float, int]], np.ndarray, str] = 10,
weight_attribute: str = "weight",
) -> DefinedHistogram:
"""
Generates a histogram of the edge weights of the provided graph. Histogram function is fundamentally proxied
through to numpy's `histogram` function, and bin selection follows :func:`numpy.histogram` processes.
Parameters
----------
graph : nx.Graph
The graph. No changes will be made to it.
bin_directive : Union[int, List[Union[float, int]], numpy.ndarray, str]
Is passed directly through to numpy's "histogram" (and thus, "histogram_bin_edges") functions.
See: :func:`numpy.histogram_bin_edges`
In short: if an int is provided, we use ``bin_directive`` number of equal range bins.
If a sequence is provided, these bin edges will be used and can be sized to whatever size you prefer
Note that the :class:`numpy.ndarray` should be ndim=1 and the values should be float or int.
weight_attribute : str
The weight attribute name in the data dictionary. Default is `weight`.
Returns
-------
DefinedHistogram
A named tuple that contains the histogram and the bin_edges used in the histogram
Notes
-----
Edges without a `weight_attribute` field will be excluded from this histogram. Enable logging to view any
messages about edges without weights.
"""
logger = logging.getLogger(__name__)
edge_weights: List[Union[int, float, None]] = [
weight for _, _, weight in graph.edges(data=weight_attribute)
]
none_weights: List[None] = [weight for weight in edge_weights if weight is None]
actual_weights: List[Union[int, float]] = [
weight for weight in edge_weights if weight is not None
]
if len(none_weights) != 0:
logger.warning(
f"Graph contains {len(none_weights)} edges with no {weight_attribute}."
+ f" Histogram excludes these values."
)
histogram, bin_edges = np.histogram(actual_weights, bin_directive)
return DefinedHistogram(histogram=histogram, bin_edges=bin_edges)
[docs]def cut_edges_by_weight(
graph: Union[nx.Graph, nx.DiGraph],
cut_threshold: Union[int, float],
cut_process: str,
weight_attribute: str = "weight",
prune_isolates: bool = False,
) -> Union[nx.Graph, nx.DiGraph]:
"""
Thresholds edges (removing them from the graph and returning a copy) by weight.
Parameters
----------
graph : Union[nx.Graph, nx.DiGraph]
The graph that will be copied and pruned.
cut_threshold : Union[int, float]
The threshold for making cuts based on weight.
cut_process : str
Describes how we should make the cut; cut all edges larger or smaller than the cut_threshold, and whether
exclusive or inclusive. Allowed values are
- ``larger_than_inclusive``
- ``larger_than_exclusive``
- ``smaller_than_inclusive``
- ``smaller_than_exclusive``
weight_attribute : str
The weight attribute name in the edge's data dictionary. Default is `weight`.
prune_isolates : bool
If true, remove any vertex that no longer has an edge. Note that this only prunes vertices which have edges to
be pruned; any isolate vertex prior to any edge cut will be retained.
Returns
-------
Union[nx.Graph, nx.DiGraph]
Pruned copy of the same type of graph provided
Notes
-----
Edges without a `weight_attribute` field will be excluded from these cuts. Enable logging to view any messages
about edges without weights.
"""
filter_by = _filter_function_for_make_cuts(cut_threshold, cut_process)
if not isinstance(cut_threshold, int) and not isinstance(cut_threshold, float):
raise TypeError(
f"cut_threshold must be of type int or float; you provided: {type(cut_threshold)}"
)
logger = logging.getLogger(__name__)
graph_copy = graph.copy()
edge_weights: List[Tuple[Tuple[Any, Any], Union[int, float, None]]] = [
((source, target), weight)
for source, target, weight in graph.edges(data=weight_attribute)
]
none_weights: List[Tuple[Tuple[Any, Any], None]] = [
(edge, weight) for edge, weight in edge_weights if weight is None
]
actual_weights: List[Tuple[Tuple[Any, Any], Union[int, float]]] = [
(edge, weight) for edge, weight in edge_weights if weight is not None
]
if len(none_weights) != 0:
logger.warning(
f"Graph contains {len(none_weights)} edges with no {weight_attribute}."
+ f"Ignoring these when cutting by weight"
)
edges_to_cut = [x for x in actual_weights if filter_by(x)]
for edge, weight in edges_to_cut:
source, target = edge
if (
source in graph_copy
and target in graph_copy
and target in graph_copy[source]
):
graph_copy.remove_edge(source, target)
if prune_isolates:
if len(graph_copy[source]) == 0:
graph_copy.remove_node(source)
if len(graph_copy[target]) == 0:
graph_copy.remove_node(target)
return graph_copy
[docs]def histogram_degree_centrality(
graph: Union[nx.Graph, nx.DiGraph],
bin_directive: Union[int, List[Union[float, int]], np.ndarray, str] = 10,
) -> DefinedHistogram:
"""
Generates a histogram of the vertex degree centrality of the provided graph. Histogram function is fundamentally
proxied through to numpy's `histogram` function, and bin selection follows :func:`numpy.histogram` processes.
Parameters
----------
graph : Union[nx.Graph, nx.DiGraph]
The graph. No changes will be made to it.
bin_directive : Union[int, List[Union[float, int]], numpy.ndarray, str]
Is passed directly through to numpy's "histogram" (and thus, "histogram_bin_edges") functions.
See: :func:`numpy.histogram_bin_edges`
In short: if an int is provided, we use ``bin_directive`` number of equal range bins.
If a sequence is provided, these bin edges will be used and can be sized to whatever size you prefer
Note that the :class:`numpy.ndarray` should be ndim=1 and the values should be float or int.
Returns
-------
DefinedHistogram
A named tuple that contains the histogram and the bin_edges used in the histogram
"""
degree_centrality_dict = nx.degree_centrality(graph)
histogram, bin_edges = np.histogram(
list(degree_centrality_dict.values()), bin_directive
)
return DefinedHistogram(histogram=histogram, bin_edges=bin_edges)
[docs]def cut_vertices_by_degree_centrality(
graph: Union[nx.Graph, nx.DiGraph],
cut_threshold: Union[int, float],
cut_process: str,
) -> Union[nx.Graph, nx.DiGraph]:
"""
Given a graph and a cut_threshold and a cut_process, return a copy of the graph with the vertices outside of the
cut_threshold.
Parameters
----------
graph : Union[nx.Graph, nx.DiGraph]
The graph that will be copied and pruned.
cut_threshold : Union[int, float]
The threshold for making cuts based on weight.
cut_process : str
Describes how we should make the cut; cut all edges larger or smaller than the cut_threshold, and whether
exclusive or inclusive. Allowed values are
- ``larger_than_inclusive``
- ``larger_than_exclusive``
- ``smaller_than_inclusive``
- ``smaller_than_exclusive``
Returns
-------
Union[nx.Graph, nx.DiGraph]
Pruned copy of the same type of graph provided
"""
graph_copy = graph.copy()
degree_centrality_dict = nx.degree_centrality(graph_copy)
filter_by = _filter_function_for_make_cuts(cut_threshold, cut_process)
vertices_to_cut = list(filter(filter_by, degree_centrality_dict.items()))
for vertex, degree_centrality in vertices_to_cut:
graph_copy.remove_node(vertex)
return graph_copy
[docs]def histogram_betweenness_centrality(
graph: Union[nx.Graph, nx.DiGraph],
bin_directive: Union[int, List[Union[float, int]], np.ndarray, str] = 10,
num_random_samples: Optional[int] = None,
normalized: bool = True,
weight_attribute: Optional[str] = "weight",
include_endpoints: bool = False,
random_seed: Optional[Union[int, random.Random, np.random.RandomState]] = None,
) -> DefinedHistogram:
"""
Generates a histogram of the vertex betweenness centrality of the provided graph. Histogram function is
fundamentally proxied through to numpy's `histogram` function, and bin selection follows :func:`numpy.histogram`
processes.
The betweenness centrality calculation can take advantage of networkx' implementation of randomized sampling by
providing num_random_samples (or ``k``, in networkx betweenness_centrality nomenclature).
Parameters
----------
graph : Union[nx.Graph, nx.DiGraph]
The graph. No changes will be made to it.
bin_directive : Union[int, List[Union[float, int]], numpy.ndarray, str]
Is passed directly through to numpy's "histogram" (and thus, "histogram_bin_edges") functions.
See: :func:`numpy.histogram_bin_edges`
In short: if an int is provided, we use ``bin_directive`` number of equal range bins.
If a sequence is provided, these bin edges will be used and can be sized to whatever size you prefer
Note that the :class:`numpy.ndarray` should be ndim=1 and the values should be float or int.
num_random_samples : Optional[int]
Use num_random_samples for vertex samples to *estimate* betweeness. num_random_samples should be <=
len(graph.nodes). The larger num_random_samples is, the better the approximation. Default is ``None``.
normalized : bool
If True the betweenness values are normalized by :math:`2/((n-1)(n-2))` for undirected graphs, and
:math:`1/((n-1)(n-2))` for directed graphs where n is the number of vertices in the graph. Default is ``True``
weight_attribute : Optional[str]
If None, all edge weights are considered equal. Otherwise holds the name of the edge attribute used as weight.
Default is ``weight``
include_endpoints : bool
If True include the endpoints in the shortest path counts. Default is ``False``
random_seed : Optional[Union[int, random.Random, np.random.RandomState]]
Random seed or preconfigured random instance to be used for selecting random samples. Only used if
num_random_samples is set. None will generate a new random state. Specifying a random state will provide
consistent results between runs.
Returns
-------
DefinedHistogram
A named tuple that contains the histogram and the bin_edges used in the histogram
See Also
--------
networkx.algorithms.centrality.betweenness_centrality
"""
betweenness_centrality_dict = nx.betweenness_centrality(
G=graph,
k=num_random_samples,
normalized=normalized,
weight=weight_attribute,
endpoints=include_endpoints,
seed=random_seed,
)
histogram, bin_edges = np.histogram(
list(betweenness_centrality_dict.values()), bin_directive
)
return DefinedHistogram(histogram=histogram, bin_edges=bin_edges)
[docs]def cut_vertices_by_betweenness_centrality(
graph: Union[nx.Graph, nx.DiGraph],
cut_threshold: Union[int, float],
cut_process: str,
num_random_samples: Optional[int] = None,
normalized: bool = True,
weight_attribute: Optional[str] = "weight",
include_endpoints: bool = False,
random_seed: Optional[Union[int, random.Random, np.random.RandomState]] = None,
) -> Union[nx.Graph, nx.DiGraph]:
"""
Given a graph and a cut_threshold and a cut_process, return a copy of the graph with the vertices outside of the
cut_threshold.
The betweenness centrality calculation can take advantage of networkx' implementation of randomized sampling
by providing num_random_samples (or k, in networkx betweenness_centrality nomenclature).
Parameters
----------
graph : Union[nx.Graph, nx.DiGraph]
The graph that will be copied and pruned.
cut_threshold : Union[int, float]
The threshold for making cuts based on weight.
cut_process : str
Describes how we should make the cut; cut all edges larger or smaller than the cut_threshold, and whether
exclusive or inclusive. Allowed values are
- ``larger_than_inclusive``
- ``larger_than_exclusive``
- ``smaller_than_inclusive``
- ``smaller_than_exclusive``
num_random_samples : Optional[int]
Use num_random_samples for vertex samples to *estimate* betweenness. num_random_samples should be <=
len(graph.nodes). The larger num_random_samples is, the better the approximation. Default is ``None``.
normalized : bool
If True the betweenness values are normalized by :math:`2/((n-1)(n-2))` for undirected graphs, and
:math:`1/((n-1)(n-2))` for directed graphs where n is the number of vertices in the graph. Default is ``True``
weight_attribute : Optional[str]
If None, all edge weights are considered equal. Otherwise holds the name of the edge attribute used as weight.
Default is ``weight``
include_endpoints : bool
If True include the endpoints in the shortest path counts. Default is ``False``
random_seed : Optional[Union[int, random.Random, np.random.RandomState]]
Random seed or preconfigured random instance to be used for selecting random samples. Only used if
num_random_samples is set. None will generate a new random state. Specifying a random state will provide
consistent results between runs.
Returns
-------
Union[nx.Graph, nx.DiGraph]
Pruned copy of the same type of graph provided
See Also
--------
networkx.algorithms.centrality.betweenness_centrality
"""
graph_copy = graph.copy()
betweenness_centrality_dict = nx.betweenness_centrality(
G=graph,
k=num_random_samples,
normalized=normalized,
weight=weight_attribute,
endpoints=include_endpoints,
seed=random_seed,
)
filter_by = _filter_function_for_make_cuts(cut_threshold, cut_process)
vertices_to_cut = list(filter(filter_by, betweenness_centrality_dict.items()))
for vertex, degree_centrality in vertices_to_cut:
graph_copy.remove_node(vertex)
return graph_copy
```