Source code for graspologic.layouts.auto

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import math
import time
from typing import Any, Dict, List, Optional, Tuple

import networkx as nx
import numpy as np
import umap
from sklearn.manifold import TSNE

from ..embed import node2vec_embed
from ..partition import leiden
from ..preprocessing import cut_edges_by_weight, histogram_edge_weight
from .classes import NodePosition
from .nooverlap import remove_overlaps

logger = logging.getLogger(__name__)


# automatically generates a layout by running node2vec over the graph and down
# projecting the embedding into 2d space via umap or tsne
[docs]def layout_tsne( graph: nx.Graph, perplexity: int, n_iter: int, max_edges: int = 10000000, random_seed: Optional[int] = None, ) -> Tuple[nx.Graph, List[NodePosition]]: """ Automatic graph layout generation by creating a generalized node2vec embedding, then using t-SNE for dimensionality reduction to 2d space. By default, this function automatically attempts to prune each graph to a maximum of 10,000,000 edges by removing the lowest weight edges. This pruning is approximate and will leave your graph with at most ``max_edges``, but is not guaranteed to be precisely ``max_edges``. In addition to pruning edges by weight, this function also only operates over the largest connected component in the graph. After dimensionality reduction, sizes are generated for each node based upon their degree centrality, and these sizes and positions are further refined by an overlap removal phase. Lastly, a global partitioning algorithm (:func:`graspologic.partition.leiden`) is executed for the largest connected component and the partition ID is included with each node position. Parameters ---------- graph : :class:`networkx.Graph` The graph to generate a layout for. This graph may have edges pruned if the count is too high and only the largest connected component will be used to automatically generate a layout. perplexity : int The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 4 and 100. Different values can result in significanlty different results. n_iter : int Maximum number of iterations for the optimization. We have found in practice that larger graphs require more iterations. We hope to eventually have more guidance on the number of iterations based on the size of the graph and the density of the edge connections. max_edges : int The maximum number of edges to use when generating the embedding. Default is ``10000000``. The edges with the lowest weights will be pruned until at most ``max_edges`` exist. Warning: this pruning is approximate and more edges than are necessary may be pruned. Running in 32 bit enviornment you will most likely need to reduce this number or you will out of memory. random_seed : int Seed to be used for reproducible results. Default is None and will produce a new random state. Specifying a random state will provide consistent results between runs. In addition the environment variable ``PYTHONHASHSEED`` must be set to control hash randomization. Returns ------- Tuple[nx.Graph, List[NodePosition]] The largest connected component and a list of NodePositions for each node in the largest connected component. The NodePosition object contains: - node_id - x coordinate - y coordinate - size - community References ---------- .. [1] van der Maaten, L.J.P.; Hinton, G.E. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research 9:2579-2605, 2008. """ lcc_graph, tensors, labels = _node2vec_for_layout(graph, max_edges, random_seed) points = TSNE( perplexity=perplexity, n_iter=n_iter, random_state=random_seed ).fit_transform(tensors) positions = _node_positions_from(lcc_graph, labels, points, random_seed=random_seed) return lcc_graph, positions
[docs]def layout_umap( graph: nx.Graph, min_dist: float = 0.75, n_neighbors: int = 25, max_edges: int = 10000000, random_seed: Optional[int] = None, ) -> Tuple[nx.Graph, List[NodePosition]]: """ Automatic graph layout generation by creating a generalized node2vec embedding, then using UMAP for dimensionality reduction to 2d space. By default, this function automatically attempts to prune each graph to a maximum of 10,000,000 edges by removing the lowest weight edges. This pruning is approximate and will leave your graph with at most ``max_edges``, but is not guaranteed to be precisely ``max_edges``. In addition to pruning edges by weight, this function also only operates over the largest connected component in the graph. After dimensionality reduction, sizes are generated for each node based upon their degree centrality, and these sizes and positions are further refined by an overlap removal phase. Lastly, a global partitioning algorithm (:func:`graspologic.partition.leiden`) is executed for the largest connected component and the partition ID is included with each node position. Parameters ---------- graph : :class:`networkx.Graph` The graph to generate a layout for. This graph may have edges pruned if the count is too high and only the largest connected component will be used to automatically generate a layout. min_dist : float The effective minimum distance between embedded points. Default is ``0.75``. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. n_neighbors : int The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Default is ``25``. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. max_edges : int The maximum number of edges to use when generating the embedding. Default is ``10000000``. The edges with the lowest weights will be pruned until at most ``max_edges`` exist. Warning: this pruning is approximate and more edges than are necessary may be pruned. Running in 32 bit enviornment you will most likely need to reduce this number or you will out of memory. random_seed : int Seed to be used for reproducible results. Default is None and will produce random results. Returns ------- Tuple[nx.Graph, List[NodePosition]] The largest connected component and a list of NodePositions for each node in the largest connected component. The NodePosition object contains: - node_id - x coordinate - y coordinate - size - community References ---------- .. [1] McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 .. [2] Böhm, Jan Niklas; Berens, Philipp; Kobak, Dmitry. A Unifying Perspective on Neighbor Embeddings along the Attraction-Repulsion Spectrum. ArXiv e-prints 2007.08902v1, 17 Jul 2020. """ lcc_graph, tensors, labels = _node2vec_for_layout(graph, max_edges, random_seed) points = umap.UMAP( min_dist=min_dist, n_neighbors=n_neighbors, random_state=random_seed ).fit_transform(tensors) positions = _node_positions_from(lcc_graph, labels, points, random_seed=random_seed) return lcc_graph, positions
def _largest_connected_component(graph: nx.Graph) -> nx.Graph: largest_component = max(nx.connected_components(graph), key=len) return graph.subgraph(largest_component).copy() def _approximate_prune(graph: nx.Graph, max_edges_to_keep: int = 1000000): num_edges = len(graph.edges()) logger.info(f"num edges: {num_edges}") if num_edges > max_edges_to_keep: histogram, bins = histogram_edge_weight(graph, bin_directive=100) counts = 0 bin_edge_for_maximum_weight = bins[0] for i, count in reversed(list(enumerate(histogram))): counts += count if counts >= max_edges_to_keep: bin_edge_for_maximum_weight = bins[i + 1] break threshold = bins[bin_edge_for_maximum_weight] graph = cut_edges_by_weight( graph, cut_threshold=threshold, cut_process="smaller_than_inclusive" ) logger.debug(f"after cut num edges: {len(graph.edges())}") return graph def _node2vec_for_layout( graph: nx.Graph, max_edges: int = 10000000, random_seed: Optional[int] = None, ) -> Tuple[nx.Graph, np.ndarray, np.ndarray]: graph = _approximate_prune(graph, max_edges) graph = _largest_connected_component(graph) start = time.time() tensors, labels = node2vec_embed( graph=graph, dimensions=128, num_walks=10, window_size=2, iterations=3, random_seed=random_seed, ) embedding_time = time.time() - start logger.info(f"embedding completed in {embedding_time} seconds") return graph, tensors, labels def _node_positions_from( graph: nx.Graph, labels: np.ndarray, down_projection_2d: np.ndarray, random_seed: Optional[int] = None, ) -> List[NodePosition]: degree = graph.degree() sizes = _compute_sizes(degree) covered_area = _covered_size(sizes) scaled_points = _scale_points(down_projection_2d, covered_area) partitions = leiden(graph, random_seed=random_seed) positions = [ NodePosition( node_id=key, x=scaled_points[index][0], y=scaled_points[index][1], size=sizes[key], community=partitions[key], ) for index, key in enumerate(labels) ] positions = remove_overlaps(positions) return positions def _find_min_max_degree( degrees: nx.classes.reportviews.DegreeView, ) -> Tuple[float, float]: min_degree = math.inf max_degree = -math.inf for _, degree in degrees: min_degree = min(min_degree, degree) max_degree = max(max_degree, degree) return min_degree, max_degree def _compute_sizes( degrees: nx.classes.reportviews.DegreeView, min_size: float = 5.0, max_size: float = 150.0, ) -> Dict[Any, float]: min_degree, max_degree = _find_min_max_degree(degrees) sizes = {} for node_id, degree in degrees: if max_degree == min_degree: size = min_size else: normalized = (degree - min_degree) / (max_degree - min_degree) size = normalized * (max_size - min_size) + min_size sizes[node_id] = size return sizes def _covered_size(sizes: Dict[Any, float]) -> float: total = sum(math.pow(value, 2) * math.pi for value in sizes.values()) return total def _get_bounds(points: np.ndarray) -> Tuple[float, float, float, float]: min_x, min_y = points.min(axis=0) max_x, max_y = points.max(axis=0) return min_x, min_y, max_x, max_y def _center_points_on_origin(points: np.ndarray) -> np.ndarray: min_x, min_y, max_x, max_y = _get_bounds(points) x_range = max_x - min_x y_range = max_y - min_y center_x = x_range / 2 + min_x center_y = y_range / 2 + min_y points[:, 0] -= center_x points[:, 1] -= center_y return points def _scale_to_unit_square(points: np.ndarray) -> np.ndarray: _, _, max_x, max_y = _get_bounds(points) points[:, 0] /= max_x points[:, 1] /= max_y return points def _scale_to_new_bounds(points: np.ndarray, max_x: float, max_y: float) -> np.ndarray: points[:, 0] *= max_x points[:, 1] *= max_y return points def _new_bounds( min_x: float, min_y: float, max_x: float, max_y: float, covered_area: float, target_ratio: float, ) -> Tuple[float, float, float, float]: range_x = max_x - min_x mid_x = min_x + range_x / 2 range_y = max_y - min_y mid_y = min_y + range_y / 2 range_ratio = range_x / range_y new_area = covered_area / target_ratio new_range_y = math.sqrt(new_area / range_ratio) new_range_x = new_area / new_range_y new_min_x, new_min_y, new_max_x, new_max_y = ( mid_x - new_range_x / 2, mid_y - new_range_y / 2, mid_x + new_range_x / 2, mid_y + new_range_y / 2, ) return new_min_x, new_min_y, new_max_x, new_max_y def _scale_points( points: np.ndarray, covered_area: float, target_ratio: float = 0.14 ) -> np.ndarray: # through lots of experiments we have found 14% to be a good ratio # of color to whitespace moved_points = _center_points_on_origin(points) min_x, min_y, max_x, max_y = _get_bounds(moved_points) _, _, new_max_x, new_max_y = _new_bounds( min_x, min_y, max_x, max_y, covered_area, target_ratio ) scaled_points = _scale_to_unit_square(moved_points) final_locs = _scale_to_new_bounds(scaled_points, new_max_x, new_max_y) return final_locs