import networkx as nx
import numpy as np
def label_propagation(G, labels, max_iter=100):
= list(G.nodes())
nodes for _ in range(max_iter):
= labels.copy()
new_labels for node in nodes:
if labels[node] is None:
= [labels[neighbor] for neighbor in G.neighbors(node) if labels[neighbor] is not None]
neighbor_labels if neighbor_labels:
= max(set(neighbor_labels), key=neighbor_labels.count)
new_labels[node] if new_labels == labels:
break
= new_labels
labels return labels
Label spreading and propagation on graphs.
Label propagation is a semi-supervised machine learning algorithm used for classification and community detection tasks on graphs. It works by propagating labels through the graph based on the structure and connectivity of the nodes:
- Initialization: Assign initial labels to a subset of nodes (these are the labeled nodes). The rest of the nodes are unlabeled.
- Propagation: Iteratively update the labels of the unlabeled nodes based on the labels of their neighbors. This is typically done by majority voting or averaging the labels of neighboring nodes.
- Convergence: Repeat the propagation step until the labels stabilize (i.e., no further changes occur) or a maximum number of iterations is reached.
- Prediction: Once the algorithm converges, the labels of the unlabeled nodes are used as the final predictions.
Label propagation is particularly useful in scenarios where labeled data is scarce but the graph structure is informative. It leverages the assumption that connected nodes are likely to share the same label.
Note that Neo4j has the label propagation algorithm via GDS. Other vendors, like Memgraph, prefer the LabelRankT variation with the advantage that it can be executed via a trigger. This triggered detection makes it very useful for streaming data and realtime anomaly detection.
Simplistic
The most basic version of label propagation could be like this:
If we apply this to the karate graph and assign None
to all nodes except to number 33 and 0:
= nx.karate_club_graph()
G = {node: None for node in G.nodes()}
initial_labels 0] = 0 # Label node 0 as class 0
initial_labels[33] = 1 # Label node 33 as class 1
initial_labels[
= label_propagation(G, initial_labels)
final_labels print(final_labels)
{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 1, 10: 0, 11: 0, 12: 0, 13: 0, 14: 1, 15: 1, 16: 0, 17: 0, 18: 1, 19: 0, 20: 1, 21: 0, 22: 1, 23: 1, 24: 0, 25: 0, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 0, 32: 1, 33: 1}
This can be visualized for instance with PyVis:
from pyvis import network as net
=net.Network(notebook=True, cdn_resources='in_line')
g
g.from_nx(G)def set_options(net_g):
= """
options var options = {
"configure": {
"enabled": true
},
"edges": {
"color": {
"inherit": false
},
"smooth": true
},
"physics": {
"barnesHut": {
"gravitationalConstant": -3000.0
},
"minVelocity": 0.75
}
}
"""
net_g.set_options(options)
set_options(g)"simple.html") g.show(
simple.html
By setting the size of the initial propagating nodes we get:
0]["color"]="#00ff00"
g.nodes[0]["size"]=20
g.nodes[33]["color"]="#ff0000"
g.nodes[33]["size"]=20
g.nodes[for i in final_labels:
if final_labels[i]==1:
"color"]="#ff0000"
g.nodes[i][else:
"color"]="#00ff00"
g.nodes[i]["title"]=str(i)
g.nodes[i]["example.html") g.show(
example.html
You can see that this simplistic propagation algorithm ain’t very good. There are some nodes with seem to be isolated.
Label propagation
A more rigorous approach can be seen in Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon University, 2002 and the algorithm there can be summed up as:
\[ \begin{array}{l}{ \mathbf{W} \text {: adjacency matrix of the graph} \\ \text { Compute the diagonal degree matrix } \mathbf{D} \text { by } \mathbf{D}_{i i} \leftarrow \sum_{j} W_{i j}} \\ {\text { Initialize } \hat{Y}^{(0)} \leftarrow\left(y_{1}, \ldots, y_{l}, 0,0, \ldots, 0\right)} \\ {\text { Iterate }} \\ {\text { 1. } \hat{Y}^{(t+1)} \leftarrow \mathbf{D}^{-1} \mathbf{W} \hat{Y}^{(t)}} \\ {\text { 2. } \hat{Y}_{l}^{(t+1)} \leftarrow Y_{l}} \\ {\text { until convergence to } \hat{Y}^{(\infty)}} \\ {\text { Label point } x_{i} \text { by the sign of } \hat{y}_{i}^{(\infty)}}\end{array} \]
with a graph where initially some nodes are labelled and the propagation assigns labels via diffusion.
Below you can find a label spreading algorithm and we’ll base both algorithm on the same base class:
from abc import abstractmethod
import torch
class BaseLabelPropagation:
def __init__(self, adj_matrix):
self.norm_adj_matrix = self._normalize(adj_matrix)
self.n_nodes = adj_matrix.size(0)
self.one_hot_labels = None
self.n_classes = None
self.labeled_mask = None
self.predictions = None
@staticmethod
@abstractmethod
def _normalize(adj_matrix):
raise NotImplementedError("_normalize must be implemented")
@abstractmethod
def _propagate(self):
raise NotImplementedError("_propagate must be implemented")
def _one_hot_encode(self, labels):
# Get the number of classes
= torch.unique(labels)
classes = classes[classes != -1]
classes self.n_classes = classes.size(0)
# One-hot encode labeled data instances and zero rows corresponding to unlabeled instances
= (labels == -1)
unlabeled_mask = labels.clone() # defensive copying
labels = 0
labels[unlabeled_mask] self.one_hot_labels = torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
self.one_hot_labels = self.one_hot_labels.scatter(1, labels.unsqueeze(1), 1)
self.one_hot_labels[unlabeled_mask, 0] = 0
self.labeled_mask = ~unlabeled_mask
def fit(self, labels, max_iter, tol):
"""Fits a semi-supervised learning label propagation model.
labels: torch.LongTensor
Tensor of size n_nodes indicating the class number of each node.
Unlabeled nodes are denoted with -1.
max_iter: int
Maximum number of iterations allowed.
tol: float
Convergence tolerance: threshold to consider the system at steady state.
"""
self._one_hot_encode(labels)
self.predictions = self.one_hot_labels.clone()
= torch.zeros((self.n_nodes, self.n_classes), dtype=torch.float)
prev_predictions
for i in range(max_iter):
# Stop iterations if the system is considered at a steady state
= torch.abs(self.predictions - prev_predictions).sum().item()
variation
if variation < tol:
print(f"The method stopped after {i} iterations, variation={variation:.4f}.")
break
= self.predictions
prev_predictions self._propagate()
def predict(self):
return self.predictions
def predict_classes(self):
return self.predictions.max(dim=1).indices
The Zhu and Ghahramani algorithm is now:
class LabelPropagation(BaseLabelPropagation):
def __init__(self, adj_matrix):
super().__init__(adj_matrix)
@staticmethod
def _normalize(adj_matrix):
"""Computes D^-1 * W"""
= adj_matrix.sum(dim=1)
degs == 0] = 1 # divide by zero
degs[degs return adj_matrix / degs[:, None]
def _propagate(self):
self.predictions = torch.matmul(self.norm_adj_matrix, self.predictions)
# Put back already known labels
self.predictions[self.labeled_mask] = self.one_hot_labels[self.labeled_mask]
def fit(self, labels, max_iter=1000, tol=1e-3):
super().fit(labels, max_iter, tol)
Label spreading
The Label Spreading algorithm from Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston, Bernhard Schoelkopf. Learning with local and global consistency (2004) is as follows:
\[ \begin{array}{l}{ \mathbf{W} \text {: adjacency matrix of the graph} \\ \text { Compute the diagonal degree matrix } \mathbf{D} \text { by } \mathbf{D}_{i i} \leftarrow \sum_{j} W_{i j}} \\ {\text { Compute the normalized graph Laplacian } \\ \mathcal{L} \leftarrow \mathbf{D}^{-1 / 2} \mathbf{W} \mathbf{D}^{-1 / 2}} \\ {\text { Initialize } \hat{Y}^{(0)} \leftarrow\left(y_{1}, \ldots, y_{l}, 0,0, \ldots, 0\right)} \\ {\text { Choose a parameter } \alpha \in[0,1)} \\ {\text { Iterate } \hat{Y}(t+1) \leftarrow \alpha \mathcal{L} \hat{Y}^{(t)}+(1-\alpha) \hat{Y}^{(0)} \text { until convergence to } \hat{Y}^{(\infty)}} \\ {\text { Label point } x_{i} \text { by the sign of } \hat{y}_{i}^{(\infty)}} \end{array} \]
Note that the Laplacian is used here rather than the odd \(\mathbf{D}^{-1} \mathbf{W} \hat{Y}^{(t)}\) matrix above. The appearance of the Laplacian mimics closer the continuous diffusion equation.
class LabelSpreading(BaseLabelPropagation):
def __init__(self, adj_matrix):
super().__init__(adj_matrix)
self.alpha = None
@staticmethod
def _normalize(adj_matrix):
"""Computes D^-1/2 * W * D^-1/2"""
= adj_matrix.sum(dim=1)
degs = torch.pow(degs, -0.5)
norm = 1
norm[torch.isinf(norm)] return adj_matrix * norm[:, None] * norm[None, :]
def _propagate(self):
self.predictions = (
self.alpha * torch.matmul(self.norm_adj_matrix, self.predictions)
+ (1 - self.alpha) * self.one_hot_labels
)
def fit(self, labels, max_iter=3000, tol=1e-5, alpha=0.5):
"""
Parameters
----------
alpha: float
Clamping factor.
"""
self.alpha = alpha
super().fit(labels, max_iter, tol)
Testing models on synthetic data
Let’s apply these two propagations to our karate graph:
= nx.karate_club_graph()
G = nx.adjacency_matrix(G).toarray()
adj_matrix # Create labels
= np.full(len(G.nodes), -1.)
labels 0]=0
labels[33]=1
labels[
= torch.FloatTensor(adj_matrix)
adj_matrix_t = torch.LongTensor(labels)
labels_t
# Learn with Label Propagation
= LabelPropagation(adj_matrix_t)
label_propagation print("Label Propagation: ", end="")
label_propagation.fit(labels_t)= label_propagation.predict_classes()
label_propagation_output_labels
# Learn with Label Spreading
= LabelSpreading(adj_matrix_t)
label_spreading print("Label Spreading: ", end="")
=0.8)
label_spreading.fit(labels_t, alpha= label_spreading.predict_classes() label_spreading_output_labels
Label Propagation: The method stopped after 39 iterations, variation=0.0008.
Label Spreading: The method stopped after 31 iterations, variation=0.0000.
The label propagation can be seen via this rendering:
= label_propagation_output_labels.numpy()
prop_labels =net.Network(notebook=True, cdn_resources='in_line')
g
g.from_nx(G)
set_options(g)0]["color"]="#00ff00"
g.nodes[0]["size"]=20
g.nodes[33]["color"]="#ff0000"
g.nodes[33]["size"]=20
g.nodes[for i,v in enumerate(prop_labels):
if prop_labels[i]==1:
"color"]="#ff0000"
g.nodes[i][else:
"color"]="#00ff00"
g.nodes[i]["title"]=str(i)
g.nodes[i]["propagation.html") g.show(
propagation.html
The label spreading gives:
= label_spreading_output_labels.numpy()
spread_labels =net.Network(notebook=True, cdn_resources='in_line')
g
g.from_nx(G)
set_options(g)0]["color"]="#00ff00"
g.nodes[0]["size"]=20
g.nodes[33]["color"]="#ff0000"
g.nodes[33]["size"]=20
g.nodes[for i,v in enumerate(spread_labels):
if spread_labels[i]==1:
"color"]="#ff0000"
g.nodes[i][else:
"color"]="#00ff00"
g.nodes[i]["title"]=str(i)
g.nodes[i]["spreading.html") g.show(
spreading.html
If you apply these to the caveman graph where the propagation reflect the community topology:
import pandas as pd
import numpy as np
import networkx as nx
# Create caveman graph
= 4
n_cliques = 10
size_cliques = nx.connected_caveman_graph(n_cliques, size_cliques)
caveman_graph = nx.adjacency_matrix(caveman_graph).toarray()
adj_matrix
# Create labels
= np.full(n_cliques * size_cliques, -1.)
labels
# Only one node per clique is labeled. Each clique belongs to a different class.
0] = 0
labels[= 1
labels[size_cliques] * 2] = 2
labels[size_cliques * 3] = 3
labels[size_cliques
# Create input tensors
= torch.FloatTensor(adj_matrix)
adj_matrix_t = torch.LongTensor(labels)
labels_t
# Learn with Label Propagation
= LabelPropagation(adj_matrix_t)
label_propagation print("Label Propagation: ", end="")
label_propagation.fit(labels_t)= label_propagation.predict_classes()
label_propagation_output_labels
# Learn with Label Spreading
= LabelSpreading(adj_matrix_t)
label_spreading print("Label Spreading: ", end="")
=0.8)
label_spreading.fit(labels_t, alpha= label_spreading.predict_classes() label_spreading_output_labels
Label Propagation: The method stopped after 73 iterations, variation=0.0010.
Label Spreading: The method stopped after 39 iterations, variation=0.0000.
= label_spreading_output_labels.numpy()
spread_labels =net.Network(notebook=True, cdn_resources='in_line')
g
g.from_nx(caveman_graph)
set_options(g)0]["color"]="#00ff00"
g.nodes[0]["size"]=20
g.nodes[33]["color"]="#ff0000"
g.nodes[33]["size"]=20
g.nodes[=["#ff0000","#00ff00","#0000ff","#ff00ff"]
color_mapfor i,v in enumerate(spread_labels):
"color"]=color_map[spread_labels[i]]
g.nodes[i]["title"]=str(i)
g.nodes[i]["caveman.html") g.show(
caveman.html
Aside from the three anomalies this gives indeed a good result.