Graph Embeddings¶

Bioinformatics reading group, 8 Nov 2019

Yi Liu

Outline

DeepWalk: intro to graph embeddings
DeepWalk example
Node2Vec
DeepGL
References

Graph and graph embeddings¶

$$ G = (V, E, X, Y) $$$$ f(X) \rightarrow Y $$

$V$: vertices
$E$: edges
$X$: attributes
$Y$: labels

Word2Vec (SkipGram)¶

DeepWalk¶

From sentence embeddings to graph embeddings

In [2]:

import numpy as np
import networkx as nx

nx_graph = nx.Graph()
nodes = list(np.arange(1, 14))
# fmt: off
edges = [
    (1, 2), (2, 3), (3, 4), (4, 5),
    (1, 6), (2, 6), (3, 6), (4, 6), (5, 6),
    (6, 7),
    (7, 8),
    (9, 8), (10, 8), (11, 8), (12, 8), (13, 8),
    (9, 10), (10, 11), (11, 12), (12, 13),
]
# fmt: on
nx_graph.add_nodes_from(nodes)
nx_graph.add_edges_from(edges)
nx.draw(nx_graph, with_labels=True, node_color="white")

In [3]:

# https://github.com/phanein/deepwalk/blob/master/deepwalk/__main__.py
import random

import deepwalk
from deepwalk import walks as serialized_walks
from deepwalk.skipgram import Skipgram
from gensim.models import Word2Vec
from six import iterkeys

In [4]:

deepwalk_graph = deepwalk.graph.Graph()

for idx, x in enumerate(nx_graph.nodes()):
    for y in iterkeys(nx_graph[x]):
        deepwalk_graph[x].append(y)
        
deepwalk_graph.make_undirected()
deepwalk_graph

Out[4]:

Graph(list,
      {1: [2, 6],
       2: [1, 3, 6],
       3: [2, 4, 6],
       4: [3, 5, 6],
       5: [4, 6],
       6: [1, 2, 3, 4, 5, 7],
       7: [6, 8],
       8: [7, 9, 10, 11, 12, 13],
       9: [8, 10],
       10: [8, 9, 11],
       11: [8, 10, 12],
       12: [8, 11, 13],
       13: [8, 12]})

In [5]:

# hyper-params

# num random walks per node: 80
NUM_WALKS = 80
# length of one random walk: 40
WALK_LENGTH = 40
# window size 10
WINDOW_SIZE = 10
# embedding dim 128
EMBEDDING_DIM = 128

In [6]:

walks = deepwalk.graph.build_deepwalk_corpus(
    deepwalk_graph,
    num_paths=NUM_WALKS,
    path_length=WALK_LENGTH,
    alpha=0,
    rand=random.Random(123)
)
print(np.array(walks))
print(np.array(walks).shape)

[['9' '10' '11' ... '12' '13' '8']
 ['6' '5' '6' ... '8' '10' '11']
 ['8' '10' '11' ... '9' '10' '9']
 ...
 ['13' '8' '10' ... '6' '4' '6']
 ['6' '4' '5' ... '7' '8' '12']
 ['8' '12' '13' ... '9' '10' '8']]
(1040, 40)

In [7]:

model = Word2Vec(
    walks,
    size=EMBEDDING_DIM,
    window=WINDOW_SIZE,
    min_count=0,
    sg=1,
    hs=1,
    workers=1,
)

In [8]:

# nodes, or "vocabulary"
model.wv.vocab

Out[8]:

{'9': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6810>,
 '10': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6890>,
 '11': <gensim.models.keyedvectors.Vocab at 0x1a1d7b68d0>,
 '8': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6910>,
 '7': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6990>,
 '6': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a10>,
 '1': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a50>,
 '3': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a90>,
 '2': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6950>,
 '4': <gensim.models.keyedvectors.Vocab at 0x1a1d7b69d0>,
 '5': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6ad0>,
 '13': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6b10>,
 '12': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6b50>}

In [9]:

# "word" embeddings
model.wv["1"]

Out[9]:

array([-0.00095934, -0.14086728,  0.07352262,  0.13970056, -0.0497438 ,
       -0.2926532 , -0.06589775, -0.10685308,  0.18745854, -0.1012044 ,
       -0.03285081,  0.29395282, -0.01746033, -0.04551164, -0.07437321,
        0.00040195,  0.08093731,  0.04620257,  0.05116988,  0.02569078,
       -0.12490681,  0.02504689,  0.1027611 , -0.08071101, -0.24925862,
        0.12331013, -0.13090487, -0.1167615 ,  0.15161267,  0.00957596,
       -0.12357164,  0.08300231,  0.0844809 , -0.12852252, -0.20644854,
        0.11501899,  0.26970205,  0.07551242, -0.11944196, -0.06687455,
       -0.1480233 , -0.18081237,  0.17705128, -0.07423408, -0.08211829,
        0.26821792,  0.0630434 , -0.23834868, -0.00506505,  0.07663959,
       -0.08388641,  0.1582118 , -0.07193353,  0.01382565,  0.00668014,
        0.01537236, -0.07911585,  0.09093268,  0.04782807, -0.04677264,
       -0.23164956, -0.17346524,  0.12032165, -0.0476453 , -0.13943444,
       -0.1260258 ,  0.13450332,  0.10808325, -0.01826159, -0.3575986 ,
        0.15718068, -0.04420409, -0.11785912,  0.10133453, -0.19064085,
       -0.20066042,  0.08204125,  0.06170731,  0.16305035, -0.13007683,
       -0.03484008, -0.14941491, -0.09808226,  0.08392007,  0.046721  ,
        0.01600912, -0.07195069,  0.01543452,  0.13940337, -0.07913083,
       -0.184816  , -0.0933992 , -0.07203937, -0.07271566,  0.11571857,
        0.00552847,  0.00289924, -0.0523177 ,  0.12037521, -0.09778723,
       -0.03153525, -0.05874648,  0.08392701,  0.19099884, -0.00918258,
        0.05699351, -0.04814161, -0.03177148,  0.00090648, -0.06779662,
       -0.19197589,  0.12392153,  0.03059392, -0.21022129, -0.02727187,
        0.13549924,  0.05740344,  0.00623336, -0.09875766,  0.04643035,
       -0.10097225,  0.10052127,  0.06828507,  0.10290339,  0.1309135 ,
        0.01477405,  0.08474117,  0.21653458], dtype=float32)

In [10]:

model.most_similar("1")

Out[10]:

[('3', 0.9698188304901123),
 ('6', 0.9323770999908447),
 ('2', 0.9054238796234131),
 ('4', 0.8182910084724426),
 ('5', 0.77535080909729),
 ('7', 0.6318682432174683),
 ('9', 0.05110911279916763),
 ('11', 0.05021246522665024),
 ('10', 0.03185462951660156),
 ('8', 0.02739347517490387)]

In [11]:

nx.draw(nx_graph, with_labels=True, node_color="white")

In [12]:

# Visualisation of "word" embeddings using PCA
from sklearn.decomposition import PCA
from matplotlib import pyplot

def plot_pca(model):
    X = model[model.wv.vocab]
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)

    pyplot.scatter(result[:, 0], result[:, 1])
    words = list(model.wv.vocab)
    for i, word in enumerate(words):
        pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
    pyplot.show()
    
plot_pca(model)

Node2Vec¶

Search strategies

BFS: Breadth-first Sampling
DFS: Depth-first Sampling

Node similarities in embedding space

Homophily:
- Similarity in terms of the neighbouring of nodes
Structural equivalence:
- Similarity in terms of structural roles; not emphasized on connectivity

2nd order random walk:

Mixture of BFS and DFS
Search bias $\alpha$:
- $p$: likelihood of immediately revisiting a node
- $q$: larger $q$ biased towards reversion, smaller $q$ biased towards visiting new nodes
semi-supervised learning: decide p, q from labeled nodes

In [13]:

from node2vec import Node2Vec

In [14]:

node2vec = Node2Vec(
    nx_graph, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALKS
)

Computing transition probabilities: 100%|██████████| 13/13 [00:00<00:00, 7282.75it/s]
Generating walks (CPU: 1): 100%|██████████| 80/80 [00:01<00:00, 63.13it/s]

In [15]:

print(
    "walks: \n", np.array(node2vec.walks), 
    "\n\n",
    "shape: ", np.array(node2vec.walks).shape 
)

walks: 
 [['9' '10' '11' ... '3' '4' '5']
 ['6' '3' '4' ... '2' '1' '2']
 ['12' '11' '12' ... '1' '2' '6']
 ...
 ['5' '4' '6' ... '8' '11' '8']
 ['10' '11' '12' ... '3' '6' '3']
 ['11' '8' '13' ... '8' '9' '10']] 

 shape:  (1040, 40)

In [16]:

model = node2vec.fit(window=WINDOW_SIZE, min_count=0)

In [17]:

model.wv.vocab

Out[17]:

{'9': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6710>,
 '10': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6750>,
 '11': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6790>,
 '8': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6810>,
 '7': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6890>,
 '6': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6910>,
 '2': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6950>,
 '5': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6990>,
 '4': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6850>,
 '3': <gensim.models.keyedvectors.Vocab at 0x1a1e4f68d0>,
 '1': <gensim.models.keyedvectors.Vocab at 0x1a1e4f69d0>,
 '12': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6a10>,
 '13': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6a50>}

In [18]:

plot_pca(model)

EpiGraphDB¶

http://epigraphdb.org/gallery/?name=efo_leukemia&type=2d

In [19]:

import json

with open("assets/epigraphdb_efo_leukemia.json", "r") as f:
    epigraphdb_data = json.load(f)

In [20]:

from pprint import pprint
pprint(epigraphdb_data["nodes"][:3])
pprint(epigraphdb_data["edges"][:3])

[{'id': 44,
  'label': 'Diagnoses - secondary ICD10: Y91.9 Alcohol involvement, not '
           'otherwise specified'},
 {'id': 37,
  'label': 'Diagnoses - secondary ICD9: 59979 Haematuria (not otherwise '
           'specified)'},
 {'id': 31,
  'label': 'Diagnoses - main ICD10: J40 Bronchitis, not specified as acute or '
           'chronic'}]
[{'from': 57, 'to': 69}, {'from': 57, 'to': 71}, {'from': 57, 'to': 70}]

In [21]:

epigraphdb_graph = nx.Graph()
epigraphdb_graph.add_nodes_from([item["id"] for item in epigraphdb_data["nodes"]])
epigraphdb_graph.add_edges_from([(item["from"], item["to"]) for item in epigraphdb_data["edges"]])
nx.draw(epigraphdb_graph, with_labels=True, node_color="white")

In [22]:

epigraphdb_node2vec = Node2Vec(
    epigraphdb_graph, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALKS
)

Computing transition probabilities: 100%|██████████| 91/91 [00:00<00:00, 7093.80it/s]
Generating walks (CPU: 1): 100%|██████████| 80/80 [00:09<00:00,  8.67it/s]

In [23]:

epigraphdb_model = epigraphdb_node2vec.fit(window=WINDOW_SIZE, min_count=0)

In [24]:

plot_pca(epigraphdb_model)

In [25]:

epigraphdb_model.most_similar("24")

Out[25]:

[('45', 0.7994406819343567),
 ('2', 0.7983152866363525),
 ('20', 0.7964375019073486),
 ('47', 0.7630603909492493),
 ('8', 0.7575975656509399),
 ('13', 0.7530874013900757),
 ('50', 0.7519896626472473),
 ('1', 0.746828019618988),
 ('33', 0.7092547416687012),
 ('9', 0.7086656093597412)]

In [26]:

from sklearn.manifold import TSNE

def plot_tsne(model):
    X = model[model.wv.vocab]
    result = TSNE(n_components=2).fit_transform(X)
    
    pyplot.scatter(result[:, 0], result[:, 1])
    words = list(model.wv.vocab)
    for i, word in enumerate(words):
        pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
    pyplot.show()
    
plot_tsne(epigraphdb_model)