Outline



From sentence embeddings to graph embeddings




import numpy as np
import networkx as nx
nx_graph = nx.Graph()
nodes = list(np.arange(1, 14))
# fmt: off
edges = [
(1, 2), (2, 3), (3, 4), (4, 5),
(1, 6), (2, 6), (3, 6), (4, 6), (5, 6),
(6, 7),
(7, 8),
(9, 8), (10, 8), (11, 8), (12, 8), (13, 8),
(9, 10), (10, 11), (11, 12), (12, 13),
]
# fmt: on
nx_graph.add_nodes_from(nodes)
nx_graph.add_edges_from(edges)
nx.draw(nx_graph, with_labels=True, node_color="white")
# https://github.com/phanein/deepwalk/blob/master/deepwalk/__main__.py
import random
import deepwalk
from deepwalk import walks as serialized_walks
from deepwalk.skipgram import Skipgram
from gensim.models import Word2Vec
from six import iterkeys
deepwalk_graph = deepwalk.graph.Graph()
for idx, x in enumerate(nx_graph.nodes()):
for y in iterkeys(nx_graph[x]):
deepwalk_graph[x].append(y)
deepwalk_graph.make_undirected()
deepwalk_graph
Graph(list,
{1: [2, 6],
2: [1, 3, 6],
3: [2, 4, 6],
4: [3, 5, 6],
5: [4, 6],
6: [1, 2, 3, 4, 5, 7],
7: [6, 8],
8: [7, 9, 10, 11, 12, 13],
9: [8, 10],
10: [8, 9, 11],
11: [8, 10, 12],
12: [8, 11, 13],
13: [8, 12]})
# hyper-params
# num random walks per node: 80
NUM_WALKS = 80
# length of one random walk: 40
WALK_LENGTH = 40
# window size 10
WINDOW_SIZE = 10
# embedding dim 128
EMBEDDING_DIM = 128
walks = deepwalk.graph.build_deepwalk_corpus(
deepwalk_graph,
num_paths=NUM_WALKS,
path_length=WALK_LENGTH,
alpha=0,
rand=random.Random(123)
)
print(np.array(walks))
print(np.array(walks).shape)
[['9' '10' '11' ... '12' '13' '8'] ['6' '5' '6' ... '8' '10' '11'] ['8' '10' '11' ... '9' '10' '9'] ... ['13' '8' '10' ... '6' '4' '6'] ['6' '4' '5' ... '7' '8' '12'] ['8' '12' '13' ... '9' '10' '8']] (1040, 40)
model = Word2Vec(
walks,
size=EMBEDDING_DIM,
window=WINDOW_SIZE,
min_count=0,
sg=1,
hs=1,
workers=1,
)
# nodes, or "vocabulary"
model.wv.vocab
{'9': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6810>,
'10': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6890>,
'11': <gensim.models.keyedvectors.Vocab at 0x1a1d7b68d0>,
'8': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6910>,
'7': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6990>,
'6': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a10>,
'1': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a50>,
'3': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6a90>,
'2': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6950>,
'4': <gensim.models.keyedvectors.Vocab at 0x1a1d7b69d0>,
'5': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6ad0>,
'13': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6b10>,
'12': <gensim.models.keyedvectors.Vocab at 0x1a1d7b6b50>}
# "word" embeddings
model.wv["1"]
array([-0.00095934, -0.14086728, 0.07352262, 0.13970056, -0.0497438 ,
-0.2926532 , -0.06589775, -0.10685308, 0.18745854, -0.1012044 ,
-0.03285081, 0.29395282, -0.01746033, -0.04551164, -0.07437321,
0.00040195, 0.08093731, 0.04620257, 0.05116988, 0.02569078,
-0.12490681, 0.02504689, 0.1027611 , -0.08071101, -0.24925862,
0.12331013, -0.13090487, -0.1167615 , 0.15161267, 0.00957596,
-0.12357164, 0.08300231, 0.0844809 , -0.12852252, -0.20644854,
0.11501899, 0.26970205, 0.07551242, -0.11944196, -0.06687455,
-0.1480233 , -0.18081237, 0.17705128, -0.07423408, -0.08211829,
0.26821792, 0.0630434 , -0.23834868, -0.00506505, 0.07663959,
-0.08388641, 0.1582118 , -0.07193353, 0.01382565, 0.00668014,
0.01537236, -0.07911585, 0.09093268, 0.04782807, -0.04677264,
-0.23164956, -0.17346524, 0.12032165, -0.0476453 , -0.13943444,
-0.1260258 , 0.13450332, 0.10808325, -0.01826159, -0.3575986 ,
0.15718068, -0.04420409, -0.11785912, 0.10133453, -0.19064085,
-0.20066042, 0.08204125, 0.06170731, 0.16305035, -0.13007683,
-0.03484008, -0.14941491, -0.09808226, 0.08392007, 0.046721 ,
0.01600912, -0.07195069, 0.01543452, 0.13940337, -0.07913083,
-0.184816 , -0.0933992 , -0.07203937, -0.07271566, 0.11571857,
0.00552847, 0.00289924, -0.0523177 , 0.12037521, -0.09778723,
-0.03153525, -0.05874648, 0.08392701, 0.19099884, -0.00918258,
0.05699351, -0.04814161, -0.03177148, 0.00090648, -0.06779662,
-0.19197589, 0.12392153, 0.03059392, -0.21022129, -0.02727187,
0.13549924, 0.05740344, 0.00623336, -0.09875766, 0.04643035,
-0.10097225, 0.10052127, 0.06828507, 0.10290339, 0.1309135 ,
0.01477405, 0.08474117, 0.21653458], dtype=float32)
model.most_similar("1")
[('3', 0.9698188304901123),
('6', 0.9323770999908447),
('2', 0.9054238796234131),
('4', 0.8182910084724426),
('5', 0.77535080909729),
('7', 0.6318682432174683),
('9', 0.05110911279916763),
('11', 0.05021246522665024),
('10', 0.03185462951660156),
('8', 0.02739347517490387)]
nx.draw(nx_graph, with_labels=True, node_color="white")
# Visualisation of "word" embeddings using PCA
from sklearn.decomposition import PCA
from matplotlib import pyplot
def plot_pca(model):
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
plot_pca(model)
Search strategies

Node similarities in embedding space

2nd order random walk:


from node2vec import Node2Vec
node2vec = Node2Vec(
nx_graph, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALKS
)
Computing transition probabilities: 100%|██████████| 13/13 [00:00<00:00, 7282.75it/s] Generating walks (CPU: 1): 100%|██████████| 80/80 [00:01<00:00, 63.13it/s]
print(
"walks: \n", np.array(node2vec.walks),
"\n\n",
"shape: ", np.array(node2vec.walks).shape
)
walks: [['9' '10' '11' ... '3' '4' '5'] ['6' '3' '4' ... '2' '1' '2'] ['12' '11' '12' ... '1' '2' '6'] ... ['5' '4' '6' ... '8' '11' '8'] ['10' '11' '12' ... '3' '6' '3'] ['11' '8' '13' ... '8' '9' '10']] shape: (1040, 40)
model = node2vec.fit(window=WINDOW_SIZE, min_count=0)
model.wv.vocab
{'9': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6710>,
'10': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6750>,
'11': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6790>,
'8': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6810>,
'7': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6890>,
'6': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6910>,
'2': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6950>,
'5': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6990>,
'4': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6850>,
'3': <gensim.models.keyedvectors.Vocab at 0x1a1e4f68d0>,
'1': <gensim.models.keyedvectors.Vocab at 0x1a1e4f69d0>,
'12': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6a10>,
'13': <gensim.models.keyedvectors.Vocab at 0x1a1e4f6a50>}
plot_pca(model)
import json
with open("assets/epigraphdb_efo_leukemia.json", "r") as f:
epigraphdb_data = json.load(f)
from pprint import pprint
pprint(epigraphdb_data["nodes"][:3])
pprint(epigraphdb_data["edges"][:3])
[{'id': 44,
'label': 'Diagnoses - secondary ICD10: Y91.9 Alcohol involvement, not '
'otherwise specified'},
{'id': 37,
'label': 'Diagnoses - secondary ICD9: 59979 Haematuria (not otherwise '
'specified)'},
{'id': 31,
'label': 'Diagnoses - main ICD10: J40 Bronchitis, not specified as acute or '
'chronic'}]
[{'from': 57, 'to': 69}, {'from': 57, 'to': 71}, {'from': 57, 'to': 70}]
epigraphdb_graph = nx.Graph()
epigraphdb_graph.add_nodes_from([item["id"] for item in epigraphdb_data["nodes"]])
epigraphdb_graph.add_edges_from([(item["from"], item["to"]) for item in epigraphdb_data["edges"]])
nx.draw(epigraphdb_graph, with_labels=True, node_color="white")
epigraphdb_node2vec = Node2Vec(
epigraphdb_graph, dimensions=EMBEDDING_DIM, walk_length=WALK_LENGTH, num_walks=NUM_WALKS
)
Computing transition probabilities: 100%|██████████| 91/91 [00:00<00:00, 7093.80it/s] Generating walks (CPU: 1): 100%|██████████| 80/80 [00:09<00:00, 8.67it/s]
epigraphdb_model = epigraphdb_node2vec.fit(window=WINDOW_SIZE, min_count=0)
plot_pca(epigraphdb_model)
epigraphdb_model.most_similar("24")
[('45', 0.7994406819343567),
('2', 0.7983152866363525),
('20', 0.7964375019073486),
('47', 0.7630603909492493),
('8', 0.7575975656509399),
('13', 0.7530874013900757),
('50', 0.7519896626472473),
('1', 0.746828019618988),
('33', 0.7092547416687012),
('9', 0.7086656093597412)]
from sklearn.manifold import TSNE
def plot_tsne(model):
X = model[model.wv.vocab]
result = TSNE(n_components=2).fit_transform(X)
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
plot_tsne(epigraphdb_model)