After struggling with this amazing facebookresearch / PyTorch-BigGraph project, and its impossible API, I managed to get a grip on how to run it (thanks to stand alone simple example)
My system restrictions do not allow me to train the dense (embedding) representation of all edges, and I need from time to time to upload past embeddings and train the model using both new edges and existing nodes, notice that nodes in past and new edge list do not necessarily overlap.
I tried to understand from here: see the context section how to do it, so far with no success.
Following is a stand-alone PGD code, that turned batch_edges
into an embedding node list, however, I need it to use pre-trained nodes list past_trained_nodes
.
import os
import shutil
from pathlib import Path
from torchbiggraph.config import parse_config
from torchbiggraph.converters.importers import TSVEdgelistReader, convert_input_data
from torchbiggraph.train import train
from torchbiggraph.util import SubprocessInitializer, setup_logging
DIMENSION = 4
DATA_DIR = 'data'
GRAPH_PATH = DATA_DIR + '/output1.tsv'
MODEL_DIR = 'model'
raw_config = dict(
entity_path=DATA_DIR,
edge_paths=[DATA_DIR + '/edges_partitioned', ],
checkpoint_path=MODEL_DIR,
entities={"n": {"num_partitions": 1}},
relations=[{"name": "doesnt_matter", "lhs": "n", "rhs": "n", "operator": "complex_diagonal", }],
dynamic_relations=False, dimension=DIMENSION, global_emb=False, comparator="dot",
num_epochs=7, num_uniform_negs=1000, loss_fn="softmax", lr=0.1, eval_fraction=0.,)
batch_edges = [["A", "B"], ["B", "C"], ["C", "D"], ["D", "B"], ["B", "D"]]
# I want the model to use these pretrained nodes, Notice that Node A exist, And F Does not
#I dont have all past nodes, as some are gained from data
past_trained_nodes = {'A': [0.5, 0.3, 1.5, 8.1], 'F': [3, 0.6, 1.2, 4.3]}
try:
shutil.rmtree('data')
except:
pass
try:
shutil.rmtree(MODEL_DIR)
except:
pass
os.makedirs(DATA_DIR, exist_ok=True)
with open(GRAPH_PATH, 'w') as f:
for edge in batch_edges:
f.write('\t'.join(edge) + '\n')
setup_logging()
config = parse_config(raw_config)
subprocess_init = SubprocessInitializer()
input_edge_paths = [Path(GRAPH_PATH)]
convert_input_data(config.entities, config.relations, config.entity_path, config.edge_paths,
input_edge_paths, TSVEdgelistReader(lhs_col=0, rel_col=None, rhs_col=1),
dynamic_relations=config.dynamic_relations, )
train(config, subprocess_init=subprocess_init)
How can I use my pre-trained nodes in the current model?
Thanks in advance!
Since torchbiggraph
is file based, you can modify the saved files to load pre-trained embeddings and add new nodes. I wrote a function to achieve this
import json
def pretrained_and_new_nodes(pretrained_nodes,new_nodes,entity_name,data_dir,embeddings_path):
"""
pretrained_nodes:
A dictionary of nodes and their embeddings
new_nodes:
A list of new nodes,each new node must have an embedding in pretrained_nodes.
If no new nodes, use []
entity_name:
The entity's name, for example, WHATEVER_0
data_dir:
The path to the files that record graph nodes and edges
embeddings_path:
The path to the .h5 file of embeddings
"""
with open('%s/entity_names_%s.json' % (data_dir,entity_name),'r') as source:
nodes = json.load(source)
dist = {item:ind for ind,item in enumerate(nodes)}
if len(new_nodes) > 0:
# modify both the node names and the node count
extended = nodes.copy()
extended.extend(new_nodes)
with open('%s/entity_names_%s.json' % (data_dir,entity_name),'w') as source:
json.dump(extended,source)
with open('%s/entity_count_%s.txt' % (data_dir,entity_name),'w') as source:
source.write('%i' % len(extended))
if len(new_nodes) == 0:
# if no new nodes are added, we won't bother create a new .h5 file, but just modify the original one
with h5py.File(embeddings_path,'r+') as source:
for node,embedding in pretrained_nodes.items():
if node in nodes:
source['embeddings'][dist[node]] = embedding
else:
# if there are new nodes, then we must create a new .h5 file
# see https://stackoverflow.com/a/47074545/8366805
with h5py.File(embeddings_path,'r+') as source:
embeddings = list(source['embeddings'])
optimizer = list(source['optimizer'])
for node,embedding in pretrained_nodes.items():
if node in nodes:
embeddings[dist[node]] = embedding
# append new nodes in order
for node in new_nodes:
if node not in list(pretrained_nodes.keys()):
raise ValueError
else:
embeddings.append(pretrained_nodes[node])
# write a new .h5 file for the embedding
with h5py.File(embeddings_path,'w') as source:
source.create_dataset('embeddings',data=embeddings,)
optimizer = [item.encode('ascii') for item in optimizer]
source.create_dataset('optimizer',data=optimizer)
After you trained a model (let's say the stand along simple example you linked in your post), and you want to change the learned embedding of node A
to [0.5, 0.3, 1.5, 8.1]
. Moreover, you also want to add a new node F
to the graph with embedding [3, 0.6, 1.2, 4.3]
(This newly added node F
has no connections with other nodes). You can run my function with
past_trained_nodes = {'A': [0.5, 0.3, 1.5, 8.1], 'F': [3, 0.6, 1.2, 4.3]}
pretrained_and_new_nodes(pretrained_nodes=past_trained_nodes,
new_nodes=['F'],
entity_name='WHATEVER_0',
data_dir='data/example_1',
embeddings_path='model_1/embeddings_WHATEVER_0.v7.h5')
After you ran this function, you can check the modified file of embeddings embeddings_WHATEVER_0.v7.h5
filename = "model_1/embeddings_WHATEVER_0.v7.h5"
with h5py.File(filename, "r") as source:
embeddings = list(source['embeddings'])
embeddings
and you will see, the embedding of A
is changed, and also the embedding of F
is added (the order of the embeddings is consistent with the order of nodes in entity_names_WHATEVER_0.json
).
With the files modified, you can use the pre-trained nodes in a new training session.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With