Source code for gemben.evaluation.evaluate_link_prediction

try: import cPickle as pickle
except: import pickle
from gemben.evaluation import metrics
from gemben.utils import evaluation_util, graph_util
import numpy as np
import networkx as nx
import pdb
from time import time

import sys
# sys.path.insert(0, './')
from gemben.utils import embed_util


[docs]def evaluateStaticLinkPrediction(train_digraph, test_digraph,
                                 graph_embedding, X,
                                 node_l=None,
                                 sample_ratio_e=None,
                                 is_undirected=True,
                                store_predictions=1):
    """This function evaluates the static link prediction accuracy of the embedding algorithms.

        Args:
            train_digraph (Object): directed networkx graph object used for training the algorithm.
            test_digraph (Object): directed networkx graph object to be used for testing the algorithm.
            graph_embedding (object): Object of the embedding algorithm class defined in gemben/embedding.
            X (Vector): Embedding of the the nodes of the graph.
            node_l (Int): Number of nodes in the graph.
            sample_ratio_e (Float): The ratio used to sample the original graph for evaluation purpose.
            is_undirected (bool): Boolean flag to denote whether the graph is directed or not.
            store_prediction (Int): Stores the predicted values.

        Returns:
            Numpy Array: Consiting of Mean average precision and the precision curve values.
    """

    node_num = train_digraph.number_of_nodes()
    # evaluation
    if sample_ratio_e:
        eval_edge_pairs = evaluation_util.getRandomEdgePairs(
            node_num,
            sample_ratio_e,
            is_undirected
        )
    else:
        eval_edge_pairs = None
    if X is None:
        # If not an embedding approach, store the new subgraph
        graph_embedding.learn_embedding(train_digraph)
    estimated_adj = graph_embedding.get_reconstructed_adj(X, node_l)
    predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(
        estimated_adj,
        is_undirected=is_undirected,
        edge_pairs=eval_edge_pairs
    )
    filtered_edge_list = [e for e in predicted_edge_list if not train_digraph.has_edge(e[0], e[1])]
    if 'partition' in train_digraph.node[0]:
        filtered_edge_list = [e for e in predicted_edge_list if train_digraph.node[e[0]]['partition'] != train_digraph.node[e[1]]['partition']]
    pickle.dump(filtered_edge_list, open('gemben/nodeListMap/preds.pickle', 'wb'))
    pickle.dump(test_digraph, open('gemben/nodeListMap/test_graph.pickle', 'wb'))
    t1 = time()
    MAP = metrics.computeMAP(filtered_edge_list, test_digraph)
    t2 = time()
    prec_curv, _ = metrics.computePrecisionCurve(
        filtered_edge_list,
        test_digraph
    )
    t3 = time()
    print('MAP computation time: %f sec, prec: %f sec' % (t2 - t1, t3 - t2))
    return (MAP, prec_curv)


[docs]def expLPT(digraph, graph_embedding,
           res_pre, m_summ,
           K=100000,
           is_undirected=True):
    """This function is used to experiment graph reconstruction for temporally varying graphs.

        Args:
            digraph (Object): directed networkx graph object.
            graph_embedding (object): Object of the embedding algorithm class defined in gemben/embedding.
            res_pre (Str): Prefix to be used to save the result.
            m_summ (Str): String to denote the name of the summary file. 
            K (Int): The maximum value to be use to get the precision curves.
            is_undirected (bool): Boolean flag to denote whether the graph is directed or not.
    """
    print('\tLink Prediction Temporal')

    t1 = time()
    # learn graph embedding on whole graph
    X, _ = graph_embedding.learn_embedding(graph=digraph)
    t2 = time()
    print('\t\tTime taken to learn the embedding: %f sec' % (t2 - t1))
    estimated_adj = graph_embedding.get_reconstructed_adj(X)
    predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(
        estimated_adj,
        is_undirected=is_undirected
    )
    filtered_edge_list = [e for e in predicted_edge_list if not digraph.has_edge(e[0], e[1])]
    if 'partition' in digraph.node[0]:
        filtered_edge_list = [e for e in predicted_edge_list if digraph.node[e[0]]['partition'] != digraph.node[e[1]]['partition']]
    sorted_edges = sorted(predicted_edge_list, key=lambda x: x[2], reverse=True)
    print('\t\tPredicted edge list computed in %f sec. Saving edge list.' % (time() - t2))
    pickle.dump(
        sorted_edges[:K],
        open('%s_%s_predEdgeList.pickle' % (res_pre, m_summ), 'wb')
    )
    print('\t\tSaved edge list.')


[docs]def expLP(digraph, graph_embedding,
          n_sample_nodes_l, rounds,
          res_pre, m_summ, train_ratio=0.8,
          no_python=True, K=32768,
          is_undirected=True, sampling_scheme="u_rand"):
    """This function is used to experiment link prediction.

        Args:
            digraph (Object): directed networkx graph object.
            graph_embedding (object): Object of the embedding algorithm class defined in gemben/embedding.
            n_sampled_node_l (Int): Number of nodes in the graph.
            rounds (Int): The number of times the graph reconstruction is performed.
            res_pre (Str): Prefix to be used to save the result.
            train_ratio (Float): The split used for dividing the traing and testing data.
            no_python (Bool): Flag to denote if python is used. 
            m_summ (Str): String to denote the name of the summary file. 
            K (Int): The maximum value to be use to get the precision curves.
            sampling_scheme (Str): Sampling schme used to sample nodes to be reconstructed.
            is_undirected (bool): Boolean flag to denote whether the graph is directed or not.

        Returns:
            Numpy Array: Consisting of Mean average precision.
    """
    print('\tLink Prediction')
    MAP = {}
    prec_curv = {}
    n_sample_nodes_l = [min(int(n), digraph.number_of_nodes()) for n in n_sample_nodes_l]

    # Randomly hide (1-train_ratio)*100% of links
    node_num = digraph.number_of_nodes()
    train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        digraph,
        train_ratio=train_ratio,
        is_undirected=is_undirected
    )

    # Ensure the resulting train subgraph is connected
    if not nx.is_connected(train_digraph.to_undirected()):
        train_digraph = max(
            nx.weakly_connected_component_subgraphs(train_digraph),
            key=len
        )
        tdl_nodes = train_digraph.nodes()
        nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes))))
        train_digraph = nx.relabel_nodes(train_digraph, nodeListMap, copy=True)
        test_digraph = test_digraph.subgraph(tdl_nodes)
        ### unfroze the graph
        test_digraph = nx.Graph(test_digraph)
        ####nx.relabel_nodes(test_digraph, nodeListMap, copy=False)
        test_digraph = nx.relabel_nodes(test_digraph, nodeListMap, copy=True)
        
    pickle.dump(nodeListMap, open('gemben/nodeListMap/lp_lcc.pickle', 'wb'))

    t1 = time()
    # learn graph embedding on train subgraph
    print(
        'Link Prediction train graph n_nodes: %d, n_edges: %d' % (
            train_digraph.number_of_nodes(),
            train_digraph.number_of_edges())
    )
    X, _ = graph_embedding.learn_embedding(
        graph=train_digraph,
        no_python=no_python
    )
    if X is not None and X.shape[0] != train_digraph.number_of_nodes():
        pdb.set_trace()
    print('Time taken to learn the embedding: %f sec' % (time() - t1))

    # sample test graph for evaluation and store results
    node_l = None
    if not n_sample_nodes_l:
        n_sample_nodes_l = [node_num]
    summ_file = open('%s_%s_%s.lpsumm' % (res_pre, m_summ, sampling_scheme), 'w')
    summ_file.write('Method\t%s\n' % metrics.getMetricsHeader())
    for n_s in n_sample_nodes_l:
        n_s = int(n_s)
        n_s = min(n_s, train_digraph.number_of_nodes())
        MAP[n_s] = [None] * rounds
        prec_curv[n_s] = [None] * rounds
        for round_id in range(rounds):
            if sampling_scheme == "u_rand":
                train_digraph_s, node_l = graph_util.sample_graph(
                    train_digraph,
                    n_s
                )
            else:
                train_digraph_s, node_l = graph_util.sample_graph_rw(
                    train_digraph,
                    n_s
                )
            if X is not None:
                X_sub = X[node_l]
            else:
                X_sub = None
            test_digraph_s = test_digraph.subgraph(node_l)
            nodeListMap = dict(zip(node_l, range(len(node_l))))
            pickle.dump(nodeListMap, open('gemben/nodeListMap/lp_lcc_samp.pickle', 'wb'))
            test_digraph_s = nx.relabel_nodes(test_digraph_s, nodeListMap, copy=True)
            MAP[n_s][round_id], prec_curv[n_s][round_id] = \
                evaluateStaticLinkPrediction(train_digraph_s, test_digraph_s,
                                             graph_embedding, X_sub,
                                             node_l=node_l,
                                             is_undirected=is_undirected)
            prec_curv[n_s][round_id] = prec_curv[n_s][round_id][:K]
        summ_file.write('\tn_s:%d, %f/%f\t%s\n' % (
            n_s,
            np.mean(MAP[n_s]),
            np.std(MAP[n_s]),
            metrics.getPrecisionReport(
                prec_curv[n_s][0],
                len(prec_curv[n_s][0])
            )
        ))
    summ_file.close()
    #if len(prec_curv[-1][0]) < 100:
        #pdb.set_trace()
    pickle.dump([MAP, prec_curv, n_sample_nodes_l],
                open('%s_%s_%s_%s.lp' % (res_pre, m_summ, sampling_scheme, str(train_ratio)),
                     'wb'))
    print('Link prediction evaluation complete. Time: %f sec' % (time() - t1))
    # prec_curv2 = [p[4096] for p in prec_curv[prec_curv.keys()[0]]]
    return MAP[list(MAP.keys())[0]]  # prec_curv2