Source code for gemben.evaluation.evaluate_node_classification

try: import cPickle as pickle
except: import pickle
from sklearn import model_selection as sk_ms
from sklearn.multiclass import OneVsRestClassifier as oneVr
from sklearn.linear_model import LogisticRegression as lr
# from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
import pdb


[docs]class TopKRanker(oneVr): """Class to get top K ranks."""
[docs] def predict(self, X, top_k_list): """This function returns the prediction for top k node labels. Args: X (Vector): Embedding of the nodes. top_k_list (List): list consisting of value to denote top k. Returns: Numpy Array: Predicted node labels. """ assert X.shape[0] == len(top_k_list) probs = np.asarray(super(TopKRanker, self).predict_proba(X)) prediction = np.zeros((X.shape[0], self.classes_.shape[0])) for i, k in enumerate(top_k_list): probs_ = probs[i, :] labels = self.classes_[probs_.argsort()[-int(k):]].tolist() for label in labels: prediction[i, label] = 1 return prediction
[docs]def evaluateNodeClassification(X, Y, test_ratio): """This function is used to evaluate node classification. Args: X (Vector) : Embedding values of the nodes. Y (Int) : Labels of the nodes. test_ratio (Float): Ratio to split the training and testing nodes. Returns: Numpy Array: Micro and macro accuracy scores. """ X_train, X_test, Y_train, Y_test = sk_ms.train_test_split( X, Y, test_size=test_ratio ) try: top_k_list = list(Y_test.toarray().sum(axis=1)) except: top_k_list = list(Y_test.sum(axis=1)) classif2 = TopKRanker(lr()) try: classif2.fit(X_train, Y_train) prediction = classif2.predict(X_test, top_k_list) except: print('Could not fit node classification model') prediction = np.zeros(Y_test.shape) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') return (micro, macro)
[docs]def expNC(X, Y, test_ratio_arr, rounds, res_pre, m_summ): """This function is used to experiment node classification. Args: X (vector): Embedding values of the nodes. Y (Int): Labels of the nodes. rounds (Int): The number of times the graph reconstruction is performed. res_pre (Str): Prefix to be used to save the result. test_ratio_arr (Float): The split used for dividing the traing and testing data. m_summ (Str): String to denote the name of the summary file. Returns: Numpy Array: Average accuracy. """ print('\tNode Classification:') summ_file = open('%s_%s.ncsumm' % (res_pre, m_summ), 'w') summ_file.write('Method\t%s\n' % ('\t'.join(map(str, test_ratio_arr)))) micro = [None] * rounds macro = [None] * rounds # Remove data points with no class # nonZeroIndices = np.where(np.any(Y!=0, axis=1))[0] # Y = Y[nonZeroIndices, :] # X = X[nonZeroIndices, :] for round_id in range(rounds): micro_round = [None] * len(test_ratio_arr) macro_round = [None] * len(test_ratio_arr) for i, test_ratio in enumerate(test_ratio_arr): micro_round[i], macro_round[i] = evaluateNodeClassification( X, Y, test_ratio ) micro[round_id] = micro_round macro[round_id] = macro_round summ_file.write('Micro-F1 LR\t%s\n' % ('\t'.join(map(str, micro[0])))) summ_file.write('Macro-F1 LR\t%s\n' % ('\t'.join(map(str, macro[0])))) summ_file.close() pickle.dump([test_ratio_arr, micro, macro], open('%s_%s.nc' % (res_pre, m_summ), 'wb')) m_avg = (np.array(micro) + np.array(macro)) / 2.0 # Return the average of micro and macro scores at middle value return list(m_avg[:, len(m_avg) // 2])