Source code for gemben.evaluation.evaluate_node_classification

try: import cPickle as pickle
except: import pickle
from sklearn import model_selection as sk_ms
from sklearn.multiclass import OneVsRestClassifier as oneVr
from sklearn.linear_model import LogisticRegression as lr
# from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
import pdb


[docs]class TopKRanker(oneVr):
    """Class to get top K ranks."""
    
[docs]    def predict(self, X, top_k_list):
        """This function returns the prediction for top k node labels.

        Args:
            X (Vector): Embedding of the nodes.
            top_k_list (List): list consisting of value to denote top k.

        Returns:
            Numpy Array: Predicted node labels.
        """
        assert X.shape[0] == len(top_k_list)
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        prediction = np.zeros((X.shape[0], self.classes_.shape[0]))
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-int(k):]].tolist()
            for label in labels:
                prediction[i, label] = 1
        return prediction


[docs]def evaluateNodeClassification(X, Y, test_ratio):
    """This function is used to evaluate node classification.

        Args:
            X (Vector) : Embedding values of the nodes.
            Y (Int) : Labels of the nodes.
            test_ratio (Float): Ratio to split the training and testing nodes.

        Returns:
            Numpy Array: Micro and macro accuracy scores.
    """
    X_train, X_test, Y_train, Y_test = sk_ms.train_test_split(
        X,
        Y,
        test_size=test_ratio
    )
    try:
        top_k_list = list(Y_test.toarray().sum(axis=1))
    except:
        top_k_list = list(Y_test.sum(axis=1))
    classif2 = TopKRanker(lr())
    try:
        classif2.fit(X_train, Y_train)
        prediction = classif2.predict(X_test, top_k_list)
    except:
        print('Could not fit node classification model')
        prediction = np.zeros(Y_test.shape)
    micro = f1_score(Y_test, prediction, average='micro')
    macro = f1_score(Y_test, prediction, average='macro')
    return (micro, macro)


[docs]def expNC(X, Y, test_ratio_arr,
          rounds, res_pre, m_summ):
    """This function is used to experiment node classification.

        Args:
            X (vector): Embedding values of the nodes.
            Y (Int): Labels of the nodes.
            rounds (Int): The number of times the graph reconstruction is performed.
            res_pre (Str): Prefix to be used to save the result.
            test_ratio_arr (Float): The split used for dividing the traing and testing data.
            m_summ (Str): String to denote the name of the summary file. 

        Returns:
            Numpy Array: Average accuracy.
    """
    print('\tNode Classification:')
    summ_file = open('%s_%s.ncsumm' % (res_pre, m_summ), 'w')
    summ_file.write('Method\t%s\n' % ('\t'.join(map(str, test_ratio_arr))))
    micro = [None] * rounds
    macro = [None] * rounds

    # Remove data points with no class
    # nonZeroIndices = np.where(np.any(Y!=0, axis=1))[0]
    # Y = Y[nonZeroIndices, :]
    # X = X[nonZeroIndices, :]
    for round_id in range(rounds):
        micro_round = [None] * len(test_ratio_arr)
        macro_round = [None] * len(test_ratio_arr)
        for i, test_ratio in enumerate(test_ratio_arr):
            micro_round[i], macro_round[i] = evaluateNodeClassification(
                X,
                Y,
                test_ratio
            )
        micro[round_id] = micro_round
        macro[round_id] = macro_round

    summ_file.write('Micro-F1 LR\t%s\n' % ('\t'.join(map(str, micro[0]))))
    summ_file.write('Macro-F1 LR\t%s\n' % ('\t'.join(map(str, macro[0]))))
    summ_file.close()
    pickle.dump([test_ratio_arr, micro, macro],
                open('%s_%s.nc' % (res_pre, m_summ), 'wb'))
    m_avg = (np.array(micro) + np.array(macro)) / 2.0

    # Return the average of micro and macro scores at middle value
    return list(m_avg[:, len(m_avg) // 2])