Source code for pgmpy.metrics.correlation_score

from itertools import combinations

import pandas as pd
from sklearn.metrics import f1_score

from pgmpy.base import DAG
from pgmpy.ci_tests import get_ci_test
from pgmpy.metrics import _BaseUnsupervisedMetric


[docs] class CorrelationScore(_BaseUnsupervisedMetric): """ Score to compute how well the model structure represents the correlations in the data. The model doesn't need to be parameterized for this score. A Bayesian Network or DAG has d-connection property which can be used to determine which variables are correlated according to the model. This function uses this d-connection/d-separation property to compare the model with variable correlations in a given dataset. For every pair of variables in the dataset, a correlation test (specified by `test` argument) is done. We say that any two variables are correlated if the test's p-value < significance_level. The same pair of variables are then tested whether they are d-connected in the network structure or not. Finally, a metric specified by `score` is computed by using the correlation test as the true value and d-connections as predicted values. Absence of correlation/d-separation is considered as the positive class for computing the metrics. Parameters ---------- ci_test: str or function The statistical tests to use for determining whether the variables in data are correlated or not. For discrete variables, the options are: 1) chi_square 2) g_sq 3) log_likelihood 4) freeman_tuckey 5) modified_log_likelihood 6) neyman 7) cressie_read. For continuous variables only one test is available: 1) pearsonr. A function with the signature fun(X, Y, Z, data) can also be passed which returns True for uncorrelated and False otherwise. significance_level: float A value between 0 and 1. If p_value < significance_level, the variables are considered uncorrelated. score: fun (default: f1-score) Any classification scoring metric from scikit-learn. https://scikit-learn.org/stable/modules/classes.html#classification-metrics return_summary: boolean (default: False) If True, returns a dataframe with details for each of the conditions checked. Returns ------- The specified metric: float The metric specified by the `score` argument. By defaults returns the f1-score. Examples -------- >>> from pgmpy.example_models import load_model >>> from pgmpy.metrics import CorrelationScore >>> alarm = load_model("bnlearn/alarm") >>> data = alarm.simulate(int(1e4)) >>> scorer = CorrelationScore( ... ci_test="chi_square", significance_level=0.05, return_summary=False ... ) >>> scorer(X=data, causal_graph=alarm) 0.911957950065703 >>> scorer = CorrelationScore( ... ci_test="chi_square", significance_level=0.05, return_summary=True ... ) >>> scorer(X=data, causal_graph=alarm).head() var1 var2 stat_test d_connected 0 HISTORY CVP False False 1 HISTORY PCWP False False 2 HISTORY HYPOVOLEMIA True True 3 HISTORY LVEDVOLUME False False 4 HISTORY LVFAILURE False False """ _tags = { "name": "correlation_score", "requires_true_graph": False, "requires_data": True, "lower_is_better": False, "supported_graph_types": (DAG,), "is_default": True, } def __init__( self, ci_test=None, score=f1_score, significance_level=0.05, return_summary=False, ): self.ci_test = ci_test self.score = score self.significance_level = significance_level self.return_summary = return_summary def _evaluate(self, X, causal_graph): # Step 1: Validate inputs num_nodes = causal_graph.number_of_nodes() if num_nodes < 2: raise ValueError( "The causal graph must have at least 2 nodes to compute the" f" correlation score. Got {num_nodes} node(s)." ) if not callable(self.score): raise ValueError(f"score should be scikit-learn classification metric. Got {self.score}") ci_test = get_ci_test(test=self.ci_test, data=X) # Step 2: Create a dataframe of every 2 combination of variables results = [] for i, j in combinations(causal_graph.nodes(), 2): test_result = ci_test( X=i, Y=j, Z=[], significance_level=self.significance_level, ) d_connected = not causal_graph.is_dconnected(start=i, end=j) results.append( { "var1": i, "var2": j, "stat_test": test_result, "d_connected": d_connected, } ) results = pd.DataFrame(results) # Step 3: Return summary or metric if self.return_summary: return results else: return self.score(y_true=results["stat_test"].values, y_pred=results["d_connected"].values)