Source code for pgmpy.metrics.correlation_score
from itertools import combinations
import pandas as pd
from sklearn.metrics import f1_score
from pgmpy.base import DAG
from pgmpy.ci_tests import get_ci_test
from pgmpy.metrics import _BaseUnsupervisedMetric
[docs]
class CorrelationScore(_BaseUnsupervisedMetric):
"""
Score to compute how well the model structure represents the correlations
in the data. The model doesn't need to be parameterized for this score.
A Bayesian Network or DAG has d-connection property which can be used to
determine which variables are correlated according to the model. This
function uses this d-connection/d-separation property to compare the model
with variable correlations in a given dataset. For every pair of variables
in the dataset, a correlation test (specified by `test` argument) is done.
We say that any two variables are correlated if the test's p-value <
significance_level. The same pair of variables are then tested whether they
are d-connected in the network structure or not. Finally, a metric specified
by `score` is computed by using the correlation test as the true value and
d-connections as predicted values.
Absence of correlation/d-separation is considered as the positive class for
computing the metrics.
Parameters
----------
ci_test: str or function
The statistical tests to use for determining whether the variables in data
are correlated or not. For discrete variables, the options are: 1) chi_square
2) g_sq 3) log_likelihood 4) freeman_tuckey 5) modified_log_likelihood 6) neyman
7) cressie_read. For continuous variables only one test is available: 1) pearsonr.
A function with the signature fun(X, Y, Z, data) can also be passed which
returns True for uncorrelated and False otherwise.
significance_level: float
A value between 0 and 1. If p_value < significance_level, the variables are
considered uncorrelated.
score: fun (default: f1-score)
Any classification scoring metric from scikit-learn.
https://scikit-learn.org/stable/modules/classes.html#classification-metrics
return_summary: boolean (default: False)
If True, returns a dataframe with details for each of the conditions checked.
Returns
-------
The specified metric: float
The metric specified by the `score` argument. By defaults returns the f1-score.
Examples
--------
>>> from pgmpy.example_models import load_model
>>> from pgmpy.metrics import CorrelationScore
>>> alarm = load_model("bnlearn/alarm")
>>> data = alarm.simulate(int(1e4))
>>> scorer = CorrelationScore(
... ci_test="chi_square", significance_level=0.05, return_summary=False
... )
>>> scorer(X=data, causal_graph=alarm)
0.911957950065703
>>> scorer = CorrelationScore(
... ci_test="chi_square", significance_level=0.05, return_summary=True
... )
>>> scorer(X=data, causal_graph=alarm).head()
var1 var2 stat_test d_connected
0 HISTORY CVP False False
1 HISTORY PCWP False False
2 HISTORY HYPOVOLEMIA True True
3 HISTORY LVEDVOLUME False False
4 HISTORY LVFAILURE False False
"""
_tags = {
"name": "correlation_score",
"requires_true_graph": False,
"requires_data": True,
"lower_is_better": False,
"supported_graph_types": (DAG,),
"is_default": True,
}
def __init__(
self,
ci_test=None,
score=f1_score,
significance_level=0.05,
return_summary=False,
):
self.ci_test = ci_test
self.score = score
self.significance_level = significance_level
self.return_summary = return_summary
def _evaluate(self, X, causal_graph):
# Step 1: Validate inputs
num_nodes = causal_graph.number_of_nodes()
if num_nodes < 2:
raise ValueError(
"The causal graph must have at least 2 nodes to compute the"
f" correlation score. Got {num_nodes} node(s)."
)
if not callable(self.score):
raise ValueError(f"score should be scikit-learn classification metric. Got {self.score}")
ci_test = get_ci_test(test=self.ci_test, data=X)
# Step 2: Create a dataframe of every 2 combination of variables
results = []
for i, j in combinations(causal_graph.nodes(), 2):
test_result = ci_test(
X=i,
Y=j,
Z=[],
significance_level=self.significance_level,
)
d_connected = not causal_graph.is_dconnected(start=i, end=j)
results.append(
{
"var1": i,
"var2": j,
"stat_test": test_result,
"d_connected": d_connected,
}
)
results = pd.DataFrame(results)
# Step 3: Return summary or metric
if self.return_summary:
return results
else:
return self.score(y_true=results["stat_test"].values, y_pred=results["d_connected"].values)