Source code for pgmpy.ci_tests.pearsonr_equivalence
import numpy as np
import pandas as pd
from scipy import stats
from .pearsonr import Pearsonr
[docs]
class PearsonrEquivalence(Pearsonr):
r"""
Pearson equivalence test [1] for conditional independence on continuous data.
This test first computes the partial correlation coefficient :math:`\hat{\rho}_{XY \mid Z}` using :class:`Pearsonr`.
Let :math:`\delta` denote ``delta_threshold``. The Fisher transform is computed as:
.. math::
z_\rho = \operatorname{arctanh}(\hat{\rho}_{XY \mid Z}),
\qquad
z_\delta = \operatorname{arctanh}(\delta),
and defines
.. math::
c = \sqrt{n - |Z| - 3},
where :math:`n` is the sample size and :math:`|Z|` is the number of conditioning variables.
The test then performs a TOST (two one-sided tests) procedure for the equivalence hypothesis
.. math::
H_0: \rho_{XY \mid Z} \leq -\delta \;\; \text{or} \;\; \rho_{XY \mid Z} \geq \delta
\qquad \text{vs.} \qquad
H_1: -\delta < \rho_{XY \mid Z} < \delta.
The two one-sided test statistics are:
.. math::
T_{\mathrm{lower}} = c (z_\rho + z_\delta),
\qquad
T_{\mathrm{upper}} = c (z_\rho - z_\delta),
with corresponding p-values:
.. math::
p_{\mathrm{lower}} = 1 - \Phi(T_{\mathrm{lower}}),
\qquad
p_{\mathrm{upper}} = \Phi(T_{\mathrm{upper}}),
where :math:`\Phi` is the standard normal CDF. The reported p-value is:
.. math::
p = \max(p_{\mathrm{lower}}, p_{\mathrm{upper}}).
Parameters
----------
data : pandas.DataFrame
The dataset in which to test the independence condition.
delta_threshold : float
The equivalence bound (threshold for practical independence).
Attributes
----------
statistic_ : float
Fisher z-transformed correlation coefficient :math:`z_\rho`. Set after calling the test.
p_value_ : float
The p-value from the TOST procedure. Independence is concluded when
``p_value_ < significance_level`` (opposite of standard CI tests). Set after calling the test.
References
----------
.. [1] Malinsky, Daniel. "A cautious approach to constraint-based causal model selection." arXiv preprint
arXiv:2404.18232 (2024).
"""
_tags = {
"name": "pearsonr_equivalence",
"data_types": ("continuous",),
"default_for": None,
"requires_data": True,
}
def __init__(self, data: pd.DataFrame, delta_threshold: float = 0.1):
self.delta_threshold = delta_threshold
super().__init__(data=data)
[docs]
def is_independent(
self,
X: str,
Y: str,
Z: list | tuple = (),
significance_level: float = 0.05,
) -> bool:
"""
Perform the equivalence CI test.
Note: Independence is concluded when p_value_ < significance_level
(rejecting the null of dependence), which is the OPPOSITE of standard CI tests.
Returns
-------
bool
True if X ⊥⊥ Y | Z (p_value_ < significance_level), else False.
"""
self._validate_inputs(X, Y, Z)
self.run_test(X=X, Y=Y, Z=list(Z))
return self.p_value_ < significance_level
[docs]
def run_test(
self,
X: str,
Y: str,
Z: list,
):
"""
Compute Pearson equivalence statistic and p-value.
Sets ``self.statistic_`` (Fisher z-transformed partial correlation) and ``self.p_value_``.
"""
# Step 2: Compute Partial Pearson Correlation via parent and clip to avoid infinities
super().run_test(X, Y, Z)
rho = np.clip(self.statistic_, -0.999999, 0.999999)
# Step 3: Fisher Z-Transformation
coeff = np.arctanh(rho)
z_delta = np.arctanh(self.delta_threshold)
n = self.data.shape[0]
s = len(Z) # Number of conditioning variables
std_error_factor = np.sqrt(n - s - 3)
# Step 4: TOST (Two One-Sided Tests)
# Step 4.1: H0: rho <= -delta vs H1: rho > -delta
z_score_lower = std_error_factor * (coeff + z_delta)
p_value_lower = 1 - stats.norm.cdf(z_score_lower)
# Step 4.2: H0: rho >= delta vs H1: rho < delta
z_score_upper = std_error_factor * (coeff - z_delta)
p_value_upper = stats.norm.cdf(z_score_upper)
self.statistic_ = coeff
self.p_value_ = max(p_value_lower, p_value_upper)
return self.statistic_, self.p_value_