Source code for pgmpy.ci_tests.pearsonr_equivalence

import numpy as np
import pandas as pd
from scipy import stats

from .pearsonr import Pearsonr



[docs]
class PearsonrEquivalence(Pearsonr):
    r"""
    Pearson equivalence test [1] for conditional independence on continuous data.

    This test first computes the partial correlation coefficient :math:`\hat{\rho}_{XY \mid Z}` using :class:`Pearsonr`.
    Let :math:`\delta` denote ``delta_threshold``. The Fisher transform is computed as:

    .. math::
        z_\rho = \operatorname{arctanh}(\hat{\rho}_{XY \mid Z}),
        \qquad
        z_\delta = \operatorname{arctanh}(\delta),

    and defines

    .. math::
        c = \sqrt{n - |Z| - 3},

    where :math:`n` is the sample size and :math:`|Z|` is the number of conditioning variables.

    The test then performs a TOST (two one-sided tests) procedure for the equivalence hypothesis

    .. math::
        H_0: \rho_{XY \mid Z} \leq -\delta \;\; \text{or} \;\; \rho_{XY \mid Z} \geq \delta
        \qquad \text{vs.} \qquad
        H_1: -\delta < \rho_{XY \mid Z} < \delta.

    The two one-sided test statistics are:

    .. math::
        T_{\mathrm{lower}} = c (z_\rho + z_\delta),
        \qquad
        T_{\mathrm{upper}} = c (z_\rho - z_\delta),

    with corresponding p-values:

    .. math::
        p_{\mathrm{lower}} = 1 - \Phi(T_{\mathrm{lower}}),
        \qquad
        p_{\mathrm{upper}} = \Phi(T_{\mathrm{upper}}),

    where :math:`\Phi` is the standard normal CDF. The reported p-value is:

    .. math::
        p = \max(p_{\mathrm{lower}}, p_{\mathrm{upper}}).


    Parameters
    ----------
    data : pandas.DataFrame
        The dataset in which to test the independence condition.

    delta_threshold : float
        The equivalence bound (threshold for practical independence).

    Attributes
    ----------
    statistic_ : float
        Fisher z-transformed correlation coefficient :math:`z_\rho`. Set after calling the test.
    p_value_ : float
        The p-value from the TOST procedure. Independence is concluded when
        ``p_value_ < significance_level`` (opposite of standard CI tests). Set after calling the test.

    References
    ----------
    .. [1] Malinsky, Daniel. "A cautious approach to constraint-based causal model selection." arXiv preprint
            arXiv:2404.18232 (2024).
    """

    _tags = {
        "name": "pearsonr_equivalence",
        "data_types": ("continuous",),
        "default_for": None,
        "requires_data": True,
    }

    def __init__(self, data: pd.DataFrame, delta_threshold: float = 0.1):
        self.delta_threshold = delta_threshold
        super().__init__(data=data)


[docs]
    def is_independent(
        self,
        X: str,
        Y: str,
        Z: list | tuple = (),
        significance_level: float = 0.05,
    ) -> bool:
        """
        Perform the equivalence CI test.

        Note: Independence is concluded when p_value_ < significance_level
        (rejecting the null of dependence), which is the OPPOSITE of standard CI tests.

        Returns
        -------
        bool
            True if X ⊥⊥ Y | Z (p_value_ < significance_level), else False.
        """
        self._validate_inputs(X, Y, Z)

        self.run_test(X=X, Y=Y, Z=list(Z))

        return self.p_value_ < significance_level



[docs]
    def run_test(
        self,
        X: str,
        Y: str,
        Z: list,
    ):
        """
        Compute Pearson equivalence statistic and p-value.

        Sets ``self.statistic_`` (Fisher z-transformed partial correlation) and ``self.p_value_``.
        """
        # Step 2: Compute Partial Pearson Correlation via parent and clip to avoid infinities
        super().run_test(X, Y, Z)
        rho = np.clip(self.statistic_, -0.999999, 0.999999)

        # Step 3: Fisher Z-Transformation
        coeff = np.arctanh(rho)
        z_delta = np.arctanh(self.delta_threshold)

        n = self.data.shape[0]
        s = len(Z)  # Number of conditioning variables

        std_error_factor = np.sqrt(n - s - 3)

        # Step 4: TOST (Two One-Sided Tests)
        # Step 4.1: H0: rho <= -delta  vs  H1: rho > -delta
        z_score_lower = std_error_factor * (coeff + z_delta)
        p_value_lower = 1 - stats.norm.cdf(z_score_lower)

        # Step 4.2: H0: rho >= delta   vs  H1: rho < delta
        z_score_upper = std_error_factor * (coeff - z_delta)
        p_value_upper = stats.norm.cdf(z_score_upper)

        self.statistic_ = coeff
        self.p_value_ = max(p_value_lower, p_value_upper)

        return self.statistic_, self.p_value_