Source code for pgmpy.ci_tests.pearsonr

import numpy as np
import pandas as pd
from scipy import stats

from ._base import _BaseCITest



[docs]
class Pearsonr(_BaseCITest):
    r"""
    Partial Correlation test for conditional independence.

    If :math:`Z = \emptyset`, compute Pearson's correlation coefficient :math:`r_{XY}` and its two-sided p-value.

    If :math:`Z \neq \emptyset`, regress :math:`X` and :math:`Y` on :math:`[1, Z]` using least squares, compute the
    residuals :math:`r_X` and :math:`r_Y`, and define the partial correlation as the Pearson correlation between those
    residuals. The resulting test statistic is

    .. math::
        t = \rho_{XY \mid Z} \sqrt{\frac{n - |Z| - 2}{1 - \rho_{XY \mid Z}^2}},

    where :math:`n` is the sample size and :math:`|Z|` is the number of conditioning variables. Under the null
    hypothesis :math:`X \perp Y \mid Z`, this statistic is Student's t distribution with :math:`n - |Z| - 2` degrees of
    freedom.

    Parameters
    ----------
    data : pandas.DataFrame
        The dataset in which to test the independence condition.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>> from pgmpy.ci_tests import Pearsonr
    >>> rng = np.random.default_rng(seed=42)
    >>> data = pd.DataFrame(data=rng.standard_normal(size=(1000, 3)), columns=["X", "Y", "Z"])
    >>> test = Pearsonr(data=data)
    >>> test(X="X", Y="Y", Z=["Z"], significance_level=0.05)
    np.True_
    >>> round(test.statistic_, 2)
    np.float64(0.01)
    >>> round(test.p_value_, 2)
    np.float64(0.87)
    >>> test.dof_
    997

    Attributes
    ----------
    statistic_ : float
        Pearson's correlation coefficient (or partial correlation when Z is non-empty),
        ranging from -1 to 1. Set after calling the test.
    p_value_ : float
        The p-value for the test. Set after calling the test.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
    .. [2] https://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression
    """

    _tags = {
        "name": "pearsonr",
        "data_types": ("continuous",),
        "default_for": "continuous",
        "requires_data": True,
    }

    def __init__(self, data: pd.DataFrame):
        self.data = data
        super().__init__()


[docs]
    def run_test(
        self,
        X: str,
        Y: str,
        Z: list,
    ):
        """
        Compute Pearson correlation coefficient and p-value.

        Sets ``self.statistic_`` (Pearson's r) and ``self.p_value_``.
        """
        data = self.data
        n_samples = data.shape[0]

        # Step 1: If Z is empty compute a non-conditional test.
        if len(Z) == 0:
            coef, p_value = stats.pearsonr(data.loc[:, X], data.loc[:, Y])

        # Step 2: If Z is non-empty, use linear regression to compute residuals and test independence on it.
        else:
            design_matrix = np.column_stack([np.ones(n_samples), data.loc[:, Z].to_numpy()])
            X_coef = np.linalg.lstsq(design_matrix, data.loc[:, X], rcond=None)[0]
            Y_coef = np.linalg.lstsq(design_matrix, data.loc[:, Y], rcond=None)[0]

            residual_X = data.loc[:, X] - design_matrix @ X_coef
            residual_Y = data.loc[:, Y] - design_matrix @ Y_coef

            coef = np.corrcoef(residual_X, residual_Y)[0, 1]
            self.dof_ = n_samples - len(Z) - 2
            t_statistic = coef * np.sqrt(self.dof_ / (1 - coef**2))
            p_value = 2 * stats.t.sf(np.abs(t_statistic), df=self.dof_)

        self.statistic_ = coef
        self.p_value_ = p_value

        return self.statistic_, self.p_value_