Source code for pgmpy.ci_tests.power_divergence

import numpy as np
import pandas as pd
from scipy import stats

from pgmpy import logger

from ._base import _BaseCITest


[docs] class PowerDivergence(_BaseCITest): r""" Cressie-Read power divergence test for conditional independence on discrete data [1]. This test evaluates the null hypothesis :math:`X \perp Y \mid Z` using contingency tables. For a contingency table with observed counts :math:`O_{ij}` and expected counts :math:`E_{ij}` under independence, the Cressie-Read power divergence statistic is: .. math:: T_\lambda = \frac{2}{\lambda(\lambda + 1)} \sum_{i, j} O_{ij} \left[\left(\frac{O_{ij}}{E_{ij}}\right)^\lambda - 1\right], for :math:`\lambda \notin \{-1, 0\}`. Different values of :math:`\lambda` recover common special cases such as the Pearson chi-square test and the log-likelihood ratio test. If :math:`Z = \emptyset`, the implementation constructs the contingency table of :math:`X` and :math:`Y` from the full dataset and computes :math:`T_\lambda` with :func:`scipy.stats.chi2_contingency`. If :math:`Z \neq \emptyset`, the data are partitioned by each observed configuration :math:`z` of :math:`Z`. For each stratum, a contingency table for :math:`X` and :math:`Y` is constructed and its power divergence statistic :math:`T_\lambda^{(z)}` and degrees of freedom :math:`\nu^{(z)}` are computed. The overall statistic used in the code is: .. math:: T = \sum_{z} T_\lambda^{(z)}, \qquad \nu = \sum_{z} \nu^{(z)}, where the sum runs over strata whose contingency tables do not contain an all-zero row or all-zero column. Strata with such degenerate tables are skipped. Under the null hypothesis, :math:`T` is treated with the usual chi-square asymptotic approximation, so the p-value is computed as: .. math:: p = 1 - F_{\chi^2_\nu}(T), where :math:`F_{\chi^2_\nu}` is the CDF of the chi-square distribution with :math:`\nu` degrees of freedom. Parameters ---------- data : pandas.DataFrame The dataset on which to test the independence condition. lambda_ : float or string The :math:`\lambda` parameter for the power divergence statistic. Some values of ``lambda_`` recover well-known special cases: * "pearson" 1 "Chi-squared test" * "log-likelihood" 0 "G-test or log-likelihood" * "freeman-tuckey" -1/2 "Freeman-Tuckey Statistic" * "mod-log-likelihood" -1 "Modified Log-likelihood" * "neyman" -2 "Neyman's statistic" * "cressie-read" 2/3 "The value recommended in the paper[1]" Attributes ---------- statistic_ : float The power divergence test statistic :math:`T`. Set after calling the test. p_value_ : float The p-value for the test. Set after calling the test. dof_ : int Degrees of freedom :math:`\nu` for the test. Set after calling the test. References ---------- .. [1] Cressie, Noel, and Timothy RC Read. "Multinomial goodness‐of‐fit tests." Journal of the Royal Statistical Society: Series B (Methodological) 46.3 (1984): 440-464. Examples -------- >>> import pandas as pd >>> import numpy as np >>> np.random.seed(42) >>> data = pd.DataFrame( ... data=np.random.randint(low=0, high=2, size=(50000, 4)), columns=list("ABCD") ... ) >>> data["E"] = data["A"] + data["B"] + data["C"] >>> test = PowerDivergence(data=data) >>> test(X="A", Y="C", Z=[], significance_level=0.05) np.True_ >>> round(test.statistic_, 2) np.float64(0.03) >>> round(test.p_value_, 2) np.float64(0.86) >>> test.dof_ 1 >>> test(X="A", Y="B", Z=["D"], significance_level=0.05) np.True_ >>> test(X="A", Y="B", Z=["D", "E"], significance_level=0.05) np.False_ """ _tags = { "name": "power_divergence", "data_types": ("discrete",), "default_for": None, "requires_data": True, } def __init__(self, data: pd.DataFrame, lambda_: str | float = "cressie-read"): self.data = data self.lambda_ = lambda_ super().__init__()
[docs] def run_test( self, X: str, Y: str, Z: list, ): """ Compute power divergence statistic, p-value, and degrees of freedom. Sets ``self.statistic_`` (chi-squared), ``self.p_value_``, and ``self.dof_``. """ data = self.data # Step 1: Do a simple contingency test if there are no conditional variables. if len(Z) == 0: chi, p_value, dof, expected = stats.chi2_contingency( data.groupby([X, Y], observed=False).size().unstack(Y, fill_value=0), lambda_=self.lambda_, ) # Step 3: If there are conditionals variables, iterate over unique states else: chi = 0 dof = 0 for z_state, df in data.groupby(list(Z), observed=True): # Compute the contingency table unique_x, x_inv = np.unique(df[X], return_inverse=True) unique_y, y_inv = np.unique(df[Y], return_inverse=True) contingency = np.bincount( x_inv * len(unique_y) + y_inv, minlength=len(unique_x) * len(unique_y), ).reshape(len(unique_x), len(unique_y)) # If all values of a column in the contingency table are zeros, skip the test. if any(contingency.sum(axis=0) == 0) or any(contingency.sum(axis=1) == 0): if isinstance(z_state, str): logger.info(f"Skipping the test {X} _|_ {Y} | {Z[0]}={z_state}. Not enough samples") else: z_str = ", ".join([f"{var}={state}" for var, state in zip(Z, z_state)]) logger.info(f"Skipping the test {X} _|_ {Y} | {z_str}. Not enough samples") else: c, _, d, _ = stats.chi2_contingency(contingency, lambda_=self.lambda_) chi += c dof += d p_value = 1 - stats.chi2.cdf(chi, df=dof) self.statistic_ = chi self.p_value_ = p_value self.dof_ = dof return self.statistic_, self.p_value_