Source code for pgmpy.structure_score.bds

from math import lgamma, log

import numpy as np
from scipy.special import gammaln

from pgmpy.structure_score.bdeu import BDeu



[docs]
class BDs(BDeu):
    r"""
    BDs structure score for discrete Bayesian networks.

    BDs is a sparse-data variant of BDeu that reallocates the equivalent sample size over the observed parent
    configurations instead of all possible configurations. This makes it better suited to discrete datasets with many
    unobserved parent configurations. The local score computed as:

    .. math::
        \operatorname{BDs}(X_i, \Pi_i) =
        \left[
            \sum_{j \in \mathcal{O}_i} \sum_{k=1}^{r_i} \log \Gamma(N_{ijk} + \beta)
            + (q_i - \tilde{q}_i) r_i \log \Gamma(\beta)
        \right]
        - \left[
            \sum_{j \in \mathcal{O}_i} \log \Gamma(N_{ij} + \alpha)
            + (q_i - \tilde{q}_i) \log \Gamma(\alpha)
        \right]
        + \tilde{q}_i \log \Gamma(\alpha)
        - q_i r_i \log \Gamma(\beta),

    where :math:`\mathcal{O}_i` is the set of observed parent configurations, :math:`\tilde{q}_i = |\mathcal{O}_i|`,
    :math:`q_i` is the total number of parent configurations, :math:`r_i` is the cardinality of :math:`X_i`,
    :math:`\alpha = \text{equivalent_sample_size} / \tilde{q}_i`, :math:`\beta = \text{equivalent_sample_size} / (r_i
    q_i)`, and :math:`N_{ij} = \sum_{k=1}^{r_i} N_{ijk}`.

    In the implementation, `state_counts(..., reindex=False)` keeps only the observed parent configurations. The
    `gamma_counts_adj` and `gamma_conds_adj` terms restore the missing contributions from the unobserved ones so the
    returned score matches the full BDs formula. This class also uses the marginal uniform graph prior from Scutari
    (2016).

    Parameters
    ----------
    data : pandas.DataFrame
        DataFrame where each column represents a discrete variable. Missing values should be set to `numpy.nan`.
    equivalent_sample_size : int, optional
        Equivalent sample size used to define the Dirichlet hyperparameters.
    state_names : dict, optional
        Dictionary mapping each variable to its discrete states. If not specified, the unique values observed in the
        data are used.

    Examples
    --------
    >>> import pandas as pd
    >>> from pgmpy.models import DiscreteBayesianNetwork
    >>> from pgmpy.structure_score import BDs
    >>> data = pd.DataFrame(
    ...     {"A": [0, 1, 1, 0], "B": [1, 0, 1, 0], "C": [1, 1, 1, 0]}
    ... )
    >>> model = DiscreteBayesianNetwork([("A", "B"), ("A", "C")])
    >>> score = BDs(data, equivalent_sample_size=5)
    >>> round(score.score(model), 3)
    np.float64(-12.857)
    >>> round(score.local_score("B", ("A",)), 3)
    np.float64(-3.446)

    Raises
    ------
    ValueError
        If the data contains non-discrete variables, or if the model variables are not present
        in the data.

    References
    ----------
    .. [1] Scutari, Marco. An Empirical-Bayes Score for Discrete Bayesian Networks. Journal of Machine Learning
        Research, 2016, pp. 438-48.
    """

    _tags = {
        "name": "bds",
        "supported_datatype": "discrete",
        "default_for": None,
        "is_parameteric": True,
    }

    def __init__(self, data, equivalent_sample_size=10, state_names=None):
        super().__init__(data, equivalent_sample_size, state_names=state_names)


[docs]
    def structure_prior_ratio(self, operation) -> float:
        """Compute the prior ratio for a graph edit."""
        if operation == "+":
            return -log(2.0)
        if operation == "-":
            return log(2.0)
        return 0



[docs]
    def structure_prior(self, model) -> float:
        """Compute the marginal uniform prior for a structure."""
        nedges = float(len(model.edges()))
        nnodes = float(len(model.nodes()))
        possible_edges = nnodes * (nnodes - 1) / 2.0
        score = -(nedges + possible_edges) * log(2.0)
        return score


    def _local_score(self, variable: str, parents: tuple[str, ...]) -> float:
        state_counts = self.state_counts(variable, parents, reindex=False)
        num_parents_states = np.prod([len(self.state_names[var]) for var in parents])

        counts = np.asarray(state_counts)
        counts_size = num_parents_states * len(self.state_names[variable])
        log_gamma_counts = np.zeros_like(counts, dtype=float)
        alpha = self.equivalent_sample_size / state_counts.shape[1]
        beta = self.equivalent_sample_size / counts_size
        gammaln(counts + beta, out=log_gamma_counts)

        log_gamma_conds = np.sum(counts, axis=0, dtype=float)
        gammaln(log_gamma_conds + alpha, out=log_gamma_conds)

        gamma_counts_adj = (num_parents_states - counts.shape[1]) * len(self.state_names[variable]) * gammaln(beta)
        gamma_conds_adj = (num_parents_states - counts.shape[1]) * gammaln(alpha)

        score = (
            (np.sum(log_gamma_counts) + gamma_counts_adj)
            - (np.sum(log_gamma_conds) + gamma_conds_adj)
            + state_counts.shape[1] * lgamma(alpha)
            - counts_size * lgamma(beta)
        )
        return score