Source code for pgmpy.metrics.bn_inference

import numpy as np
import pandas as pd

from pgmpy.sampling import BayesianModelInference



[docs]
class BayesianModelProbability(BayesianModelInference):
    """
    Class to calculate probability (pmf) values specific to Bayesian Models
    """

    def __init__(self, model):
        """
        Class to calculate probability (pmf) values specific to Bayesian Models

        Parameters
        ----------
        model: Bayesian Model
            model on which inference queries will be computed
        """
        super(BayesianModelProbability, self).__init__(model)

    def _log_probability_node(self, data, ordering, node):
        """
        Evaluate the log probability of each datapoint for a specific node.

        Internal function used by log_probability().

        Parameters
        ----------
        data: array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points.  Each row
            corresponds to a single data point.

        ordering: list
            ordering of columns in data, used by the Bayesian model.
            default is topological ordering used by model.

        node: Bayesian Model Node
            node from the Bayesian network.

        Returns
        -------
        Log probability of node: np.array (n_samples,)
            The array of log(density) evaluations. These are normalized to be
            probability densities, so values will be low for high-dimensional
            data.
        """

        def vec_translate(a, my_dict):
            return np.vectorize(my_dict.__getitem__)(a)

        cpd = self.model.get_cpds(node)

        # variable to probe: data[n], where n is the node number
        current = cpd.variables[0]
        current_idx = ordering.index(current)
        current_val = data[:, current_idx]
        current_no = vec_translate(current_val, cpd.name_to_no[current])

        # conditional dependencies E of the probed variable
        evidence = [var for var in cpd.variables[1:] if var not in self.model.latents]
        evidence_idx = [ordering.index(ev) for ev in evidence]
        evidence_val = data[:, evidence_idx]
        evidence_no = np.empty_like(evidence_val, dtype=int)
        for i, ev in enumerate(evidence):
            evidence_no[:, i] = vec_translate(evidence_val[:, i], cpd.name_to_no[ev])

        if evidence:
            # there are conditional dependencies E for data[n] for this node
            # Here we retrieve the array: p(x[n]|E). We do this for each x in data.
            # We pick the specific node value from the arrays below.

            unique, inverse = np.unique(evidence_no, axis=0, return_inverse=True)
            unique = [tuple(u) for u in unique]
            state_to_index, index_to_weight = self.pre_compute_reduce_maps(
                variable=node, evidence=evidence, state_combinations=unique
            )
            weights = np.array(
                [index_to_weight[state_to_index[tuple(u)].item()] for u in unique]
            )[inverse]
        else:
            # there are NO conditional dependencies for this node
            # retrieve array: p(x[n]).  We do this for each x in data.
            # We pick the specific node value from the arrays below.
            weights = np.array([cpd.values] * len(data))

        # pick the specific node value x[n] from the array p(x[n]|E) or p(x[n])
        # We do this for each x in data.
        probability_node = np.array([weights[i][cn] for i, cn in enumerate(current_no)])

        return np.log(probability_node)


[docs]
    def log_probability(self, data, ordering=None):
        """
        Evaluate the logarithmic probability of each point in a data set.

        Parameters
        ----------
        data: pandas dataframe OR array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points.  Each row
            corresponds to a single data point.

        ordering: list
            ordering of columns in data, used by the Bayesian model.
            default is topological ordering used by model.

        Returns
        -------
        Log probability of each datapoint: np.array (n_samples,)
            The array of log(density) evaluations. These are normalized to be
            probability densities, so values will be low for high-dimensional
            data.
        """
        if isinstance(data, pd.DataFrame):
            # use numpy array from now on.
            ordering = data.columns.to_list()
            data = data.values
        if ordering is None:
            ordering = self.topological_order
            data = data.loc[:, ordering].values

        logp = np.array(
            [self._log_probability_node(data, ordering, node) for node in ordering]
        )
        return np.sum(logp, axis=0)



[docs]
    def score(self, data, ordering=None):
        """
        Compute the total log probability density under the model.

        Parameters
        ----------
        data: pandas dataframe OR array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points.  Each row
            corresponds to a single data point.

        ordering: list
            ordering of columns in data, used by the Bayesian model.
            default is topological ordering used by model.

        Returns
        -------
        Log-likelihood of data: float
            This is normalized to be a probability density, so the value
            will be low for high-dimensional data.
        """
        return np.sum(self.log_probability(data, ordering))
Source code for pgmpy.metrics.bn_inference

Navigation

Related Topics