Source code for pgmpy.metrics.bn_inference

import numpy as np
import pandas as pd

from pgmpy.sampling import BayesianModelInference


[docs] class BayesianModelProbability(BayesianModelInference): """ Class to calculate probability (pmf) values specific to Bayesian Models """ def __init__(self, model): """ Class to calculate probability (pmf) values specific to Bayesian Models Parameters ---------- model: Bayesian Model model on which inference queries will be computed """ super(BayesianModelProbability, self).__init__(model) def _log_probability_node(self, data, ordering, node): """ Evaluate the log probability of each datapoint for a specific node. Internal function used by log_probability(). Parameters ---------- data: array_like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. ordering: list ordering of columns in data, used by the Bayesian model. default is topological ordering used by model. node: Bayesian Model Node node from the Bayesian network. Returns ------- Log probability of node: np.array (n_samples,) The array of log(density) evaluations. These are normalized to be probability densities, so values will be low for high-dimensional data. """ def vec_translate(a, my_dict): return np.vectorize(my_dict.__getitem__)(a) cpd = self.model.get_cpds(node) # variable to probe: data[n], where n is the node number current = cpd.variables[0] current_idx = ordering.index(current) current_val = data[:, current_idx] current_no = vec_translate(current_val, cpd.name_to_no[current]) # conditional dependencies E of the probed variable evidence = [var for var in cpd.variables[1:] if var not in self.model.latents] evidence_idx = [ordering.index(ev) for ev in evidence] evidence_val = data[:, evidence_idx] evidence_no = np.empty_like(evidence_val, dtype=int) for i, ev in enumerate(evidence): evidence_no[:, i] = vec_translate(evidence_val[:, i], cpd.name_to_no[ev]) if evidence: # there are conditional dependencies E for data[n] for this node # Here we retrieve the array: p(x[n]|E). We do this for each x in data. # We pick the specific node value from the arrays below. unique, inverse = np.unique(evidence_no, axis=0, return_inverse=True) unique = [tuple(u) for u in unique] state_to_index, index_to_weight = self.pre_compute_reduce_maps( variable=node, evidence=evidence, state_combinations=unique ) weights = np.array( [index_to_weight[state_to_index[tuple(u)]] for u in unique] )[inverse] else: # there are NO conditional dependencies for this node # retrieve array: p(x[n]). We do this for each x in data. # We pick the specific node value from the arrays below. weights = np.array([cpd.values] * len(data)) # pick the specific node value x[n] from the array p(x[n]|E) or p(x[n]) # We do this for each x in data. probability_node = np.array([weights[i][cn] for i, cn in enumerate(current_no)]) return np.log(probability_node)
[docs] def log_probability(self, data, ordering=None): """ Evaluate the logarithmic probability of each point in a data set. Parameters ---------- data: pandas dataframe OR array_like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. ordering: list ordering of columns in data, used by the Bayesian model. default is topological ordering used by model. Returns ------- Log probability of each datapoint: np.array (n_samples,) The array of log(density) evaluations. These are normalized to be probability densities, so values will be low for high-dimensional data. """ if isinstance(data, pd.DataFrame): # use numpy array from now on. ordering = data.columns.to_list() data = data.values if ordering is None: ordering = self.topological_order data = data.loc[:, ordering].values logp = np.array( [self._log_probability_node(data, ordering, node) for node in ordering] ) return np.sum(logp, axis=0)
[docs] def score(self, data, ordering=None): """ Compute the total log probability density under the model. Parameters ---------- data: pandas dataframe OR array_like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. ordering: list ordering of columns in data, used by the Bayesian model. default is topological ordering used by model. Returns ------- Log-likelihood of data: float This is normalized to be a probability density, so the value will be low for high-dimensional data. """ return np.sum(self.log_probability(data, ordering))