Source code for pgmpy.inference.ApproxInference

import itertools

from pgmpy.factors.discrete import DiscreteFactor
from pgmpy.models import BayesianNetwork, DynamicBayesianNetwork


[docs] class ApproxInference(object): """ Initializes the Approximate Inference class. Parameters ---------- model: Instance of pgmpy.models.BayesianNetwork or pgmpy.models.DynamicBayesianNetwork Examples -------- >>> from pgmpy.utils import get_example_model >>> model = get_example_model('alarm') >>> infer = ApproxInference(model) """ def __init__(self, model): if not isinstance(model, (BayesianNetwork, DynamicBayesianNetwork)): raise ValueError( f"model should either be a Bayesian Network or Dynamic Bayesian Network. Got {type(model)}." ) model.check_model() self.model = model @staticmethod def _get_factor_from_df(df, state_names): """ Takes a groupby dataframe and converts it into a pgmpy.factors.discrete.DiscreteFactor object. """ variables = list(df.index.names) if len(variables) == 1: df_index = state_names[variables[0]] else: df_index = itertools.product(*[state_names[var] for var in variables]) # state_names = {var: list(df.index.unique(var)) for var in variables} cardinality = [len(state_names[var]) for var in variables] return DiscreteFactor( variables=variables, cardinality=cardinality, values=df.reindex(df_index).fillna(0).values, state_names=state_names, )
[docs] def get_distribution(self, samples, variables, state_names=None, joint=True): """ Computes distribution of `variables` from given data `samples`. Parameters ---------- samples: pandas.DataFrame A dataframe of samples generated from the model. variables: list (array-like) A list of variables whose distribution needs to be computed. state_names: dict (default: None) A dict of state names for each variable in `variables` in the form {variable_name: list of states}. If None, inferred from the data but is possible that the final distribution misses some states. joint: boolean If joint=True, computes the joint distribution over `variables`. Else, returns a dict with marginal distribution of each variable in `variables`. """ if joint == True: return self._get_factor_from_df( samples.groupby(variables).size() / samples.shape[0], state_names ) else: return { var: self._get_factor_from_df( samples.groupby([var]).size() / samples.shape[0], state_names ) for var in variables }
[docs] def query( self, variables, n_samples=int(1e4), samples=None, evidence=None, virtual_evidence=None, joint=True, state_names=None, show_progress=True, seed=None, ): """ Method for doing approximate inference based on sampling in Bayesian Networks and Dynamic Bayesian Networks. Parameters ---------- variables: list List of variables for which the probability distribution needs to be calculated. n_samples: int The number of samples to generate for computing the distributions. Higher `n_samples` results in more accurate results at the cost of more computation time. samples: pd.DataFrame (default: None) If provided, uses these samples to compute the distribution instead of generating samples. `samples` **must** conform with the provided `evidence` and `virtual_evidence`. evidence: dict (default: None) The observed values. A dict key, value pair of the form {var: state_name}. virtual_evidence: list (default: None) A list of pgmpy.factors.discrete.TabularCPD representing the virtual/soft evidence. state_names: dict (default: None) A dict of state names for each variable in `variables` in the form {variable_name: list of states}. If None, inferred from the data but is possible that the final distribution misses some states. show_progress: boolean (default: True) If True, shows a progress bar when generating samples. seed: int (default: None) Sets the seed for the random generators. Returns ------- Probability distribution: pgmpy.factors.discrete.TabularCPD The queried probability distribution. Examples -------- >>> from pgmpy.utils import get_example_model >>> from pgmpy.inference import ApproxInference >>> model = get_example_model("alarm") >>> infer = ApproxInference(model) >>> infer.query(variables=["HISTORY"]) <DiscreteFactor representing phi(HISTORY:2) at 0x7f92d9f5b910> >>> infer.query(variables=["HISTORY", "CVP"], joint=True) <DiscreteFactor representing phi(HISTORY:2, CVP:3) at 0x7f92d9f77610> >>> infer.query(variables=["HISTORY", "CVP"], joint=False) {'HISTORY': <DiscreteFactor representing phi(HISTORY:2) at 0x7f92dc61eb50>, 'CVP': <DiscreteFactor representing phi(CVP:3) at 0x7f92d915ec40>} """ # Step 1: If samples are not provided, generate samples for the query if samples is None: if isinstance(self.model, BayesianNetwork): samples = self.model.simulate( n_samples=n_samples, evidence=evidence, virtual_evidence=virtual_evidence, seed=seed, show_progress=show_progress, ) elif isinstance(self.model, DynamicBayesianNetwork): if evidence is None: evidence = dict() if virtual_evidence is None: virtual_evidence = dict() max_time_slices = 0 for var in variables: if var[1] > max_time_slices: max_time_slices = var[1] for var, state in evidence.items(): if var[1] > max_time_slices: max_time_slices = var[1] for cpd in virtual_evidence: if cpd.variable[1] > max_time_slices: max_time_slices = cpd.variable[2] samples = self.model.simulate( n_samples=n_samples, n_time_slices=max_time_slices + 1, evidence=evidence, virtual_evidence=virtual_evidence, show_progress=show_progress, seed=seed, ) # Step 2: If state_names is None, infer it from samples. if state_names is None: if isinstance(self.model, BayesianNetwork): state_names = { var: list(samples.loc[:, var].unique()) for var in variables } elif isinstance(self.model, DynamicBayesianNetwork): state_names = { var: list(samples.loc[:, [var]].iloc[:, 0].unique()) for var in variables } # Step 3: Compute the distributions and return it. return self.get_distribution( samples, variables=variables, state_names=state_names, joint=joint )