Source code for pgmpy.inference.ApproxInference

import itertools

from pgmpy.factors.discrete import DiscreteFactor
from pgmpy.models import DiscreteBayesianNetwork, DynamicBayesianNetwork
from pgmpy.utils import compat_fns



[docs]
class ApproxInference(object):
    """
    Initializes the Approximate Inference class.

    Parameters
    ----------
    model: Instance of pgmpy.models.DiscreteBayesianNetwork or pgmpy.models.DynamicBayesianNetwork

    Examples
    --------
    >>> from pgmpy.utils import get_example_model
    >>> model = get_example_model("alarm")
    >>> infer = ApproxInference(model)
    """

    def __init__(self, model):
        if not isinstance(model, (DiscreteBayesianNetwork, DynamicBayesianNetwork)):
            raise ValueError(
                f"model should either be a Bayesian Network or Dynamic Bayesian Network. Got {type(model)}."
            )
        model.check_model()
        self.model = model

    @staticmethod
    def _get_factor_from_df(df, state_names):
        """
        Takes a groupby dataframe and converts it into a pgmpy.factors.discrete.DiscreteFactor object.
        """
        variables = list(df.index.names)
        if len(variables) == 1:
            df_index = state_names[variables[0]]
        else:
            df_index = itertools.product(*[state_names[var] for var in variables])
        # state_names = {var: list(df.index.unique(var)) for var in variables}
        cardinality = [len(state_names[var]) for var in variables]
        return DiscreteFactor(
            variables=variables,
            cardinality=cardinality,
            values=df.reindex(df_index).fillna(0).values,
            state_names=state_names,
        )


[docs]
    def get_distribution(self, samples, variables, state_names=None, joint=True):
        """
        Computes distribution of `variables` from given data `samples`.

        Parameters
        ----------
        samples: pandas.DataFrame
            A dataframe of samples generated from the model.

        variables: list (array-like)
            A list of variables whose distribution needs to be computed.

        state_names: dict (default: None)
            A dict of state names for each variable in `variables` in the form {variable_name: list of states}.
            If None, inferred from the data but is possible that the final distribution misses some states.

        joint: boolean
            If joint=True, computes the joint distribution over `variables`.
            Else, returns a dict with marginal distribution of each variable in
            `variables`.
        """
        if isinstance(variables, (set, tuple)):
            variables = list(variables)

        if joint == True:
            return self._get_factor_from_df(
                samples.groupby(variables).size() / samples.shape[0], state_names
            )
        else:
            return {
                var: self._get_factor_from_df(
                    samples.groupby([var]).size() / samples.shape[0], state_names
                )
                for var in variables
            }



[docs]
    def query(
        self,
        variables,
        n_samples=int(1e4),
        samples=None,
        evidence=None,
        virtual_evidence=None,
        joint=True,
        state_names=None,
        show_progress=True,
        seed=None,
    ):
        """
        Method for doing approximate inference based on sampling in Bayesian
        Networks and Dynamic Bayesian Networks.

        Parameters
        ----------
        variables: list
            List of variables for which the probability distribution needs to be calculated.

        n_samples: int
            The number of samples to generate for computing the distributions. Higher `n_samples`
            results in more accurate results at the cost of more computation time.

        samples: pd.DataFrame (default: None)
            If provided, uses these samples to compute the distribution instead
            of generating samples. `samples` **must** conform with the provided
            `evidence` and `virtual_evidence`.

        evidence: dict (default: None)
            The observed values. A dict key, value pair of the form {var: state_name}.

        virtual_evidence: list (default: None)
            A list of pgmpy.factors.discrete.TabularCPD representing the virtual/soft
            evidence.

        state_names: dict (default: None)
            A dict of state names for each variable in `variables` in the form {variable_name: list of states}.
            If None, inferred from the data but is possible that the final distribution misses some states.

        show_progress: boolean (default: True)
            If True, shows a progress bar when generating samples.

        seed: int (default: None)
            Sets the seed for the random generators.

        Returns
        -------
        Probability distribution: pgmpy.factors.discrete.TabularCPD
            The queried probability distribution.

        Examples
        --------
        >>> from pgmpy.utils import get_example_model
        >>> from pgmpy.inference import ApproxInference
        >>> model = get_example_model("alarm")
        >>> infer = ApproxInference(model)
        >>> infer.query(variables=["HISTORY"])
        <DiscreteFactor representing phi(HISTORY:2) at 0x7f92d9f5b910>
        >>> infer.query(variables=["HISTORY", "CVP"], joint=True)
        <DiscreteFactor representing phi(HISTORY:2, CVP:3) at 0x7f92d9f77610>
        >>> infer.query(variables=["HISTORY", "CVP"], joint=False)
        {'HISTORY': <DiscreteFactor representing phi(HISTORY:2) at 0x7f92dc61eb50>,
         'CVP': <DiscreteFactor representing phi(CVP:3) at 0x7f92d915ec40>}
        """
        # Step 1: If samples are not provided, generate samples for the query
        if samples is None:
            if isinstance(self.model, DiscreteBayesianNetwork):
                samples = self.model.simulate(
                    n_samples=n_samples,
                    evidence=evidence,
                    virtual_evidence=virtual_evidence,
                    seed=seed,
                    show_progress=show_progress,
                )
            elif isinstance(self.model, DynamicBayesianNetwork):
                if evidence is None:
                    evidence = dict()
                if virtual_evidence is None:
                    virtual_evidence = dict()

                max_time_slices = 0
                for var in variables:
                    if var[1] > max_time_slices:
                        max_time_slices = var[1]
                for var, state in evidence.items():
                    if var[1] > max_time_slices:
                        max_time_slices = var[1]
                for cpd in virtual_evidence:
                    if cpd.variable[1] > max_time_slices:
                        max_time_slices = cpd.variable[2]
                samples = self.model.simulate(
                    n_samples=n_samples,
                    n_time_slices=max_time_slices + 1,
                    evidence=evidence,
                    virtual_evidence=virtual_evidence,
                    show_progress=show_progress,
                    seed=seed,
                )

        # Step 2: If state_names is None, infer it from samples.
        if state_names is None:
            if isinstance(self.model, DiscreteBayesianNetwork):
                state_names = {
                    var: list(samples.loc[:, var].unique()) for var in variables
                }
            elif isinstance(self.model, DynamicBayesianNetwork):
                state_names = {
                    var: list(samples.loc[:, [var]].iloc[:, 0].unique())
                    for var in variables
                }

        # Step 3: Compute the distributions and return it.
        return self.get_distribution(
            samples, variables=variables, state_names=state_names, joint=joint
        )



[docs]
    def map_query(
        self,
        variables,
        n_samples=int(1e4),
        samples=None,
        evidence=None,
        virtual_evidence=None,
        state_names=None,
        show_progress=True,
        seed=None,
    ):
        """
        Finds the most probable state in the joint distribution of variables. Calculates the
        result by generating samples and calculating most probable states based on the probabilities.

        Parameters
        ----------
        variables: list
            List of variables for which the probability distribution needs to be calculated.

        n_samples: int
            The number of samples to generate for computing the distributions. Higher `n_samples`
            results in more accurate results at the cost of more computation time.

        samples: pd.DataFrame (default: None)
            If provided, uses these samples to compute the distribution instead
            of generating samples. `samples` **must** conform with the provided
            `evidence` and `virtual_evidence`.

        evidence: dict (default: None)
            The observed values. A dict key, value pair of the form {var: state_name}.

        virtual_evidence: list (default: None)
            A list of pgmpy.factors.discrete.TabularCPD representing the virtual/soft
            evidence.

        state_names: dict (default: None)
            A dict of state names for each variable in `variables` in the form {variable_name: list of states}.
            If None, inferred from the data but is possible that the final distribution misses some states.

        show_progress: boolean (default: True)
            If True, shows a progress bar when generating samples.

        seed: int (default: None)
            Sets the seed for the random generators.

        Returns
        -------
        MAP values: dict
            The most probable state of provided `variables` given the evidence.

        Examples
        --------
        >>> from pgmpy.utils import get_example_model
        >>> from pgmpy.inference import ApproxInference
        >>> from pgmpy.factors.discrete import State, TabularCPD
        >>> model = get_example_model("alarm")
        >>> infer = ApproxInference(model)
        >>> print(infer.map_query(variables=["HISTORY", "CVP"]))
        {'HISTORY': 'FALSE', 'CVP': 'NORMAL'}
        >>> virtual_evidence_history = TabularCPD(
        ...     variable="HISTORY",
        ...     variable_card=2,
        ...     values=[[0.99], [0.01]],
        ...     state_names={"HISTORY": ["TRUE", "FALSE"]},
        ... )
        >>> evidence = {"CVP": "NORMAL"}
        >>> print(
        ...     infer.map_query(
        ...         variables=["HISTORY"],
        ...         evidence=evidence,
        ...         virtual_evidence=[virtual_evidence_history],
        ...     )
        ... )
        {'HISTORY': 'TRUE'}
        """
        final_distribution = self.query(
            variables,
            n_samples=n_samples,
            samples=samples,
            evidence=evidence,
            virtual_evidence=virtual_evidence,
            joint=True,
            state_names=state_names,
            show_progress=show_progress,
            seed=seed,
        )

        argmax = compat_fns.argmax(final_distribution.values)
        assignment = final_distribution.assignment([argmax])[0]

        map_query_results = {}
        for var_assignment in assignment:
            var, value = var_assignment
            map_query_results[var] = value

        return map_query_results
Source code for pgmpy.inference.ApproxInference

Navigation

Related Topics