Source code for pgmpy.estimators.MmhcEstimator

#!/usr/bin/env python
import networkx as nx

from pgmpy.base import UndirectedGraph
from pgmpy.estimators import BDeu, ExpertKnowledge, HillClimbSearch, StructureEstimator
from pgmpy.estimators.CITests import chi_square
from pgmpy.independencies import IndependenceAssertion, Independencies
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.utils.mathext import powerset



[docs]
class MmhcEstimator(StructureEstimator):
    """
    Implements the MMHC hybrid structure estimation procedure for
    learning BayesianNetworks from discrete data.

    Parameters
    ----------
    data: pandas DataFrame object
        dataframe object where each column represents one variable.
        (If some values in the data are missing the data cells should be set to `numpy.nan`.
        Note that pandas converts each column containing `numpy.nan`s to dtype `float`.)

    state_names: dict (optional)
        A dict indicating, for each variable, the discrete set of states (or values)
        that the variable can take. If unspecified, the observed values in the data set
        are taken to be the only possible states.

    References
    ----------
    Tsamardinos et al., The max-min hill-climbing Bayesian network structure learning algorithm (2005)
    http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf
    """

    def __init__(self, data, **kwargs):
        super(MmhcEstimator, self).__init__(data, **kwargs)


[docs]
    def estimate(self, scoring_method=None, tabu_length=10, significance_level=0.01):
        """
        Estimates a DiscreteBayesianNetwork for the data set, using MMHC. First estimates a
        graph skeleton using MMPC and then orients the edges using score-based local
        search (hill climbing).

        Parameters
        ----------
        significance_level: float, default: 0.01
            The significance level to use for conditional independence tests in the data set. See `mmpc`-method.

        scoring_method: instance of a Scoring method (default: BDeu)
            The method to use for scoring during Hill Climb Search. Can be an instance of any of the
            scoring methods implemented in pgmpy.

        tabu_length: int
            If provided, the last `tabu_length` graph modifications cannot be reversed
            during the search procedure. This serves to enforce a wider exploration
            of the search space. Default value: 100.

        Returns
        -------
        Estimated model: pgmpy.base.DAG
            The estimated model without the parameterization.

        References
        ----------
        Tsamardinos et al., The max-min hill-climbing Bayesian network structure learning algorithm (2005),
        Algorithm 3
        http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import MmhcEstimator
        >>> data = pd.DataFrame(
        ...     np.random.randint(0, 2, size=(2500, 4)), columns=list("XYZW")
        ... )
        >>> data["sum"] = data.sum(axis=1)
        >>> est = MmhcEstimator(data)
        >>> model = est.estimate()
        >>> print(model.edges())
        [('Z', 'sum'), ('X', 'sum'), ('W', 'sum'), ('Y', 'sum')]
        """
        if scoring_method is None:
            scoring_method = BDeu(self.data, equivalent_sample_size=10)

        skel = self.mmpc(significance_level)
        hc = HillClimbSearch(self.data)

        possible_edges = nx.complete_graph(
            n=self.state_names.keys(), create_using=nx.Graph
        ).edges()

        expert_knowledge = ExpertKnowledge(
            forbidden_edges=possible_edges - skel.to_directed().edges()
        )

        model = hc.estimate(
            scoring_method=scoring_method,
            expert_knowledge=expert_knowledge,
            tabu_length=tabu_length,
        )

        return model



[docs]
    def mmpc(self, significance_level=0.01):
        """Estimates a graph skeleton (UndirectedGraph) for the data set, using then
        MMPC (max-min parents-and-children) algorithm.

        Parameters
        ----------
        significance_level: float, default=0.01
            The significance level to use for conditional independence tests in the data set.

            `significance_level` is the desired Type 1 error probability of
            falsely rejecting the null hypothesis that variables are independent,
            given that they are. The lower `significance_level`, the less likely
            we are to accept dependencies, resulting in a sparser graph.

        Returns
        -------
        skeleton: pgmpy.base.UndirectedGraph
            An estimate for the undirected graph skeleton of the BN underlying the data.

        seperating_sets: dict
            A dict containing for each pair of not directly connected nodes a
            seperating set ("witnessing set") of variables that makes then
            conditionally independent. (needed for edge orientation)

        References
        ----------
        Tsamardinos et al., The max-min hill-climbing Bayesian network structure
        learning algorithm (2005), Algorithm 1 & 2
        http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import MmhcEstimator
        >>> data = pd.DataFrame(
        ...     np.random.randint(0, 2, size=(5000, 5)), columns=list("ABCDE")
        ... )
        >>> data["F"] = data["A"] + data["B"] + data["C"]
        >>> est = PC(data)
        >>> skel, sep_sets = est.estimate_skeleton()
        >>> skel.edges()
        [('A', 'F'), ('B', 'F'), ('C', 'F')]
        >>> # all independencies are unconditional:
        >>> sep_sets
        {('D', 'A'): (), ('C', 'A'): (), ('C', 'E'): (), ('E', 'F'): (), ('B', 'D'): (),
         ('B', 'E'): (), ('D', 'F'): (), ('D', 'E'): (), ('A', 'E'): (), ('B', 'A'): (),
         ('B', 'C'): (), ('C', 'D'): ()}
        >>> data = pd.DataFrame(
        ...     np.random.randint(0, 2, size=(5000, 3)), columns=list("XYZ")
        ... )
        >>> data["X"] += data["Z"]
        >>> data["Y"] += data["Z"]
        >>> est = PC(data)
        >>> skel, sep_sets = est.estimate_skeleton()
        >>> skel.edges()
        [('X', 'Z'), ('Y', 'Z')]
        >>> # X, Y dependent, but conditionally independent given Z:
        >>> sep_sets
        {('X', 'Y'): ('Z',)}
        """

        nodes = self.state_names.keys()

        def assoc(X, Y, Zs):
            """Measure for (conditional) association between variables. Use negative
            p-value of independence test.
            """
            return 1 - chi_square(X, Y, Zs, self.data, boolean=False)[1]

        def min_assoc(X, Y, Zs):
            "Minimal association of X, Y given any subset of Zs."
            return min(assoc(X, Y, Zs_subset) for Zs_subset in powerset(Zs))

        def max_min_heuristic(X, Zs):
            "Finds variable that maximizes min_assoc with `node` relative to `neighbors`."
            max_min_assoc = 0
            best_Y = None

            for Y in set(nodes) - set(Zs + [X]):
                min_assoc_val = min_assoc(X, Y, Zs)
                if min_assoc_val >= max_min_assoc:
                    best_Y = Y
                    max_min_assoc = min_assoc_val

            return (best_Y, max_min_assoc)

        # Find parents and children for each node
        neighbors = dict()
        for node in nodes:
            neighbors[node] = []

            # Forward Phase
            while True:
                new_neighbor, new_neighbor_min_assoc = max_min_heuristic(
                    node, neighbors[node]
                )
                if new_neighbor_min_assoc > 0:
                    neighbors[node].append(new_neighbor)
                else:
                    break

            # Backward Phase
            for neigh in neighbors[node]:
                other_neighbors = [n for n in neighbors[node] if n != neigh]
                for sep_set in powerset(other_neighbors):
                    if chi_square(
                        X=node,
                        Y=neigh,
                        Z=sep_set,
                        data=self.data,
                        significance_level=significance_level,
                    ):
                        neighbors[node].remove(neigh)
                        break

        # correct for false positives
        for node in nodes:
            for neigh in neighbors[node]:
                if node not in neighbors[neigh]:
                    neighbors[node].remove(neigh)

        skel = UndirectedGraph()
        skel.add_nodes_from(nodes)
        for node in nodes:
            skel.add_edges_from([(node, neigh) for neigh in neighbors[node]])

        return skel
Source code for pgmpy.estimators.MmhcEstimator

Navigation

Related Topics