Source code for pgmpy.estimators.MmhcEstimator

#!/usr/bin/env python
from pgmpy.base import UndirectedGraph
from pgmpy.estimators import BDeuScore, HillClimbSearch, StructureEstimator
from pgmpy.estimators.CITests import chi_square
from pgmpy.independencies import IndependenceAssertion, Independencies
from pgmpy.models import BayesianNetwork
from pgmpy.utils.mathext import powerset


[docs]class MmhcEstimator(StructureEstimator): """ Implements the MMHC hybrid structure estimation procedure for learning BayesianNetworks from discrete data. Parameters ---------- data: pandas DataFrame object dataframe object where each column represents one variable. (If some values in the data are missing the data cells should be set to `numpy.NaN`. Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.) state_names: dict (optional) A dict indicating, for each variable, the discrete set of states (or values) that the variable can take. If unspecified, the observed values in the data set are taken to be the only possible states. References ---------- Tsamardinos et al., The max-min hill-climbing Bayesian network structure learning algorithm (2005) http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf """ def __init__(self, data, **kwargs): super(MmhcEstimator, self).__init__(data, **kwargs)
[docs] def estimate(self, scoring_method=None, tabu_length=10, significance_level=0.01): """ Estimates a BayesianNetwork for the data set, using MMHC. First estimates a graph skeleton using MMPC and then orients the edges using score-based local search (hill climbing). Parameters ---------- significance_level: float, default: 0.01 The significance level to use for conditional independence tests in the data set. See `mmpc`-method. scoring_method: instance of a Scoring method (default: BDeuScore) The method to use for scoring during Hill Climb Search. Can be an instance of any of the scoring methods implemented in pgmpy. tabu_length: int If provided, the last `tabu_length` graph modifications cannot be reversed during the search procedure. This serves to enforce a wider exploration of the search space. Default value: 100. Returns ------- Estimated model: pgmpy.base.DAG The estimated model without the parameterization. References ---------- Tsamardinos et al., The max-min hill-climbing Bayesian network structure learning algorithm (2005), Algorithm 3 http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import MmhcEstimator >>> data = pd.DataFrame(np.random.randint(0, 2, size=(2500, 4)), columns=list('XYZW')) >>> data['sum'] = data.sum(axis=1) >>> est = MmhcEstimator(data) >>> model = est.estimate() >>> print(model.edges()) [('Z', 'sum'), ('X', 'sum'), ('W', 'sum'), ('Y', 'sum')] """ if scoring_method is None: scoring_method = BDeuScore(self.data, equivalent_sample_size=10) skel = self.mmpc(significance_level) hc = HillClimbSearch(self.data) model = hc.estimate( scoring_method=scoring_method, white_list=skel.to_directed().edges(), tabu_length=tabu_length, ) return model
[docs] def mmpc(self, significance_level=0.01): """Estimates a graph skeleton (UndirectedGraph) for the data set, using then MMPC (max-min parents-and-children) algorithm. Parameters ---------- significance_level: float, default=0.01 The significance level to use for conditional independence tests in the data set. `significance_level` is the desired Type 1 error probability of falsely rejecting the null hypothesis that variables are independent, given that they are. The lower `significance_level`, the less likely we are to accept dependencies, resulting in a sparser graph. Returns ------- skeleton: pgmpy.base.UndirectedGraph An estimate for the undirected graph skeleton of the BN underlying the data. seperating_sets: dict A dict containing for each pair of not directly connected nodes a seperating set ("witnessing set") of variables that makes then conditionally independent. (needed for edge orientation) References ---------- Tsamardinos et al., The max-min hill-climbing Bayesian network structure learning algorithm (2005), Algorithm 1 & 2 http://www.dsl-lab.org/supplements/mmhc_paper/paper_online.pdf Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import MmhcEstimator >>> data = pd.DataFrame(np.random.randint(0, 2, size=(5000, 5)), columns=list('ABCDE')) >>> data['F'] = data['A'] + data['B'] + data ['C'] >>> est = PC(data) >>> skel, sep_sets = est.estimate_skeleton() >>> skel.edges() [('A', 'F'), ('B', 'F'), ('C', 'F')] >>> # all independencies are unconditional: >>> sep_sets {('D', 'A'): (), ('C', 'A'): (), ('C', 'E'): (), ('E', 'F'): (), ('B', 'D'): (), ('B', 'E'): (), ('D', 'F'): (), ('D', 'E'): (), ('A', 'E'): (), ('B', 'A'): (), ('B', 'C'): (), ('C', 'D'): ()} >>> data = pd.DataFrame(np.random.randint(0, 2, size=(5000, 3)), columns=list('XYZ')) >>> data['X'] += data['Z'] >>> data['Y'] += data['Z'] >>> est = PC(data) >>> skel, sep_sets = est.estimate_skeleton() >>> skel.edges() [('X', 'Z'), ('Y', 'Z')] >>> # X, Y dependent, but conditionally independent given Z: >>> sep_sets {('X', 'Y'): ('Z',)} """ nodes = self.state_names.keys() def assoc(X, Y, Zs): """Measure for (conditional) association between variables. Use negative p-value of independence test. """ return 1 - chi_square(X, Y, Zs, self.data, boolean=False)[1] def min_assoc(X, Y, Zs): "Minimal association of X, Y given any subset of Zs." return min(assoc(X, Y, Zs_subset) for Zs_subset in powerset(Zs)) def max_min_heuristic(X, Zs): "Finds variable that maximizes min_assoc with `node` relative to `neighbors`." max_min_assoc = 0 best_Y = None for Y in set(nodes) - set(Zs + [X]): min_assoc_val = min_assoc(X, Y, Zs) if min_assoc_val >= max_min_assoc: best_Y = Y max_min_assoc = min_assoc_val return (best_Y, max_min_assoc) # Find parents and children for each node neighbors = dict() for node in nodes: neighbors[node] = [] # Forward Phase while True: new_neighbor, new_neighbor_min_assoc = max_min_heuristic( node, neighbors[node] ) if new_neighbor_min_assoc > 0: neighbors[node].append(new_neighbor) else: break # Backward Phase for neigh in neighbors[node]: other_neighbors = [n for n in neighbors[node] if n != neigh] for sep_set in powerset(other_neighbors): if chi_square( X=node, Y=neigh, Z=sep_set, data=self.data, significance_level=significance_level, ): neighbors[node].remove(neigh) break # correct for false positives for node in nodes: for neigh in neighbors[node]: if node not in neighbors[neigh]: neighbors[node].remove(neigh) skel = UndirectedGraph() skel.add_nodes_from(nodes) for node in nodes: skel.add_edges_from([(node, neigh) for neigh in neighbors[node]]) return skel