Source code for pgmpy.estimators.HillClimbSearch

#!/usr/bin/env python
from itertools import permutations

import networkx as nx

from pgmpy.estimators import StructureEstimator, K2Score
from pgmpy.base import DAG

[docs]class HillClimbSearch(StructureEstimator): def __init__(self, data, scoring_method=None, **kwargs): """ Class for heuristic hill climb searches for DAGs, to learn network structure from data. `estimate` attempts to find a model with optimal score. Parameters ---------- data: pandas DataFrame object datafame object where each column represents one variable. (If some values in the data are missing the data cells should be set to `numpy.NaN`. Note that pandas converts each column containing `numpy.NaN`s to dtype `float`.) scoring_method: Instance of a `StructureScore`-subclass (`K2Score` is used as default) An instance of `K2Score`, `BDeuScore`, or `BicScore`. This score is optimized during structure estimation by the `estimate`-method. state_names: dict (optional) A dict indicating, for each variable, the discrete set of states (or values) that the variable can take. If unspecified, the observed values in the data set are taken to be the only possible states. complete_samples_only: bool (optional, default `True`) Specifies how to deal with missing data, if present. If set to `True` all rows that contain `np.Nan` somewhere are ignored. If `False` then, for each variable, every row where neither the variable nor its parents are `np.NaN` is used. This sets the behavior of the `state_count`-method. References ---------- Koller & Friedman, Probabilistic Graphical Models - Principles and Techniques, 2009 Section 18.4.3 (page 811ff) """ if scoring_method is not None: self.scoring_method = scoring_method else: self.scoring_method = K2Score(data, **kwargs) super(HillClimbSearch, self).__init__(data, **kwargs) def _legal_operations( self, model, tabu_list=[], max_indegree=None, black_list=None, white_list=None ): """Generates a list of legal (= not in tabu_list) graph modifications for a given model, together with their score changes. Possible graph modifications: (1) add, (2) remove, or (3) flip a single edge. For details on scoring see Koller & Friedman, Probabilistic Graphical Models, Section (page 818). If a number `max_indegree` is provided, only modifications that keep the number of parents for each node below `max_indegree` are considered. A list of edges can optionally be passed as `black_list` or `white_list` to exclude those edges or to limit the search. """ local_score = self.scoring_method.local_score nodes = self.state_names.keys() potential_new_edges = ( set(permutations(nodes, 2)) - set(model.edges()) - set([(Y, X) for (X, Y) in model.edges()]) ) for (X, Y) in potential_new_edges: # (1) add single edge if nx.is_directed_acyclic_graph(nx.DiGraph(list(model.edges()) + [(X, Y)])): operation = ("+", (X, Y)) if ( operation not in tabu_list and (black_list is None or (X, Y) not in black_list) and (white_list is None or (X, Y) in white_list) ): old_parents = model.get_parents(Y) new_parents = old_parents + [X] if max_indegree is None or len(new_parents) <= max_indegree: score_delta = local_score(Y, new_parents) - local_score( Y, old_parents ) yield (operation, score_delta) for (X, Y) in model.edges(): # (2) remove single edge operation = ("-", (X, Y)) if operation not in tabu_list: old_parents = model.get_parents(Y) new_parents = old_parents[:] new_parents.remove(X) score_delta = local_score(Y, new_parents) - local_score(Y, old_parents) yield (operation, score_delta) for (X, Y) in model.edges(): # (3) flip single edge new_edges = list(model.edges()) + [(Y, X)] new_edges.remove((X, Y)) if nx.is_directed_acyclic_graph(nx.DiGraph(new_edges)): operation = ("flip", (X, Y)) if ( operation not in tabu_list and ("flip", (Y, X)) not in tabu_list and (black_list is None or (X, Y) not in black_list) and (white_list is None or (X, Y) in white_list) ): old_X_parents = model.get_parents(X) old_Y_parents = model.get_parents(Y) new_X_parents = old_X_parents + [Y] new_Y_parents = old_Y_parents[:] new_Y_parents.remove(X) if max_indegree is None or len(new_X_parents) <= max_indegree: score_delta = ( local_score(X, new_X_parents) + local_score(Y, new_Y_parents) - local_score(X, old_X_parents) - local_score(Y, old_Y_parents) ) yield (operation, score_delta)
[docs] def estimate( self, start=None, tabu_length=0, max_indegree=None, black_list=None, white_list=None, epsilon=1e-4, max_iter=1e6, ): """ Performs local hill climb search to estimates the `DAG` structure that has optimal score, according to the scoring method supplied in the constructor. Starts at model `start` and proceeds by step-by-step network modifications until a local maximum is reached. Only estimates network structure, no parametrization. Parameters ---------- start: DAG instance The starting point for the local search. By default a completely disconnected network is used. tabu_length: int If provided, the last `tabu_length` graph modifications cannot be reversed during the search procedure. This serves to enforce a wider exploration of the search space. Default value: 100. max_indegree: int or None If provided and unequal None, the procedure only searches among models where all nodes have at most `max_indegree` parents. Defaults to None. black_list: list or None If a list of edges is provided as `black_list`, they are excluded from the search and the resulting model will not contain any of those edges. Default: None white_list: list or None If a list of edges is provided as `white_list`, the search is limited to those edges. The resulting model will then only contain edges that are in `white_list`. Default: None epsilon: float (default: 1e-4) Defines the exit condition. If the improvement in score is less than `epsilon`, the learned model is returned. max_iter: int (default: 1e6) The maximum number of iterations allowed. Returns the learned model when the number of iterations is greater than `max_iter`. Returns ------- model: `DAG` instance A `DAG` at a (local) score maximum. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import HillClimbSearch, BicScore >>> # create data sample with 9 random variables: ... data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 9)), columns=list('ABCDEFGHI')) >>> # add 10th dependent variable ... data['J'] = data['A'] * data['B'] >>> est = HillClimbSearch(data, scoring_method=BicScore(data)) >>> best_model = est.estimate() >>> sorted(best_model.nodes()) ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] >>> best_model.edges() [('B', 'J'), ('A', 'J')] >>> # search a model with restriction on the number of parents: >>> est.estimate(max_indegree=1).edges() [('J', 'A'), ('B', 'J')] """ nodes = self.state_names.keys() if start is None: start = DAG() start.add_nodes_from(nodes) elif not isinstance(start, DAG) or not set(start.nodes()) == set(nodes): raise ValueError( "'start' should be a DAG with the same variables as the data set, or 'None'." ) tabu_list = [] current_model = start iter_no = 0 while iter_no <= max_iter: iter_no += 1 best_score_delta = 0 best_operation = None for operation, score_delta in self._legal_operations( current_model, tabu_list, max_indegree, black_list, white_list ): if score_delta > best_score_delta: best_operation = operation best_score_delta = score_delta if best_operation is None or best_score_delta < epsilon: break elif best_operation[0] == "+": current_model.add_edge(*best_operation[1]) tabu_list = ([("-", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "-": current_model.remove_edge(*best_operation[1]) tabu_list = ([("+", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "flip": X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list = ([best_operation] + tabu_list)[:tabu_length] return current_model