Source code for pgmpy.estimators.TreeSearch

#!/usr/bin/env python

from itertools import combinations

import networkx as nx
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.metrics import (
    adjusted_mutual_info_score,
    mutual_info_score,
    normalized_mutual_info_score,
)
from tqdm.auto import tqdm

from pgmpy import config
from pgmpy.base import DAG
from pgmpy.estimators import StructureEstimator



[docs]
class TreeSearch(StructureEstimator):
    """
    Search class for learning tree related graph structure. The algorithms
    supported are Chow-Liu and Tree-augmented naive bayes (TAN).

    Chow-Liu constructs the maximum-weight spanning tree with mutual information
    score as edge weights.

    TAN is an extension of Naive Bayes classifier to allow a tree structure over
    the independent variables to account for interaction.

    Parameters
    ----------
    data: pandas.DataFrame object
        dataframe object where each column represents one variable.

    root_node: str, int, or any hashable python object, default is None.
        The root node of the tree structure. If None then root node is auto-picked
        as the node with the highest sum of edge weights.

    n_jobs: int (default: -1)
        Number of jobs to run in parallel. `-1` means use all processors.

    References
    ----------
    [1] Chow, C. K.; Liu, C.N. (1968), "Approximating discrete probability
        distributions with dependence trees", IEEE Transactions on Information
        Theory, IT-14 (3): 462–467

    [2] Friedman N, Geiger D and Goldszmidt M (1997). Bayesian network classifiers.
        Machine Learning 29: 131–163
    """

    def __init__(self, data, root_node=None, n_jobs=-1, **kwargs):
        if root_node is not None and root_node not in data.columns:
            raise ValueError(f"Root node: {root_node} not found in data columns.")

        self.data = data
        self.root_node = root_node
        self.n_jobs = n_jobs

        super(TreeSearch, self).__init__(data, **kwargs)


[docs]
    def estimate(
        self,
        estimator_type="chow-liu",
        class_node=None,
        edge_weights_fn="mutual_info",
        show_progress=True,
    ):
        """
        Estimate the `DAG` structure that fits best to the given data set without
        parametrization.

        Parameters
        ----------
        estimator_type: str (chow-liu | tan)
            The algorithm to use for estimating the DAG.

        class_node: string, int or any hashable python object. (optional)
            Needed only if estimator_type = 'tan'. In the estimated DAG, there would be
            edges from class_node to each of the feature variables.

        edge_weights_fn: str or function (default: mutual info)
            Method to use for computing edge weights. By default, Mutual Info Score is
            used.

        show_progress: boolean
            If True, shows a progress bar for the running algorithm.

        Returns
        -------
        Estimated Model: pgmpy.base.DAG
            The estimated model structure.

        Examples
        --------
        >>> import numpy as np
        >>> import pandas as pd
        >>> import networkx as nx
        >>> import matplotlib.pyplot as plt
        >>> from pgmpy.estimators import TreeSearch
        >>> values = pd.DataFrame(
        ...     np.random.randint(low=0, high=2, size=(1000, 5)),
        ...     columns=["A", "B", "C", "D", "E"],
        ... )
        >>> est = TreeSearch(values, root_node="B")
        >>> model = est.estimate(estimator_type="chow-liu")
        >>> nx.draw_circular(
        ...     model, with_labels=True, arrowsize=20, arrowstyle="fancy", alpha=0.3
        ... )
        >>> plt.show()
        >>> est = TreeSearch(values)
        >>> model = est.estimate(estimator_type="chow-liu")
        >>> nx.draw_circular(
        ...     model, with_labels=True, arrowsize=20, arrowstyle="fancy", alpha=0.3
        ... )
        >>> plt.show()
        >>> est = TreeSearch(values, root_node="B")
        >>> model = est.estimate(estimator_type="tan", class_node="A")
        >>> nx.draw_circular(
        ...     model, with_labels=True, arrowsize=20, arrowstyle="fancy", alpha=0.3
        ... )
        >>> plt.show()
        >>> est = TreeSearch(values)
        >>> model = est.estimate(estimator_type="tan")
        >>> nx.draw_circular(
        ...     model, with_labels=True, arrowsize=20, arrowstyle="fancy", alpha=0.3
        ... )
        >>> plt.show()
        """
        # Step 1. Argument checks
        # Step 1.1: Only chow-liu and tan allowed as estimator type.
        if estimator_type not in {"chow-liu", "tan"}:
            raise ValueError(
                f"Invalid estimator_type. Expected either chow-liu or tan. Got: {estimator_type}"
            )

        # Step 1.2: If estimator_type=tan, class_node must be specified
        if estimator_type == "tan" and class_node is None:
            raise ValueError(
                f"class_node argument must be specified for estimator_type='tan'"
            )
        if estimator_type == "tan" and class_node not in self.data.columns:
            raise ValueError(f"Class node: {class_node} not found in data columns")

        # Step 1.3: If root_node isn't specified, get the node with the highest score.
        weights_computed = False
        if self.root_node is None:
            weights = TreeSearch._get_weights(
                self.data, edge_weights_fn, self.n_jobs, show_progress
            )
            weights_computed = True
            sum_weights = weights.sum(axis=0)
            maxw_idx = np.argsort(sum_weights)[::-1]
            self.root_node = self.data.columns[maxw_idx[0]]

        # Step 2. Compute all edge weights.
        if estimator_type == "chow-liu":
            if not weights_computed:
                weights = TreeSearch._get_weights(
                    self.data, edge_weights_fn, self.n_jobs, show_progress
                )
        else:
            weights = TreeSearch._get_conditional_weights(
                self.data, class_node, edge_weights_fn, self.n_jobs, show_progress
            )

        # Step 3: If estimator_type = "chow-liu", estimate the DAG and return.
        if estimator_type == "chow-liu":
            return TreeSearch._create_tree_and_dag(
                weights, self.data.columns, self.root_node
            )

        # Step 4: If estimator_type = "tan":
        elif estimator_type == "tan":
            # Step 4.1: Checks root_node != class_node
            if self.root_node == class_node:
                raise ValueError(
                    f"Root node: {self.root_node} and class node: {class_node} are identical"
                )

            # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
            class_node_idx = np.where(self.data.columns == class_node)[0][0]
            weights = np.delete(weights, class_node_idx, axis=0)
            weights = np.delete(weights, class_node_idx, axis=1)
            reduced_columns = np.delete(self.data.columns, class_node_idx)
            D = TreeSearch._create_tree_and_dag(
                weights, reduced_columns, self.root_node
            )

            # Step 4.3: Add edges from class_node to all other nodes.
            D.add_edges_from([(class_node, node) for node in reduced_columns])
            return D


    @staticmethod
    def _get_weights(
        data, edge_weights_fn="mutual_info", n_jobs=-1, show_progress=True
    ):
        """
        Helper function to Chow-Liu algorithm for estimating tree structure from given data. Refer to
        pgmpy.estimators.TreeSearch for more details. This function returns the edge weights matrix.

        Parameters
        ----------
        data: pandas.DataFrame object
            dataframe object where each column represents one variable.

        edge_weights_fn: str or function (default: mutual_info)
            Method to use for computing edge weights. Options are:
                1. 'mutual_info': Mutual Information Score.
                2. 'adjusted_mutual_info': Adjusted Mutual Information Score.
                3. 'normalized_mutual_info': Normalized Mutual Information Score.
                4. function(array[n_samples,], array[n_samples,]): Custom function.

        n_jobs: int (default: -1)
            Number of jobs to run in parallel. `-1` means use all processors.

        show_progress: boolean
            If True, shows a progress bar for the running algorithm.

        Returns
        -------
        weights: numpy 2D array, shape = (n_columns, n_columns)
            symmetric matrix where each element represents an edge weight.

        Examples
        --------
        >>> import numpy as np
        >>> import pandas as pd
        >>> from pgmpy.estimators import TreeSearch
        >>> values = pd.DataFrame(
        ...     np.random.randint(low=0, high=2, size=(1000, 5)),
        ...     columns=["A", "B", "C", "D", "E"],
        ... )
        >>> est = TreeSearch(values, root_node="B")
        >>> model = est.estimate(estimator_type="chow-liu")
        """
        # Step 0: Check for edge weight computation method
        if edge_weights_fn == "mutual_info":
            edge_weights_fn = mutual_info_score
        elif edge_weights_fn == "adjusted_mutual_info":
            edge_weights_fn = adjusted_mutual_info_score
        elif edge_weights_fn == "normalized_mutual_info":
            edge_weights_fn = normalized_mutual_info_score
        elif not callable(edge_weights_fn):
            raise ValueError(
                f"edge_weights_fn should either be 'mutual_info', 'adjusted_mutual_info', "
                f"'normalized_mutual_info'or a function of form fun(array, array). Got: f{edge_weights_fn}"
            )

        # Step 1: Compute edge weights for a fully connected graph.
        n_vars = len(data.columns)
        pbar = combinations(data.columns, 2)
        if show_progress and config.SHOW_PROGRESS:
            pbar = tqdm(pbar, total=(n_vars * (n_vars - 1) / 2), desc="Building tree")

        vals = Parallel(n_jobs=n_jobs)(
            delayed(edge_weights_fn)(data.loc[:, u], data.loc[:, v]) for u, v in pbar
        )
        weights = np.zeros((n_vars, n_vars))
        indices = np.triu_indices(n_vars, k=1)
        weights[indices] = vals
        weights.T[indices] = vals

        return weights

    @staticmethod
    def _get_conditional_weights(
        data, class_node, edge_weights_fn="mutual_info", n_jobs=-1, show_progress=True
    ):
        """
        Helper function to TAN (Tree Augmented Naive Bayes) algorithm for
        estimating tree structure from given data. Refer to
        pgmpy.estimators.TreeSearch for more details. This function returns the
        edge weights matrix.

        Parameters
        ----------
        data: pandas.DataFrame object
            dataframe object where each column represents one variable.

        class_node: str
            The class node for TAN. The edge weight is computed as I(X, Y | class_node).

        edge_weights_fn: str or function (default: mutual_info)
            Method to use for computing edge weights. Options are:
                1. 'mutual_info': Mutual Information Score.
                2. 'adjusted_mutual_info': Adjusted Mutual Information Score.
                3. 'normalized_mutual_info': Normalized Mutual Information Score.
                4. function(array[n_samples,], array[n_samples,]): Custom function.

        n_jobs: int (default: -1)
            Number of jobs to run in parallel. `-1` means use all processors.

        show_progress: boolean
            If True, shows a progress bar for the running algorithm.

        Returns
        -------
        weights: numpy 2D array, shape = (n_columns, n_columns)
            symmetric matrix where each element represents an edge weight.

        Examples
        --------
        >>> import numpy as np
        >>> import pandas as pd
        >>> from pgmpy.estimators import TreeSearch
        >>> values = pd.DataFrame(
        ...     np.random.randint(low=0, high=2, size=(1000, 5)),
        ...     columns=["A", "B", "C", "D", "E"],
        ... )
        >>> est = TreeSearch(values, root_node="B")
        >>> model = est.estimate(estimator_type="tan")
        """
        # Step 0: Check for edge weight computation method
        if edge_weights_fn == "mutual_info":
            edge_weights_fn = mutual_info_score
        elif edge_weights_fn == "adjusted_mutual_info":
            edge_weights_fn = adjusted_mutual_info_score
        elif edge_weights_fn == "normalized_mutual_info":
            edge_weights_fn = normalized_mutual_info_score
        elif not callable(edge_weights_fn):
            raise ValueError(
                f"edge_weights_fn should either be 'mutual_info', 'adjusted_mutual_info', "
                f"'normalized_mutual_info'or a function of form fun(array, array). Got: f{edge_weights_fn}"
            )

        # Step 1: Compute edge weights for a fully connected graph.
        n_vars = len(data.columns)
        pbar = combinations(data.columns, 2)
        if show_progress and config.SHOW_PROGRESS:
            pbar = tqdm(pbar, total=(n_vars * (n_vars - 1) / 2), desc="Building tree")

        def _conditional_edge_weights_fn(u, v):
            """
            Computes the conditional edge weight of variable index u and v conditioned on class_node
            """
            cond_marginal = data.loc[:, class_node].value_counts() / data.shape[0]
            cond_edge_weight = 0
            for index, marg_prob in cond_marginal.items():
                df_cond_subset = data[data.loc[:, class_node] == index]
                cond_edge_weight += marg_prob * edge_weights_fn(
                    df_cond_subset.loc[:, u], df_cond_subset.loc[:, v]
                )
            return cond_edge_weight

        vals = Parallel(n_jobs=n_jobs)(
            delayed(_conditional_edge_weights_fn)(u, v) for u, v in pbar
        )
        weights = np.zeros((n_vars, n_vars))
        indices = np.triu_indices(n_vars, k=1)
        weights[indices] = vals
        weights.T[indices] = vals

        return weights

    @staticmethod
    def _create_tree_and_dag(weights, columns, root_node):
        """
        Helper function to Chow-Liu algorithm for estimating tree structure from given data. Refer to
        pgmpy.estimators.TreeSearch for more details. This function returns the DAG based on the edge weights matrix.

        Parameters
        ----------
        weights: numpy 2D array, shape = (n_columns, n_columns)
            symmetric matrix where each element represents an edge weight.

        columns: list or array
            Names of the columns (& rows) of the weights matrix.

        root_node: str, int, or any hashable python object.
            The root node of the tree structure.

        Returns
        -------
        model: pgmpy.base.DAG
            The estimated model structure.

        Examples
        --------
        >>> import numpy as np
        >>> import pandas as pd
        >>> from pgmpy.estimators import TreeSearch
        >>> values = pd.DataFrame(
        ...     np.random.randint(low=0, high=2, size=(1000, 5)),
        ...     columns=["A", "B", "C", "D", "E"],
        ... )
        >>> est = TreeSearch(values, root_node="B")
        >>> model = est.estimate(estimator_type="chow-liu")
        """
        # Step 2: Compute the maximum spanning tree using the weights.
        T = nx.maximum_spanning_tree(
            nx.from_pandas_adjacency(
                pd.DataFrame(weights, index=columns, columns=columns),
                create_using=nx.Graph,
            )
        )

        # Step 3: Create DAG by directing edges away from root node and return
        D = nx.bfs_tree(T, root_node)
        return DAG(D)
Source code for pgmpy.estimators.TreeSearch

Navigation

Related Topics