from pgmpy.independencies import Independencies
from pgmpy.models import BayesianNetwork
[docs]
class NaiveBayes(BayesianNetwork):
"""
Class to represent Naive Bayes. Naive Bayes is a special case of Bayesian Model
where the only edges in the model are from the feature variables to the dependent variable.
"""
def __init__(self, feature_vars=None, dependent_var=None):
"""
Method to initialize the `NaiveBayes` class.
Parameters
----------
feature_vars: list (array-like)
A list of variable predictor variables (i.e. the features) in the model.
dependent_var: hashable object
The dependent variable (i.e. the variable to be predicted) in the model.
Returns
-------
pgmpy.models.BayesianNetwork instance: An instance of a Bayesian Model with the
initialized model structure.
"""
self.dependent = dependent_var
self.features = set(feature_vars) if feature_vars is not None else set()
if (feature_vars is not None) and (dependent_var is not None):
ebunch = [(self.dependent, feature) for feature in self.features]
else:
ebunch = []
super(NaiveBayes, self).__init__(ebunch=ebunch)
[docs]
def add_edge(self, u, v, *kwargs):
"""
Add an edge between `u` and `v`.
The nodes `u` and `v` will be automatically added if they are
not already in the graph. `u` will be the dependent variable (i.e. variable to be predicted)
and `v` will be one of the features (i.e. predictors) in the model.
Parameters
----------
u, v : nodes
Nodes can be any hashable python object.
Returns
-------
None
Examples
--------
>>> from pgmpy.models import NaiveBayes
>>> G = NaiveBayes()
>>> G.add_nodes_from(['a', 'b', 'c'])
>>> G.add_edge('a', 'b')
>>> G.add_edge('a', 'c')
>>> G.edges()
OutEdgeView([('a', 'b'), ('a', 'c')])
"""
if self.dependent and u != self.dependent:
raise ValueError(
f"Model can only have edges outgoing from: {self.dependent}"
)
self.dependent = u
self.features.add(v)
super(NaiveBayes, self).add_edge(u, v, *kwargs)
[docs]
def add_edges_from(self, ebunch):
"""
Adds edges to the model.
Each tuple of the form (u, v) in ebunch adds a new edge in the model.
Since there can only be one dependent variable in a Naive Bayes model, `u` should
be the same for each tuple in `ebunch`.
Parameters
----------
ebunch: list (array-like)
A list of tuples of the form (u, v) representing an edge from u to v.
Returns
-------
None
Examples
--------
>>> from pgmpy.models import NaiveBayes
>>> G = NaiveBayes()
>>> G.add_nodes_from(['a', 'b', 'c'])
>>> G.add_edges_from([('a', 'b'), ('a', 'c')])
>>> G.edges()
OutEdgeView([('a', 'b'), ('a', 'c')])
"""
for u, v in ebunch:
self.add_edge(u, v)
def _get_ancestors_of(self, obs_nodes_list):
"""
Returns a list of all ancestors of all the observed nodes.
Parameters
----------
obs_nodes_list: string, list-type
name of all the observed nodes
"""
if not obs_nodes_list:
return set()
return set(obs_nodes_list) | set(self.dependent)
[docs]
def active_trail_nodes(self, start, observed=None):
"""
Returns all the nodes reachable from start via an active trail.
Parameters
----------
start: Graph node
observed : List of nodes (optional)
If given the active trail would be computed assuming these nodes to be observed.
Examples
--------
>>> from pgmpy.models import NaiveBayes
>>> model = NaiveBayes()
>>> model.add_nodes_from(['a', 'b', 'c', 'd'])
>>> model.add_edges_from([('a', 'b'), ('a', 'c'), ('a', 'd')])
>>> model.active_trail_nodes('a')
{'a', 'd', 'c', 'b'}
>>> model.active_trail_nodes('a', ['b', 'c'])
{'a', 'd'}
>>> model.active_trail_nodes('b', ['a'])
{'b'}
"""
if observed and self.dependent in observed:
return set(start)
else:
return set(self.nodes()) - set(observed if observed else [])
[docs]
def local_independencies(self, variables):
"""
Returns an instance of Independencies containing the local independencies
of each of the variables.
Parameters
----------
variables: str or array like
variables whose local independencies are to found.
Examples
--------
>>> from pgmpy.models import NaiveBayes
>>> model = NaiveBayes()
>>> model.add_edges_from([('a', 'b'), ('a', 'c'), ('a', 'd')])
>>> ind = model.local_independencies('b')
>>> ind
(b \u27C2 d, c | a)
"""
independencies = Independencies()
for variable in [variables] if isinstance(variables, str) else variables:
if variable != self.dependent:
independencies.add_assertions(
[variable, list(set(self.features) - set(variable)), self.dependent]
)
return independencies
[docs]
def fit(self, data, parent_node=None, estimator=None):
"""
Computes the CPD for each node from a given data in the form of a pandas dataframe.
If a variable from the data is not present in the model, it adds that node into the model.
Parameters
----------
data : pandas DataFrame object
A DataFrame object with column names same as the variable names of network
parent_node: any hashable python object (optional)
Parent node of the model, if not specified it looks for a previously specified
parent node.
estimator: Estimator class
Any pgmpy estimator. If nothing is specified, the default ``MaximumLikelihoodEstimator``
would be used.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>> from pgmpy.models import NaiveBayes
>>> model = NaiveBayes()
>>> values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
... columns=['A', 'B', 'C', 'D', 'E'])
>>> model.fit(values, 'A')
>>> model.get_cpds()
[<TabularCPD representing P(D:2 | A:2) at 0x4b72870>,
<TabularCPD representing P(E:2 | A:2) at 0x4bb2150>,
<TabularCPD representing P(A:2) at 0x4bb23d0>,
<TabularCPD representing P(B:2 | A:2) at 0x4bb24b0>,
<TabularCPD representing P(C:2 | A:2) at 0x4bb2750>]
>>> model.edges()
[('A', 'D'), ('A', 'E'), ('A', 'B'), ('A', 'C')]
"""
if not parent_node:
if not self.dependent:
raise ValueError("parent node must be specified for the model")
else:
parent_node = self.dependent
if parent_node not in data.columns:
raise ValueError(
f"Dependent variable: {parent_node} is not present in the data"
)
for child_node in data.columns:
if child_node != parent_node:
self.add_edge(parent_node, child_node)
super(NaiveBayes, self).fit(data, estimator)