[1]:
# Imports
from pgmpy.models import DynamicBayesianNetwork as DBN
from pgmpy.factors.discrete import TabularCPD
[2]:
# Initialize a simple DBN model modeling the Weather (W), Rain (O), Temperature (T), and Humidity (H).
dbn = DBN()
# pgmpy requires the user to define the structure of the first time slice and the edges connecting the first time slice to the second time slice.
# pgmpy assumes that this structure remains constant for further time slices, i.e., it is a 2-TBN.
# Add intra-slice edges for both time slices
dbn.add_edges_from([
(('W', 0), ('O', 0)), # Weather influences ground observation
(('T', 0), ('H', 0)), # Temperature influences humidity
(('W', 0), ('H', 0)) # Weather influences humidity
])
# Add inter-slice edges
dbn.add_edges_from([
(('W', 0), ('W', 1)), # Weather transition
(('T', 0), ('T', 1)), # Temperature transition
(('W', 0), ('T', 1)) # Weather influences future temperature
])
# Define the parameters of the model. Again pgmpy assumes that these CPDs remain the same for future time slices.
# Define CPDs
# CPD for W (Weather transition)
cpd_w_0 = TabularCPD(
variable=('W', 0),
variable_card=3, # Sunny, Cloudy, Rainy
values=[[0.6], [0.3], [0.1]], # Initial probabilities
)
cpd_w_1 = TabularCPD(
variable=('W', 1),
variable_card=3,
evidence=[('W', 0)],
evidence_card=[3],
values=[
[0.7, 0.3, 0.2], # P(Sunny | W_0)
[0.2, 0.4, 0.3], # P(Cloudy | W_0)
[0.1, 0.3, 0.5] # P(Rainy | W_0)
],
)
# CPD for T (Temperature transition)
cpd_t_0 = TabularCPD(
variable=('T', 0),
variable_card=3, # Hot, Mild, Cold
values=[[0.5], [0.4], [0.1]] # Initial probabilities
)
cpd_t_1 = TabularCPD(
variable=('T', 1),
variable_card=3,
evidence=[('T', 0), ('W', 0)],
evidence_card=[3, 3],
values=[
[0.8, 0.6, 0.1, 0.7, 0.4, 0.2, 0.6, 0.3, 0.1], # P(Hot | T_0, W_0)
[0.2, 0.3, 0.7, 0.2, 0.5, 0.3, 0.3, 0.4, 0.3], # P(Mild | T_0, W_0)
[0.0, 0.1, 0.2, 0.1, 0.1, 0.5, 0.1, 0.3, 0.6] # P(Cold | T_0, W_0)
]
)
# CPD for O (Ground observation)
cpd_o = TabularCPD(
variable=('O', 0),
variable_card=2, # Dry, Wet
evidence=[('W', 0)],
evidence_card=[3],
values=[
[0.9, 0.6, 0.2], # P(Dry | Sunny, Cloudy, Rainy)
[0.1, 0.4, 0.8] # P(Wet | Sunny, Cloudy, Rainy)
]
)
# CPD for H (Humidity observation)
cpd_h = TabularCPD(
variable=('H', 0),
variable_card=3, # Low, Medium, High
evidence=[('T', 0), ('W', 0)],
evidence_card=[3, 3],
values=[
[0.7, 0.4, 0.1, 0.5, 0.3, 0.2, 0.3, 0.2, 0.1], # P(Low | T_0, W_0)
[0.2, 0.5, 0.3, 0.4, 0.5, 0.3, 0.4, 0.3, 0.2], # P(Medium | T_0, W_0)
[0.1, 0.1, 0.6, 0.1, 0.2, 0.5, 0.3, 0.5, 0.7] # P(High | T_0, W_0)
]
)
# Add CPDs to the DBN
dbn.add_cpds(cpd_w_0, cpd_w_1, cpd_t_0, cpd_t_1, cpd_o, cpd_h)
# After defining the model, call the initialization method that generates the required data structures for further computation.
dbn.initialize_initial_state()
# Simulate some data from the defined model.
samples = dbn.simulate(n_samples=1000, n_time_slices=10)
samples.head()
[2]:
| (W, 0) | (O, 0) | (H, 0) | (T, 0) | (W, 1) | (O, 1) | (H, 1) | (T, 1) | (W, 2) | (O, 2) | ... | (H, 7) | (T, 7) | (W, 8) | (O, 8) | (H, 8) | (W, 9) | (T, 9) | (O, 9) | (H, 9) | (T, 8) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 | 0 |
| 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 2 | 2 | 1 | ... | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 |
| 2 | 2 | 1 | 2 | 1 | 2 | 1 | 2 | 1 | 2 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 2 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 2 | 1 | 0 | 1 | 2 | 1 | 1 | 2 | 1 |
| 4 | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | ... | 1 | 2 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
5 rows × 40 columns
[3]:
# Fitting model parameters to a defined network structure.
# Define the network structure for which to learn the model parameters. Here, we have assumeed the same model
# structure that we simulated the data from
dbn = DBN()
dbn.add_edges_from([
(('W', 0), ('O', 0)), # Weather influences ground observation
(('T', 0), ('H', 0)), # Temperature influences humidity
(('W', 0), ('H', 0)) # Weather influences humidity
])
dbn.add_edges_from([
(('W', 0), ('W', 1)), # Weather transition
(('T', 0), ('T', 1)), # Temperature transition
(('W', 0), ('T', 1)) # Weather influences future temperature
])
# Fit the model using simulated samples
dbn.fit(samples)
dbn.get_cpds()
[3]:
[<TabularCPD representing P((H, 0):3 | (T, 0):3, (W, 0):3) at 0x726d1a2b0cb0>,
<TabularCPD representing P((T, 0):3) at 0x726d1a2b08c0>,
<TabularCPD representing P((W, 0):3) at 0x726d1a2b0dd0>,
<TabularCPD representing P((O, 0):2 | (W, 0):3) at 0x726d1a2b0d40>,
<TabularCPD representing P((H, 1):3 | (T, 1):3, (W, 1):3) at 0x726d1a2b0830>,
<TabularCPD representing P((T, 1):3 | (T, 0):3, (W, 0):3) at 0x726d1a2b03b0>,
<TabularCPD representing P((W, 1):3 | (W, 0):3) at 0x726d1a2b0320>,
<TabularCPD representing P((O, 1):2 | (W, 1):3) at 0x726d1a2b07a0>]
[4]:
print(dbn.get_cpds(('W', 1)))
+-----------+---------------------+---------------------+---------------------+
| (W, 0) | (W, 0)(0) | (W, 0)(1) | (W, 0)(2) |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(0) | 0.7004921418172009 | 0.31049438383598194 | 0.20921204376971736 |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(1) | 0.19523741956078264 | 0.4021998199276747 | 0.2642262957939027 |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(2) | 0.10427043862201643 | 0.2873057962363434 | 0.5265616604363799 |
+-----------+---------------------+---------------------+---------------------+
[5]:
# Learning the model structure from data.
# pgmpy doesn't implement any specific methods for DBN structure learning. This is a hackish method to utilize the
# existing BN learning algorithms to estimate the structure of the DBN. Essentially, we remove the time-information from the
# given data and try to learn the 2-DBN network that remains constant across time-slices.
# First convert the given dataset into long form removing the time information such that it is suitable to learn the 2-DBN network.
import pandas as pd
# Use sorted() to ensure consistent node ordering across all iterations
nodes = sorted(dbn._nodes())
colnames = [node + '0' for node in nodes] + [node + '1' for node in nodes]
df_long = pd.DataFrame(columns=colnames)
for t in range(9):
cols = [(node, t) for node in nodes] + [(node, t+1) for node in nodes]
samples_t = samples.loc[:, cols]
samples_t.columns = colnames
df_long = pd.concat([df_long, samples_t], ignore_index=True)
df_long = df_long.reset_index(drop=True)
df_long.head()
[5]:
| H0 | O0 | T0 | W0 | H1 | O1 | T1 | W1 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 1 | 0 | 0 | 2 | 0 |
| 2 | 2 | 1 | 1 | 2 | 2 | 1 | 1 | 2 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 1 |
[6]:
# Use this long data frame to learn the first two time frames of the DBN. Because we are using structure learning algorithms we\n",
# need to add constraints such that the algorithm doesn't learn edges from time slice 1 to 0.\n",
from pgmpy.causal_discovery import HillClimbSearch
from pgmpy.estimators import ExpertKnowledge
# Constraints to learn edges in only time forward direction.
expert = ExpertKnowledge(forbidden_edges=
[('W1', 'W0'), ('W1', 'O0'), ('W1', 'T0'), ('W1', 'H0'),
('O1', 'W0'), ('O1', 'O0'), ('O1', 'T0'), ('O1', 'H0'),
('T1', 'W0'), ('T1', 'O0'), ('T1', 'T0'), ('T1', 'H0'),
('H1', 'W0'), ('H1', 'O0'), ('H1', 'T0'), ('H1', 'H0'),])
est = HillClimbSearch(expert_knowledge=expert)
est.fit(df_long)
est.causal_graph_.edges() # Use this learned DAG to define a DBN."
[6]:
OutEdgeView([('W0', 'T0'), ('W0', 'H0'), ('W0', 'T1'), ('W0', 'O0'), ('W0', 'W1'), ('T0', 'T1'), ('T0', 'H0'), ('T0', 'W0'), ('H0', 'W0'), ('H0', 'T0'), ('T1', 'H1'), ('T1', 'T0'), ('T1', 'W0'), ('W1', 'H1'), ('W1', 'O1'), ('W1', 'W0'), ('O1', 'W1'), ('O0', 'W0')])