[1]:
# Imports

from pgmpy.models import DynamicBayesianNetwork as DBN
from pgmpy.factors.discrete import TabularCPD
[2]:
# Initialize a simple DBN model modeling the Weather (W), Rain (O), Temperature (T), and Humidity (H).

dbn = DBN()

# pgmpy requires the user to define the structure of the first time slice and the edges connecting the first time slice to the second time slice.
# pgmpy assumes that this structure remains constant for further time slices, i.e., it is a 2-TBN.

# Add intra-slice edges for both time slices
dbn.add_edges_from([
    (('W', 0), ('O', 0)),  # Weather influences ground observation
    (('T', 0), ('H', 0)),  # Temperature influences humidity
    (('W', 0), ('H', 0))   # Weather influences humidity
])

# Add inter-slice edges
dbn.add_edges_from([
    (('W', 0), ('W', 1)),  # Weather transition
    (('T', 0), ('T', 1)),  # Temperature transition
    (('W', 0), ('T', 1))   # Weather influences future temperature
])

# Define the parameters of the model. Again pgmpy assumes that these CPDs remain the same for future time slices.

# Define CPDs
# CPD for W (Weather transition)
cpd_w_0 = TabularCPD(
    variable=('W', 0),
    variable_card=3,  # Sunny, Cloudy, Rainy
    values=[[0.6], [0.3], [0.1]],  # Initial probabilities
)

cpd_w_1 = TabularCPD(
    variable=('W', 1),
    variable_card=3,
    evidence=[('W', 0)],
    evidence_card=[3],
    values=[
        [0.7, 0.3, 0.2],  # P(Sunny | W_0)
        [0.2, 0.4, 0.3],  # P(Cloudy | W_0)
        [0.1, 0.3, 0.5]   # P(Rainy | W_0)
    ],
)

# CPD for T (Temperature transition)
cpd_t_0 = TabularCPD(
    variable=('T', 0),
    variable_card=3,  # Hot, Mild, Cold
    values=[[0.5], [0.4], [0.1]]  # Initial probabilities
)

cpd_t_1 = TabularCPD(
    variable=('T', 1),
    variable_card=3,
    evidence=[('T', 0), ('W', 0)],
    evidence_card=[3, 3],
    values=[
        [0.8, 0.6, 0.1, 0.7, 0.4, 0.2, 0.6, 0.3, 0.1],  # P(Hot | T_0, W_0)
        [0.2, 0.3, 0.7, 0.2, 0.5, 0.3, 0.3, 0.4, 0.3],  # P(Mild | T_0, W_0)
        [0.0, 0.1, 0.2, 0.1, 0.1, 0.5, 0.1, 0.3, 0.6]   # P(Cold | T_0, W_0)
    ]
)

# CPD for O (Ground observation)
cpd_o = TabularCPD(
    variable=('O', 0),
    variable_card=2,  # Dry, Wet
    evidence=[('W', 0)],
    evidence_card=[3],
    values=[
        [0.9, 0.6, 0.2],  # P(Dry | Sunny, Cloudy, Rainy)
        [0.1, 0.4, 0.8]   # P(Wet | Sunny, Cloudy, Rainy)
    ]
)

# CPD for H (Humidity observation)
cpd_h = TabularCPD(
    variable=('H', 0),
    variable_card=3,  # Low, Medium, High
    evidence=[('T', 0), ('W', 0)],
    evidence_card=[3, 3],
    values=[
        [0.7, 0.4, 0.1, 0.5, 0.3, 0.2, 0.3, 0.2, 0.1],  # P(Low | T_0, W_0)
        [0.2, 0.5, 0.3, 0.4, 0.5, 0.3, 0.4, 0.3, 0.2],  # P(Medium | T_0, W_0)
        [0.1, 0.1, 0.6, 0.1, 0.2, 0.5, 0.3, 0.5, 0.7]   # P(High | T_0, W_0)
    ]
)

# Add CPDs to the DBN
dbn.add_cpds(cpd_w_0, cpd_w_1, cpd_t_0, cpd_t_1, cpd_o, cpd_h)

# After defining the model, call the initialization method that generates the required data structures for further computation.
dbn.initialize_initial_state()

# Simulate some data from the defined model.
samples = dbn.simulate(n_samples=1000, n_time_slices=10)
samples.head()
[2]:
(W, 0) (O, 0) (H, 0) (T, 0) (W, 1) (O, 1) (H, 1) (T, 1) (W, 2) (O, 2) ... (H, 7) (T, 7) (W, 8) (O, 8) (H, 8) (W, 9) (T, 9) (O, 9) (H, 9) (T, 8)
0 1 0 1 0 0 0 0 0 1 0 ... 1 0 1 1 1 1 0 0 1 0
1 1 1 1 1 0 0 0 2 2 1 ... 1 1 1 1 0 1 1 0 1 0
2 2 1 2 1 2 1 2 1 2 0 ... 0 0 1 0 1 2 0 0 2 0
3 0 0 0 0 0 0 0 0 0 0 ... 2 2 1 0 1 2 1 1 2 1
4 1 1 0 0 1 1 1 0 1 0 ... 1 2 1 0 0 1 0 1 0 1

5 rows × 40 columns

[3]:
# Fitting model parameters to a defined network structure.

# Define the network structure for which to learn the model parameters. Here, we have assumeed the same model
# structure that we simulated the data from
dbn = DBN()
dbn.add_edges_from([
    (('W', 0), ('O', 0)),  # Weather influences ground observation
    (('T', 0), ('H', 0)),  # Temperature influences humidity
    (('W', 0), ('H', 0))   # Weather influences humidity
])
dbn.add_edges_from([
    (('W', 0), ('W', 1)),  # Weather transition
    (('T', 0), ('T', 1)),  # Temperature transition
    (('W', 0), ('T', 1))   # Weather influences future temperature
])

# Fit the model using simulated samples
dbn.fit(samples)
dbn.get_cpds()
[3]:
[<TabularCPD representing P((H, 0):3 | (T, 0):3, (W, 0):3) at 0x726d1a2b0cb0>,
 <TabularCPD representing P((T, 0):3) at 0x726d1a2b08c0>,
 <TabularCPD representing P((W, 0):3) at 0x726d1a2b0dd0>,
 <TabularCPD representing P((O, 0):2 | (W, 0):3) at 0x726d1a2b0d40>,
 <TabularCPD representing P((H, 1):3 | (T, 1):3, (W, 1):3) at 0x726d1a2b0830>,
 <TabularCPD representing P((T, 1):3 | (T, 0):3, (W, 0):3) at 0x726d1a2b03b0>,
 <TabularCPD representing P((W, 1):3 | (W, 0):3) at 0x726d1a2b0320>,
 <TabularCPD representing P((O, 1):2 | (W, 1):3) at 0x726d1a2b07a0>]
[4]:
print(dbn.get_cpds(('W', 1)))
+-----------+---------------------+---------------------+---------------------+
| (W, 0)    | (W, 0)(0)           | (W, 0)(1)           | (W, 0)(2)           |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(0) | 0.7004921418172009  | 0.31049438383598194 | 0.20921204376971736 |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(1) | 0.19523741956078264 | 0.4021998199276747  | 0.2642262957939027  |
+-----------+---------------------+---------------------+---------------------+
| (W, 1)(2) | 0.10427043862201643 | 0.2873057962363434  | 0.5265616604363799  |
+-----------+---------------------+---------------------+---------------------+
[5]:
# Learning the model structure from data.

# pgmpy doesn't implement any specific methods for DBN structure learning. This is a hackish method to utilize the
# existing BN learning algorithms to estimate the structure of the DBN. Essentially, we remove the time-information from the
# given data and try to learn the 2-DBN network that remains constant across time-slices.

# First convert the given dataset into long form removing the time information such that it is suitable to learn the 2-DBN network.

import pandas as pd

# Use sorted() to ensure consistent node ordering across all iterations
nodes = sorted(dbn._nodes())
colnames = [node + '0' for node in nodes] + [node + '1' for node in nodes]
df_long = pd.DataFrame(columns=colnames)

for t in range(9):
    cols = [(node, t) for node in nodes] + [(node, t+1) for node in nodes]
    samples_t = samples.loc[:, cols]
    samples_t.columns = colnames
    df_long = pd.concat([df_long, samples_t], ignore_index=True)

df_long = df_long.reset_index(drop=True)

df_long.head()
[5]:
H0 O0 T0 W0 H1 O1 T1 W1
0 1 0 0 1 0 0 0 0
1 1 1 1 1 0 0 2 0
2 2 1 1 2 2 1 1 2
3 0 0 0 0 0 0 0 0
4 0 1 0 1 1 1 0 1
[6]:
# Use this long data frame to learn the first two time frames of the DBN. Because we are using structure learning algorithms we\n",
# need to add constraints such that the algorithm doesn't learn edges from time slice 1 to 0.\n",
from pgmpy.causal_discovery import HillClimbSearch
from pgmpy.estimators import ExpertKnowledge

# Constraints to learn edges in only time forward direction.
expert = ExpertKnowledge(forbidden_edges=
                         [('W1', 'W0'), ('W1', 'O0'), ('W1', 'T0'), ('W1', 'H0'),
                          ('O1', 'W0'), ('O1', 'O0'), ('O1', 'T0'), ('O1', 'H0'),
                          ('T1', 'W0'), ('T1', 'O0'), ('T1', 'T0'), ('T1', 'H0'),
                          ('H1', 'W0'), ('H1', 'O0'), ('H1', 'T0'), ('H1', 'H0'),])
est = HillClimbSearch(expert_knowledge=expert)
est.fit(df_long)
est.causal_graph_.edges() # Use this learned DAG to define a DBN."
[6]:
OutEdgeView([('W0', 'T0'), ('W0', 'H0'), ('W0', 'T1'), ('W0', 'O0'), ('W0', 'W1'), ('T0', 'T1'), ('T0', 'H0'), ('T0', 'W0'), ('H0', 'W0'), ('H0', 'T0'), ('T1', 'H1'), ('T1', 'T0'), ('T1', 'W0'), ('W1', 'H1'), ('W1', 'O1'), ('W1', 'W0'), ('O1', 'W1'), ('O0', 'W0')])