Source code for pgmpy.factors.FactorDict
#!/usr/bin/env python3
from __future__ import annotations
from numbers import Number
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from pgmpy.factors.base import factor_product
from pgmpy.factors.discrete import DiscreteFactor
[docs]
class FactorDict(dict):
[docs]
@classmethod
def from_dataframe(cls, df, marginals):
"""Create a `FactorDict` from a given set of marginals.
Parameters
----------
df: pandas DataFrame object
marginals: List[Tuple[str]]
List of Tuples containing the names of the marginals.
Returns
-------
Factor dictionary: FactorDict
FactorDict with each marginal's Factor representing the empirical
frequency of the marginal from the dataset.
"""
if df.isnull().values.any():
raise ValueError("df cannot contain None or np.nan values.")
factor_dict = cls({})
for marginal in marginals:
# Subset of columns arranged in a lexographical ordering.
_df = df.loc[:, list(marginal)].sort_values(list(marginal))
cardinality = list(_df.nunique())
# Since we have sorted the columns, this encoding will
# also be sorted lexographically.
encoded = OrdinalEncoder().fit_transform(_df)
factor_dict[marginal] = DiscreteFactor(
variables=marginal,
cardinality=cardinality,
values=np.histogramdd(sample=encoded, bins=cardinality)[0].flatten(),
state_names={column: sorted(_df[column].unique().tolist()) for column in marginal},
)
return factor_dict
[docs]
def get_factors(self):
return set(self.values())
def __mul__(self, const):
return FactorDict({clique: const * self[clique] for clique in self})
def __rmul__(self, const):
return self.__mul__(const)
def __add__(self, other):
return FactorDict(
{clique: self[clique] + other for clique in self}
if isinstance(other, Number)
else {clique: self[clique] + other[clique] for clique in self}
)
def __sub__(self, other):
return self + -1 * other
[docs]
def dot(self, other):
return sum((self[clique] * other[clique]).values.sum() for clique in self)
[docs]
def product(self):
return factor_product(*self.get_factors())