"""
Naive Adjustment Regressor in sklearn Compatible Design.
"""
import numpy as np
from sklearn.base import BaseEstimator, clone
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import (
check_is_fitted,
validate_data,
)
from pgmpy.prediction._base import _BaseCausalPrediction
[docs]
class NaiveAdjustmentRegressor(_BaseCausalPrediction):
"""
Naive adjustment regressor using causal graph roles for feature selection.
This estimator concatenates exposure, adjustment, and pretreatment variables
as features to predict the outcome variable using standard ML algorithms.
It's "naive" because it uses a simple prediction model with the adjustment
set and doesn't employ sophisticated causal inference methods like double ML,
inverse propensity weighting, or other advanced causal estimation techniques.
Parameters
----------
causal_graph : DAG, PDAG, ADMG, MAG, or PAG
Causal graph with defined variable roles. Must have exactly one exposure
and one outcome variable. The adjustment role is optional (can be missing,
empty or contain variables).
estimator : sklearn estimator, optional (default=LinearRegression())
Base estimator for prediction.
Attributes
----------
`estimator_` : sklearn estimator
The fitted base estimator.
`feature_names_in_` : ndarray of shape (n_features,)
Names of features seen during fit.
`n_features_in_` : int
Number of features seen during fit.
`exposure_var_` : str
Name of exposure variable extracted from causal graph.
`adjustment_vars_` : list
List of adjustment variable names extracted from causal graph.
`pretreatment_vars_` : list
List of pretreatment variable names extracted from causal graph.
`outcome_var_` : str
Name of outcome variable extracted from causal graph.
`feature_columns_fit_` : list
List of feature column names used (exposure + adjustment + pretreatment).
`explanation_` : str
Formatted description of the fitted model.
Examples
--------
Basic usage with a simple causal DAG:
>>> import numpy as np
>>> import pandas as pd
>>> from pgmpy.base import DAG
>>> from pgmpy.prediction import NaiveAdjustmentRegressor
>>> from sklearn.linear_model import LinearRegression
>>>
>>> # Create a simple causal DAG: Z -> X, Z -> Y, X -> Y
>>> # where Z is a confounder, X is exposure, Y is outcome
>>> dag = DAG(
... ebunch=[("Z", "X"), ("Z", "Y"), ("X", "Y")],
... roles={"exposures": "X", "outcomes": "Y", "adjustment": ["Z"]},
... )
>>>
>>> # Generate some synthetic data
>>> np.random.seed(42)
>>> n = 100
>>> Z = np.random.normal(0, 1, n)
>>> X = 0.5 * Z + np.random.normal(0, 0.5, n)
>>> Y = 2.0 * X + 1.5 * Z + np.random.normal(0, 0.3, n)
>>>
>>> data = pd.DataFrame({"X": X, "Y": Y, "Z": Z})
>>>
>>> # Fit the regressor
>>> regressor = NaiveAdjustmentRegressor(causal_graph=dag)
>>> _ = regressor.fit(data[["X", "Z"]], data["Y"])
>>>
>>> # Make predictions
>>> predictions = regressor.predict(data[["X", "Z"]])
>>> print(f"Predictions shape: {predictions.shape}")
Predictions shape: (100,)
Using a custom estimator:
>>> from sklearn.ensemble import RandomForestRegressor
>>>
>>> # Use Random Forest as the base estimator
>>> rf_regressor = NaiveAdjustmentRegressor(
... causal_graph=dag,
... estimator=RandomForestRegressor(n_estimators=10, random_state=42),
... )
>>> _ = rf_regressor.fit(data[["X", "Z"]], data["Y"])
Example with pretreatment variables:
>>> # Create DAG with pretreatment variable P -> Y
>>> dag_with_pretreatment = DAG(
... ebunch=[("P", "Y"), ("Z", "X"), ("Z", "Y"), ("X", "Y")],
... roles={
... "exposures": "X",
... "outcomes": "Y",
... "adjustment": ["Z"],
... "pretreatment": ["P"],
... },
... )
>>>
>>> # Generate data with proper relationships using simulate
>>> lgbn_with_P = DAG.from_dagitty(
... "dag { P -> Y [beta=0.8] Z -> X [beta=0.5] X -> Y [beta=2.0] Z -> Y [beta=1.5] }"
... )
>>> data_with_P = lgbn_with_P.simulate(100, seed=42)
>>>
>>> regressor_with_P = NaiveAdjustmentRegressor(causal_graph=dag_with_pretreatment)
>>> _ = regressor_with_P.fit(data_with_P[["X", "Z", "P"]], data_with_P["Y"])
"""
def __init__(
self,
causal_graph,
estimator: BaseEstimator | None = None,
):
self.causal_graph = causal_graph
self.estimator = estimator
[docs]
def fit(
self,
X,
y,
sample_weight: np.ndarray | None = None,
):
"""
Fit the Naive Adjustment Regressor.
Parameters
----------
X : array-like or DataFrame of shape (n_samples, n_features)
Training data. Column names must exactly match variable names in the causal graph.
- If DataFrame: Column names must match DAG variable names exactly
- If numpy array: Will be converted to DataFrame with columns [0, 1, 2, ...],
so DAG should use integer variable names
y : array-like of shape (n_samples,)
Target values (outcome variable).
sample_weight : array-like of shape (n_samples,), optional
Sample weights for training.
Returns
-------
self : object
Returns self for method chaining.
"""
# Step 1: Validate input data
validate_data(self, X, y, accept_sparse=False, ensure_2d=True, dtype="numeric")
# Step 2: Extract and validate causal graph roles
exposure_vars = self.causal_graph.get_role("exposures")
outcome_vars = self.causal_graph.get_role("outcomes")
adjustment_vars = self.causal_graph.get_role("adjustment")
pretreatment_vars = self.causal_graph.get_role("pretreatment")
# Validate exactly one exposure and one outcome variable
if len(exposure_vars) != 1:
raise ValueError(
f"Exactly one exposure variable must be defined. Found {len(exposure_vars)}: {exposure_vars}"
)
if len(outcome_vars) != 1:
raise ValueError(f"Exactly one outcome variable must be defined. Found {len(outcome_vars)}: {outcome_vars}")
# Step 3: Store role variables as instance attributes
self.exposure_var_ = exposure_vars[0]
self.outcome_var_ = outcome_vars[0]
self.adjustment_vars_ = adjustment_vars
self.pretreatment_vars_ = pretreatment_vars
self.feature_columns_fit_ = [self.exposure_var_] + adjustment_vars + pretreatment_vars
# Step 4: Prepare feature DataFrame
X_features = self._prepare_feature_df(X, required_features=self.feature_columns_fit_)
# Step 5: Initialize base estimator
self.estimator_ = LinearRegression() if self.estimator is None else clone(self.estimator)
# Step 6: Fit the estimator
self.estimator_.fit(X_features, y, sample_weight=sample_weight)
# Step 7: Create explanation
adj_str = ", ".join(map(str, adjustment_vars)) if adjustment_vars else "none"
pre_str = ", ".join(map(str, pretreatment_vars)) if pretreatment_vars else "none"
self.explanation_ = (
f"NaiveAdjustmentRegressor(exposure={self.exposure_var_}, outcome={self.outcome_var_}, "
f"adjustment=[{adj_str}], pretreatment=[{pre_str}], "
f"estimator={type(self.estimator_).__name__})"
)
return self
[docs]
def predict(self, X):
"""Make predictions using the fitted regressor.
Parameters
----------
X : array-like or DataFrame of shape (n_samples, n_features)
Input data. Column names must exactly match variable names in the causal graph.
- If DataFrame: Column names must match DAG variable names exactly
- If numpy array: Will be converted to DataFrame with columns [0, 1, 2, ...],
so DAG should use integer variable names
Returns
-------
predictions : ndarray of shape (n_samples,)
Predicted values.
"""
# Step 1: Validate that estimator is fitted
check_is_fitted(self, "estimator_")
validate_data(
self,
X,
accept_sparse=False,
ensure_2d=True,
dtype="numeric",
reset=False,
)
X_filtered = self._prepare_feature_df(X, required_features=self.feature_columns_fit_)
# Step 2: Make predictions and return as 1D array
predictions = self.estimator_.predict(X_filtered)
return np.asarray(predictions).ravel()
[docs]
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation."""
check_is_fitted(self, "estimator_")
return np.array(self.feature_columns_fit_, dtype=str)