Source code for pgmpy.prediction.NaiveAdjustmentRegressor

"""
Naive Adjustment Regressor in sklearn Compatible Design.
"""

import numpy as np
from sklearn.base import BaseEstimator, clone
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import (
    check_is_fitted,
    validate_data,
)

from pgmpy.prediction._base import _BaseCausalPrediction



[docs]
class NaiveAdjustmentRegressor(_BaseCausalPrediction):
    """
    Naive adjustment regressor using causal graph roles for feature selection.

    This estimator concatenates exposure, adjustment, and pretreatment variables
    as features to predict the outcome variable using standard ML algorithms.
    It's "naive" because it uses a simple prediction model with the adjustment
    set and doesn't employ sophisticated causal inference methods like double ML,
    inverse propensity weighting, or other advanced causal estimation techniques.

    Parameters
    ----------
    causal_graph : DAG, PDAG, ADMG, MAG, or PAG
        Causal graph with defined variable roles. Must have exactly one exposure
        and one outcome variable. The adjustment role is optional (can be missing,
        empty or contain variables).
    estimator : sklearn estimator, optional (default=LinearRegression())
        Base estimator for prediction.

    Attributes
    ----------
    `estimator_` : sklearn estimator
        The fitted base estimator.
    `feature_names_in_` : ndarray of shape (n_features,)
        Names of features seen during fit.
    `n_features_in_` : int
        Number of features seen during fit.
    `exposure_var_` : str
        Name of exposure variable extracted from causal graph.
    `adjustment_vars_` : list
        List of adjustment variable names extracted from causal graph.
    `pretreatment_vars_` : list
        List of pretreatment variable names extracted from causal graph.
    `outcome_var_` : str
        Name of outcome variable extracted from causal graph.
    `feature_columns_fit_` : list
        List of feature column names used (exposure + adjustment + pretreatment).
    `explanation_` : str
        Formatted description of the fitted model.

    Examples
    --------
    Basic usage with a simple causal DAG:

    >>> import numpy as np
    >>> import pandas as pd
    >>> from pgmpy.base import DAG
    >>> from pgmpy.prediction import NaiveAdjustmentRegressor
    >>> from sklearn.linear_model import LinearRegression
    >>>
    >>> # Create a simple causal DAG: Z -> X, Z -> Y, X -> Y
    >>> # where Z is a confounder, X is exposure, Y is outcome
    >>> dag = DAG(
    ...     ebunch=[("Z", "X"), ("Z", "Y"), ("X", "Y")],
    ...     roles={"exposures": "X", "outcomes": "Y", "adjustment": ["Z"]},
    ... )
    >>>
    >>> # Generate some synthetic data
    >>> np.random.seed(42)
    >>> n = 100
    >>> Z = np.random.normal(0, 1, n)
    >>> X = 0.5 * Z + np.random.normal(0, 0.5, n)
    >>> Y = 2.0 * X + 1.5 * Z + np.random.normal(0, 0.3, n)
    >>>
    >>> data = pd.DataFrame({"X": X, "Y": Y, "Z": Z})
    >>>
    >>> # Fit the regressor
    >>> regressor = NaiveAdjustmentRegressor(causal_graph=dag)
    >>> _ = regressor.fit(data[["X", "Z"]], data["Y"])
    >>>
    >>> # Make predictions
    >>> predictions = regressor.predict(data[["X", "Z"]])
    >>> print(f"Predictions shape: {predictions.shape}")
    Predictions shape: (100,)

    Using a custom estimator:

    >>> from sklearn.ensemble import RandomForestRegressor
    >>>
    >>> # Use Random Forest as the base estimator
    >>> rf_regressor = NaiveAdjustmentRegressor(
    ...     causal_graph=dag,
    ...     estimator=RandomForestRegressor(n_estimators=10, random_state=42),
    ... )
    >>> _ = rf_regressor.fit(data[["X", "Z"]], data["Y"])

    Example with pretreatment variables:

    >>> # Create DAG with pretreatment variable P -> Y
    >>> dag_with_pretreatment = DAG(
    ...     ebunch=[("P", "Y"), ("Z", "X"), ("Z", "Y"), ("X", "Y")],
    ...     roles={
    ...         "exposures": "X",
    ...         "outcomes": "Y",
    ...         "adjustment": ["Z"],
    ...         "pretreatment": ["P"],
    ...     },
    ... )
    >>>
    >>> # Generate data with proper relationships using simulate
    >>> lgbn_with_P = DAG.from_dagitty(
    ...     "dag { P -> Y [beta=0.8] Z -> X [beta=0.5] X -> Y [beta=2.0] Z -> Y [beta=1.5] }"
    ... )
    >>> data_with_P = lgbn_with_P.simulate(100, seed=42)
    >>>
    >>> regressor_with_P = NaiveAdjustmentRegressor(causal_graph=dag_with_pretreatment)
    >>> _ = regressor_with_P.fit(data_with_P[["X", "Z", "P"]], data_with_P["Y"])
    """

    def __init__(
        self,
        causal_graph,
        estimator: BaseEstimator | None = None,
    ):
        self.causal_graph = causal_graph
        self.estimator = estimator


[docs]
    def fit(
        self,
        X,
        y,
        sample_weight: np.ndarray | None = None,
    ):
        """
        Fit the Naive Adjustment Regressor.

        Parameters
        ----------
        X : array-like or DataFrame of shape (n_samples, n_features)
            Training data. Column names must exactly match variable names in the causal graph.
            - If DataFrame: Column names must match DAG variable names exactly
            - If numpy array: Will be converted to DataFrame with columns [0, 1, 2, ...],
              so DAG should use integer variable names
        y : array-like of shape (n_samples,)
            Target values (outcome variable).
        sample_weight : array-like of shape (n_samples,), optional
            Sample weights for training.

        Returns
        -------
        self : object
            Returns self for method chaining.
        """

        # Step 1: Validate input data
        validate_data(self, X, y, accept_sparse=False, ensure_2d=True, dtype="numeric")

        # Step 2: Extract and validate causal graph roles
        exposure_vars = self.causal_graph.get_role("exposures")
        outcome_vars = self.causal_graph.get_role("outcomes")
        adjustment_vars = self.causal_graph.get_role("adjustment")
        pretreatment_vars = self.causal_graph.get_role("pretreatment")

        # Validate exactly one exposure and one outcome variable
        if len(exposure_vars) != 1:
            raise ValueError(
                f"Exactly one exposure variable must be defined. Found {len(exposure_vars)}: {exposure_vars}"
            )

        if len(outcome_vars) != 1:
            raise ValueError(f"Exactly one outcome variable must be defined. Found {len(outcome_vars)}: {outcome_vars}")

        # Step 3: Store role variables as instance attributes
        self.exposure_var_ = exposure_vars[0]
        self.outcome_var_ = outcome_vars[0]
        self.adjustment_vars_ = adjustment_vars
        self.pretreatment_vars_ = pretreatment_vars
        self.feature_columns_fit_ = [self.exposure_var_] + adjustment_vars + pretreatment_vars

        # Step 4: Prepare feature DataFrame
        X_features = self._prepare_feature_df(X, required_features=self.feature_columns_fit_)

        # Step 5: Initialize base estimator
        self.estimator_ = LinearRegression() if self.estimator is None else clone(self.estimator)

        # Step 6: Fit the estimator
        self.estimator_.fit(X_features, y, sample_weight=sample_weight)

        # Step 7: Create explanation
        adj_str = ", ".join(map(str, adjustment_vars)) if adjustment_vars else "none"
        pre_str = ", ".join(map(str, pretreatment_vars)) if pretreatment_vars else "none"
        self.explanation_ = (
            f"NaiveAdjustmentRegressor(exposure={self.exposure_var_}, outcome={self.outcome_var_}, "
            f"adjustment=[{adj_str}], pretreatment=[{pre_str}], "
            f"estimator={type(self.estimator_).__name__})"
        )

        return self



[docs]
    def predict(self, X):
        """Make predictions using the fitted regressor.

        Parameters
        ----------
        X : array-like or DataFrame of shape (n_samples, n_features)
            Input data. Column names must exactly match variable names in the causal graph.
            - If DataFrame: Column names must match DAG variable names exactly
            - If numpy array: Will be converted to DataFrame with columns [0, 1, 2, ...],
              so DAG should use integer variable names

        Returns
        -------
        predictions : ndarray of shape (n_samples,)
            Predicted values.
        """
        # Step 1: Validate that estimator is fitted
        check_is_fitted(self, "estimator_")

        validate_data(
            self,
            X,
            accept_sparse=False,
            ensure_2d=True,
            dtype="numeric",
            reset=False,
        )
        X_filtered = self._prepare_feature_df(X, required_features=self.feature_columns_fit_)

        # Step 2: Make predictions and return as 1D array
        predictions = self.estimator_.predict(X_filtered)
        return np.asarray(predictions).ravel()



[docs]
    def get_feature_names_out(self, input_features=None):
        """Get output feature names for transformation."""
        check_is_fitted(self, "estimator_")
        return np.array(self.feature_columns_fit_, dtype=str)