Skip to content

data_loading module

Data loading utilities for EMIT model training and inference.

This module provides functions for loading and preprocessing hyperspectral remote sensing data from Excel files for training and testing machine learning models.

load_real_data(excel_path, selected_bands, split_ratio=0.7, seed=42, diff_before_norm=False, diff_after_norm=False, target_parameter='TSS', lower_quantile=0.0, upper_quantile=1.0, log_offset=0.01)

Load and preprocess real data using MinMax scaling for training and testing.

This function reads hyperspectral Rrs data and water quality parameters from an Excel file, applies sample-wise MinMax scaling and log transformation, and returns DataLoaders ready for model training.

Parameters:

Name Type Description Default
excel_path str

Path to Excel file containing 'Rrs' and 'parameter' sheets.

required
selected_bands List[float]

List of wavelengths (nm) to extract from the data.

required
split_ratio float

Proportion of data to use for training (0-1).

0.7
seed int

Random seed for reproducible train/test split.

42
diff_before_norm bool

Whether to apply differencing before normalization.

False
diff_after_norm bool

Whether to apply differencing after normalization.

False
target_parameter str

Name of target parameter column (e.g., 'TSS', 'Chla').

'TSS'
lower_quantile float

Lower quantile for filtering target parameter outliers.

0.0
upper_quantile float

Upper quantile for filtering target parameter outliers.

1.0
log_offset float

Offset added before log transformation to avoid log(0).

0.01

Returns:

Type Description
train_dl

DataLoader for training data. test_dl: DataLoader for testing data. input_dim: Number of input features. output_dim: Number of output features. train_ids: List of sample IDs in training set. test_ids: List of sample IDs in testing set.

Source code in hypercoast/emit_utils/data_loading.py
def load_real_data(
    excel_path: str,
    selected_bands: List[float],
    split_ratio: float = 0.7,
    seed: int = 42,
    diff_before_norm: bool = False,
    diff_after_norm: bool = False,
    target_parameter: str = "TSS",
    lower_quantile: float = 0.0,
    upper_quantile: float = 1.0,
    log_offset: float = 0.01,
) -> Tuple[DataLoader, DataLoader, int, int, List[str], List[str]]:
    """Load and preprocess real data using MinMax scaling for training and testing.

    This function reads hyperspectral Rrs data and water quality parameters from
    an Excel file, applies sample-wise MinMax scaling and log transformation,
    and returns DataLoaders ready for model training.

    Args:
        excel_path: Path to Excel file containing 'Rrs' and 'parameter' sheets.
        selected_bands: List of wavelengths (nm) to extract from the data.
        split_ratio: Proportion of data to use for training (0-1).
        seed: Random seed for reproducible train/test split.
        diff_before_norm: Whether to apply differencing before normalization.
        diff_after_norm: Whether to apply differencing after normalization.
        target_parameter: Name of target parameter column (e.g., 'TSS', 'Chla').
        lower_quantile: Lower quantile for filtering target parameter outliers.
        upper_quantile: Upper quantile for filtering target parameter outliers.
        log_offset: Offset added before log transformation to avoid log(0).

    Returns:
        train_dl: DataLoader for training data.
        test_dl: DataLoader for testing data.
        input_dim: Number of input features.
        output_dim: Number of output features.
        train_ids: List of sample IDs in training set.
        test_ids: List of sample IDs in testing set.
    """

    rounded_bands = [int(round(b)) for b in selected_bands]
    band_cols = [f"Rrs_{b}" for b in rounded_bands]
    df_rrs = pd.read_excel(excel_path, sheet_name="Rrs")
    df_param = pd.read_excel(excel_path, sheet_name="parameter")
    df_rrs_selected = df_rrs[["GLORIA_ID"] + band_cols]
    df_param_selected = df_param[["GLORIA_ID", target_parameter]]
    df_merged = pd.merge(
        df_rrs_selected, df_param_selected, on="GLORIA_ID", how="inner"
    )

    # === Filter valid samples ===
    mask_rrs_valid = df_merged[band_cols].notna().all(axis=1)
    mask_target_valid = df_merged[target_parameter].notna()
    df_filtered = df_merged[mask_rrs_valid & mask_target_valid].reset_index(drop=True)
    print(
        f"✅ Number of samples after filtering Rrs and {target_parameter}: {len(df_filtered)}"
    )

    # === Quantile clipping for target parameter ===
    lower = df_filtered[target_parameter].quantile(lower_quantile)
    upper = df_filtered[target_parameter].quantile(upper_quantile)
    df_filtered = df_filtered[
        (df_filtered[target_parameter] >= lower)
        & (df_filtered[target_parameter] <= upper)
    ].reset_index(drop=True)
    print(
        f"✅ Number of samples after removing {target_parameter} quantiles [{lower_quantile}, {upper_quantile}]: {len(df_filtered)}"
    )

    # === Extract sample IDs, Rrs, and target parameter ===
    all_sample_ids = df_filtered["GLORIA_ID"].astype(str).tolist()
    Rrs_array = df_filtered[band_cols].values
    param_array = df_filtered[[target_parameter]].values

    if diff_before_norm:
        Rrs_array = np.diff(Rrs_array, axis=1)

    # === Apply MinMax scaling to [1, 10] for each sample independently ===
    scalers_Rrs_real = [MinMaxScaler((1, 10)) for _ in range(Rrs_array.shape[0])]
    Rrs_normalized = np.array(
        [
            scalers_Rrs_real[i].fit_transform(row.reshape(-1, 1)).flatten()
            for i, row in enumerate(Rrs_array)
        ]
    )

    if diff_after_norm:
        Rrs_normalized = np.diff(Rrs_normalized, axis=1)

    # === Transform target parameter to log10(param + log_offset) ===
    param_transformed = np.log10(param_array + log_offset)

    # === Build Dataset ===
    Rrs_tensor = torch.tensor(Rrs_normalized, dtype=torch.float32)
    param_tensor = torch.tensor(param_transformed, dtype=torch.float32)
    dataset = TensorDataset(Rrs_tensor, param_tensor)

    # === Split into training and testing sets ===
    num_samples = len(dataset)
    indices = np.arange(num_samples)
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_size = int(split_ratio * num_samples)
    train_indices = indices[:train_size]
    test_indices = indices[train_size:]

    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)

    train_ids = [all_sample_ids[i] for i in train_indices]
    test_ids = [all_sample_ids[i] for i in test_indices]

    train_dl = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=0)
    test_dl = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=0)

    input_dim = Rrs_tensor.shape[1]
    output_dim = param_tensor.shape[1]

    return (train_dl, test_dl, input_dim, output_dim, train_ids, test_ids)

load_real_data_Robust(excel_path, selected_bands, target_parameter='TSS', split_ratio=0.7, seed=42, use_diff=False, lower_quantile=0.0, upper_quantile=1.0, Rrs_range=(0, 0.25), target_range=(-0.5, 0.5))

Load and preprocess real data using robust scaling for training and testing.

This function reads hyperspectral remote sensing reflectance (Rrs) data and water quality parameters from an Excel file, applies robust scaling and preprocessing, and returns DataLoaders ready for model training.

Parameters:

Name Type Description Default
excel_path str

Path to Excel file containing 'Rrs' and 'parameter' sheets.

required
selected_bands List[float]

List of wavelengths (nm) to extract from the data.

required
target_parameter str

Name of target parameter column (e.g., 'TSS', 'Chla').

'TSS'
split_ratio float

Proportion of data to use for training (0-1).

0.7
seed int

Random seed for reproducible train/test split.

42
use_diff bool

Whether to apply first-order differencing to Rrs spectra.

False
lower_quantile float

Lower quantile for filtering target parameter outliers.

0.0
upper_quantile float

Upper quantile for filtering target parameter outliers.

1.0
Rrs_range Tuple[float, float]

Target range for Rrs normalization.

(0, 0.25)
target_range Tuple[float, float]

Target range for parameter normalization.

(-0.5, 0.5)

Returns:

Type Description
train_dl

DataLoader for training data. test_dl: DataLoader for testing data. input_dim: Number of input features. output_dim: Number of output features. train_ids: List of sample IDs in training set. test_ids: List of sample IDs in testing set. scaler_Rrs: Fitted scaler for Rrs data. TSS_scalers_dict: Dictionary containing 'log' and 'robust' scalers for target.

Source code in hypercoast/emit_utils/data_loading.py
def load_real_data_Robust(
    excel_path: str,
    selected_bands: List[float],
    target_parameter: str = "TSS",
    split_ratio: float = 0.7,
    seed: int = 42,
    use_diff: bool = False,
    lower_quantile: float = 0.0,
    upper_quantile: float = 1.0,
    Rrs_range: Tuple[float, float] = (0, 0.25),
    target_range: Tuple[float, float] = (-0.5, 0.5),
) -> Tuple[DataLoader, DataLoader, int, int, List[str], List[str], Any, Dict[str, Any]]:
    """Load and preprocess real data using robust scaling for training and testing.

    This function reads hyperspectral remote sensing reflectance (Rrs) data and
    water quality parameters from an Excel file, applies robust scaling and
    preprocessing, and returns DataLoaders ready for model training.

    Args:
        excel_path: Path to Excel file containing 'Rrs' and 'parameter' sheets.
        selected_bands: List of wavelengths (nm) to extract from the data.
        target_parameter: Name of target parameter column (e.g., 'TSS', 'Chla').
        split_ratio: Proportion of data to use for training (0-1).
        seed: Random seed for reproducible train/test split.
        use_diff: Whether to apply first-order differencing to Rrs spectra.
        lower_quantile: Lower quantile for filtering target parameter outliers.
        upper_quantile: Upper quantile for filtering target parameter outliers.
        Rrs_range: Target range for Rrs normalization.
        target_range: Target range for parameter normalization.

    Returns:
        train_dl: DataLoader for training data.
        test_dl: DataLoader for testing data.
        input_dim: Number of input features.
        output_dim: Number of output features.
        train_ids: List of sample IDs in training set.
        test_ids: List of sample IDs in testing set.
        scaler_Rrs: Fitted scaler for Rrs data.
        TSS_scalers_dict: Dictionary containing 'log' and 'robust' scalers for target.
    """

    rounded_bands = [int(round(b)) for b in selected_bands]
    band_cols = [f"Rrs_{b}" for b in rounded_bands]

    df_rrs = pd.read_excel(excel_path, sheet_name="Rrs")
    df_param = pd.read_excel(excel_path, sheet_name="parameter")

    df_rrs_selected = df_rrs[["GLORIA_ID"] + band_cols]
    df_param_selected = df_param[["GLORIA_ID", target_parameter]]
    df_merged = pd.merge(
        df_rrs_selected, df_param_selected, on="GLORIA_ID", how="inner"
    )

    mask_rrs_valid = df_merged[band_cols].notna().all(axis=1)
    mask_param_valid = df_merged[target_parameter].notna()
    df_filtered = df_merged[mask_rrs_valid & mask_param_valid].reset_index(drop=True)

    print(
        f"Number of samples after filtering Rrs and {target_parameter}: {len(df_filtered)}"
    )

    lower = df_filtered[target_parameter].quantile(lower_quantile)
    top = df_filtered[target_parameter].quantile(upper_quantile)
    df_filtered = df_filtered[
        (df_filtered[target_parameter] >= lower)
        & (df_filtered[target_parameter] <= top)
    ].reset_index(drop=True)

    print(
        f"Number of samples after removing {target_parameter} quantiles [{lower_quantile}, {upper_quantile}]: {len(df_filtered)}"
    )

    all_sample_ids = df_filtered["GLORIA_ID"].astype(str).tolist()
    Rrs_array = df_filtered[band_cols].values
    param_array = df_filtered[[target_parameter]].values

    if use_diff:
        Rrs_array = np.diff(Rrs_array, axis=1)

    scaler_Rrs = RobustMinMaxScaler(feature_range=Rrs_range)
    scaler_Rrs.fit(torch.tensor(Rrs_array, dtype=torch.float32))
    Rrs_normalized = scaler_Rrs.transform(
        torch.tensor(Rrs_array, dtype=torch.float32)
    ).numpy()

    log_scaler = LogScaler(shift_min=False, safety_term=1e-8)
    param_log = log_scaler.fit_transform(torch.tensor(param_array, dtype=torch.float32))
    param_scaler = RobustMinMaxScaler(
        feature_range=target_range, global_scale=True, robust=True
    )
    param_transformed = param_scaler.fit_transform(param_log).numpy()

    Rrs_tensor = torch.tensor(Rrs_normalized, dtype=torch.float32)
    param_tensor = torch.tensor(param_transformed, dtype=torch.float32)
    dataset = TensorDataset(Rrs_tensor, param_tensor)

    num_samples = len(dataset)
    indices = np.arange(num_samples)
    np.random.seed(seed)
    np.random.shuffle(indices)
    train_size = int(split_ratio * num_samples)
    train_indices = indices[:train_size]
    test_indices = indices[train_size:]

    train_dataset = Subset(dataset, train_indices)
    test_dataset = Subset(dataset, test_indices)

    train_ids = [all_sample_ids[i] for i in train_indices]
    test_ids = [all_sample_ids[i] for i in test_indices]

    train_dl = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=0)
    test_dl = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=0)

    input_dim = Rrs_tensor.shape[1]
    output_dim = param_tensor.shape[1]
    TSS_scalers_dict = {"log": log_scaler, "robust": param_scaler}

    return (
        train_dl,
        test_dl,
        input_dim,
        output_dim,
        train_ids,
        test_ids,
        scaler_Rrs,
        TSS_scalers_dict,
    )

load_real_test(excel_path, selected_bands, max_allowed_diff=1.0, diff_before_norm=False, diff_after_norm=False, target_parameter='TSS', log_offset=0.01)

Load and preprocess test data using sample-wise MinMax scaling.

This function loads test data from Excel, performs band matching to the nearest available wavelengths, applies sample-wise MinMax scaling and log transformation, and returns a DataLoader for inference.

Parameters:

Name Type Description Default
excel_path str

Path to Excel file containing 'Rrs' and 'parameter' sheets.

required
selected_bands List[float]

List of target wavelengths (nm) to extract.

required
max_allowed_diff float

Maximum wavelength difference (nm) allowed for band matching.

1.0
diff_before_norm bool

Whether to apply differencing before normalization.

False
diff_after_norm bool

Whether to apply differencing after normalization.

False
target_parameter str

Name of target parameter column (e.g., 'TSS', 'SPM').

'TSS'
log_offset float

Offset added before log transformation to avoid log(0).

0.01

Returns:

Type Description
test_dl

DataLoader for test data. input_dim: Number of input features. output_dim: Number of output features (always 1). sample_ids: List of sample IDs in test set. sample_dates: List of sample dates in test set.

Exceptions:

Type Description
ValueError

If row counts don't match, no valid samples exist, or band matching fails.

Source code in hypercoast/emit_utils/data_loading.py
def load_real_test(
    excel_path: str,
    selected_bands: List[float],
    max_allowed_diff: float = 1.0,
    diff_before_norm: bool = False,
    diff_after_norm: bool = False,
    target_parameter: str = "TSS",
    log_offset: float = 0.01,
) -> Tuple[DataLoader, int, int, List[str], List[str]]:
    """Load and preprocess test data using sample-wise MinMax scaling.

    This function loads test data from Excel, performs band matching to the
    nearest available wavelengths, applies sample-wise MinMax scaling and log
    transformation, and returns a DataLoader for inference.

    Args:
        excel_path: Path to Excel file containing 'Rrs' and 'parameter' sheets.
        selected_bands: List of target wavelengths (nm) to extract.
        max_allowed_diff: Maximum wavelength difference (nm) allowed for band matching.
        diff_before_norm: Whether to apply differencing before normalization.
        diff_after_norm: Whether to apply differencing after normalization.
        target_parameter: Name of target parameter column (e.g., 'TSS', 'SPM').
        log_offset: Offset added before log transformation to avoid log(0).

    Returns:
        test_dl: DataLoader for test data.
        input_dim: Number of input features.
        output_dim: Number of output features (always 1).
        sample_ids: List of sample IDs in test set.
        sample_dates: List of sample dates in test set.

    Raises:
        ValueError: If row counts don't match, no valid samples exist, or band
            matching fails.
    """

    df_rrs = pd.read_excel(excel_path, sheet_name="Rrs")
    df_param = pd.read_excel(excel_path, sheet_name="parameter")

    if df_rrs.shape[0] != df_param.shape[0]:
        raise ValueError(
            f"❌ The number of rows in the Rrs table and parameter table do not match. Rrs: {df_rrs.shape[0]}, parameter: {df_param.shape[0]}"
        )

    # === Extract IDs and dates ===
    sample_ids = df_rrs["Site Label"].astype(str).tolist()
    sample_dates = df_rrs["Date"].astype(str).tolist()

    # === Match target bands ===
    rrs_wavelengths = []
    rrs_cols = []
    for col in df_rrs.columns:
        try:
            wl = float(col)
            rrs_wavelengths.append(wl)
            rrs_cols.append(col)
        except Exception:
            continue

    band_cols = []
    matched_bands = []
    for target_band in selected_bands:
        diffs = [abs(wl - target_band) for wl in rrs_wavelengths]
        min_diff = min(diffs)
        if min_diff > max_allowed_diff:
            raise ValueError(
                f"Target wavelength {target_band} nm cannot be matched, error {min_diff:.2f} nm exceeds the allowed range"
            )
        best_idx = diffs.index(min_diff)
        band_cols.append(rrs_cols[best_idx])
        matched_bands.append(rrs_wavelengths[best_idx])

    print(
        f"\n✅ Band matching successful, {len(selected_bands)} target bands in total, {len(band_cols)} columns actually extracted"
    )
    print(f"Original number of test samples: {df_rrs.shape[0]}\n")

    # === Extract Rrs and target parameter (without differencing for now) ===
    Rrs_array = df_rrs[band_cols].values.astype(float)
    target_array = df_param[[target_parameter]].values.astype(float).flatten()

    # === Key: Remove rows with NaN/Inf before differencing ===
    mask_inputs_ok = np.all(np.isfinite(Rrs_array), axis=1)
    mask_target_ok = np.isfinite(target_array)
    mask_ok = mask_inputs_ok & mask_target_ok
    if not np.any(mask_ok):
        raise ValueError("❌ No valid samples (NaN/Inf found in input or target).")
    dropped = int(len(mask_ok) - mask_ok.sum())
    if dropped > 0:
        print(
            f"⚠️ Dropped {dropped} invalid samples (containing NaN/Inf) before differencing"
        )

    Rrs_array = Rrs_array[mask_ok]
    target_array = target_array[mask_ok]
    sample_ids = [sid for sid, keep in zip(sample_ids, mask_ok) if keep]
    sample_dates = [d for d, keep in zip(sample_dates, mask_ok) if keep]

    # === Preprocessing before differencing (optional) ===
    if diff_before_norm:
        Rrs_array = np.diff(Rrs_array, axis=1)

    # === Apply MinMaxScaler to [1, 10] for each sample ===
    scalers_Rrs_test = [MinMaxScaler((1, 10)) for _ in range(Rrs_array.shape[0])]
    Rrs_normalized = np.array(
        [
            scalers_Rrs_test[i].fit_transform(row.reshape(-1, 1)).flatten()
            for i, row in enumerate(Rrs_array)
        ]
    )

    # === Post-processing after differencing (optional) ===
    if diff_after_norm:
        Rrs_normalized = np.diff(Rrs_normalized, axis=1)

    # === Transform target value to log10(x + log_offset) ===
    target_transformed = np.log10(target_array + log_offset)

    # === Construct DataLoader ===
    Rrs_tensor = torch.tensor(Rrs_normalized, dtype=torch.float32)
    target_tensor = torch.tensor(target_transformed.reshape(-1, 1), dtype=torch.float32)

    dataset = TensorDataset(Rrs_tensor, target_tensor)
    test_dl = DataLoader(dataset, batch_size=len(dataset), shuffle=False, num_workers=0)

    input_dim = Rrs_tensor.shape[1]
    output_dim = target_tensor.shape[1]

    return test_dl, input_dim, output_dim, sample_ids, sample_dates

load_real_test_Robust(excel_path, selected_bands, max_allowed_diff=1.0, scaler_Rrs=None, scalers_dict=None, use_diff=False, target_parameter='SPM')

Load and preprocess test data using pre-fitted robust scalers.

This function loads test data from Excel and applies the same preprocessing transformations used during training, including band matching, filtering, and scaling using pre-fitted scalers.

Parameters:

Name Type Description Default
excel_path str

Path to Excel file containing 'Rrs' and 'parameter' sheets.

required
selected_bands List[float]

List of wavelengths (nm) to extract from the data.

required
max_allowed_diff float

Maximum wavelength difference (nm) allowed for band matching.

1.0
scaler_Rrs Optional[Any]

Pre-fitted scaler for Rrs data from training.

None
scalers_dict Optional[Dict[str, Any]]

Dictionary containing 'log' and 'robust' scalers from training.

None
use_diff bool

Whether to apply first-order differencing to Rrs spectra.

False
target_parameter str

Name of target parameter column (e.g., 'SPM', 'TSS').

'SPM'

Returns:

Type Description
test_dl

DataLoader for test data. input_dim: Number of input features. output_dim: Number of output features (always 1). sample_ids: List of sample IDs in test set. sample_dates: List of sample dates in test set.

Exceptions:

Type Description
ValueError

If row counts don't match, no valid samples exist, or band matching fails.

Source code in hypercoast/emit_utils/data_loading.py
def load_real_test_Robust(
    excel_path: str,
    selected_bands: List[float],
    max_allowed_diff: float = 1.0,
    scaler_Rrs: Optional[Any] = None,
    scalers_dict: Optional[Dict[str, Any]] = None,
    use_diff: bool = False,
    target_parameter: str = "SPM",
) -> Tuple[DataLoader, int, int, List[str], List[str]]:
    """Load and preprocess test data using pre-fitted robust scalers.

    This function loads test data from Excel and applies the same preprocessing
    transformations used during training, including band matching, filtering,
    and scaling using pre-fitted scalers.

    Args:
        excel_path: Path to Excel file containing 'Rrs' and 'parameter' sheets.
        selected_bands: List of wavelengths (nm) to extract from the data.
        max_allowed_diff: Maximum wavelength difference (nm) allowed for band matching.
        scaler_Rrs: Pre-fitted scaler for Rrs data from training.
        scalers_dict: Dictionary containing 'log' and 'robust' scalers from training.
        use_diff: Whether to apply first-order differencing to Rrs spectra.
        target_parameter: Name of target parameter column (e.g., 'SPM', 'TSS').

    Returns:
        test_dl: DataLoader for test data.
        input_dim: Number of input features.
        output_dim: Number of output features (always 1).
        sample_ids: List of sample IDs in test set.
        sample_dates: List of sample dates in test set.

    Raises:
        ValueError: If row counts don't match, no valid samples exist, or band
            matching fails.
    """

    df_rrs = pd.read_excel(excel_path, sheet_name="Rrs")
    df_param = pd.read_excel(excel_path, sheet_name="parameter")

    if df_rrs.shape[0] != df_param.shape[0]:
        raise ValueError(
            f"❌ The number of rows in the Rrs table and parameter table do not match. Rrs: {df_rrs.shape[0]}, parameter: {df_param.shape[0]}"
        )

    sample_ids = df_rrs["Site Label"].astype(str).tolist()
    sample_dates = df_rrs["Date"].astype(str).tolist()

    # Match target bands
    rrs_wavelengths = []
    rrs_cols = []
    for col in df_rrs.columns:
        try:
            wl = float(col)
            rrs_wavelengths.append(wl)
            rrs_cols.append(col)
        except:
            continue

    band_cols = []
    for target_band in selected_bands:
        diffs = [abs(wl - target_band) for wl in rrs_wavelengths]
        min_diff = min(diffs)
        if min_diff > max_allowed_diff:
            raise ValueError(
                f"Target wavelength {target_band} nm cannot be matched, error {min_diff:.2f} nm exceeds the allowed range"
            )
        best_idx = diffs.index(min_diff)
        band_cols.append(rrs_cols[best_idx])

    print(f"\n✅ Band matching successful, {len(selected_bands)} target bands in total")
    print(f"Final number of valid test samples: {df_rrs.shape[0]}\n")

    Rrs_array = df_rrs[band_cols].values
    param_array = df_param[[target_parameter]].values.flatten()
    # === Key: Remove rows with NaN/Inf before differencing ===
    mask_inputs_ok = np.all(np.isfinite(Rrs_array), axis=1)
    mask_target_ok = np.isfinite(param_array)
    mask_ok = mask_inputs_ok & mask_target_ok
    if not np.any(mask_ok):
        raise ValueError("❌ Valid samples = 0 (NaN/Inf found in input or target).")
    dropped = int(len(mask_ok) - mask_ok.sum())
    if dropped > 0:
        print(
            f"⚠️ Dropped {dropped} invalid samples (containing NaN/Inf) before differencing"
        )

    Rrs_array = Rrs_array[mask_ok]
    param_array = param_array[mask_ok]
    sample_ids = [sid for sid, keep in zip(sample_ids, mask_ok) if keep]
    sample_dates = [d for d, keep in zip(sample_dates, mask_ok) if keep]

    if use_diff:
        Rrs_array = np.diff(Rrs_array, axis=1)

    Rrs_tensor = torch.tensor(Rrs_array, dtype=torch.float32)
    Rrs_normalized = scaler_Rrs.transform(Rrs_tensor).numpy()

    log_scaler = scalers_dict["log"]
    robust_scaler = scalers_dict["robust"]
    param_log = log_scaler.transform(
        torch.tensor(param_array.reshape(-1, 1), dtype=torch.float32)
    )
    param_transformed = robust_scaler.transform(param_log).numpy()

    dataset = TensorDataset(
        torch.tensor(Rrs_normalized, dtype=torch.float32),
        torch.tensor(param_transformed.reshape(-1, 1), dtype=torch.float32),
    )
    test_dl = DataLoader(dataset, batch_size=len(dataset), shuffle=False, num_workers=0)

    input_dim = Rrs_tensor.shape[1]
    output_dim = 1

    return test_dl, input_dim, output_dim, sample_ids, sample_dates