preprocess module¶
Preprocessing and scaling utilities for hyperspectral data.
This module provides custom scalers for preprocessing hyperspectral remote sensing data, including robust quantile-based scaling and logarithmic transformations.
LogScaler
¶
Logarithmic scaler with optional shifting for non-positive values.
This scaler applies log10 transformation after optionally shifting data to ensure all values are positive. Useful for compressing the dynamic range of water quality parameters that span multiple orders of magnitude.
The transformation steps are: 1. Optionally shift values so minimum becomes 0 (if shift_min=True). 2. Add a small safety term to avoid log(0). 3. Apply log10 transformation.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
shift_min |
bool |
Whether to shift data so the minimum value becomes 0. |
False |
safety_term |
float |
Small constant added before log to avoid log(0). |
1e-08 |
Attributes:
| Name | Type | Description |
|---|---|---|
global_min |
Minimum value observed during fitting. |
|
shift_value |
Amount to shift data (computed from global_min). |
|
fitted |
Whether the scaler has been fitted to data. |
Source code in hypercoast/emit_utils/preprocess.py
class LogScaler:
"""Logarithmic scaler with optional shifting for non-positive values.
This scaler applies log10 transformation after optionally shifting data to
ensure all values are positive. Useful for compressing the dynamic range of
water quality parameters that span multiple orders of magnitude.
The transformation steps are:
1. Optionally shift values so minimum becomes 0 (if shift_min=True).
2. Add a small safety term to avoid log(0).
3. Apply log10 transformation.
Args:
shift_min: Whether to shift data so the minimum value becomes 0.
safety_term: Small constant added before log to avoid log(0).
Attributes:
global_min: Minimum value observed during fitting.
shift_value: Amount to shift data (computed from global_min).
fitted: Whether the scaler has been fitted to data.
"""
def __init__(self, shift_min: bool = False, safety_term: float = 1e-8):
self.safety_term = safety_term
self.shift_min = shift_min # Whether to shift minimum value to 0
self.global_min = None
self.shift_value = None
self.fitted = False
def fit(self, y: Union[torch.Tensor, "np.ndarray"]) -> "LogScaler":
"""Fit the scaler by computing the global minimum.
Args:
y: Input values (can be tensor or array).
Returns:
self: Fitted scaler instance.
"""
if not isinstance(y, torch.Tensor):
y = torch.tensor(y, dtype=torch.float32)
self.global_min = torch.min(y).item()
# Calculate shift value to make minimum = 0
self.shift_value = abs(self.global_min) if self.global_min < 0 else 0
self.fitted = True
return self
def transform(self, y: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Transform data by applying log10 transformation.
Args:
y: Input values (can be tensor or array).
Returns:
Log-transformed values.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before transform")
if not isinstance(y, torch.Tensor):
y = torch.tensor(y, dtype=torch.float32)
# Step 1: Shift values so minimum becomes 0
if self.shift_min:
# Shift to make minimum = 0
shifted = y - self.global_min
else:
# Use pre-calculated shift value
shifted = torch.clamp(y, min=0) # Ensure no negative values
# Step 2: Add safety term to avoid log(0)
safe_values = shifted + self.safety_term
# Step 3: Apply log10
log_values = torch.log10(safe_values)
return log_values
def fit_transform(self, y: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Fit the scaler and transform data in one step.
Args:
y: Input values (can be tensor or array).
Returns:
Log-transformed values.
"""
return self.fit(y).transform(y)
def inverse_transform(
self, y_log: Union[torch.Tensor, "np.ndarray"]
) -> torch.Tensor:
"""Inverse transform log-transformed data back to original scale.
Args:
y_log: Log-transformed values.
Returns:
Values in original scale.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before inverse_transform")
if not isinstance(y_log, torch.Tensor):
y_log = torch.tensor(y_log, dtype=torch.float32)
# Step 1: Apply 10^y
exp_values = torch.pow(10, y_log)
# Step 2: Remove safety term
safe_removed = exp_values - self.safety_term
# Step 3: Remove shift to restore original range
if self.shift_min:
original_values = safe_removed - self.global_min
else:
original_values = safe_removed
return original_values
fit(self, y)
¶
Fit the scaler by computing the global minimum.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
y |
Union[torch.Tensor, np.ndarray] |
Input values (can be tensor or array). |
required |
Returns:
| Type | Description |
|---|---|
self |
Fitted scaler instance. |
Source code in hypercoast/emit_utils/preprocess.py
def fit(self, y: Union[torch.Tensor, "np.ndarray"]) -> "LogScaler":
"""Fit the scaler by computing the global minimum.
Args:
y: Input values (can be tensor or array).
Returns:
self: Fitted scaler instance.
"""
if not isinstance(y, torch.Tensor):
y = torch.tensor(y, dtype=torch.float32)
self.global_min = torch.min(y).item()
# Calculate shift value to make minimum = 0
self.shift_value = abs(self.global_min) if self.global_min < 0 else 0
self.fitted = True
return self
fit_transform(self, y)
¶
Fit the scaler and transform data in one step.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
y |
Union[torch.Tensor, np.ndarray] |
Input values (can be tensor or array). |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Log-transformed values. |
Source code in hypercoast/emit_utils/preprocess.py
def fit_transform(self, y: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Fit the scaler and transform data in one step.
Args:
y: Input values (can be tensor or array).
Returns:
Log-transformed values.
"""
return self.fit(y).transform(y)
inverse_transform(self, y_log)
¶
Inverse transform log-transformed data back to original scale.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
y_log |
Union[torch.Tensor, np.ndarray] |
Log-transformed values. |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Values in original scale. |
Exceptions:
| Type | Description |
|---|---|
ValueError |
If scaler has not been fitted. |
Source code in hypercoast/emit_utils/preprocess.py
def inverse_transform(
self, y_log: Union[torch.Tensor, "np.ndarray"]
) -> torch.Tensor:
"""Inverse transform log-transformed data back to original scale.
Args:
y_log: Log-transformed values.
Returns:
Values in original scale.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before inverse_transform")
if not isinstance(y_log, torch.Tensor):
y_log = torch.tensor(y_log, dtype=torch.float32)
# Step 1: Apply 10^y
exp_values = torch.pow(10, y_log)
# Step 2: Remove safety term
safe_removed = exp_values - self.safety_term
# Step 3: Remove shift to restore original range
if self.shift_min:
original_values = safe_removed - self.global_min
else:
original_values = safe_removed
return original_values
transform(self, y)
¶
Transform data by applying log10 transformation.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
y |
Union[torch.Tensor, np.ndarray] |
Input values (can be tensor or array). |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Log-transformed values. |
Exceptions:
| Type | Description |
|---|---|
ValueError |
If scaler has not been fitted. |
Source code in hypercoast/emit_utils/preprocess.py
def transform(self, y: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Transform data by applying log10 transformation.
Args:
y: Input values (can be tensor or array).
Returns:
Log-transformed values.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before transform")
if not isinstance(y, torch.Tensor):
y = torch.tensor(y, dtype=torch.float32)
# Step 1: Shift values so minimum becomes 0
if self.shift_min:
# Shift to make minimum = 0
shifted = y - self.global_min
else:
# Use pre-calculated shift value
shifted = torch.clamp(y, min=0) # Ensure no negative values
# Step 2: Add safety term to avoid log(0)
safe_values = shifted + self.safety_term
# Step 3: Apply log10
log_values = torch.log10(safe_values)
return log_values
RobustMinMaxScaler
¶
Robust MinMax scaler using quantiles for outlier-resistant normalization.
This scaler provides robust scaling by using quantiles instead of min/max values, making it less sensitive to outliers. It can operate in global or feature-wise mode.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
feature_range |
Tuple[float, float] |
Target range (min, max) for scaled values. |
(0, 1) |
global_scale |
bool |
If True, compute quantiles across all features globally. If False, compute quantiles independently for each feature. |
True |
robust |
bool |
If True, use quantiles for scaling. If False, use traditional min/max. |
True |
quantile_range |
Tuple[float, float] |
Tuple of (lower, upper) quantiles (e.g., (0.25, 0.75) for IQR). |
(0.25, 0.75) |
clip_outliers |
bool |
If True, clip values outside quantile range before scaling. |
False |
Attributes:
| Name | Type | Description |
|---|---|---|
min_val |
Fitted minimum (or lower quantile) value(s). |
|
max_val |
Fitted maximum (or upper quantile) value(s). |
|
fitted |
Whether the scaler has been fitted to data. |
Source code in hypercoast/emit_utils/preprocess.py
class RobustMinMaxScaler:
"""Robust MinMax scaler using quantiles for outlier-resistant normalization.
This scaler provides robust scaling by using quantiles instead of min/max values,
making it less sensitive to outliers. It can operate in global or feature-wise mode.
Args:
feature_range: Target range (min, max) for scaled values.
global_scale: If True, compute quantiles across all features globally.
If False, compute quantiles independently for each feature.
robust: If True, use quantiles for scaling. If False, use traditional min/max.
quantile_range: Tuple of (lower, upper) quantiles (e.g., (0.25, 0.75) for IQR).
clip_outliers: If True, clip values outside quantile range before scaling.
Attributes:
min_val: Fitted minimum (or lower quantile) value(s).
max_val: Fitted maximum (or upper quantile) value(s).
fitted: Whether the scaler has been fitted to data.
"""
def __init__(
self,
feature_range: Tuple[float, float] = (0, 1),
global_scale: bool = True,
robust: bool = True,
quantile_range: Tuple[float, float] = (0.25, 0.75),
clip_outliers: bool = False,
):
self.feature_range = feature_range
self.global_scale = global_scale
self.robust = robust
self.quantile_range = quantile_range
self.clip_outliers = clip_outliers
self.min_val = None
self.max_val = None
self.fitted = False
def fit(self, X: Union[torch.Tensor, "np.ndarray"]) -> "RobustMinMaxScaler":
"""Fit the scaler to the data by computing quantiles or min/max values.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
self: Fitted scaler instance.
"""
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
if self.robust:
# Use quantiles for robust scaling
if self.global_scale:
# Global quantiles across all values
flat_X = X.flatten()
# If tensor is too large, use sampling for quantile calculation
max_samples = 1000000 # 1M samples max
if len(flat_X) > max_samples:
# Randomly sample from the tensor
indices = torch.randperm(len(flat_X))[:max_samples]
sampled_X = flat_X[indices]
self.min_val = torch.quantile(sampled_X, self.quantile_range[0])
self.max_val = torch.quantile(sampled_X, self.quantile_range[1])
else:
self.min_val = torch.quantile(flat_X, self.quantile_range[0])
self.max_val = torch.quantile(flat_X, self.quantile_range[1])
else:
# Feature-wise quantiles
self.min_val = torch.quantile(
X, self.quantile_range[0], dim=0, keepdim=True
)
self.max_val = torch.quantile(
X, self.quantile_range[1], dim=0, keepdim=True
)
else:
# Use traditional min/max
if self.global_scale:
# Global min/max across all values
self.min_val = torch.min(X)
self.max_val = torch.max(X)
else:
# Feature-wise min/max
self.min_val = torch.min(X, dim=0, keepdim=True)[0]
self.max_val = torch.max(X, dim=0, keepdim=True)[0]
self.fitted = True
return self
def transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Transform the data using fitted scaling parameters.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
Scaled tensor in the target feature_range.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before transform")
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
# Clip outliers if using robust scaling
if self.robust and self.clip_outliers:
X = torch.clamp(X, min=self.min_val, max=self.max_val)
# Avoid division by zero
range_val = self.max_val - self.min_val
range_val = torch.where(range_val == 0, torch.ones_like(range_val), range_val)
# Scale to [0, 1]
scaled = (X - self.min_val) / range_val
# Scale to desired range
min_target, max_target = self.feature_range
return scaled * (max_target - min_target) + min_target
def fit_transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Fit the scaler and transform the data in one step.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
Scaled tensor in the target feature_range.
"""
return self.fit(X).transform(X)
def inverse_transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Inverse transform scaled data back to original scale.
Args:
X: Scaled tensor.
Returns:
Tensor in original scale.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before inverse_transform")
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
min_target, max_target = self.feature_range
# Scale back to [0, 1]
normalized = (X - min_target) / (max_target - min_target)
# Scale back to original range
range_val = self.max_val - self.min_val
range_val = torch.where(range_val == 0, torch.ones_like(range_val), range_val)
return normalized * range_val + self.min_val
fit(self, X)
¶
Fit the scaler to the data by computing quantiles or min/max values.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X |
Union[torch.Tensor, np.ndarray] |
Input tensor of shape (batch_size, features). |
required |
Returns:
| Type | Description |
|---|---|
self |
Fitted scaler instance. |
Source code in hypercoast/emit_utils/preprocess.py
def fit(self, X: Union[torch.Tensor, "np.ndarray"]) -> "RobustMinMaxScaler":
"""Fit the scaler to the data by computing quantiles or min/max values.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
self: Fitted scaler instance.
"""
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
if self.robust:
# Use quantiles for robust scaling
if self.global_scale:
# Global quantiles across all values
flat_X = X.flatten()
# If tensor is too large, use sampling for quantile calculation
max_samples = 1000000 # 1M samples max
if len(flat_X) > max_samples:
# Randomly sample from the tensor
indices = torch.randperm(len(flat_X))[:max_samples]
sampled_X = flat_X[indices]
self.min_val = torch.quantile(sampled_X, self.quantile_range[0])
self.max_val = torch.quantile(sampled_X, self.quantile_range[1])
else:
self.min_val = torch.quantile(flat_X, self.quantile_range[0])
self.max_val = torch.quantile(flat_X, self.quantile_range[1])
else:
# Feature-wise quantiles
self.min_val = torch.quantile(
X, self.quantile_range[0], dim=0, keepdim=True
)
self.max_val = torch.quantile(
X, self.quantile_range[1], dim=0, keepdim=True
)
else:
# Use traditional min/max
if self.global_scale:
# Global min/max across all values
self.min_val = torch.min(X)
self.max_val = torch.max(X)
else:
# Feature-wise min/max
self.min_val = torch.min(X, dim=0, keepdim=True)[0]
self.max_val = torch.max(X, dim=0, keepdim=True)[0]
self.fitted = True
return self
fit_transform(self, X)
¶
Fit the scaler and transform the data in one step.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X |
Union[torch.Tensor, np.ndarray] |
Input tensor of shape (batch_size, features). |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Scaled tensor in the target feature_range. |
Source code in hypercoast/emit_utils/preprocess.py
def fit_transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Fit the scaler and transform the data in one step.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
Scaled tensor in the target feature_range.
"""
return self.fit(X).transform(X)
inverse_transform(self, X)
¶
Inverse transform scaled data back to original scale.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X |
Union[torch.Tensor, np.ndarray] |
Scaled tensor. |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Tensor in original scale. |
Exceptions:
| Type | Description |
|---|---|
ValueError |
If scaler has not been fitted. |
Source code in hypercoast/emit_utils/preprocess.py
def inverse_transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Inverse transform scaled data back to original scale.
Args:
X: Scaled tensor.
Returns:
Tensor in original scale.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before inverse_transform")
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
min_target, max_target = self.feature_range
# Scale back to [0, 1]
normalized = (X - min_target) / (max_target - min_target)
# Scale back to original range
range_val = self.max_val - self.min_val
range_val = torch.where(range_val == 0, torch.ones_like(range_val), range_val)
return normalized * range_val + self.min_val
transform(self, X)
¶
Transform the data using fitted scaling parameters.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
X |
Union[torch.Tensor, np.ndarray] |
Input tensor of shape (batch_size, features). |
required |
Returns:
| Type | Description |
|---|---|
Tensor |
Scaled tensor in the target feature_range. |
Exceptions:
| Type | Description |
|---|---|
ValueError |
If scaler has not been fitted. |
Source code in hypercoast/emit_utils/preprocess.py
def transform(self, X: Union[torch.Tensor, "np.ndarray"]) -> torch.Tensor:
"""Transform the data using fitted scaling parameters.
Args:
X: Input tensor of shape (batch_size, features).
Returns:
Scaled tensor in the target feature_range.
Raises:
ValueError: If scaler has not been fitted.
"""
if not self.fitted:
raise ValueError("Scaler must be fitted before transform")
if not isinstance(X, torch.Tensor):
X = torch.tensor(X, dtype=torch.float32)
# Clip outliers if using robust scaling
if self.robust and self.clip_outliers:
X = torch.clamp(X, min=self.min_val, max=self.max_val)
# Avoid division by zero
range_val = self.max_val - self.min_val
range_val = torch.where(range_val == 0, torch.ones_like(range_val), range_val)
# Scale to [0, 1]
scaled = (X - self.min_val) / range_val
# Scale to desired range
min_target, max_target = self.feature_range
return scaled * (max_target - min_target) + min_target