cesl module¶
Utilities for working with the Coastal Ecosystem Spectral Library (CESL).
cesl_to_gdf(sample_ids=None, include_units=False, max_workers=8, timeout=30, skip_missing_coordinates=True, skip_errors=False, **search_kwargs)
¶
Convert CESL site metadata to a GeoPandas GeoDataFrame.
Source code in hypercoast/cesl.py
def cesl_to_gdf(
sample_ids: Optional[Iterable[int]] = None,
include_units: bool = False,
max_workers: int = 8,
timeout: int = 30,
skip_missing_coordinates: bool = True,
skip_errors: bool = False,
**search_kwargs: Any,
) -> gpd.GeoDataFrame:
"""Convert CESL site metadata to a GeoPandas GeoDataFrame."""
try:
import geopandas as gpd
except ImportError as exc:
raise ImportError(
"geopandas is required to convert CESL sites to a GeoDataFrame."
) from exc
import pandas as pd
records = get_cesl_sites(
sample_ids=sample_ids,
include_units=include_units,
max_workers=max_workers,
timeout=timeout,
skip_missing_coordinates=skip_missing_coordinates,
skip_errors=skip_errors,
**search_kwargs,
)
if not records:
return gpd.GeoDataFrame(geometry=[], crs="EPSG:4326")
frame = pd.DataFrame(records)
return gpd.GeoDataFrame(
frame,
geometry=gpd.points_from_xy(frame["longitude"], frame["latitude"]),
crs="EPSG:4326",
)
cesl_to_geojson(output=None, sample_ids=None, include_units=False, max_workers=8, timeout=30, skip_missing_coordinates=True, skip_errors=False, **search_kwargs)
¶
Create a GeoJSON feature collection for CESL sites.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
output |
str |
Output GeoJSON path. If provided, the GeoJSON is written to disk. |
None |
sample_ids |
Iterable[int] |
CESL sample IDs to export. |
None |
include_units |
bool |
Whether to preserve units in properties. Defaults to False. |
False |
max_workers |
int |
Number of worker threads used to fetch metadata. Defaults to 8. |
8 |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
skip_missing_coordinates |
bool |
Whether to skip samples without coordinates. Defaults to True. |
True |
skip_errors |
bool |
Whether to skip samples that fail for any reason (e.g. non-JSON API responses). Defaults to False. |
False |
**search_kwargs |
Any |
Additional arguments passed to :func: |
{} |
Returns:
| Type | Description |
|---|---|
dict[str, Any] |
A GeoJSON FeatureCollection. |
Source code in hypercoast/cesl.py
def cesl_to_geojson(
output: Optional[str] = None,
sample_ids: Optional[Iterable[int]] = None,
include_units: bool = False,
max_workers: int = 8,
timeout: int = 30,
skip_missing_coordinates: bool = True,
skip_errors: bool = False,
**search_kwargs: Any,
) -> Dict[str, Any]:
"""Create a GeoJSON feature collection for CESL sites.
Args:
output (str, optional): Output GeoJSON path. If provided, the GeoJSON is
written to disk.
sample_ids (Iterable[int], optional): CESL sample IDs to export.
include_units (bool, optional): Whether to preserve units in properties.
Defaults to False.
max_workers (int, optional): Number of worker threads used to fetch
metadata. Defaults to 8.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
skip_missing_coordinates (bool, optional): Whether to skip samples
without coordinates. Defaults to True.
skip_errors (bool, optional): Whether to skip samples that fail for any
reason (e.g. non-JSON API responses). Defaults to False.
**search_kwargs: Additional arguments passed to :func:`search_cesl` when
``sample_ids`` is not provided.
Returns:
dict[str, Any]: A GeoJSON FeatureCollection.
"""
records = get_cesl_sites(
sample_ids=sample_ids,
include_units=include_units,
max_workers=max_workers,
timeout=timeout,
skip_missing_coordinates=skip_missing_coordinates,
skip_errors=skip_errors,
**search_kwargs,
)
feature_collection = {
"type": "FeatureCollection",
"features": [_build_feature(record) for record in records],
}
if output is not None:
output = os.path.abspath(output)
os.makedirs(os.path.dirname(output), exist_ok=True)
with open(output, "w", encoding="utf-8") as file:
json.dump(feature_collection, file, indent=2)
return feature_collection
get_cesl_metadata(sample_id, include_units=False, timeout=30, crosswalk='speclib')
¶
Retrieve metadata for a CESL sample.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sample_id |
int |
CESL sample ID. |
required |
include_units |
bool |
Whether to preserve CESL units metadata. Defaults to False. |
False |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
crosswalk |
str |
Metadata crosswalk type. Defaults to
|
'speclib' |
Returns:
| Type | Description |
|---|---|
dict[str, Any] |
Normalized metadata for the sample. |
Source code in hypercoast/cesl.py
def get_cesl_metadata(
sample_id: int,
include_units: bool = False,
timeout: int = 30,
crosswalk: str = "speclib",
) -> Dict[str, Any]:
"""Retrieve metadata for a CESL sample.
Args:
sample_id (int): CESL sample ID.
include_units (bool, optional): Whether to preserve CESL units metadata.
Defaults to False.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
crosswalk (str, optional): Metadata crosswalk type. Defaults to
``"speclib"``.
Returns:
dict[str, Any]: Normalized metadata for the sample.
"""
data = _request_cesl(
f"sample/{sample_id}/metadata",
params={"format": "json", "crosswalk": crosswalk},
timeout=timeout,
)
payload = _get_payload(data, _METADATA_KEY_CANDIDATES)
return _normalize_cesl_metadata(payload, include_units=include_units)
get_cesl_sites(sample_ids=None, include_units=False, max_workers=8, timeout=30, skip_missing_coordinates=True, skip_errors=False, **search_kwargs)
¶
Retrieve CESL site metadata for a set of sample IDs.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sample_ids |
Iterable[int] |
CESL sample IDs to retrieve. Defaults to the full CESL catalog or a filtered catalog search. |
None |
include_units |
bool |
Whether to preserve units in metadata. Defaults to False. |
False |
max_workers |
int |
Number of worker threads used to fetch metadata. Defaults to 8. |
8 |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
skip_missing_coordinates |
bool |
Whether to skip samples without coordinates. Defaults to True. |
True |
skip_errors |
bool |
Whether to skip samples that fail for any reason (e.g. non-JSON API responses). A warning is emitted for each skipped sample. Defaults to False. |
False |
**search_kwargs |
Any |
Additional arguments passed to :func: |
{} |
Returns:
| Type | Description |
|---|---|
list[dict[str, Any]] |
Normalized site records including coordinates. |
Source code in hypercoast/cesl.py
def get_cesl_sites(
sample_ids: Optional[Iterable[int]] = None,
include_units: bool = False,
max_workers: int = 8,
timeout: int = 30,
skip_missing_coordinates: bool = True,
skip_errors: bool = False,
**search_kwargs: Any,
) -> List[Dict[str, Any]]:
"""Retrieve CESL site metadata for a set of sample IDs.
Args:
sample_ids (Iterable[int], optional): CESL sample IDs to retrieve.
Defaults to the full CESL catalog or a filtered catalog search.
include_units (bool, optional): Whether to preserve units in metadata.
Defaults to False.
max_workers (int, optional): Number of worker threads used to fetch
metadata. Defaults to 8.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
skip_missing_coordinates (bool, optional): Whether to skip samples
without coordinates. Defaults to True.
skip_errors (bool, optional): Whether to skip samples that fail for any
reason (e.g. non-JSON API responses). A warning is emitted for each
skipped sample. Defaults to False.
**search_kwargs: Additional arguments passed to :func:`search_cesl` when
``sample_ids`` is not provided.
Returns:
list[dict[str, Any]]: Normalized site records including coordinates.
"""
if sample_ids is None:
sample_ids = search_cesl(timeout=timeout, **search_kwargs)
sample_ids = list(sample_ids)
def fetch_site(sample_id: int) -> Dict[str, Any]:
metadata = get_cesl_metadata(
sample_id=sample_id, include_units=include_units, timeout=timeout
)
latitude = _extract_coordinate(metadata, _LATITUDE_KEYS, "latitude")
longitude = _extract_coordinate(metadata, _LONGITUDE_KEYS, "longitude")
return {
**metadata,
"sample_id": sample_id,
"latitude": latitude,
"longitude": longitude,
}
_BATCH_SIZE = 50
records: List[Dict[str, Any]] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for batch_start in range(0, len(sample_ids), _BATCH_SIZE):
batch = sample_ids[batch_start : batch_start + _BATCH_SIZE]
futures = {
executor.submit(fetch_site, sample_id): sample_id for sample_id in batch
}
for future in as_completed(futures):
sample_id = futures[future]
try:
records.append(future.result())
except _MissingCoordinateError:
if not skip_missing_coordinates:
raise
except Exception as exc:
if skip_errors:
warnings.warn(
f"Skipping sample {sample_id}: {exc}",
RuntimeWarning,
stacklevel=2,
)
else:
raise RuntimeError(
f"Failed to retrieve CESL metadata for sample {sample_id}."
) from exc
records.sort(key=lambda record: record["sample_id"])
return records
get_cesl_spectrum(sample_id, spectrum_key=None, timeout=30)
¶
Retrieve the wavelength and spectrum values for a CESL sample.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sample_id |
int |
CESL sample ID. |
required |
spectrum_key |
str |
Name of the spectrum field to extract.
Defaults to the first non- |
None |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
Returns:
| Type | Description |
|---|---|
pandas.DataFrame |
A DataFrame with |
Source code in hypercoast/cesl.py
def get_cesl_spectrum(
sample_id: int, spectrum_key: Optional[str] = None, timeout: int = 30
) -> pd.DataFrame:
"""Retrieve the wavelength and spectrum values for a CESL sample.
Args:
sample_id (int): CESL sample ID.
spectrum_key (str, optional): Name of the spectrum field to extract.
Defaults to the first non-``wavelength`` field returned by the API.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
Returns:
pandas.DataFrame: A DataFrame with ``wavelength`` and spectrum columns.
"""
import pandas as pd
data = _request_cesl(
f"sample/{sample_id}/data", params={"format": "json"}, timeout=timeout
)
payload = _get_payload(data, _DATA_KEY_CANDIDATES)
if "wavelength" not in payload:
raise KeyError("CESL spectrum response does not contain 'wavelength'.")
if spectrum_key is None:
spectrum_keys = [key for key in payload if key.lower() != "wavelength"]
if not spectrum_keys:
raise KeyError("CESL spectrum response does not contain any spectral data.")
spectrum_key = spectrum_keys[0]
else:
matching_keys = {
key.lower(): key for key in payload if key.lower() != "wavelength"
}
resolved_key = matching_keys.get(spectrum_key.lower())
if resolved_key is None:
raise KeyError(f"Could not find spectrum key '{spectrum_key}'.")
spectrum_key = resolved_key
spectrum = pd.DataFrame(
{"wavelength": payload["wavelength"], spectrum_key: payload[spectrum_key]}
)
spectrum.attrs["sample_id"] = sample_id
spectrum.attrs["spectrum_key"] = spectrum_key
return spectrum
plot_cesl_spectrum(sample_id, spectrum_key=None, ax=None, title=None, xlabel='Wavelength (nm)', ylabel=None, figsize=None, x_range=None, y_range=None, timeout=30, **kwargs)
¶
Plot a CESL spectrum for a selected sample.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
sample_id |
int |
CESL sample ID. |
required |
spectrum_key |
str |
Name of the spectrum field to plot. |
None |
ax |
matplotlib.axes.Axes |
Existing axes to plot on. |
None |
title |
str |
Plot title. Defaults to |
None |
xlabel |
str |
X-axis label. Defaults to |
'Wavelength (nm)' |
ylabel |
str |
Y-axis label. Defaults to the selected spectrum key. |
None |
figsize |
Sequence[float] |
Figure size passed to
|
None |
x_range |
Sequence[float] |
Two-element x-axis range used to exclude wavelength outliers from the visible plot extent. |
None |
y_range |
Sequence[float] |
Two-element y-axis range used to exclude reflectance outliers from the visible plot extent. |
None |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
**kwargs |
Any |
Additional keyword arguments passed to |
{} |
Returns:
| Type | Description |
|---|---|
matplotlib.axes.Axes |
The axes containing the plot. |
Source code in hypercoast/cesl.py
def plot_cesl_spectrum(
sample_id: int,
spectrum_key: Optional[str] = None,
ax: Optional[plt.Axes] = None,
title: Optional[str] = None,
xlabel: str = "Wavelength (nm)",
ylabel: Optional[str] = None,
figsize: Optional[Sequence[float]] = None,
x_range: Optional[Sequence[float]] = None,
y_range: Optional[Sequence[float]] = None,
timeout: int = 30,
**kwargs: Any,
) -> plt.Axes:
"""Plot a CESL spectrum for a selected sample.
Args:
sample_id (int): CESL sample ID.
spectrum_key (str, optional): Name of the spectrum field to plot.
ax (matplotlib.axes.Axes, optional): Existing axes to plot on.
title (str, optional): Plot title. Defaults to ``CESL Sample <id>``.
xlabel (str, optional): X-axis label. Defaults to ``Wavelength (nm)``.
ylabel (str, optional): Y-axis label. Defaults to the selected spectrum
key.
figsize (Sequence[float], optional): Figure size passed to
``matplotlib.pyplot.subplots`` when ``ax`` is not provided.
x_range (Sequence[float], optional): Two-element x-axis range used to
exclude wavelength outliers from the visible plot extent.
y_range (Sequence[float], optional): Two-element y-axis range used to
exclude reflectance outliers from the visible plot extent.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
**kwargs: Additional keyword arguments passed to ``Axes.plot``.
Returns:
matplotlib.axes.Axes: The axes containing the plot.
"""
import matplotlib.pyplot as plt
spectrum = get_cesl_spectrum(
sample_id=sample_id, spectrum_key=spectrum_key, timeout=timeout
)
spectrum_key = spectrum.attrs["spectrum_key"]
if ax is None:
_, ax = plt.subplots(figsize=figsize)
label = kwargs.pop("label", f"Sample {sample_id}")
ax.plot(spectrum["wavelength"], spectrum[spectrum_key], label=label, **kwargs)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel or spectrum_key)
ax.set_title(title or f"CESL Sample {sample_id}")
if x_range is not None:
ax.set_xlim(x_range)
if y_range is not None:
ax.set_ylim(y_range)
if label is not None:
ax.legend()
return ax
search_cesl(bbox=None, circle=None, publish_date_start=None, publish_date_end=None, taxonomy=None, biomass=None, coverage=None, timeout=30)
¶
Search the CESL catalog and return matching sample IDs.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
bbox |
Sequence[float] |
Bounding box formatted as
|
None |
circle |
Sequence[float] |
Circular search formatted as
|
None |
publish_date_start |
str |
ISO8601 start date for published samples. |
None |
publish_date_end |
str |
ISO8601 end date for published samples. |
None |
taxonomy |
str |
Binomial or taxonomy path filter. |
None |
biomass |
bool |
Whether biomass measurements are required. |
None |
coverage |
float |
Minimum percentage coverage threshold. |
None |
timeout |
int |
Request timeout in seconds. Defaults to 30. |
30 |
Returns:
| Type | Description |
|---|---|
list[int] |
Matching CESL sample IDs. |
Source code in hypercoast/cesl.py
def search_cesl(
bbox: Optional[Sequence[float]] = None,
circle: Optional[Sequence[float]] = None,
publish_date_start: Optional[str] = None,
publish_date_end: Optional[str] = None,
taxonomy: Optional[str] = None,
biomass: Optional[bool] = None,
coverage: Optional[float] = None,
timeout: int = 30,
) -> List[int]:
"""Search the CESL catalog and return matching sample IDs.
Args:
bbox (Sequence[float], optional): Bounding box formatted as
``(north, south, east, west)``.
circle (Sequence[float], optional): Circular search formatted as
``(latitude, longitude, radius_km)``.
publish_date_start (str, optional): ISO8601 start date for published
samples.
publish_date_end (str, optional): ISO8601 end date for published
samples.
taxonomy (str, optional): Binomial or taxonomy path filter.
biomass (bool, optional): Whether biomass measurements are required.
coverage (float, optional): Minimum percentage coverage threshold.
timeout (int, optional): Request timeout in seconds. Defaults to 30.
Returns:
list[int]: Matching CESL sample IDs.
"""
params: Dict[str, Any] = {"format": "json"}
formatted_bbox = _format_catalog_param(bbox, 4, "bbox")
formatted_circle = _format_catalog_param(circle, 3, "circle")
if formatted_bbox is not None:
params["bbox"] = formatted_bbox
if formatted_circle is not None:
params["circle"] = formatted_circle
if publish_date_start is not None:
params["publish_date_start"] = publish_date_start
if publish_date_end is not None:
params["publish_date_end"] = publish_date_end
if taxonomy is not None:
params["taxonomy"] = taxonomy
if biomass is not None:
params["biomass"] = str(biomass).lower()
if coverage is not None:
params["coverage"] = coverage
data = _request_cesl("catalog", params=params, timeout=timeout)
payload = _get_payload(data, _CATALOG_KEY_CANDIDATES)
return payload.get("ids", [])