xarray provides N-dimensional labeled arrays and Datasets for scientific data. pip install xarray. import xarray as xr. DataArray: da = xr.DataArray(np.zeros((3,4)), dims=["lat","lon"], coords={"lat":[0,1,2],"lon":[10,11,12,13]}, attrs={"units":"K"}). Dataset: ds = xr.Dataset({"temp": da, "precip": da2}). Label select: ds.sel(lat=1), ds.sel(lat=slice(0,2)), ds.sel(time="2024-01"). Integer index: ds.isel(lat=0, lon=slice(0,2)). Nearest: ds.sel(lat=1.5, method="nearest"). Where: da.where(da > 0). Fillna: da.fillna(0.0). Assign: ds.assign(temp_C=ds.temp - 273.15). Groupby: ds.groupby("time.month").mean(). Resample: ds.resample(time="1ME").mean(). Rolling: da.rolling(time=7, center=True).mean(). Coarsen: da.coarsen(lat=2, lon=2).mean(). Interp: ds.interp(lat=np.linspace(0,89,180)). Apply ufunc: xr.apply_ufunc(np.sqrt, da, dask="parallelized", output_dtypes=[float]). Open NetCDF: ds = xr.open_dataset("file.nc", chunks={"time":100}). Multi-file: ds = xr.open_mfdataset("path/*.nc", combine="by_coords"). Write: ds.to_netcdf("out.nc", encoding={"temp":{"zlib":True,"complevel":4}}). Zarr: ds.to_zarr("store.zarr", consolidated=True). Concat: xr.concat([ds1, ds2], dim="time"). Merge: xr.merge([ds_a, ds_b]). Stack: ds.stack(grid=["lat","lon"]). Transpose: da.transpose("lon","lat","time"). Claude Code generates xarray climate data pipelines, multi-file NetCDF aggregators, and labeled array transformations.
CLAUDE.md for xarray
## xarray Stack
- Version: xarray >= 2024.0 | pip install "xarray[io]" for netcdf4/zarr
- DataArray: xr.DataArray(data, dims=[...], coords={...}, attrs={...})
- Dataset: xr.Dataset({"var": da}) | ds["var"] or ds.var
- Indexing: .sel(dim=value/slice) label-based | .isel(dim=int) positional
- Aggregation: .groupby("time.month").mean() | .resample(time="1ME").mean()
- I/O: xr.open_dataset/open_mfdataset with chunks={} for lazy Dask loading
- Write: .to_netcdf(encoding={"var":{"zlib":True}}) | .to_zarr(consolidated=True)
- Backends: netcdf4, h5netcdf, zarr, scipy — install separately
xarray N-D Labeled Array Pipeline
# science/xarray_pipeline.py — N-D labeled arrays with xarray
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Any, Union
import numpy as np
import pandas as pd
import xarray as xr
warnings.filterwarnings("ignore", category=SerializationWarning, module="xarray")
# ── 0. DataArray / Dataset creation ──────────────────────────────────────────
def make_temperature_dataset(
n_lat: int = 36,
n_lon: int = 72,
n_times: int = 365,
start: str = "2024-01-01",
) -> xr.Dataset:
"""
Create a synthetic temperature + precipitation Dataset.
Demonstrates dims, coords, attrs, and encoding metadata.
"""
lats = np.linspace(-87.5, 87.5, n_lat)
lons = np.linspace(-177.5, 177.5, n_lon)
times = pd.date_range(start, periods=n_times, freq="D")
rng = np.random.default_rng(42)
shape = (n_times, n_lat, n_lon)
# Realistic temperature: base + seasonal + lat gradient + noise
lat_grad = -0.5 * np.abs(lats) # colder near poles
seasonal = 10 * np.sin(2 * np.pi * np.arange(n_times) / 365)
temp_base = 283.15 # ~10°C in K
temp_data = (
temp_base
+ lat_grad[np.newaxis, :, np.newaxis]
+ seasonal[:, np.newaxis, np.newaxis]
+ rng.normal(0, 2, shape)
)
precip_data = rng.exponential(2.0, shape) # mm/day, skewed
return xr.Dataset(
{
"temperature": xr.DataArray(
temp_data.astype("float32"),
dims=["time", "lat", "lon"],
attrs={"units": "K", "long_name": "Air Temperature at 2m",
"standard_name": "air_temperature"},
),
"precipitation": xr.DataArray(
precip_data.astype("float32"),
dims=["time", "lat", "lon"],
attrs={"units": "mm/day", "long_name": "Daily Precipitation",
"standard_name": "precipitation_flux"},
),
},
coords={
"lat": xr.DataArray(lats, dims=["lat"], attrs={"units": "degrees_north"}),
"lon": xr.DataArray(lons, dims=["lon"], attrs={"units": "degrees_east"}),
"time": xr.DataArray(times, dims=["time"]),
},
attrs={
"title": "Synthetic Climate Dataset",
"Conventions": "CF-1.8",
"source": "xarray_pipeline.py demo",
},
)
# ── 1. Indexing and selection ─────────────────────────────────────────────────
def extract_region(
ds: xr.Dataset,
lat_range: tuple[float, float],
lon_range: tuple[float, float],
variables: list[str] = None,
) -> xr.Dataset:
"""Spatial subset by lat/lon bounding box."""
subset = ds.sel(
lat=slice(lat_range[0], lat_range[1]),
lon=slice(lon_range[0], lon_range[1]),
)
if variables:
subset = subset[variables]
return subset
def extract_time_window(
ds: xr.Dataset,
start_date: str,
end_date: str,
) -> xr.Dataset:
"""Temporal subset by date strings."""
return ds.sel(time=slice(start_date, end_date))
def nearest_point(
ds: xr.Dataset,
lat: float,
lon: float,
) -> xr.Dataset:
"""Extract the grid point nearest to a lat/lon location."""
return ds.sel(lat=lat, lon=lon, method="nearest")
# ── 2. Aggregation and statistics ─────────────────────────────────────────────
def seasonal_means(ds: xr.Dataset, variable: str) -> xr.DataArray:
"""
Compute mean by meteorological season (DJF, MAM, JJA, SON).
Returns DataArray indexed by season label.
"""
return ds[variable].groupby("time.season").mean(dim="time")
def monthly_climatology(ds: xr.Dataset, variable: str) -> xr.DataArray:
"""Monthly means averaged over all years — the climatological mean."""
return ds[variable].groupby("time.month").mean(dim="time")
def anomaly(ds: xr.Dataset, variable: str) -> xr.DataArray:
"""
Compute anomaly: observation minus monthly climatology.
Uses groupby broadcast to align dimensions automatically.
"""
clim = monthly_climatology(ds, variable)
return ds[variable].groupby("time.month") - clim
def spatial_mean(
da: xr.DataArray,
lat_weights: bool = True,
) -> xr.DataArray:
"""
Area-weighted spatial mean (cos-lat weighting).
Critical for global averages — equal-area weighting by default.
"""
if lat_weights and "lat" in da.dims:
weights = np.cos(np.deg2rad(da.lat))
weights = weights / weights.sum()
return (da * weights).sum(dim=["lat", "lon"])
return da.mean(dim=[d for d in ["lat", "lon"] if d in da.dims])
def resample_monthly(ds: xr.Dataset) -> xr.Dataset:
"""Downsample daily data to monthly mean."""
return ds.resample(time="1ME").mean()
def rolling_mean(
da: xr.DataArray,
window: int = 7,
dim: str = "time",
min_count: int = None,
) -> xr.DataArray:
"""N-day rolling mean with optional minimum valid count."""
roller = da.rolling({dim: window}, center=True, min_periods=min_count)
return roller.mean()
def coarsen_spatial(
ds: xr.Dataset,
factor: int = 4,
method: str = "mean", # "mean" | "sum" | "max"
) -> xr.Dataset:
"""Reduce spatial resolution by block aggregation."""
coarsener = ds.coarsen(lat=factor, lon=factor, boundary="trim")
return getattr(coarsener, method)()
# ── 3. Masking and filling ────────────────────────────────────────────────────
def mask_invalid(da: xr.DataArray, valid_range: tuple[float, float]) -> xr.DataArray:
"""Set values outside [vmin, vmax] to NaN."""
vmin, vmax = valid_range
return da.where((da >= vmin) & (da <= vmax))
def land_sea_mask(
ds: xr.Dataset,
mask_da: xr.DataArray, # 1=land, 0=ocean
keep: str = "land",
) -> xr.Dataset:
"""Mask to land or ocean grid cells."""
if keep == "land":
return ds.where(mask_da == 1)
return ds.where(mask_da == 0)
def fill_gaps(
da: xr.DataArray,
method: str = "interpolate", # "interpolate" | "constant" | "ffill"
fill_value: float = 0.0,
limit: int = None,
) -> xr.DataArray:
"""Fill missing values along the time dimension."""
if method == "interpolate":
return da.interpolate_na(dim="time", limit=limit)
elif method == "ffill":
return da.ffill(dim="time", limit=limit)
return da.fillna(fill_value)
# ── 4. Transformations ────────────────────────────────────────────────────────
def convert_units(
da: xr.DataArray,
from_unit: str,
to_unit: str,
) -> xr.DataArray:
"""
Common unit conversions for climate variables.
Returns a new DataArray with updated attrs.
"""
conversions = {
("K", "C"): lambda x: x - 273.15,
("C", "K"): lambda x: x + 273.15,
("K", "F"): lambda x: (x - 273.15) * 9/5 + 32,
("m/s", "km/h"): lambda x: x * 3.6,
("m/s", "knots"): lambda x: x * 1.944,
("Pa", "hPa"): lambda x: x / 100.0,
("mm/s", "mm/day"): lambda x: x * 86400,
}
key = (from_unit, to_unit)
if key not in conversions:
raise ValueError(f"No conversion defined for {from_unit} → {to_unit}")
result = conversions[key](da)
result.attrs = {**da.attrs, "units": to_unit}
return result
def interpolate_to_grid(
ds: xr.Dataset,
new_lat: np.ndarray,
new_lon: np.ndarray,
method: str = "linear", # "linear" | "nearest" | "cubic"
) -> xr.Dataset:
"""Regrid to a new lat/lon grid by interpolation."""
return ds.interp(lat=new_lat, lon=new_lon, method=method)
def apply_func_vectorized(
da: xr.DataArray,
fn, # numpy ufunc or vectorized function
output_dtype=float,
dask: str = "parallelized",
) -> xr.DataArray:
"""
Apply a numpy function while preserving xarray coordinates.
Use dask='parallelized' for lazy Dask-backed arrays.
"""
return xr.apply_ufunc(
fn, da,
dask=dask,
output_dtypes=[output_dtype],
)
# ── 5. Combining datasets ─────────────────────────────────────────────────────
def concat_along_time(datasets: list[xr.Dataset]) -> xr.Dataset:
"""Concatenate a list of Datasets along the time dimension."""
return xr.concat(datasets, dim="time")
def merge_variables(datasets: list[xr.Dataset]) -> xr.Dataset:
"""Merge Datasets with different variables into one."""
return xr.merge(datasets, compat="no_conflicts")
def stack_ensemble(
members: list[xr.Dataset],
member_ids: list[str],
dim_name: str = "member",
) -> xr.Dataset:
"""Stack multiple model runs along a new ensemble dimension."""
coord = xr.DataArray(member_ids, dims=[dim_name])
return xr.concat(members, dim=coord)
# ── 6. File I/O ───────────────────────────────────────────────────────────────
def save_netcdf(
ds: xr.Dataset,
path: str,
compress: bool = True,
complevel: int = 4,
float_vars: list[str] = None,
) -> None:
"""
Write Dataset to NetCDF4 with optional compression.
Adds CF-compliant coordinate encoding.
"""
encoding = {}
variables = float_vars or [v for v in ds.data_vars if ds[v].dtype in [np.float32, np.float64]]
for var in variables:
enc: dict = {}
if compress:
enc.update({"zlib": True, "complevel": complevel, "shuffle": True})
# Efficient integer packing for float32 fields
if ds[var].dtype == np.float32:
enc.update({"dtype": "int16", "scale_factor": 0.01, "_FillValue": -9999})
encoding[var] = enc
# Encode time as CF standard
if "time" in ds.coords:
encoding["time"] = {"units": "days since 1900-01-01", "calendar": "standard"}
ds.to_netcdf(path, encoding=encoding, format="NETCDF4")
print(f"Saved {path} ({Path(path).stat().st_size / 1e6:.1f} MB)")
def save_zarr(
ds: xr.Dataset,
store: str,
chunks: dict = None,
consolidated: bool = True,
) -> None:
"""Write Dataset to Zarr store with optional rechunking."""
if chunks:
ds = ds.chunk(chunks)
ds.to_zarr(store, mode="w", consolidated=consolidated)
print(f"Saved Zarr store: {store}")
def open_multi_file(
pattern: str,
variable: str = None,
chunks: dict = None,
combine: str = "by_coords", # "by_coords" | "nested"
) -> xr.Dataset:
"""
Open multiple NetCDF files as a single lazy Dataset (Dask-backed).
Pattern: "data/cmip6/tas_*.nc"
"""
ds = xr.open_mfdataset(
pattern,
combine=combine,
chunks=chunks or {"time": 100},
parallel=True,
preprocess=lambda d: d[[variable]] if variable else d,
)
return ds
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("xarray N-D Labeled Array Demo")
print("=" * 50)
# Create synthetic dataset
ds = make_temperature_dataset(n_lat=18, n_lon=36, n_times=365)
print(f"\nDataset: {dict(ds.dims)}")
print(f"Variables: {list(ds.data_vars)}")
print(f"Coords: {list(ds.coords)}")
temp = ds["temperature"]
print(f"\ntemp shape: {temp.shape}, dtype: {temp.dtype}")
print(f"temp attrs: {temp.attrs}")
# Indexing
print("\nIndexing:")
point = nearest_point(ds, lat=45, lon=0)
print(f" Nearest point (45N, 0E) — time series length: {len(point.time)}")
region = extract_region(ds, lat_range=(20, 70), lon_range=(-60, 60))
print(f" Region subset shape: {dict(region.dims)}")
# Aggregations
print("\nAggregations:")
seasonal = seasonal_means(ds, "temperature")
print(f" Seasonal means seasons: {seasonal.season.values}")
monthly = monthly_climatology(ds, "temperature")
print(f" Monthly climatology shape: {monthly.shape}")
glob_mean = spatial_mean(temp.mean("time"), lat_weights=True)
print(f" Global mean temperature: {float(glob_mean):.2f} K")
# Unit conversion
print("\nUnit conversion:")
temp_C = convert_units(ds["temperature"].mean("time"), "K", "C")
print(f" Mean temp in C range: {float(temp_C.min()):.1f} to {float(temp_C.max()):.1f}")
# Interpolation to finer grid
print("\nInterpolation:")
fine_lat = np.linspace(-87.5, 87.5, 36)
fine_lon = np.linspace(-177.5, 177.5, 72)
ds_fine = interpolate_to_grid(ds.isel(time=0), fine_lat, fine_lon)
print(f" Regridded shape: {dict(ds_fine.dims)}")
print("\nDone — use xr.open_dataset('file.nc') for real NetCDF files.")
For the NumPy + Pandas alternative — NumPy arrays lose coordinate labels immediately after slicing while xarray’s da.sel(lat=45, method="nearest") returns a DataArray with all coordinates intact, groupby("time.month").mean() computes monthly climatologies without calendar arithmetic, and xr.open_mfdataset("data/*.nc", parallel=True, chunks={"time":100}) lazily opens hundreds of NetCDF files as a single Dask-backed Dataset that fits in memory because data is only read on .compute(). For the Pandas multi-index alternative for multi-dimensional data — Pandas MultiIndex requires storing N-D arrays as long-format tables with repeated coordinate values while xarray’s Dataset stores a 4D field as a single (time, level, lat, lon) array, apply_ufunc broadcasts numpy ufuncs across all coordinates without reshaping, and interp_like(other_ds) regrids to another Dataset’s grid in one call, replacing nested scipy.interpolate.interp2d loops. The Claude Skills 360 bundle includes xarray skill sets covering DataArray and Dataset creation with dims/coords/attrs, sel/isel/where label indexing, groupby seasonal and monthly climatology, resample/rolling/coarsen aggregation, apply_ufunc for numpy vectorization, open_mfdataset Dask lazy loading, to_netcdf with CF encoding, to_zarr chunked storage, concat/merge/stack ensemble combination, and spatial area-weighted mean. Start with the free tier to try N-D labeled array code generation.