Draft and test the `extract_external` methods¶

Draft `extract_external`¶

Success

Add required dependencies to pyproject.toml

Add the required dependencies for the Open-Mateo API to the project.optional-dependencies.runner section of pyproject.toml. Since openmeteo-requests and retry-requests are not available on conda-forge, the tool.pyproject2conda.dependencies section of pyrpoejct.toml should also be modified to tell pyproject2conda to build the conda environment yaml files specifying that these packages be installed with pip instead of directly with conda.

pyproject.toml

#:schema https://json.schemastore.org/pyproject.json
# https://github.com/tamasfe/taplo/issues/620#issuecomment-2625975106

[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "able-workflow-weather-example"
# Dynamic versioning is not supported by Snakemake modules
version = "0.1.0-rc20250624-1200"
description = "A snakemake workflow with an associated python package containing modules for datasets, features, and models for extracting, transforming, and loading weather data."
authors = [
    { name = "Michael Kane", email = "mi.kane@northeastern.edu" }
]
license-files = ["LICENSE"]
readme = "README.md"
classifiers = [
    "Development Status :: 3 - Alpha",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
]
requires-python = ">=3.11,<3.13"

# Minimal runtime dependencies for package to read and validate (schemas)
# data produced by previously executed workflows using this same package
dependencies = [
  "loguru>=0.7.2",
  "numpy>=2.2.5",
  "pandas>=2.2.3",
  "pandera>=0.24",
  "pyarrow>=20.0.0",
  "pydantic>=2.11.4",
  "python-dotenv>=1.1.0",
  "tqdm>=4.67.1",
  "typer>=0.15.3",
]

[project.optional-dependencies]

# These extras are required to run the python package within
# a snakemake workflow. In order to build the
# documentation for the package, all of the optional-dependencies
# must be able to be installed simultaneously.
runner = [
  "snakemake>=9.3.3",
  # open-meteo API dependencies
  "openmeteo-requests",
  "requests-cache",
  "retry-requests",
  # Add additional dependencies for new modules and ETL process here.
]

[dependency-groups]
# Optional dependencies for development, CI, and Jupyter workflows
# that are not inherently part of the package.

# Minimal snakemake workflow dependencies
# Replaces `workflow/envs/global.yaml` since `global.yaml`
# cannot be cached before a workflow goes offline.
snakemake = [
  "snakemake>=9.3.3",
]
workflow = [
  { include-group = "snakemake" },
  "json5>=0.12.0",
  "loguru>=0.7.2",
  "graphviz>=0.20.3",
  "pandas>=2.2.3",
  "pyproject2conda>=0.19.1",
  "ruamel.yaml>=0.18.6",
]

# Jupyter / data-viz workflows
notebook = [
  "ipython>=9.2.0",
  "itables>=2.3.0",
  "jupyterlab>=4.4.2",
  "matplotlib>=3.10.1",
  "nbconvert>=7.16.6",
  "notebook>=7.4.2",    # needed by papermill
  "papermill>=2.6.0",
  "seaborn>=0.13.2",
]

docs = [
  { include-group = "workflow" },
  "lxml>=5.4.0",
  "mike>=2.1.3",
  "mkdocs>=1.6.1",
  "mkdocs-include-markdown-plugin>=7.1.6",
  "mkdocs-inline-select-svg-plugin>=0.1.0",
  "mkdocs-jupyter>=0.25.1",
  "mkdocs-gen-files>=0.5.0",
  "mkdocs-git-revision-date-localized-plugin>=1.2.9",
  "mkdocs-literate-nav>=0.6.2",
  "mkdocs-macros-plugin>=1.3.7",
  "mkdocs-material>=9.6.14",
  "mkdocs-mermaid2-plugin>=1.2.1",
  "mkdocs-section-index>=0.3.10",
  "mkdocstrings-python>=1.16.10",
  "notebook>=7.4.2",
  "pymdown-extensions>=10.15",
]

lint = [
  "black>=24.3,<25.0", # Currently being held back by `snakefmt`
  "ruff>=0.11.8",
  "snakefmt>=0.11.0",

  # Add jupyter extras to black
  # See https://github.com/psf/black/blob/main/pyproject.toml
  "ipython>=7.8.0",
  "tokenize-rt>=3.2.0",
]

test = [
  "pytest>=8.3.5",
  "pytest-cov>=6.1.1",
  "pytest-sugar>=1.0.0",
  "pytest-order>=1.3.0",
  "pytest-remotedata>=0.4.1",
]

tox = [
  "tox>=4.25.0",
]

typecheck = [
  "mypy>=1.15.0",
  # stubs
  "loguru>=0.7.2",
  "pandas>=2.2.3",
  "pandas-stubs>=2.2.3",
  "pandera>=0.24",
  "pydantic>=2.11.4",
  "types-tqdm>=4.67.0",
  "types-openpyxl>=3.1.5",
  "typer>=0.15.3",
]

# Adding and updating copier template
copier = [
  "copier>=9.7.1",
  "loguru>=0.7.2",
  "ruamel.yaml>=0.18.12",
  "typer>=0.16.0",

  # jinja2 extensions
  "cookiecutter>=2.6.0",
  "copier-templates-extensions>=0.3.1",
  "jinja2-jsonschema>=0.3.0",
]

# development / CI
dev = [
  "pip>=25.1.1",
  "pre-commit>=4.2.0",
  { include-group = "copier" },
  { include-group = "docs" },
  { include-group = "lint" },
  { include-group = "notebook" },
  { include-group = "test" },
  { include-group = "tox" },
  { include-group = "typecheck" },
  { include-group = "workflow" },
]

[project.urls]
Repository = "https://github.com/NEU-ABLE-LAB/able-workflow-weather-example"
Documentation = "https://github.com/NEU-ABLE-LAB/able-workflow-weather-example"
Homepage = "https://github.com/NEU-ABLE-LAB/able-workflow-weather-example"

[tool.coverage.run]
parallel = true
# TODO-copier-package Add docs and hooks python
source = [
    "able_weather",
    "workflow/scripts",
]
omit   = [
    "tests/*",                       # skip everything in tests/
    "**/conftest.py",                # skip pytest helpers
    "**/__init__.py",                # often trivial
]

[tool.coverage.report]
exclude_lines = [
    "pragma: no cover",
    "if TYPE_CHECKING:",
    "if __name__ == \"__main__\":",
]

[tool.mypy]
python_version = "3.11"
plugins = [
  # https://pandera.readthedocs.io/en/stable/mypy_integration.html
  "pandera.mypy",
  # https://docs.pydantic.dev/1.10/mypy_plugin/
  "pydantic.mypy",
]
strict = true
exclude = '(^|/)tests/.*\.py$'
files = ["able_weather"]
ignore_missing_imports = true

[[tool.mypy.overrides]]
# Snakemake does not provide type hints for its modules,
# so we need to ignore them
module = "snakemake.*"
follow_imports = "skip"

[[tool.mypy.overrides]]
module = "tests.*"
ignore_errors = true

[tool.pydantic-mypy]
# With this, the plugin makes the generated `__init__` for Pydantic models
# require exact types (no coercion allowed in type checking). For example, if
# a field is `age: int` and you call `User(age="123")`, Mypy will error
# because `"123"` is a str (even though Pydantic might coerce it at runtime).
# This is useful to enforce correct types at compile time.
warn_untyped_fields = true
# Mypy will error if any Pydantic model field is not annotated with a type.
# This ensures you  don't accidentally leave a field without a type, which
# would be treated as `Any`.
init_typed          = true
# Ensures that if you pass an unexpected field to a model’s constructor,
# Mypy will flag it (even if Pydantic might allow it due to `extra` settings).
# This aligns with using  `extra="forbid"` at runtime.
init_forbid_extra   = true

[tool.pyproject2conda]
channels = [
    "conda-forge",
]
template-python = "workflow/envs/pyproject2conda/able_weather-py{py}-{env}"
template = "workflow/envs/pyproject2conda/able_weather-{env}"

[tool.pyproject2conda.dependencies]
copier-templates-extensions = { pip = true }
jinja2-jsonschema = { pip = true }
# Install GraphViz to visualize the Snakemake DAG
graphviz = { channel = "conda-forge", packages = ["graphviz>=12.2.1"]}
mkdocs-inline-select-svg-plugin = { pip = true }
mkdocs-mermaid2-plugin = { pip = true }
mkdocs-section-index = { pip = true }
mkdocstrings = { skip = true, packages = ["mkdocstrings-python>=1.16.10"] }
openmeteo-requests = { pip = true }
retry-requests = { pip = true }
snakemake = { channel = "bioconda"}
snakefmt = { channel = "bioconda"}

# --- PyProject2Conda environments ---
# When adding a new environment, add it to
# `config/config.yaml` `CONDA.ENVS` list too.

[tool.pyproject2conda.envs."core"]
style = ["yaml"]
header = true
python = "highest"

[tool.pyproject2conda.envs."core-runner"]
style = ["yaml"]
header = true
python = "highest"
extras-or-groups = ["runner"]

[tool.pyproject2conda.envs."dev"]
style = ["yaml"]
header = true
python = "highest"
extras-or-groups = ["dev"]

[tool.pyproject2conda.envs."dev-runner"]
style = ["yaml"]
header = true
python = "highest"
extras-or-groups = ["runner","dev"]

[tool.pyproject2conda.envs."docs"]
style = ["yaml"]
header = true
python = "highest"
extras-or-groups = ["runner","docs"]

[tool.pyproject2conda.envs."notebook"]
style = ["yaml"]
header = true
python = "highest"
extras-or-groups = ["notebook"]

[tool.pyproject2conda.envs."tox"]
style = ["yaml"]
header = true
python = "highest"
skip-package = true
extras-or-groups = ["tox"]
template-python = "workflow/envs/able_weather-py{py}-{env}"
template = "workflow/envs/able_weather-{env}"

[tool.pyproject2conda.envs."workflow"]
style = ["yaml"]
header = true
python = "highest"
skip_package = true
extras-or-groups = ["workflow"]
template-python = "workflow/envs/able_weather-py{py}-{env}"
template = "workflow/envs/able_weather-{env}"

[tool.pytest.ini_options]
markers = [
    # `remote_data` marker is defined by `pytest-remotedata`
]

[tool.ruff]
line-length = 79
src = ["able_weather", "workflow/scripts"]
include = [
    "pyproject.toml",
    "able_weather/**/*.py",
    "tests/**/*.py",
    "workflow/scripts/**/*.py",
]

[tool.ruff.lint]
extend-select = [
    "I",  # Import sorting
    "T201",  # Do not allow `print()` statements. Encourage `loguru` instead.
]

[tool.ruff.lint.flake8-tidy-imports.banned-api]
"logging".msg  = "Use Loguru instead."
"warnings".msg = "Use Loguru instead."

[tool.ruff.lint.isort]
known-first-party = ["able_weather"]

[tool.setuptools.packages.find]
include = ["able_weather*"]

[tool.snakefmt]
line_length = 79
include = '\.smk$|^Snakefile$'

This example leverages the example code generated by open-meteo

extract_external.py

"""
Extract data from the Open Meteo API and return it as a Pandas DataFrame.
"""

import openmeteo_requests
import pandas as pd
import requests_cache
from loguru import logger
from retry_requests import retry


def extract_open_meteo_data(
    latitude: float,
    longitude: float,
    start_date: str,
    end_date: str,
) -> pd.DataFrame:
    """
    Extract weather data from the Open Meteo API for a given
    latitude, longitude, and date range.

    Args:
        latitude (float): Latitude in decimal degrees.
        longitude (float): Longitude in decimal degrees.
        start_date (str): Start date in YYYY-MM-DD format.
        end_date (str): End date in YYYY-MM-DD format.
        hourly_variables (list[str], optional):
            List of hourly weather variables to extract.
            Defaults to ["temperature_2m"].

    Returns:
        pd.DataFrame: DataFrame containing the weather data.
    """

    # Define the hourly weather variables to extract
    hourly_variables = [
        "temperature_2m",
        "relative_humidity_2m",
        "wind_speed_10m",
        "cloud_cover",
        "snowfall",
        "snow_depth",
        "rain",
        "apparent_temperature",
        "dew_point_2m",
        "precipitation",
    ]

    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important
    # to assign them correctly below.
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": hourly_variables,
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]
    logger.debug(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
    logger.debug(f"Elevation {response.Elevation()} m asl")
    logger.debug(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
    logger.debug(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_wind_speed_10m = hourly.Variables(2).ValuesAsNumpy()
    hourly_cloud_cover = hourly.Variables(3).ValuesAsNumpy()
    hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
    hourly_snow_depth = hourly.Variables(5).ValuesAsNumpy()
    hourly_rain = hourly.Variables(6).ValuesAsNumpy()
    hourly_apparent_temperature = hourly.Variables(7).ValuesAsNumpy()
    hourly_dew_point_2m = hourly.Variables(8).ValuesAsNumpy()
    hourly_precipitation = hourly.Variables(9).ValuesAsNumpy()

    hourly_data = {
        "date": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left",
        )
    }

    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
    hourly_data["cloud_cover"] = hourly_cloud_cover
    hourly_data["snowfall"] = hourly_snowfall
    hourly_data["snow_depth"] = hourly_snow_depth
    hourly_data["rain"] = hourly_rain
    hourly_data["apparent_temperature"] = hourly_apparent_temperature
    hourly_data["dew_point_2m"] = hourly_dew_point_2m
    hourly_data["precipitation"] = hourly_precipitation

    hourly_dataframe = pd.DataFrame(data=hourly_data)

    return hourly_dataframe

Check lint and typecheck `extract_external`¶

tox run-parallel --quiet -f py312 lint -f py312 typecheck

Test `extract_external`¶

Success

Write a minimal unit test for extract_external. This unit test should consume a small but real test dataset from one of the following sources:

a small test dataset committed to the datasets/tests/ directory of this git repo,
a remote data source (e.g., web API) using a relatively small query and the remote_data pytest marker, or
if you have not yet paired down the data on disk into a test dataset small enough to commit to the git repo (<2MB), then use the remote_data marker but have the test consume data from disk but not committed to the git repo. Ideally, these tests should still be <100 MB of data and can run on a laptop.

Tip

Typically, a "unit test" should not require access to remote resources or the full dataset on local disk. However, the purpose of this test is to ensure that the real data is as expected.

If you try to use an AI agent to write this unit test, it might try to monkeypatch the web API and yield synthetic data to the extract_external function. It will make assumptions about what the test data or remote data looks like, but not actually test against the real data.

This unit test requests a single day of data for a single location from extract_external. Since it uses the open-meteo web API, the remote_data pytest marker is applied. (NOTE, the pytest-remote-data package is not typed, thus mypy complains about the untyped decorator @pytest.mark.remote_data. To make mypy happy, we tell it to ignore this type issue.)

@pytest.mark.remote_data  # type: ignore[misc]
def test_extract_external_remote_data() -> None:

test_extract_external.py

"""
Unit tests for `extract_open_meteo_data`.

The real function calls the Open-Meteo REST API through the `openmeteo_requests`
client, wrapped in a cached / retrying `requests` session.  These tests patch
out every network-touching component with lightweight stubs so that no external
HTTP traffic is generated and the behaviour is fully deterministic.
"""

from __future__ import annotations

import pandas as pd
import pytest

# Module under test
from able_weather.datasets.weather.open_meteo.runner import (
    extract_external,
)


@pytest.mark.remote_data  # type: ignore[misc]
def test_extract_external_remote_data() -> None:
    """
    Test the `extract_external` function with remote data.
    This test uses the `remotedata` marker to ensure that it can
    access the Open-Meteo API and retrieve real weather data.
    """
    latitude = 42.3346515
    longitude = -71.086777
    start_date = "2023-01-01"
    end_date = "2023-01-02"

    df = extract_external.extract_open_meteo_data(
        latitude=latitude,
        longitude=longitude,
        start_date=start_date,
        end_date=end_date,
    )

    assert isinstance(df, pd.DataFrame)
    assert not df.empty
    assert "date" in df.columns
    assert "temperature_2m" in df.columns
    assert "relative_humidity_2m" in df.columns
    assert "wind_speed_10m" in df.columns
    assert "cloud_cover" in df.columns
    assert "snowfall" in df.columns
    assert "snow_depth" in df.columns
    assert "rain" in df.columns
    assert "apparent_temperature" in df.columns
    assert "dew_point_2m" in df.columns
    assert "precipitation" in df.columns

Run the test¶

The most reliable way to run the test is through tox with the following command that runs all the unit-tests for the package modules that require the runner extras.

tox run -e py312-package-unit-runner

By default, tox will skip tests with the remotedata marker. If you use the remotedata marker you can tell tox you want the tests to run with the following command:

tox run -e py312-package-unit-runner -- --remote-data=any

Debug the test¶

If you need to debug because the tests failed, or you just want to run this test file, you can use the VSCode python debugger. However, you must have the full *-dev-runner conda environment installed, which you can do with the following command and then use the Python Environments extension to set the default environment.

snakemake conda_update`

Open the file you want to test, test_extract_external.py and open the VSCode command line (++ctrl+P++) and type the following, or click the caret symbol next to the play button in the top right corner of the file and select the "Debug using launch.json" option.

Python Debugger: Python Debugger: Debug using launch.json

And then select "PyTest Debugger: Current Test" and select the option to run with remote data if needed.

You can then set breakpoints, watch variables, and use the DEBUG CONSOLE to help your tests to pass.

Commit and CI¶

Commit the changes, push to github, and ensure all the continuous integration tests pass. NOTE: The CI tests will skip any tests marked with remote-data.

Draft and test the extract_external methods¶

Draft extract_external¶

Check lint and typecheck extract_external¶

Test extract_external¶