Skip to content

Instantly share code, notes, and snippets.

@anna-geller
Created June 27, 2021 14:48
Show Gist options
  • Save anna-geller/0600e287de9a239a5ca71fe31cb9eddd to your computer and use it in GitHub Desktop.
Save anna-geller/0600e287de9a239a5ca71fe31cb9eddd to your computer and use it in GitHub Desktop.
Script to run data tests locally
import logging
import pandas as pd
from src.timeseries_data_generator import TimeseriesGenerator
from src.timeseries_data_test_runner import TimeseriesDataTestRunner
DATASET = "local_test_without_s3"
DEFAULT_START_DATE = "2021-07-01"
DEFAULT_END_DATE = "2021-07-31 23:59"
SKEWED_END_DATE = "2021-07-25 23:59"
def _run_data_tests(df: pd.DataFrame) -> None:
TimeseriesDataTestRunner(s3_path=DATASET).run_data_tests(df)
def test_happy_path() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
_run_data_tests(df)
def test_incorrect_order_of_columns() -> None:
"""Order of columns here is the opposite of what we expect in a time series"""
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
df = df[["value", "timestamp"]]
_run_data_tests(df)
def test_incomplete_data() -> None:
"""Data doesn't contain the full month of July"""
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=SKEWED_END_DATE
).get_timeseries()
_run_data_tests(df)
def test_missing_nr_values() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
df.at[0, "value"] = None
_run_data_tests(df)
def test_missing_timestamp() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
df.at[0, "timestamp"] = None
_run_data_tests(df)
def test_incorrect_data_type_nr_column() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
df["value"] = df["value"].astype(float)
_run_data_tests(df)
def test_incorrect_data_type_dt_column() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE
).get_timeseries()
df["timestamp"] = df["timestamp"].astype(str)
_run_data_tests(df)
def test_incorrect_value_range() -> None:
df = TimeseriesGenerator(
start_date=DEFAULT_START_DATE,
end_date=DEFAULT_END_DATE,
min_value=0,
max_value=120,
).get_timeseries()
_run_data_tests(df)
if __name__ == "__main__":
logging.basicConfig(
format="[%(levelname)s] [%(name)s] [%(asctime)s]: %(message)s", level="INFO"
)
test_happy_path()
test_incorrect_order_of_columns()
test_incomplete_data()
test_missing_nr_values()
test_missing_timestamp()
test_incorrect_data_type_nr_column()
test_incorrect_data_type_dt_column()
test_incorrect_value_range()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment