Created
June 27, 2021 14:48
-
-
Save anna-geller/0600e287de9a239a5ca71fe31cb9eddd to your computer and use it in GitHub Desktop.
Script to run data tests locally
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import pandas as pd | |
from src.timeseries_data_generator import TimeseriesGenerator | |
from src.timeseries_data_test_runner import TimeseriesDataTestRunner | |
DATASET = "local_test_without_s3" | |
DEFAULT_START_DATE = "2021-07-01" | |
DEFAULT_END_DATE = "2021-07-31 23:59" | |
SKEWED_END_DATE = "2021-07-25 23:59" | |
def _run_data_tests(df: pd.DataFrame) -> None: | |
TimeseriesDataTestRunner(s3_path=DATASET).run_data_tests(df) | |
def test_happy_path() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
_run_data_tests(df) | |
def test_incorrect_order_of_columns() -> None: | |
"""Order of columns here is the opposite of what we expect in a time series""" | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
df = df[["value", "timestamp"]] | |
_run_data_tests(df) | |
def test_incomplete_data() -> None: | |
"""Data doesn't contain the full month of July""" | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=SKEWED_END_DATE | |
).get_timeseries() | |
_run_data_tests(df) | |
def test_missing_nr_values() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
df.at[0, "value"] = None | |
_run_data_tests(df) | |
def test_missing_timestamp() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
df.at[0, "timestamp"] = None | |
_run_data_tests(df) | |
def test_incorrect_data_type_nr_column() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
df["value"] = df["value"].astype(float) | |
_run_data_tests(df) | |
def test_incorrect_data_type_dt_column() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, end_date=DEFAULT_END_DATE | |
).get_timeseries() | |
df["timestamp"] = df["timestamp"].astype(str) | |
_run_data_tests(df) | |
def test_incorrect_value_range() -> None: | |
df = TimeseriesGenerator( | |
start_date=DEFAULT_START_DATE, | |
end_date=DEFAULT_END_DATE, | |
min_value=0, | |
max_value=120, | |
).get_timeseries() | |
_run_data_tests(df) | |
if __name__ == "__main__": | |
logging.basicConfig( | |
format="[%(levelname)s] [%(name)s] [%(asctime)s]: %(message)s", level="INFO" | |
) | |
test_happy_path() | |
test_incorrect_order_of_columns() | |
test_incomplete_data() | |
test_missing_nr_values() | |
test_missing_timestamp() | |
test_incorrect_data_type_nr_column() | |
test_incorrect_data_type_dt_column() | |
test_incorrect_value_range() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment