Skip to content

Instantly share code, notes, and snippets.

@kirel
Created May 6, 2016 08:49
Show Gist options
  • Save kirel/0d7dcc6e2da354605162d62e6223ea65 to your computer and use it in GitHub Desktop.
Save kirel/0d7dcc6e2da354605162d62e6223ea65 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import re
from ftplib import FTP
import zipfile
import StringIO
FTP_SERVER = 'ftp-cdc.dwd.de'
PATH_HISTORICAL = 'pub/CDC/observations_germany/climate/daily/kl/historical'
PATH_RECENT = 'pub/CDC/observations_germany/climate/daily/kl/recent'
RE_TEMPLATE_HISTORICAL = 'tageswerte_%s_.*_hist\\.zip$'
RE_TEMPLATE_RECENT = 'tageswerte_KL_%s_akt\\.zip$'
RE_TEMPLATE_CSV = 'produkt_klima_Tageswerte_.*_%s\\.txt'
def weather_for_stations(station_ids):
ftp = FTP(FTP_SERVER)
ftp.login()
historical_files = ftp.nlst(PATH_HISTORICAL)
recent_files = ftp.nlst(PATH_RECENT)
dfs = []
for station_id in station_ids:
re_historical = re.compile(RE_TEMPLATE_HISTORICAL % str(station_id).zfill(5))
re_recent = re.compile(RE_TEMPLATE_RECENT % str(station_id).zfill(5))
re_csv = re.compile(RE_TEMPLATE_CSV % str(station_id).zfill(5))
historical_files_to_download = [path for path in historical_files if re_historical.search(path)]
recent_files_to_download = [path for path in recent_files if re_recent.search(path)]
historical_zip_files = [StringIO.StringIO() for path in historical_files_to_download]
recent_zip_files = [StringIO.StringIO() for path in recent_files_to_download]
for (path, io) in zip(historical_files_to_download + recent_files_to_download, historical_zip_files + recent_zip_files):
ftp.retrbinary('RETR %s' % path, io.write)
historical_zips = [zipfile.ZipFile(io) for io in historical_zip_files]
recent_zips = [zipfile.ZipFile(io) for io in recent_zip_files]
historical_csvs = [StringIO.StringIO(zipf.read(next(name for name in zipf.namelist() if re_csv.match(name)))) for zipf in historical_zips]
recent_csvs = [StringIO.StringIO(zipf.read(next(name for name in zipf.namelist() if re_csv.match(name)))) for zipf in recent_zips]
data_frames = [pd.read_csv(csv, sep=';', skipfooter=1) for csv in historical_csvs + recent_csvs]
dfs = dfs + data_frames
df = pd.concat(dfs)
df.columns = df.columns.str.strip()
del df['eor']
df = df.replace(-999, np.nan)
df['MESS_DATUM'] = pd.to_datetime(df['MESS_DATUM'], format='%Y%m%d')
df.index = df['MESS_DATUM']
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment