Source code for aftercovid.data.data_hopkins

"""
Loads data from :epkg:`CSSE Johns Hopkins`.
"""
import numpy
import pandas
from ..preprocess import (
    ts_normalise_negative_values, ts_moving_average,
    ts_remove_decreasing_values)


population = {
    'Belgium': 11.5e6,
    'France': 67e6,
    'Germany': 83e6,
    'Spain': 47e6,
    'Italy': 60e6,
    'UK': 67e6,
}


[docs]def download_hopkins_data(kind='deaths', country='France'): """ Downloads data from :epkg:`CSSE Johns Hopkins` for a particular country. :param kind: `'deaths'`, `'confirmed'` or `'recovered'` :param country: `'France'`, `'UK'`, ... :return: dataframe .. runpython:: :showcode: from aftercovid.data import download_hopkins_data df = download_hopkins_data() print(df.tail()) """ url = ( "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/" "master/csse_covid_19_data/" "csse_covid_19_time_series/time_series_covid19_%s_global.csv" % kind) df = pandas.read_csv(url) eur = df[df['Country/Region'].isin([country]) & df['Province/State'].isna()] tf = eur.T.iloc[4:] tf.columns = [kind] return tf
[docs]def extract_hopkins_data(kinds=('deaths', 'confirmed', 'recovered'), country='France', delay=21, raw=False): """ Downloads data from :epkg:`CSSE Johns Hopkins` and infers the number of current positive cases in a very simple way. :param kinds: series to extracts, by default `('deaths', 'confirmed', 'recovered')` :param country: `'France'`, `'UK'`, ... :param delay: the function assumes after 21 days, a confirmed case moves is not positive anymore :param raw: if True, returns the raw data as well :return: dataframe .. runpython:: :showcode: from aftercovid.data import extract_hopkins_data df = extract_hopkins_data() print(df.tail()) """ total = population[country] dfs = [] for k in kinds: df = download_hopkins_data(k, country) dfs.append(df) conc0 = pandas.concat(dfs, axis=1) for c in conc0: conc0[c] = ts_remove_decreasing_values(conc0[c].astype(numpy.int64)) conc = conc0.copy() infected = conc['confirmed'] - (conc['deaths'] + conc['recovered']) conf30 = infected[:-delay] recovered = conc['recovered'].values.copy() recovered[delay:] += conf30 delta_conf = conc['confirmed'].values[1:] - conc['confirmed'].values[:-1] infected = conc['confirmed'].values * 0 infected[:] = conc['confirmed'] - (conc['deaths'] + recovered) infected[1:] = numpy.maximum(1, numpy.maximum(infected[1:], delta_conf)) infected[20:] = numpy.maximum(10, infected[20:]) infected[60:] = numpy.maximum(100, infected[60:]) conc['recovered'] = recovered conc['infected'] = infected conc['safe'] = total - conc.drop('confirmed', axis=1).sum(axis=1) if raw: return conc, conc0 return conc
def preprocess_hopkins_data(df): """ Improves the differentiated series by removing negative values. :param df: dataframe returned by :func:`extract_hopkins_data <aftercovid.data.extract_hopkins_data>` :return: (smoothed differentiated series, preprocessed dataframe) """ total = df.drop('confirmed', axis=1).sum(axis=1) total = list(total)[0] diff = df.diff() diff['deaths'] = ts_normalise_negative_values(diff['deaths'], extreme=2) diff['recovered'] = ts_normalise_negative_values( diff['recovered'], extreme=2) diff['confirmed'] = ts_normalise_negative_values( diff['confirmed'], extreme=2) mov = ts_moving_average(diff, n=7, center=True) df2 = mov.cumsum() df2['safe'] = total - df2.drop(['confirmed', 'safe'], axis=1).sum(axis=1) return mov, df2