!pip install folium

Collecting folium
  Downloading folium-0.10.1-py2.py3-none-any.whl (91 kB)
Requirement already satisfied: numpy in c:\users\shane\lib\site-packages (from folium) (1.18.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\shane\lib\site-packages (from folium) (2.11.0)
Collecting branca>=0.3.0
  Downloading branca-0.4.0-py3-none-any.whl (25 kB)
Requirement already satisfied: requests in c:\users\shane\lib\site-packages (from folium) (2.22.0)
Requirement already satisfied: MarkupSafe>=0.23 in c:\users\shane\lib\site-packages (from jinja2>=2.9->folium) (1.1.1)
Requirement already satisfied: six in c:\users\shane\lib\site-packages (from branca>=0.3.0->folium) (1.14.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shane\lib\site-packages (from requests->folium) (2018.1.18)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\users\shane\lib\site-packages (from requests->folium) (3.0.4)
Requirement already satisfied: idna<2.9,>=2.5 in c:\users\shane\lib\site-packages (from requests->folium) (2.7)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\shane\lib\site-packages (from requests->folium) (1.24.3)
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1

import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots
import folium 
from folium import plugins
from tqdm.notebook import tqdm as tqdm


from pathlib import Path
data_dir = Path('../')

import os
os.listdir(data_dir)

import warnings
warnings.filterwarnings('ignore')

cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])

cleaned_data.rename(columns={'ObservationDate': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                     'Last Update':'last_updated',
                     'Confirmed': 'confirmed',
                     'Deaths':'deaths',
                     'Recovered':'recovered'
                    }, inplace=True)

# cases 
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']

# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')

# filling missing values 
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)

data = cleaned_data

display(data.head())
display(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19220 entries, 0 to 19219
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   state      19220 non-null  object        
 1   country    19220 non-null  object        
 2   Lat        19220 non-null  float64       
 3   Long       19220 non-null  float64       
 4   date       19220 non-null  datetime64[ns]
 5   confirmed  19220 non-null  float64       
 6   deaths     19220 non-null  float64       
 7   recovered  19220 non-null  float64       
 8   active     19220 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 1.3+ MB

None

# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry:     {data['date'].max()}")
print(f"Total Days:     {data['date'].max() - data['date'].min()}")

External Data
Earliest Entry: 2020-01-22 00:00:00
Last Entry:     2020-03-23 00:00:00
Total Days:     61 days 00:00:00

Infections and Fatalities Worldwide¶

group = data.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()

fig = px.line(group, x="date", y="confirmed", 
              title="Worldwide Confirmed Cases Over Time")

fig.show()

fig = px.line(group, x="date", y="deaths", 
              title="Worldwide Deaths Over Time")

fig.show()

Normalized Infection and Fatality Ratio¶

(normalized by country population size)¶

def p2f(x):
    """
    Convert urban percentage to float
    """
    try:
        return float(x.strip('%'))/100
    except:
        return np.nan

def age2int(x):
    """
    Convert Age to integer
    """
    try:
        return int(x)
    except:
        return np.nan

def fert2float(x):
    """
    Convert Fertility Rate to float
    """
    try:
        return float(x)
    except:
        return np.nan


countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
                                                                                                             'Fert. Rate':fert2float,
                                                                                                             'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
                             'Population (2020)' : 'population',
                             'Density (P/Km²)' : 'density',
                             'Fert. Rate' : 'fertility',
                             'Med. Age' : "age",
                             'Urban Pop %' : 'urban_percentage'}, inplace=True)



countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]

countries_df.head()

data = pd.merge(data, countries_df, on='country')

cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'population'].agg({'confirmed':'sum', 'population':'mean'}).reset_index()

flg['infectionRate'] = round((flg['confirmed']/flg['population'])*100, 5)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('infectionRate', ascending=False)

fig = px.bar(temp.sort_values(by="infectionRate", ascending=False)[:10][::-1],
             x = 'infectionRate', y = 'country', 
             title='% of infected people by country', text='infectionRate', height=800, orientation='h',
             color_discrete_sequence=['red']
            )
fig.show()

Makes sense these countries have low populations ie. San Marino, Andorra, and Luxembourg, however the infection percentage is still relatively low (<0.5%)¶

formated_gdf = data.groupby(['date', 'country'])['confirmed', 'population'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['infectionRate'] = round((formated_gdf['confirmed']/formated_gdf['population'])*100, 8)

fig = px.scatter_geo(formated_gdf, locations="country", locationmode='country names', 
                     color="infectionRate", size='infectionRate', hover_name="country", 
                     range_color= [0, 0.2], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Spread Over Time (Normalized by Country Population)', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'deaths', 'recovered', 'active'].sum().reset_index()

flg['mortalityRate'] = round((flg['deaths']/flg['confirmed'])*100, 2)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('mortalityRate', ascending=False)

fig = px.bar(temp.sort_values(by="mortalityRate", ascending=False)[:10][::-1],
             x = 'mortalityRate', y = 'country', 
             title='Deaths per 100 Confirmed Cases', text='mortalityRate', height=800, orientation='h',
             color_discrete_sequence=['darkred']
            )
fig.show()

formated_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['mortalityRate'] = round((formated_gdf['deaths']/formated_gdf['confirmed'])*100, 2)

fig = px.scatter_geo(formated_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="mortalityRate", size='mortalityRate', hover_name="country", 
                     range_color= [0, 10], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality Rate in % by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

ICU Beds per Country¶

Ratio of ICU Beds per 1000 people. The US is low at 2.9 - not good.¶

icu_df = pd.read_csv("./API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv")
icu_df['Country Name'] = icu_df['Country Name'].replace('United States', 'US')
icu_df['Country Name'] = icu_df['Country Name'].replace('Russian Federation', 'Russia')
icu_df['Country Name'] = icu_df['Country Name'].replace('Iran, Islamic Rep.', 'Iran')
icu_df['Country Name'] = icu_df['Country Name'].replace('Egypt, Arab Rep.', 'Egypt')
icu_df['Country Name'] = icu_df['Country Name'].replace('Venezuela, RB', 'Venezuela')
data['country'] = data['country'].replace('Czechia', 'Czech Republic')

# We wish to have the most recent values, thus we need to go through every year and extract the most recent one, if it exists.
icu_cleaned = pd.DataFrame()
icu_cleaned["country"] = icu_df["Country Name"]
icu_cleaned["icu"] = np.nan

for year in range(1960, 2020):
    year_df = icu_df[str(year)].dropna()
    icu_cleaned["icu"].loc[year_df.index] = year_df.values

data = pd.merge(data, icu_cleaned, on='country')

data['state'] = data['state'].fillna('')
temp = data[[col for col in data.columns if col != 'state']]

latest = temp[temp['date'] == max(temp['date'])].reset_index()
latest_grouped = latest.groupby('country')['icu'].mean().reset_index()


fig = px.bar(latest_grouped.sort_values('icu', ascending=False)[:10][::-1], 
             x='icu', y='country',
             title='Ratio of ICU Beds per 1000 People', text='icu', orientation='h',color_discrete_sequence=['green'] )
fig.show()

fig = px.choropleth(latest_grouped, locations="country", 
                    locationmode='country names', color="icu", 
                    hover_name="country", range_color=[1,15], 
                    color_continuous_scale="algae", 
                    title='Ratio of ICU beds per 1000 people')
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Temperature Data¶

df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')


df_temperature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16677 entries, 0 to 16676
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   index          16677 non-null  int64         
 1   country        16677 non-null  object        
 2   state          16677 non-null  object        
 3   date           16677 non-null  datetime64[ns]
 4   humidity       16500 non-null  float64       
 5   sunHour        16500 non-null  float64       
 6   tempC          16500 non-null  float64       
 7   windspeedKmph  16500 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 1.0+ MB

data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
data.to_csv("countries_icu_temp.csv")

data.head()

train_data = data
print(train_data.shape)
train_data.head()

(15177, 20)

Regression Model¶

Using Random Forest Regressor, going to use the country's input variables to predict the most recent # infections and deaths as the target. Also extracting the relative feature importance.¶

threshold = 0
train_data['infectionRate'] = round((train_data['confirmed']/train_data['population'])*100, 5)
train_data = train_data[train_data['infectionRate'] >= threshold]
print(train_data.shape)

(15177, 21)

train_data = train_data.drop([
                     "country", 
                     "active", 
                     "recovered", 
                     "infectionRate",
                     "state",
                     "Lat",
                     "Long",
                     "date",
                     "index"
                    ], axis= 1).dropna()

y = train_data[["confirmed", "deaths"]]
X = train_data.drop(["confirmed", "deaths"],axis=1)

display(X.head())
print(X.shape, y.shape)

(14520, 10) (14520, 2)

import matplotlib.pyplot as plt
import seaborn as sns
cm = train_data.corr()
plt.figure(figsize=(20,10))
sns.heatmap(cm, annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1d2f88aa2e8>

Train and Evaluate Model (Random Forest)¶

from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)

# Split into training and evaluation data:
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_log_error, make_scorer
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error of a prediction set.
    params:
        y_true: numpy array of ground truth
        y_pred: numpy array of predictions
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle)

X_train, X_val, y_train, y_val = tts(X, y, test_size= 0.2, random_state=42, shuffle=True)

model_infected = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_infected, 
                      X_train,
                      y_train["confirmed"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Confirmed Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_infected.fit(X_train, y_train["confirmed"])
result_infected = rmsle(y_val["confirmed"], model_infected.predict(X_val))
print("Validation Infected set RMSLE: {}".format(result_infected))

Cross Validation of Confirmed Cases: Mean = 1.709191346175162, std = 0.08849664321754487
Validation Infected set RMSLE: 1.8249734268769162

model_deaths = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_deaths, 
                      X_train,
                      y_train["deaths"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Fatal Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_deaths.fit(X_train, y_train["deaths"])
result_deaths = rmsle(y_val["deaths"], model_deaths.predict(X_val))
print("Validation Death set RMSLE: {}".format(result_deaths))

Cross Validation of Fatal Cases: Mean = 0.7289154951349002, std = 0.06238107924858306
Validation Death set RMSLE: 0.7328272681602619

# Final Evalutation
print("Final Validation score: {}".format(np.mean([result_infected, result_deaths])))

Final Validation score: 1.2789003475185892

Extract Features for Infections¶

model_infected = model_infected.fit(X, y["confirmed"])
model_deaths = model_deaths.fit(X, y["deaths"])

def show_feature_importance(forest):
    """
    Creates a sorted list of the feature importance of a decision tree algorithm.
    Furthermore it plots it.
    params:
        forest: Decision Tree algorithm
    """
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("{}, Feature: {}, Importance: {}".format(f + 1, X.columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(figsize=(20,10))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
    plt.xticks(range(X.shape[1]),  X.columns[indices], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.show()

show_feature_importance(model_infected)

Feature ranking:
1, Feature: tempC, Importance: 0.23468829939161234
2, Feature: population, Importance: 0.20000156935776625
3, Feature: sunHour, Importance: 0.17830127484162486
4, Feature: humidity, Importance: 0.17314201118428982
5, Feature: age, Importance: 0.1063226808815607
6, Feature: windspeedKmph, Importance: 0.09353764627722243
7, Feature: icu, Importance: 0.006484063171881324
8, Feature: urban_percentage, Importance: 0.0032524939710505785
9, Feature: fertility, Importance: 0.002900696271729065
10, Feature: density, Importance: 0.0013692646512625463

We see temperature, population, hours of sunlight, humidity, and age positively correlate with infection rate, surprisingly much more than ICU beds, or percentage of population that is urban, and population density.¶

I have a lot to say about the link with temperature but I'm working on that. THis is a work in progress, obviously.¶

show_feature_importance(model_deaths)

Feature ranking:
1, Feature: tempC, Importance: 0.3251088534107426
2, Feature: windspeedKmph, Importance: 0.22931785195935728
3, Feature: population, Importance: 0.2133526850507975
4, Feature: sunHour, Importance: 0.15239477503628723
5, Feature: humidity, Importance: 0.06240928882438295
6, Feature: icu, Importance: 0.010643444605708718
7, Feature: density, Importance: 0.006773101112723731
8, Feature: urban_percentage, Importance: 0.0
9, Feature: age, Importance: 0.0
10, Feature: fertility, Importance: 0.0

	country	population	density	fertility	age	urban_percentage
0	China	1439323776	153	1.7	38.0	0.61
1	India	1380004385	464	2.2	28.0	0.35
2	US	331002651	36	1.8	38.0	0.83
3	Indonesia	273523615	151	2.3	30.0	0.56
4	Pakistan	220892340	287	3.6	23.0	0.35

	country	Lat	Long	date	confirmed	recovered	active	population	density	fertility	age	urban_percentage	icu	index	humidity	sunHour	tempC	windspeedKmph
0	Thailand	15.0	101.0	2020-01-22	2.0	0.0	2.0	69799978	137	1.5	40.0	0.51	2.1	12118	55.0	8.7	36.0	11.0
1	Thailand	15.0	101.0	2020-01-23	3.0	0.0	3.0	69799978	137	1.5	40.0	0.51	2.1	12119	59.0	11.6	35.0	14.0
2	Thailand	15.0	101.0	2020-01-24	5.0	0.0	5.0	69799978	137	1.5	40.0	0.51	2.1	12120	60.0	11.6	35.0	16.0
3	Thailand	15.0	101.0	2020-01-25	7.0	0.0	7.0	69799978	137	1.5	40.0	0.51	2.1	12121	63.0	11.6	32.0	22.0
4	Thailand	15.0	101.0	2020-01-26	8.0	2.0	6.0	69799978	137	1.5	40.0	0.51	2.1	12122	54.0	11.6	34.0	15.0

	country	Lat	Long	date	confirmed	recovered	active	population	density	fertility	age	urban_percentage	icu	index	humidity	sunHour	tempC	windspeedKmph
0	Thailand	15.0	101.0	2020-01-22	2.0	0.0	2.0	69799978	137	1.5	40.0	0.51	2.1	12118	55.0	8.7	36.0	11.0
1	Thailand	15.0	101.0	2020-01-23	3.0	0.0	3.0	69799978	137	1.5	40.0	0.51	2.1	12119	59.0	11.6	35.0	14.0
2	Thailand	15.0	101.0	2020-01-24	5.0	0.0	5.0	69799978	137	1.5	40.0	0.51	2.1	12120	60.0	11.6	35.0	16.0
3	Thailand	15.0	101.0	2020-01-25	7.0	0.0	7.0	69799978	137	1.5	40.0	0.51	2.1	12121	63.0	11.6	32.0	22.0
4	Thailand	15.0	101.0	2020-01-26	8.0	2.0	6.0	69799978	137	1.5	40.0	0.51	2.1	12122	54.0	11.6	34.0	15.0

	population	density	fertility	age	urban_percentage	icu	humidity	sunHour	tempC	windspeedKmph
0	69799978	137	1.5	40.0	0.51	2.1	55.0	8.7	36.0	11.0
1	69799978	137	1.5	40.0	0.51	2.1	59.0	11.6	35.0	14.0
2	69799978	137	1.5	40.0	0.51	2.1	60.0	11.6	35.0	16.0
3	69799978	137	1.5	40.0	0.51	2.1	63.0	11.6	32.0	22.0
4	69799978	137	1.5	40.0	0.51	2.1	54.0	11.6	34.0	15.0

	country	Lat	Long	date	confirmed	active
0	Thailand	15.0000	101.0000	2020-01-22	2.0	2.0
1	Japan	36.0000	138.0000	2020-01-22	2.0	2.0
2	Singapore	1.2833	103.8333	2020-01-22	0.0	0.0
3	Nepal	28.1667	84.2500	2020-01-22	0.0	0.0
4	Malaysia	2.5000	112.5000	2020-01-22	0.0	0.0