In [5]:
!pip install folium
In [5]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import as px
import as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots
import folium 
from folium import plugins
from tqdm.notebook import tqdm as tqdm

from pathlib import Path
data_dir = Path('../')

import os

import warnings
In [8]:
cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])

cleaned_data.rename(columns={'ObservationDate': 'date', 
                     'Last Update':'last_updated',
                     'Confirmed': 'confirmed',
                    }, inplace=True)

# cases 
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']

# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')

# filling missing values 
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)

data = cleaned_data

In [9]:
# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry:     {data['date'].max()}")
print(f"Total Days:     {data['date'].max() - data['date'].min()}")
Infections and Fatalities Worldwide

In [10]:
group = data.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()

fig = px.line(group, x="date", y="confirmed", 
              title="Worldwide Confirmed Cases Over Time")

fig = px.line(group, x="date", y="deaths", 
              title="Worldwide Deaths Over Time")

Normalized Infection and Fatality Ratio

(normalized by country population size)
In [12]:
def p2f(x):
    Convert urban percentage to float
        return float(x.strip('%'))/100
        return np.nan

def age2int(x):
    Convert Age to integer
        return int(x)
        return np.nan

def fert2float(x):
    Convert Fertility Rate to float
        return float(x)
        return np.nan

countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
                                                                                                             'Fert. Rate':fert2float,
                                                                                                             'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
                             'Population (2020)' : 'population',
                             'Density (P/Km²)' : 'density',
                             'Fert. Rate' : 'fertility',
                             'Med. Age' : "age",
                             'Urban Pop %' : 'urban_percentage'}, inplace=True)

countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]

In [13]:
data = pd.merge(data, countries_df, on='country')
In [14]:
cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'population'].agg({'confirmed':'sum', 'population':'mean'}).reset_index()

flg['infectionRate'] = round((flg['confirmed']/flg['population'])*100, 5)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('infectionRate', ascending=False)

fig ="infectionRate", ascending=False)[:10][::-1],
             x = 'infectionRate', y = 'country', 
             title='% of infected people by country', text='infectionRate', height=800, orientation='h',
Makes sense these countries have low populations ie. San Marino, Andorra, and Luxembourg, however the infection percentage is still relatively low (<0.5%)
In [15]:
formated_gdf = data.groupby(['date', 'country'])['confirmed', 'population'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['infectionRate'] = round((formated_gdf['confirmed']/formated_gdf['population'])*100, 8)

fig = px.scatter_geo(formated_gdf, locations="country", locationmode='country names', 
                     color="infectionRate", size='infectionRate', hover_name="country", 
                     range_color= [0, 0.2], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Spread Over Time (Normalized by Country Population)', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
In [16]:
cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'deaths', 'recovered', 'active'].sum().reset_index()

flg['mortalityRate'] = round((flg['deaths']/flg['confirmed'])*100, 2)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('mortalityRate', ascending=False)

fig ="mortalityRate", ascending=False)[:10][::-1],
             x = 'mortalityRate', y = 'country', 
             title='Deaths per 100 Confirmed Cases', text='mortalityRate', height=800, orientation='h',
In [17]:
formated_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['mortalityRate'] = round((formated_gdf['deaths']/formated_gdf['confirmed'])*100, 2)

fig = px.scatter_geo(formated_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="mortalityRate", size='mortalityRate', hover_name="country", 
                     range_color= [0, 10], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality Rate in % by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)

ICU Beds per Country

Ratio of ICU Beds per 1000 people. The US is low at 2.9 - not good.
In [19]:
icu_df = pd.read_csv("./API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv")
icu_df['Country Name'] = icu_df['Country Name'].replace('United States', 'US')
icu_df['Country Name'] = icu_df['Country Name'].replace('Russian Federation', 'Russia')
icu_df['Country Name'] = icu_df['Country Name'].replace('Iran, Islamic Rep.', 'Iran')
icu_df['Country Name'] = icu_df['Country Name'].replace('Egypt, Arab Rep.', 'Egypt')
icu_df['Country Name'] = icu_df['Country Name'].replace('Venezuela, RB', 'Venezuela')
data['country'] = data['country'].replace('Czechia', 'Czech Republic')
In [20]:
# We wish to have the most recent values, thus we need to go through every year and extract the most recent one, if it exists.
icu_cleaned = pd.DataFrame()
icu_cleaned["country"] = icu_df["Country Name"]
icu_cleaned["icu"] = np.nan

for year in range(1960, 2020):
    year_df = icu_df[str(year)].dropna()
    icu_cleaned["icu"].loc[year_df.index] = year_df.values
In [21]:
data = pd.merge(data, icu_cleaned, on='country')
In [22]:
data['state'] = data['state'].fillna('')
temp = data[[col for col in data.columns if col != 'state']]

latest = temp[temp['date'] == max(temp['date'])].reset_index()
latest_grouped = latest.groupby('country')['icu'].mean().reset_index()

fig ='icu', ascending=False)[:10][::-1], 
             x='icu', y='country',
             title='Ratio of ICU Beds per 1000 People', text='icu', orientation='h',color_discrete_sequence=['green'] )
In [23]:
fig = px.choropleth(latest_grouped, locations="country", 
                    locationmode='country names', color="icu", 
                    hover_name="country", range_color=[1,15], 
                    title='Ratio of ICU beds per 1000 people')
# fig.update(layout_coloraxis_showscale=False)

Temperature Data

In [26]:
df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')
In [27]:
data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
In [28]:
In [29]:
train_data = data
Regression Model

Using Random Forest Regressor, going to use the country's input variables to predict the most recent # infections and deaths as the target. Also extracting the relative feature importance.
In [30]:
threshold = 0
train_data['infectionRate'] = round((train_data['confirmed']/train_data['population'])*100, 5)
train_data = train_data[train_data['infectionRate'] >= threshold]
In [31]:
train_data = train_data.drop([
                    ], axis= 1).dropna()

y = train_data[["confirmed", "deaths"]]
X = train_data.drop(["confirmed", "deaths"],axis=1)

print(X.shape, y.shape)
In [32]:
import matplotlib.pyplot as plt
import seaborn as sns
cm = train_data.corr()
sns.heatmap(cm, annot=True)
Train and Evaluate Model (Random Forest)

In [33]:
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
In [34]:
# Split into training and evaluation data:
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_log_error, make_scorer
def rmsle(y_true, y_pred):
    Computes the Root Mean Squared Logarithmic Error of a prediction set.
        y_true: numpy array of ground truth
        y_pred: numpy array of predictions
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle)

X_train, X_val, y_train, y_val = tts(X, y, test_size= 0.2, random_state=42, shuffle=True)
In [35]:
model_infected = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_infected, 
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Confirmed Cases: Mean = {}, std = {}".format(scores.mean(), scores.std())), y_train["confirmed"])
result_infected = rmsle(y_val["confirmed"], model_infected.predict(X_val))
print("Validation Infected set RMSLE: {}".format(result_infected))
In [36]:
model_deaths = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_deaths, 
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Fatal Cases: Mean = {}, std = {}".format(scores.mean(), scores.std())), y_train["deaths"])
result_deaths = rmsle(y_val["deaths"], model_deaths.predict(X_val))
print("Validation Death set RMSLE: {}".format(result_deaths))
In [44]:
# Final Evalutation
print("Final Validation score: {}".format(np.mean([result_infected, result_deaths])))
Final Validation score: 1.2789003475185892

Extract Features for Infections

In [38]:
model_infected =, y["confirmed"])
model_deaths =, y["deaths"])
In [39]:
def show_feature_importance(forest):
    Creates a sorted list of the feature importance of a decision tree algorithm.
    Furthermore it plots it.
        forest: Decision Tree algorithm
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("{}, Feature: {}, Importance: {}".format(f + 1, X.columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.title("Feature importances")[1]), importances[indices], color="r", align="center")
    plt.xticks(range(X.shape[1]),  X.columns[indices], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
In [40]:
We see temperature, population, hours of sunlight, humidity, and age positively correlate with infection rate, surprisingly much more than ICU beds, or percentage of population that is urban, and population density.
In [41]:
