In [5]:
!pip install folium
Collecting folium
  Downloading folium-0.10.1-py2.py3-none-any.whl (91 kB)
Requirement already satisfied: numpy in c:\users\shane\lib\site-packages (from folium) (1.18.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\shane\lib\site-packages (from folium) (2.11.0)
Collecting branca>=0.3.0
  Downloading branca-0.4.0-py3-none-any.whl (25 kB)
Requirement already satisfied: requests in c:\users\shane\lib\site-packages (from folium) (2.22.0)
Requirement already satisfied: MarkupSafe>=0.23 in c:\users\shane\lib\site-packages (from jinja2>=2.9->folium) (1.1.1)
Requirement already satisfied: six in c:\users\shane\lib\site-packages (from branca>=0.3.0->folium) (1.14.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shane\lib\site-packages (from requests->folium) (2018.1.18)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\users\shane\lib\site-packages (from requests->folium) (3.0.4)
Requirement already satisfied: idna<2.9,>=2.5 in c:\users\shane\lib\site-packages (from requests->folium) (2.7)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\shane\lib\site-packages (from requests->folium) (1.24.3)
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1
In [5]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from plotly.subplots import make_subplots
import folium 
from folium import plugins
from tqdm.notebook import tqdm as tqdm


from pathlib import Path
data_dir = Path('../')

import os
os.listdir(data_dir)

import warnings
warnings.filterwarnings('ignore')
In [8]:
cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])

cleaned_data.rename(columns={'ObservationDate': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                     'Last Update':'last_updated',
                     'Confirmed': 'confirmed',
                     'Deaths':'deaths',
                     'Recovered':'recovered'
                    }, inplace=True)

# cases 
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']

# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')

# filling missing values 
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)

data = cleaned_data

display(data.head())
display(data.info())
state country Lat Long date confirmed deaths recovered active
0 Thailand 15.0000 101.0000 2020-01-22 2.0 0.0 0.0 2.0
1 Japan 36.0000 138.0000 2020-01-22 2.0 0.0 0.0 2.0
2 Singapore 1.2833 103.8333 2020-01-22 0.0 0.0 0.0 0.0
3 Nepal 28.1667 84.2500 2020-01-22 0.0 0.0 0.0 0.0
4 Malaysia 2.5000 112.5000 2020-01-22 0.0 0.0 0.0 0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19220 entries, 0 to 19219
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   state      19220 non-null  object        
 1   country    19220 non-null  object        
 2   Lat        19220 non-null  float64       
 3   Long       19220 non-null  float64       
 4   date       19220 non-null  datetime64[ns]
 5   confirmed  19220 non-null  float64       
 6   deaths     19220 non-null  float64       
 7   recovered  19220 non-null  float64       
 8   active     19220 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 1.3+ MB
None
In [9]:
# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry:     {data['date'].max()}")
print(f"Total Days:     {data['date'].max() - data['date'].min()}")
External Data
Earliest Entry: 2020-01-22 00:00:00
Last Entry:     2020-03-23 00:00:00
Total Days:     61 days 00:00:00

Infections and Fatalities Worldwide

In [10]:
group = data.groupby('date')['date', 'confirmed', 'deaths'].sum().reset_index()

fig = px.line(group, x="date", y="confirmed", 
              title="Worldwide Confirmed Cases Over Time")

fig.show()

fig = px.line(group, x="date", y="deaths", 
              title="Worldwide Deaths Over Time")

fig.show()

Normalized Infection and Fatality Ratio

(normalized by country population size)
In [12]:
def p2f(x):
    """
    Convert urban percentage to float
    """
    try:
        return float(x.strip('%'))/100
    except:
        return np.nan

def age2int(x):
    """
    Convert Age to integer
    """
    try:
        return int(x)
    except:
        return np.nan

def fert2float(x):
    """
    Convert Fertility Rate to float
    """
    try:
        return float(x)
    except:
        return np.nan


countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
                                                                                                             'Fert. Rate':fert2float,
                                                                                                             'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
                             'Population (2020)' : 'population',
                             'Density (P/Km²)' : 'density',
                             'Fert. Rate' : 'fertility',
                             'Med. Age' : "age",
                             'Urban Pop %' : 'urban_percentage'}, inplace=True)



countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]

countries_df.head()
Out[12]:
country population density fertility age urban_percentage
0 China 1439323776 153 1.7 38.0 0.61
1 India 1380004385 464 2.2 28.0 0.35
2 US 331002651 36 1.8 38.0 0.83
3 Indonesia 273523615 151 2.3 30.0 0.56
4 Pakistan 220892340 287 3.6 23.0 0.35
In [13]:
data = pd.merge(data, countries_df, on='country')
In [14]:
cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'population'].agg({'confirmed':'sum', 'population':'mean'}).reset_index()

flg['infectionRate'] = round((flg['confirmed']/flg['population'])*100, 5)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('infectionRate', ascending=False)

fig = px.bar(temp.sort_values(by="infectionRate", ascending=False)[:10][::-1],
             x = 'infectionRate', y = 'country', 
             title='% of infected people by country', text='infectionRate', height=800, orientation='h',
             color_discrete_sequence=['red']
            )
fig.show()
Makes sense these countries have low populations ie. San Marino, Andorra, and Luxembourg, however the infection percentage is still relatively low (<0.5%)
In [15]:
formated_gdf = data.groupby(['date', 'country'])['confirmed', 'population'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['infectionRate'] = round((formated_gdf['confirmed']/formated_gdf['population'])*100, 8)

fig = px.scatter_geo(formated_gdf, locations="country", locationmode='country names', 
                     color="infectionRate", size='infectionRate', hover_name="country", 
                     range_color= [0, 0.2], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Spread Over Time (Normalized by Country Population)', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
In [16]:
cleaned_latest = data[data['date'] == max(data['date'])]
flg = cleaned_latest.groupby('country')['confirmed', 'deaths', 'recovered', 'active'].sum().reset_index()

flg['mortalityRate'] = round((flg['deaths']/flg['confirmed'])*100, 2)
temp = flg[flg['confirmed']>100]
temp = temp.sort_values('mortalityRate', ascending=False)

fig = px.bar(temp.sort_values(by="mortalityRate", ascending=False)[:10][::-1],
             x = 'mortalityRate', y = 'country', 
             title='Deaths per 100 Confirmed Cases', text='mortalityRate', height=800, orientation='h',
             color_discrete_sequence=['darkred']
            )
fig.show()
In [17]:
formated_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].max()
formated_gdf = formated_gdf.reset_index()
formated_gdf['date'] = pd.to_datetime(formated_gdf['date'])
formated_gdf['date'] = formated_gdf['date'].dt.strftime('%m/%d/%Y')
formated_gdf['mortalityRate'] = round((formated_gdf['deaths']/formated_gdf['confirmed'])*100, 2)

fig = px.scatter_geo(formated_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="mortalityRate", size='mortalityRate', hover_name="country", 
                     range_color= [0, 10], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality Rate in % by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

ICU Beds per Country

Ratio of ICU Beds per 1000 people. The US is low at 2.9 - not good.
In [19]:
icu_df = pd.read_csv("./API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv")
icu_df['Country Name'] = icu_df['Country Name'].replace('United States', 'US')
icu_df['Country Name'] = icu_df['Country Name'].replace('Russian Federation', 'Russia')
icu_df['Country Name'] = icu_df['Country Name'].replace('Iran, Islamic Rep.', 'Iran')
icu_df['Country Name'] = icu_df['Country Name'].replace('Egypt, Arab Rep.', 'Egypt')
icu_df['Country Name'] = icu_df['Country Name'].replace('Venezuela, RB', 'Venezuela')
data['country'] = data['country'].replace('Czechia', 'Czech Republic')
In [20]:
# We wish to have the most recent values, thus we need to go through every year and extract the most recent one, if it exists.
icu_cleaned = pd.DataFrame()
icu_cleaned["country"] = icu_df["Country Name"]
icu_cleaned["icu"] = np.nan

for year in range(1960, 2020):
    year_df = icu_df[str(year)].dropna()
    icu_cleaned["icu"].loc[year_df.index] = year_df.values
In [21]:
data = pd.merge(data, icu_cleaned, on='country')
In [22]:
data['state'] = data['state'].fillna('')
temp = data[[col for col in data.columns if col != 'state']]

latest = temp[temp['date'] == max(temp['date'])].reset_index()
latest_grouped = latest.groupby('country')['icu'].mean().reset_index()


fig = px.bar(latest_grouped.sort_values('icu', ascending=False)[:10][::-1], 
             x='icu', y='country',
             title='Ratio of ICU Beds per 1000 People', text='icu', orientation='h',color_discrete_sequence=['green'] )
fig.show()
In [23]:
fig = px.choropleth(latest_grouped, locations="country", 
                    locationmode='country names', color="icu", 
                    hover_name="country", range_color=[1,15], 
                    color_continuous_scale="algae", 
                    title='Ratio of ICU beds per 1000 people')
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Temperature Data

In [26]:
df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')


df_temperature.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16677 entries, 0 to 16676
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   index          16677 non-null  int64         
 1   country        16677 non-null  object        
 2   state          16677 non-null  object        
 3   date           16677 non-null  datetime64[ns]
 4   humidity       16500 non-null  float64       
 5   sunHour        16500 non-null  float64       
 6   tempC          16500 non-null  float64       
 7   windspeedKmph  16500 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(2)
memory usage: 1.0+ MB
In [27]:
data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
data.to_csv("countries_icu_temp.csv")
In [28]:
data.head()
Out[28]:
state country Lat Long date confirmed deaths recovered active population density fertility age urban_percentage icu index humidity sunHour tempC windspeedKmph
0 Thailand 15.0 101.0 2020-01-22 2.0 0.0 0.0 2.0 69799978 137 1.5 40.0 0.51 2.1 12118 55.0 8.7 36.0 11.0
1 Thailand 15.0 101.0 2020-01-23 3.0 0.0 0.0 3.0 69799978 137 1.5 40.0 0.51 2.1 12119 59.0 11.6 35.0 14.0
2 Thailand 15.0 101.0 2020-01-24 5.0 0.0 0.0 5.0 69799978 137 1.5 40.0 0.51 2.1 12120 60.0 11.6 35.0 16.0
3 Thailand 15.0 101.0 2020-01-25 7.0 0.0 0.0 7.0 69799978 137 1.5 40.0 0.51 2.1 12121 63.0 11.6 32.0 22.0
4 Thailand 15.0 101.0 2020-01-26 8.0 0.0 2.0 6.0 69799978 137 1.5 40.0 0.51 2.1 12122 54.0 11.6 34.0 15.0
In [29]:
train_data = data
print(train_data.shape)
train_data.head()
(15177, 20)
Out[29]:
state country Lat Long date confirmed deaths recovered active population density fertility age urban_percentage icu index humidity sunHour tempC windspeedKmph
0 Thailand 15.0 101.0 2020-01-22 2.0 0.0 0.0 2.0 69799978 137 1.5 40.0 0.51 2.1 12118 55.0 8.7 36.0 11.0
1 Thailand 15.0 101.0 2020-01-23 3.0 0.0 0.0 3.0 69799978 137 1.5 40.0 0.51 2.1 12119 59.0 11.6 35.0 14.0
2 Thailand 15.0 101.0 2020-01-24 5.0 0.0 0.0 5.0 69799978 137 1.5 40.0 0.51 2.1 12120 60.0 11.6 35.0 16.0
3 Thailand 15.0 101.0 2020-01-25 7.0 0.0 0.0 7.0 69799978 137 1.5 40.0 0.51 2.1 12121 63.0 11.6 32.0 22.0
4 Thailand 15.0 101.0 2020-01-26 8.0 0.0 2.0 6.0 69799978 137 1.5 40.0 0.51 2.1 12122 54.0 11.6 34.0 15.0

Regression Model

Using Random Forest Regressor, going to use the country's input variables to predict the most recent # infections and deaths as the target. Also extracting the relative feature importance.
In [30]:
threshold = 0
train_data['infectionRate'] = round((train_data['confirmed']/train_data['population'])*100, 5)
train_data = train_data[train_data['infectionRate'] >= threshold]
print(train_data.shape)
(15177, 21)
In [31]:
train_data = train_data.drop([
                     "country", 
                     "active", 
                     "recovered", 
                     "infectionRate",
                     "state",
                     "Lat",
                     "Long",
                     "date",
                     "index"
                    ], axis= 1).dropna()

y = train_data[["confirmed", "deaths"]]
X = train_data.drop(["confirmed", "deaths"],axis=1)

display(X.head())
print(X.shape, y.shape)
population density fertility age urban_percentage icu humidity sunHour tempC windspeedKmph
0 69799978 137 1.5 40.0 0.51 2.1 55.0 8.7 36.0 11.0
1 69799978 137 1.5 40.0 0.51 2.1 59.0 11.6 35.0 14.0
2 69799978 137 1.5 40.0 0.51 2.1 60.0 11.6 35.0 16.0
3 69799978 137 1.5 40.0 0.51 2.1 63.0 11.6 32.0 22.0
4 69799978 137 1.5 40.0 0.51 2.1 54.0 11.6 34.0 15.0
(14520, 10) (14520, 2)
In [32]:
import matplotlib.pyplot as plt
import seaborn as sns
cm = train_data.corr()
plt.figure(figsize=(20,10))
sns.heatmap(cm, annot=True)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d2f88aa2e8>

Train and Evaluate Model (Random Forest)

In [33]:
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
In [34]:
# Split into training and evaluation data:
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_log_error, make_scorer
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error of a prediction set.
    params:
        y_true: numpy array of ground truth
        y_pred: numpy array of predictions
    """
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_scorer = make_scorer(rmsle)

X_train, X_val, y_train, y_val = tts(X, y, test_size= 0.2, random_state=42, shuffle=True)
In [35]:
model_infected = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_infected, 
                      X_train,
                      y_train["confirmed"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Confirmed Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_infected.fit(X_train, y_train["confirmed"])
result_infected = rmsle(y_val["confirmed"], model_infected.predict(X_val))
print("Validation Infected set RMSLE: {}".format(result_infected))
Cross Validation of Confirmed Cases: Mean = 1.709191346175162, std = 0.08849664321754487
Validation Infected set RMSLE: 1.8249734268769162
In [36]:
model_deaths = DecisionTreeRegressor(random_state=42, criterion="mae")

scores = cross_val_score(model_deaths, 
                      X_train,
                      y_train["deaths"],
                      cv=5, scoring=rmsle_scorer)

print("Cross Validation of Fatal Cases: Mean = {}, std = {}".format(scores.mean(), scores.std()))
model_deaths.fit(X_train, y_train["deaths"])
result_deaths = rmsle(y_val["deaths"], model_deaths.predict(X_val))
print("Validation Death set RMSLE: {}".format(result_deaths))
Cross Validation of Fatal Cases: Mean = 0.7289154951349002, std = 0.06238107924858306
Validation Death set RMSLE: 0.7328272681602619
In [44]:
# Final Evalutation
print("Final Validation score: {}".format(np.mean([result_infected, result_deaths])))
Final Validation score: 1.2789003475185892

Extract Features for Infections

In [38]:
model_infected = model_infected.fit(X, y["confirmed"])
model_deaths = model_deaths.fit(X, y["deaths"])
In [39]:
def show_feature_importance(forest):
    """
    Creates a sorted list of the feature importance of a decision tree algorithm.
    Furthermore it plots it.
    params:
        forest: Decision Tree algorithm
    """
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X.shape[1]):
        print("{}, Feature: {}, Importance: {}".format(f + 1, X.columns[indices[f]], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.figure(figsize=(20,10))
    plt.title("Feature importances")
    plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
    plt.xticks(range(X.shape[1]),  X.columns[indices], rotation='vertical')
    plt.xlim([-1, X.shape[1]])
    plt.show()
In [40]:
show_feature_importance(model_infected)
Feature ranking:
1, Feature: tempC, Importance: 0.23468829939161234
2, Feature: population, Importance: 0.20000156935776625
3, Feature: sunHour, Importance: 0.17830127484162486
4, Feature: humidity, Importance: 0.17314201118428982
5, Feature: age, Importance: 0.1063226808815607
6, Feature: windspeedKmph, Importance: 0.09353764627722243
7, Feature: icu, Importance: 0.006484063171881324
8, Feature: urban_percentage, Importance: 0.0032524939710505785
9, Feature: fertility, Importance: 0.002900696271729065
10, Feature: density, Importance: 0.0013692646512625463
We see temperature, population, hours of sunlight, humidity, and age positively correlate with infection rate, surprisingly much more than ICU beds, or percentage of population that is urban, and population density.
In [41]:
show_feature_importance(model_deaths)
Feature ranking:
1, Feature: tempC, Importance: 0.3251088534107426
2, Feature: windspeedKmph, Importance: 0.22931785195935728
3, Feature: population, Importance: 0.2133526850507975
4, Feature: sunHour, Importance: 0.15239477503628723
5, Feature: humidity, Importance: 0.06240928882438295
6, Feature: icu, Importance: 0.010643444605708718
7, Feature: density, Importance: 0.006773101112723731
8, Feature: urban_percentage, Importance: 0.0
9, Feature: age, Importance: 0.0
10, Feature: fertility, Importance: 0.0