import gc
import os
from pathlib import Path
import random
import sys
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.display import display, HTML
# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"
# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
# --- setup ---
pd.set_option('max_columns', 50)
cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])
cleaned_data.rename(columns={'ObservationDate': 'date',
'Province/State':'state',
'Country/Region':'country',
'Last Update':'last_updated',
'Confirmed': 'confirmed',
'Deaths':'deaths',
'Recovered':'recovered'
}, inplace=True)
# cases
cases = ['confirmed', 'deaths', 'recovered', 'active']
# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']
# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')
# filling missing values
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)
data = cleaned_data
display(data.head())
display(data.info())
# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry: {data['date'].max()}")
print(f"Total Days: {data['date'].max() - data['date'].min()}")
def p2f(x):
"""
Convert urban percentage to float
"""
try:
return float(x.strip('%'))/100
except:
return np.nan
def age2int(x):
"""
Convert Age to integer
"""
try:
return int(x)
except:
return np.nan
def fert2float(x):
"""
Convert Fertility Rate to float
"""
try:
return float(x)
except:
return np.nan
countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
'Fert. Rate':fert2float,
'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
'Population (2020)' : 'population',
'Density (P/Km²)' : 'density',
'Fert. Rate' : 'fertility',
'Med. Age' : "age",
'Urban Pop %' : 'urban_percentage'}, inplace=True)
countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]
countries_df.head()
data = pd.merge(data, countries_df, on='country')
df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')
# df_temperature.info()
data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
data['mortality_rate'] = data['deaths'] / data['confirmed']
data.head()
data.describe()
temp_gdf = data.groupby(['date', 'country'])['tempC', 'humidity'].mean()
temp_gdf = temp_gdf.reset_index()
temp_gdf['date'] = pd.to_datetime(temp_gdf['date'])
temp_gdf['date'] = temp_gdf['date'].dt.strftime('%m/%d/%Y')
temp_gdf['tempC_pos'] = temp_gdf['tempC'] - temp_gdf['tempC'].min() # To use it with size
wind_gdf = data.groupby(['date', 'country'])['windspeedKmph'].max()
wind_gdf = wind_gdf.reset_index()
wind_gdf['date'] = pd.to_datetime(temp_gdf['date'])
wind_gdf['date'] = wind_gdf['date'].dt.strftime('%m/%d/%Y')
target_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].sum()
target_gdf = target_gdf.reset_index()
target_gdf['date'] = pd.to_datetime(target_gdf['date'])
target_gdf['date'] = target_gdf['date'].dt.strftime('%m/%d/%Y')
fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names',
color="tempC", size='tempC_pos', hover_name="country",
range_color= [-20, 45],
projection="natural earth", animation_frame="date",
title='Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names',
color="humidity", size='humidity', hover_name="country",
range_color= [0, 100],
projection="natural earth", animation_frame="date",
title='COVID-19: Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
gdf = pd.merge(target_gdf, temp_gdf, on=['date', 'country'])
gdf['confirmed_log1p'] = np.log1p(gdf['confirmed'])
gdf['deaths_log1p'] = np.log1p(gdf['deaths'])
gdf['mortality_rate'] = gdf['deaths'] / gdf['confirmed']
gdf = pd.merge(gdf, wind_gdf, on=['date', 'country'])
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="tempC", size='confirmed', hover_name="country",
range_color= [-20, 45],
projection="natural earth", animation_frame="date",
title='COVID-19: Confirmed VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="tempC", size='confirmed_log1p', hover_name="country",
range_color= [-20, 45],
projection="natural earth", animation_frame="date",
title='COVID-19: log1p(confirmed) VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="tempC", size='deaths', hover_name="country",
range_color= [-20, 45],
projection="natural earth", animation_frame="date",
title='COVID-19: deaths VS temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="tempC", size='mortality_rate', hover_name="country",
range_color= [-20, 45],
projection="natural earth", animation_frame="date",
title='COVID-19: Mortality rate VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="humidity", size='confirmed_log1p', hover_name="country",
range_color= [0, 100],
projection="natural earth", animation_frame="date",
title='COVID-19: log1p(confirmed) VS Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="humidity", size='mortality_rate', hover_name="country",
range_color= [0, 100],
projection="natural earth", animation_frame="date",
title='COVID-19: Mortality rate VS humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names',
color="windspeedKmph", size='confirmed_log1p', hover_name="country",
range_color= [0, 40],
projection="natural earth", animation_frame="date",
title='COVID-19: log1p(Confirmed) VS Wind speed by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()