COVID-19: Effect of temperature/humidity with visualization¶

import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp


import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- setup ---
pd.set_option('max_columns', 50)

cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])

cleaned_data.rename(columns={'ObservationDate': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                     'Last Update':'last_updated',
                     'Confirmed': 'confirmed',
                     'Deaths':'deaths',
                     'Recovered':'recovered'
                    }, inplace=True)

# cases 
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']

# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')

# filling missing values 
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)

data = cleaned_data

display(data.head())
display(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19220 entries, 0 to 19219
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   state      19220 non-null  object        
 1   country    19220 non-null  object        
 2   Lat        19220 non-null  float64       
 3   Long       19220 non-null  float64       
 4   date       19220 non-null  datetime64[ns]
 5   confirmed  19220 non-null  float64       
 6   deaths     19220 non-null  float64       
 7   recovered  19220 non-null  float64       
 8   active     19220 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 1.3+ MB

None

# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry:     {data['date'].max()}")
print(f"Total Days:     {data['date'].max() - data['date'].min()}")

External Data
Earliest Entry: 2020-01-22 00:00:00
Last Entry:     2020-03-23 00:00:00
Total Days:     61 days 00:00:00

def p2f(x):
    """
    Convert urban percentage to float
    """
    try:
        return float(x.strip('%'))/100
    except:
        return np.nan

def age2int(x):
    """
    Convert Age to integer
    """
    try:
        return int(x)
    except:
        return np.nan

def fert2float(x):
    """
    Convert Fertility Rate to float
    """
    try:
        return float(x)
    except:
        return np.nan


countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
                                                                                                             'Fert. Rate':fert2float,
                                                                                                             'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
                             'Population (2020)' : 'population',
                             'Density (P/Km²)' : 'density',
                             'Fert. Rate' : 'fertility',
                             'Med. Age' : "age",
                             'Urban Pop %' : 'urban_percentage'}, inplace=True)



countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]

countries_df.head()

data = pd.merge(data, countries_df, on='country')

Adding Temperature Data¶

df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')
# df_temperature.info()

data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
data['mortality_rate'] = data['deaths'] / data['confirmed']

data.head()

data.describe()

Temperature by Country¶

temp_gdf = data.groupby(['date', 'country'])['tempC', 'humidity'].mean()
temp_gdf = temp_gdf.reset_index()
temp_gdf['date'] = pd.to_datetime(temp_gdf['date'])
temp_gdf['date'] = temp_gdf['date'].dt.strftime('%m/%d/%Y')

temp_gdf['tempC_pos'] = temp_gdf['tempC'] - temp_gdf['tempC'].min()  # To use it with size

wind_gdf = data.groupby(['date', 'country'])['windspeedKmph'].max()
wind_gdf = wind_gdf.reset_index()
wind_gdf['date'] = pd.to_datetime(temp_gdf['date'])
wind_gdf['date'] = wind_gdf['date'].dt.strftime('%m/%d/%Y')

c:\users\shane\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

target_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].sum()
target_gdf = target_gdf.reset_index()
target_gdf['date'] = pd.to_datetime(target_gdf['date'])
target_gdf['date'] = target_gdf['date'].dt.strftime('%m/%d/%Y')

c:\users\shane\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='tempC_pos', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Makes sense, it's colder away from the equator, etc etc.¶

fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='humidity', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Seems to be no real correlation between humidity and location like there is with temperature. We do see relatively low humidity in China, while humidity is always high in Europe.¶

gdf = pd.merge(target_gdf, temp_gdf, on=['date', 'country'])
gdf['confirmed_log1p'] = np.log1p(gdf['confirmed'])
gdf['deaths_log1p'] = np.log1p(gdf['deaths'])
gdf['mortality_rate'] = gdf['deaths'] / gdf['confirmed']

gdf = pd.merge(gdf, wind_gdf, on=['date', 'country'])

Weather-correlation with COVID19 spread¶

The size of the circle represents the # of confirmed cases of those infected with coronavirus, the color represents the temperature.¶

we see that corona started in China when the temperature was cold, but it does not seem to show a correlation that I can see between higher temperatures and a slow down in infection rate.¶

we also see that corona spread in Europe when the temperature was medium, around 20C.¶

This is contrary to my initial hypothesis that high temperature is the major factor in coronavirus spread¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='confirmed', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Confirmed VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Changing the visualization. Now circle size is adjusted for a logarithmic scale to show how it corona spreads throughout (not just by major countries) Purpose of this is to show that corona still does spread worldwide despite high temperatures¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='confirmed_log1p', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(confirmed) VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Let's look at deaths vs. temp, we see it's high in China, Europe, US, and Iran. These are in a temperate region. These are also regions with high population, I believe 90% of the world's population is along this range of latitude lines.¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='deaths', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: deaths VS temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

let's look at mortality rate instead of total deaths. It seems to not have much of a correlation with region nor temperature. Generally, I see that mortality rate is high at the beginning stage (most likely due to low total inspection number), but that countries seem to generally converge around 3%¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='mortality_rate', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality rate VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Humidity¶

Corona spread in China with low humidity, and Europe where humidity was relatively high. Unsure of this correlation at this point.¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='confirmed_log1p', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(confirmed) VS Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Same thing with mortality rate.¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='mortality_rate', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality rate VS humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Windspeed¶

My hypothesis being that greater windspeed enables the virus to reach further distances and infect a greater number of people within a shorter amount of time, aka before it dies. Seems wind speed is relatively high in Europe - maybe this can offset the effects of higher humidity/temperature in Europe?¶

fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="windspeedKmph", size='confirmed_log1p', hover_name="country", 
                     range_color= [0, 40], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(Confirmed) VS Wind speed by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Overall, I thought the effect of Temperature, Humidity, and Windspeed was damning. Now, I am not so sure. Further work must be done to see if the effects of windspeed make up for the negative effects of temperature and humidity and vice versa. This is obviously a work in progress.¶

	country	Lat	Long	date	confirmed	active
0	Thailand	15.0000	101.0000	2020-01-22	2.0	2.0
1	Japan	36.0000	138.0000	2020-01-22	2.0	2.0
2	Singapore	1.2833	103.8333	2020-01-22	0.0	0.0
3	Nepal	28.1667	84.2500	2020-01-22	0.0	0.0
4	Malaysia	2.5000	112.5000	2020-01-22	0.0	0.0

	country	population	density	fertility	age	urban_percentage
0	China	1439323776	153	1.7	38.0	0.61
1	India	1380004385	464	2.2	28.0	0.35
2	US	331002651	36	1.8	38.0	0.83
3	Indonesia	273523615	151	2.3	30.0	0.56
4	Pakistan	220892340	287	3.6	23.0	0.35

	country	Lat	Long	date	confirmed	recovered	active	population	density	fertility	age	urban_percentage	index	humidity	sunHour	tempC	windspeedKmph
0	Thailand	15.0	101.0	2020-01-22	2.0	0.0	2.0	69799978	137	1.5	40.0	0.51	12118	55.0	8.7	36.0	11.0
1	Thailand	15.0	101.0	2020-01-23	3.0	0.0	3.0	69799978	137	1.5	40.0	0.51	12119	59.0	11.6	35.0	14.0
2	Thailand	15.0	101.0	2020-01-24	5.0	0.0	5.0	69799978	137	1.5	40.0	0.51	12120	60.0	11.6	35.0	16.0
3	Thailand	15.0	101.0	2020-01-25	7.0	0.0	7.0	69799978	137	1.5	40.0	0.51	12121	63.0	11.6	32.0	22.0
4	Thailand	15.0	101.0	2020-01-26	8.0	2.0	6.0	69799978	137	1.5	40.0	0.51	12122	54.0	11.6	34.0	15.0

	Lat	Long	confirmed	deaths	recovered	active	population	density	fertility	age	urban_percentage	index	humidity	sunHour	tempC	windspeedKmph	mortality_rate
count	15657.000000	15657.000000	15657.000000	15657.000000	15657.000000	15657.000000	1.565700e+04	15657.000000	15417.000000	15417.000000	15298.000000	15657.000000	15480.000000	15480.000000	15480.000000	15480.000000	6103.000000
mean	26.622529	4.269820	297.513892	10.591684	112.918886	174.003321	2.811105e+08	266.094335	2.114075	35.420121	0.700277	8352.579421	66.191150	8.528236	15.711563	12.274871	0.013510
std	23.183574	81.431773	3450.997620	154.363301	1865.935756	1980.167965	4.652968e+08	1710.974999	0.866366	7.105604	0.181491	4864.268666	19.206303	2.518260	10.536874	7.016770	0.061879
min	-41.454500	-157.498300	0.000000	0.000000	0.000000	0.000000	3.812800e+04	0.000000	1.200000	17.000000	0.150000	0.000000	5.000000	1.500000	-21.000000	1.000000	0.000000
25%	13.444300	-72.710700	0.000000	0.000000	0.000000	0.000000	8.655535e+06	36.000000	1.700000	32.000000	0.610000	4034.000000	55.000000	6.300000	9.000000	7.000000	0.000000
50%	33.040600	9.501800	0.000000	0.000000	0.000000	0.000000	4.373376e+07	67.000000	1.800000	38.000000	0.790000	8428.000000	72.000000	8.700000	14.000000	11.000000	0.000000
75%	42.506300	69.345100	10.000000	0.000000	0.000000	6.000000	3.310027e+08	153.000000	2.100000	40.000000	0.830000	12702.000000	80.000000	11.000000	24.000000	16.000000	0.008000
max	72.000000	174.886000	67800.000000	4825.000000	58946.000000	50633.000000	1.439324e+09	26337.000000	6.100000	48.000000	0.980000	16676.000000	99.000000	14.000000	45.000000	65.000000	1.000000