COVID-19: Effect of temperature/humidity with visualization

In [4]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp


import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- setup ---
pd.set_option('max_columns', 50)
In [6]:
cleaned_data = pd.read_csv('./covid_19_clean_complete.csv', parse_dates=['Date'])

cleaned_data.rename(columns={'ObservationDate': 'date', 
                     'Province/State':'state',
                     'Country/Region':'country',
                     'Last Update':'last_updated',
                     'Confirmed': 'confirmed',
                     'Deaths':'deaths',
                     'Recovered':'recovered'
                    }, inplace=True)

# cases 
cases = ['confirmed', 'deaths', 'recovered', 'active']

# Active Case = confirmed - deaths - recovered
cleaned_data['active'] = cleaned_data['confirmed'] - cleaned_data['deaths'] - cleaned_data['recovered']

# replacing Mainland china with just China
cleaned_data['country'] = cleaned_data['country'].replace('Mainland China', 'China')

# filling missing values 
cleaned_data[['state']] = cleaned_data[['state']].fillna('')
cleaned_data[cases] = cleaned_data[cases].fillna(0)
cleaned_data.rename(columns={'Date':'date'}, inplace=True)

data = cleaned_data

display(data.head())
display(data.info())
state country Lat Long date confirmed deaths recovered active
0 Thailand 15.0000 101.0000 2020-01-22 2.0 0.0 0.0 2.0
1 Japan 36.0000 138.0000 2020-01-22 2.0 0.0 0.0 2.0
2 Singapore 1.2833 103.8333 2020-01-22 0.0 0.0 0.0 0.0
3 Nepal 28.1667 84.2500 2020-01-22 0.0 0.0 0.0 0.0
4 Malaysia 2.5000 112.5000 2020-01-22 0.0 0.0 0.0 0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19220 entries, 0 to 19219
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   state      19220 non-null  object        
 1   country    19220 non-null  object        
 2   Lat        19220 non-null  float64       
 3   Long       19220 non-null  float64       
 4   date       19220 non-null  datetime64[ns]
 5   confirmed  19220 non-null  float64       
 6   deaths     19220 non-null  float64       
 7   recovered  19220 non-null  float64       
 8   active     19220 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 1.3+ MB
None
In [7]:
# Check if the data is updated
print("External Data")
print(f"Earliest Entry: {data['date'].min()}")
print(f"Last Entry:     {data['date'].max()}")
print(f"Total Days:     {data['date'].max() - data['date'].min()}")
External Data
Earliest Entry: 2020-01-22 00:00:00
Last Entry:     2020-03-23 00:00:00
Total Days:     61 days 00:00:00
In [8]:
def p2f(x):
    """
    Convert urban percentage to float
    """
    try:
        return float(x.strip('%'))/100
    except:
        return np.nan

def age2int(x):
    """
    Convert Age to integer
    """
    try:
        return int(x)
    except:
        return np.nan

def fert2float(x):
    """
    Convert Fertility Rate to float
    """
    try:
        return float(x)
    except:
        return np.nan


countries_df = pd.read_csv("./population_by_country_2020.csv", converters={'Urban Pop %':p2f,
                                                                                                             'Fert. Rate':fert2float,
                                                                                                             'Med. Age':age2int})
countries_df.rename(columns={'Country (or dependency)': 'country',
                             'Population (2020)' : 'population',
                             'Density (P/Km²)' : 'density',
                             'Fert. Rate' : 'fertility',
                             'Med. Age' : "age",
                             'Urban Pop %' : 'urban_percentage'}, inplace=True)



countries_df['country'] = countries_df['country'].replace('United States', 'US')
countries_df = countries_df[["country", "population", "density", "fertility", "age", "urban_percentage"]]

countries_df.head()
Out[8]:
country population density fertility age urban_percentage
0 China 1439323776 153 1.7 38.0 0.61
1 India 1380004385 464 2.2 28.0 0.35
2 US 331002651 36 1.8 38.0 0.83
3 Indonesia 273523615 151 2.3 30.0 0.56
4 Pakistan 220892340 287 3.6 23.0 0.35
In [9]:
data = pd.merge(data, countries_df, on='country')

Adding Temperature Data

In [11]:
df_temperature = pd.read_csv("./temperature_dataframe.csv")
df_temperature['country'] = df_temperature['country'].replace('USA', 'US')
df_temperature['country'] = df_temperature['country'].replace('UK', 'United Kingdom')
df_temperature = df_temperature[["country", "province", "date", "humidity", "sunHour", "tempC", "windspeedKmph"]].reset_index()
df_temperature.rename(columns={'province': 'state'}, inplace=True)
df_temperature["date"] = pd.to_datetime(df_temperature['date'])
df_temperature['state'] = df_temperature['state'].fillna('')
# df_temperature.info()
In [12]:
data = data.merge(df_temperature, on=['country','date', 'state'], how='inner')
data['mortality_rate'] = data['deaths'] / data['confirmed']
In [13]:
data.head()
Out[13]:
state country Lat Long date confirmed deaths recovered active population density fertility age urban_percentage index humidity sunHour tempC windspeedKmph mortality_rate
0 Thailand 15.0 101.0 2020-01-22 2.0 0.0 0.0 2.0 69799978 137 1.5 40.0 0.51 12118 55.0 8.7 36.0 11.0 0.0
1 Thailand 15.0 101.0 2020-01-23 3.0 0.0 0.0 3.0 69799978 137 1.5 40.0 0.51 12119 59.0 11.6 35.0 14.0 0.0
2 Thailand 15.0 101.0 2020-01-24 5.0 0.0 0.0 5.0 69799978 137 1.5 40.0 0.51 12120 60.0 11.6 35.0 16.0 0.0
3 Thailand 15.0 101.0 2020-01-25 7.0 0.0 0.0 7.0 69799978 137 1.5 40.0 0.51 12121 63.0 11.6 32.0 22.0 0.0
4 Thailand 15.0 101.0 2020-01-26 8.0 0.0 2.0 6.0 69799978 137 1.5 40.0 0.51 12122 54.0 11.6 34.0 15.0 0.0
In [14]:
data.describe()
Out[14]:
Lat Long confirmed deaths recovered active population density fertility age urban_percentage index humidity sunHour tempC windspeedKmph mortality_rate
count 15657.000000 15657.000000 15657.000000 15657.000000 15657.000000 15657.000000 1.565700e+04 15657.000000 15417.000000 15417.000000 15298.000000 15657.000000 15480.000000 15480.000000 15480.000000 15480.000000 6103.000000
mean 26.622529 4.269820 297.513892 10.591684 112.918886 174.003321 2.811105e+08 266.094335 2.114075 35.420121 0.700277 8352.579421 66.191150 8.528236 15.711563 12.274871 0.013510
std 23.183574 81.431773 3450.997620 154.363301 1865.935756 1980.167965 4.652968e+08 1710.974999 0.866366 7.105604 0.181491 4864.268666 19.206303 2.518260 10.536874 7.016770 0.061879
min -41.454500 -157.498300 0.000000 0.000000 0.000000 0.000000 3.812800e+04 0.000000 1.200000 17.000000 0.150000 0.000000 5.000000 1.500000 -21.000000 1.000000 0.000000
25% 13.444300 -72.710700 0.000000 0.000000 0.000000 0.000000 8.655535e+06 36.000000 1.700000 32.000000 0.610000 4034.000000 55.000000 6.300000 9.000000 7.000000 0.000000
50% 33.040600 9.501800 0.000000 0.000000 0.000000 0.000000 4.373376e+07 67.000000 1.800000 38.000000 0.790000 8428.000000 72.000000 8.700000 14.000000 11.000000 0.000000
75% 42.506300 69.345100 10.000000 0.000000 0.000000 6.000000 3.310027e+08 153.000000 2.100000 40.000000 0.830000 12702.000000 80.000000 11.000000 24.000000 16.000000 0.008000
max 72.000000 174.886000 67800.000000 4825.000000 58946.000000 50633.000000 1.439324e+09 26337.000000 6.100000 48.000000 0.980000 16676.000000 99.000000 14.000000 45.000000 65.000000 1.000000

Temperature by Country

In [15]:
temp_gdf = data.groupby(['date', 'country'])['tempC', 'humidity'].mean()
temp_gdf = temp_gdf.reset_index()
temp_gdf['date'] = pd.to_datetime(temp_gdf['date'])
temp_gdf['date'] = temp_gdf['date'].dt.strftime('%m/%d/%Y')

temp_gdf['tempC_pos'] = temp_gdf['tempC'] - temp_gdf['tempC'].min()  # To use it with size

wind_gdf = data.groupby(['date', 'country'])['windspeedKmph'].max()
wind_gdf = wind_gdf.reset_index()
wind_gdf['date'] = pd.to_datetime(temp_gdf['date'])
wind_gdf['date'] = wind_gdf['date'].dt.strftime('%m/%d/%Y')
c:\users\shane\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

In [16]:
target_gdf = data.groupby(['date', 'country'])['confirmed', 'deaths'].sum()
target_gdf = target_gdf.reset_index()
target_gdf['date'] = pd.to_datetime(target_gdf['date'])
target_gdf['date'] = target_gdf['date'].dt.strftime('%m/%d/%Y')
c:\users\shane\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

In [17]:
fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='tempC_pos', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Makes sense, it's colder away from the equator, etc etc.

In [18]:
fig = px.scatter_geo(temp_gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='humidity', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
Seems to be no real correlation between humidity and location like there is with temperature. We do see relatively low humidity in China, while humidity is always high in Europe.
In [19]:
gdf = pd.merge(target_gdf, temp_gdf, on=['date', 'country'])
gdf['confirmed_log1p'] = np.log1p(gdf['confirmed'])
gdf['deaths_log1p'] = np.log1p(gdf['deaths'])
gdf['mortality_rate'] = gdf['deaths'] / gdf['confirmed']

gdf = pd.merge(gdf, wind_gdf, on=['date', 'country'])

Weather-correlation with COVID19 spread

The size of the circle represents the # of confirmed cases of those infected with coronavirus, the color represents the temperature.
we see that corona started in China when the temperature was cold, but it does not seem to show a correlation that I can see between higher temperatures and a slow down in infection rate.
we also see that corona spread in Europe when the temperature was medium, around 20C.
This is contrary to my initial hypothesis that high temperature is the major factor in coronavirus spread
In [20]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='confirmed', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Confirmed VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
Changing the visualization. Now circle size is adjusted for a logarithmic scale to show how it corona spreads throughout (not just by major countries) Purpose of this is to show that corona still does spread worldwide despite high temperatures
In [21]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='confirmed_log1p', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(confirmed) VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Let's look at deaths vs. temp, we see it's high in China, Europe, US, and Iran. These are in a temperate region. These are also regions with high population, I believe 90% of the world's population is along this range of latitude lines.

In [22]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='deaths', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: deaths VS temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
let's look at mortality rate instead of total deaths. It seems to not have much of a correlation with region nor temperature. Generally, I see that mortality rate is high at the beginning stage (most likely due to low total inspection number), but that countries seem to generally converge around 3%
In [23]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="tempC", size='mortality_rate', hover_name="country", 
                     range_color= [-20, 45], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality rate VS Temperature by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Humidity

Corona spread in China with low humidity, and Europe where humidity was relatively high. Unsure of this correlation at this point.
In [24]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='confirmed_log1p', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(confirmed) VS Humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()
Same thing with mortality rate.
In [25]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="humidity", size='mortality_rate', hover_name="country", 
                     range_color= [0, 100], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: Mortality rate VS humidity by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Windspeed

My hypothesis being that greater windspeed enables the virus to reach further distances and infect a greater number of people within a shorter amount of time, aka before it dies. Seems wind speed is relatively high in Europe - maybe this can offset the effects of higher humidity/temperature in Europe?
In [26]:
fig = px.scatter_geo(gdf.fillna(0), locations="country", locationmode='country names', 
                     color="windspeedKmph", size='confirmed_log1p', hover_name="country", 
                     range_color= [0, 40], 
                     projection="natural earth", animation_frame="date", 
                     title='COVID-19: log1p(Confirmed) VS Wind speed by country', color_continuous_scale="portland")
# fig.update(layout_coloraxis_showscale=False)
fig.show()

Overall, I thought the effect of Temperature, Humidity, and Windspeed was damning. Now, I am not so sure. Further work must be done to see if the effects of windspeed make up for the negative effects of temperature and humidity and vice versa. This is obviously a work in progress.

In [ ]: