Predictive Analytics Technical Specialist- written test
# Only consider countries with a Humanitarian Response Plan (HRP) in place .
hpr_countries = ['AFG','BFA','BDI','CMR','CAF','TCD','COD','ETH','HTI','IRQ','LBY','MLI','MMR','NER','NGA','PSE','SOM','SSD','SDN','SYR','UKR','VEN','YEM']
who_file = 'data/un_test/Data_WHO_Coronavirus_Covid-19_Cases_and_Deaths-WHO-COVID-19-global-data.csv'
worldbank_2018_file = 'data/un_test/Worldbank_total_population.xls'
inform_covid19_index_file = 'data/un_test/inform-covid-19-risk-index-v012.xlsx'
shapefile = 'data/un_test/ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp'
who_df = (pd.read_csv(who_file,parse_dates=[4])
.query("ISO_3_CODE.isin(@hpr_countries)")
)
#who_df['date'] = pd.to_datetime(who_df['date_epicrv'])
assert who_df['ISO_3_CODE'].nunique() == len(hpr_countries)
who_df.head(1)
worldbank_2018_df = pd.read_excel( worldbank_2018_file, sheet_name=0 ,skiprows=[0,1,2])
worldbank_2018_df = worldbank_2018_df.loc[worldbank_2018_df['Country Code'].isin(hpr_countries)]
assert worldbank_2018_df['Country Code'].nunique() == len(hpr_countries)
worldbank_2018_df.head(1)
inform_covid19_index_df = (pd.read_excel( inform_covid19_index_file, sheet_name='INFORM COVID-19 RISK 2020 (a-z)' ,skiprows=[0,2] )
.query("ISO3.isin(@hpr_countries)")
)
assert inform_covid19_index_df['ISO3'].nunique() == len(hpr_countries)
inform_covid19_index_df.head(1)
who_df_g = who_df.groupby(['ISO_3_CODE'])
q1_res_df =who_df_g.tail(1)[['ADM0_NAME','ISO_3_CODE','CumCase','CumDeath']].sort_values(by='CumCase',ascending=[False])
print(f"Question1: The three countries with the absolute largest number of cases are: {list(q1_res_df[['ADM0_NAME','CumCase']].head(3).itertuples(index=False, name=None))}.")
q2_merged_df = q1_res_df.merge(worldbank_2018_df[['2018', 'Country Code']], how='left', left_on='ISO_3_CODE', right_on='Country Code')
q2_merged_df['2018'] = q2_merged_df['2018'].astype(int)
q2_merged_df['CumCase_rel'] = q2_merged_df['CumCase'] / q2_merged_df['2018'] * 100000
q2_res_df = q2_merged_df.sort_values(by='CumCase_rel',ascending=[False])
q2_res_df.head(1)
print(f"Question 2: The three countries with the highest number of cases per 100000 are: {list(q2_res_df[['ADM0_NAME','CumCase_rel']].head(3).itertuples(index=False, name=None))}.")
q3_df = q1_res_df.copy()
q3_df['cfr'] = q3_df['CumDeath'] * 100 / q3_df['CumCase']
q3_res_df = q3_df.sort_values(by=['cfr'], ascending=False)
print(f"Question 3: The three countries with the highest CFR are: {list(q3_res_df[['ADM0_NAME','cfr']].head(3).itertuples(index=False, name=None))}.")
# source: https://towardsdatascience.com/a-complete-guide-to-an-interactive-geographical-map-using-python-f4c5197e23e0
import geopandas as gpd
#shapefile = 'data/countries_110m/ne_110m_admin_0_countries.shp'
#Read shapefile using Geopandas
gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
#Rename columns.
gdf.columns = ['country', 'country_code', 'geometry']
gdf.head(1)
merged = gdf.merge(q3_df, how='left',left_on = 'country_code', right_on = 'ISO_3_CODE')
import json
#Read data to json.
merged_json = json.loads(merged.to_json())
#Convert to String like object.
json_data = json.dumps(merged_json)
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
from bokeh.palettes import brewer
#Input GeoJSON source that contains features for plotting.
geosource = GeoJSONDataSource(geojson = json_data)
#Define a sequential multi-hue color palette.
palette = brewer['YlGnBu'][8]
#Reverse color order so that dark blue is highest obesity.
palette = palette[::-1]
#Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors.
color_mapper = LinearColorMapper(palette = palette, low = 0, high = max(q3_df.cfr))
#Define custom tick labels for color bar.
#tick_labels = {'0': '0%', '5': '5%', '10':'10%', '15':'15%', '20':'20%', '25':'25%', '30':'30%','35':'35%', '40': '>40%'}
#Create color bar.
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
border_line_color=None,location = (0,0), orientation = 'horizontal')
#, major_label_overrides = tick_labels)
#Create figure object.
p = figure(title = 'Case fatality ratio for COVID-19 pandemic in countries with a Humanitarian Response Plan', plot_height = 600 , plot_width = 950, toolbar_location = None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
#Add patch renderer to figure.
p.patches('xs','ys', source = geosource,fill_color = {'field' :'cfr', 'transform' : color_mapper},
line_color = 'black', line_width = 0.25, fill_alpha = 1)
#Specify figure layout.
p.add_layout(color_bar, 'below')
#Display figure inline in Jupyter Notebook.
output_notebook()
#Display figure.
#show(p)
# source: https://github.com/CrazyDaffodils/Interactive-Choropleth-Map-Using-Python/blob/master/bokeh-app/coronavirus_geocode_csv.ipynb
import xgboost
import shap
shap.initjs()
cols_for_analysis = ['ISO3','COVID-19 HAZARD & EXPOSURE','COVID-19 VULNERABILITY','COVID-19 LACK OF COPING CAPACITY']
q5_merged = (inform_covid19_index_df[cols_for_analysis]
.merge(q2_res_df, how='left', left_on='ISO3', right_on='ISO_3_CODE')
.drop(['ISO_3_CODE','Country Code', 'CumCase', '2018'], axis=1)
)
q5_merged['COVID-19 HAZARD & EXPOSURE'] = q5_merged['COVID-19 HAZARD & EXPOSURE'].astype(float)
q5_merged['COVID-19 VULNERABILITY'] = q5_merged['COVID-19 VULNERABILITY'].astype(float)
q5_merged['COVID-19 LACK OF COPING CAPACITY'] = q5_merged['COVID-19 LACK OF COPING CAPACITY'].astype(float)
# train XGBoost model
X,y = (q5_merged[['COVID-19 HAZARD & EXPOSURE','COVID-19 VULNERABILITY','COVID-19 LACK OF COPING CAPACITY']],np.log(q5_merged['CumCase_rel'].tolist()))
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 400)
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(np.exp(shap_values), X, plot_type="bar")
q5_merged.head()
X,y = (q5_merged[['COVID-19 HAZARD & EXPOSURE','COVID-19 VULNERABILITY','COVID-19 LACK OF COPING CAPACITY']],np.log(q5_merged['CumCase_rel'].tolist()))
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 400)
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(np.exp(shap_values), X, plot_type="bar")
shap.summary_plot(np.exp(shap_values), X)
print(f"Question 5: I calculated SHAP values for the three features. The contribution of the features to the outcomeis in this order: 'COVID-19 HAZARD & EXPOSURE' (1.61),'COVID-19 VULNERABILITY' (1.22 SHAP value),'COVID-19 LACK OF COPING CAPACITY (1.1)' ")
q5_merged = (inform_covid19_index_df.drop(['COUNTRY','P2P','Lack of Reliability (*)','Number of Missing Indicators','% of Missing Indicators'], axis=1)
.merge(q2_res_df, how='left', left_on='ISO3', right_on='ISO_3_CODE')
.drop(['ISO_3_CODE','Country Code', 'CumCase', '2018'], axis=1)
)
q5_merged['COVID-19 HAZARD & EXPOSURE'] = q5_merged['COVID-19 HAZARD & EXPOSURE'].astype(float)
q5_merged['COVID-19 VULNERABILITY'] = q5_merged['COVID-19 VULNERABILITY'].astype(float)
q5_merged['COVID-19 LACK OF COPING CAPACITY'] = q5_merged['COVID-19 LACK OF COPING CAPACITY'].astype(float)
#q5_merged['Inequality'] = q5_merged['Inequality'].astype(float)
#q5_merged['Inequality'] = q5_merged['Inequality'].str.replace('x',np.nan)
#q5_merged['Inequality'].fillna((q5_merged['Inequality'].mean()), inplace=True)
q5_merged['Health Conditions'] = q5_merged['Health Conditions'].astype(float)
#q5_merged['GBV'] = q5_merged['GBV'].astype(float)
q5_merged['Demographic and co-morbidity'] = q5_merged['Demographic and co-morbidity'].astype(float)
q5_merged['Access to health care'] = q5_merged['Access to health care'].astype(float)
q5_merged['Infrastructure'] = q5_merged['Infrastructure'].astype(float)
q5_merged['COVID-19 RISK CLASS'] = pd.factorize(q5_merged['COVID-19 RISK CLASS'], sort=True)[0] + 1
X,y = (q5_merged.drop(['ISO3','ADM0_NAME','Inequality','GBV','CumCase_rel'], axis=1),np.log(q5_merged['CumCase_rel'].tolist()))
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 400)
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(np.exp(shap_values), X, plot_type="bar")
print(f"Question 5 (alt): When I understand the question as : How much is the contribution of the three mentioned features compared to other features, I get that features like 'Development&Deprivation', 'Food Security', and 'Movements' have a bigger influence on the output.")
# todo: If time: https://github.com/delunapaolo/tree-explainer/tree/master/notebooks
#source: https://www.mathebibel.de/verdopplungszeit
q6_subset_df = who_df.query('ISO_3_CODE == "AFG"')
q6_subset_df.head(2)
# get days since first occurrence of infection
first_date = pd.Timestamp('2020-02-24', tz='UTC')
date = q6_subset_df['date_epicrv']
q6_subset_df['days'] = date.map(lambda x : (x - first_date).days )
## calculate average growth rate
for i in range(1, len(q6_subset_df)):
q6_subset_df.loc[i, 'growth_rate'] = (q6_subset_df.loc[i, 'CumCase'] - q6_subset_df.loc[i-1, 'CumCase']) / q6_subset_df.loc[i, 'CumCase']
mean_growth_rate = 1+ np.mean(q6_subset_df['growth_rate'])
estimated_doubling_time = math.log(2)/math.log(mean_growth_rate)
print(f"Question 6: The estimated doubling time is ~ {round(estimated_doubling_time, 2)} days.")
Question 2
Imagine that you are responding to a request from the OCHA country office in Nigeria for a COVID-19 model to support the COVID-19 Humanitarian Response Plan. The country office is interested in understanding where they should preposition resources, what are the key areas and groups that are expected to be most impacted and what are the most successful non-pharmaceutical interventions that can be implemented in Nigeria. After reviewing the COVID-19 models available, the Predictive Analytics team has decided to adapt a model initially developed by the Imperial College of London for the UK and the US to the Nigerian context.
Please describe how you would plan this analysis and what are the key issues and main challenges that you expect to face. Identify:
Question 1:
The main stakeholders are
- OCHA contry office, ideally a focal point as main point of contact for coordination
- Authorities in Nigeria:
- Ministry of health with their newly setup advisory committe on COVIC-19 as well as the response task force
- Due to the ongoing conflict in the North-East of Nigera, stakeholders of the regions of Borno, Adamawa and Yobe (e.g. Nigerian Army, the Department of State Security Services, the Nigerian Human Rights Commission, as well as the Nigerian Emergency Management Agency, National Commission for Refugees, Migrants and IDPs and the members of the National IHL Committee
- NGOs that deliver care and have access to vulnerable/risk groups, including in remote areas
- International organisations such as WHO, committee of the red cross as well as the Nigerian red cross
- The humanitarian data center team to agree on an engagement strategy as well as to learn from potentially previously implemented models in other HPR countries.
- The authors at the Imperial College of London to understand how and why they made adjustments to their model and to get feedback on our own plan to modify their model to work in the Nigerian context.
Question 2:
The model needs to be modified in the following areas to model to particularities of Nigera. Examples of differences are: People in Nigeria are on average 27 years younger, 3.1 times less obese, ~50% less likely to have access to improved drinking water, 3.7 times less likely to have internet access. These factors might be relevant for chosen model parameters as well as final delivery of services and implementation of measures.
Besides adjusting for the differences in population characteristics (density, age, livelihood, etc.), the model should take into account the following features that are prominent in Nigeria:
- Current stressors on the health system, e.g. potential outbreaks/occurrences of other infectious disease, vaccination campaigns.
- Access to health services/ health insurance.
- High-risk areas with conflicts.
- High-risk populations in refugee camps.
- Populations in remote areas that are difficult to reach.
- Differences in expenditures on health care.
- Potential difficulties in accessing food and water in addition to health crisis.
Question 3:
Most of the dataset to develop this model can be found on HDX. The main areas of dataset are:
- Census data on population distribution as well as age/gender/disabilities/livelihood to calculate
- Data on vulnerable/risk groups, e.g. data of ongoing conflict, internally displaced people
- Data on health, including health site mappings, health indicators
- Data on aid effectiveness or other humanitarian aid initiatives in the region where coordination is required
Question 4:
- Kick-off meeting with responsible stakeholders to perform a needs analysis
- Check with colleagues with experience in either developing similar models or working with authorities in Nigeria.
- Present model results in a way easy to understand e.g. build a COVID-19 dashboard showcasing the model's main findings and recommendation or generate a fact sheet that can be given to local authorities and health care professionals.
- Present the models results in form of recommendations for prioritities in implementation of measures.
- Integration of new data as they become available in order to provide the most up-to-date view/ findings of the Corona pandemic and thereby the best possible response recommendations.
- Monitor and evaluate if the findings and recommendations are understood and have an effect by actively seeking feedback. Adjust the model if necessary
- Set up a database of relevant information from various sources covering recommendations for different areas of COVID-19 response, e.g. gender,children, risk groups (elderly, pregnant women, other diseases), IDPs, trafficking