#install.packages(tidyverse) # install package
library(tidyverse)# load package
## Import a csv file
TB <- read.csv(“TB_burden_countries_2020-01-09.csv”)View(TB)# View can view some summary statistics with summary()# but it is not easy to readsummary(TB)# We can use library stargazer to produce better summary outputlibrary(stargazer)stargazer(TB, type = “text”)# How can you add the median to stargazer output? (use the help)# Useful Tidyverse functions are filter and select# select helps us filter out columns# e.g., select all columns from ‘country’ to ‘e_pop_num’TB1 <- TB %>%
select(country:e_pop_num)
View(TB1)
# filter helps us filter the observations by some criteria
# e.g., filter only data belonging to region EUR
TB2 <- TB %>%
filter(g_whoregion == EUR)
View(TB2)
# We can also apply both transformations together
TB3 <- TB %>%
select(country:e_pop_num) %>%
filter(g_whoregion == EUR)
View(TB3)
# with basic R we would have needed the following code to get TB3
# TB3 <- TB[TB$g_whoregion == “EUR”, 1:7] # which can get complicated and difficult to read the more operations we perform on TB# Another useful functions is the function gather# it can be used to stack variables one below the other# in the code below I am stacking one below the others all the variables# from ‘e_mort_exc_tbhiv_100k’ to ‘e_mort_num_hi’TB4 <- TB %>%
gather(e_mort_exc_tbhiv_100k:e_mort_num_hi,
key = e_mort_key,
value = e_mort_value)
View(TB4)
# we can also chain gather with other functions like select
TB %>%
select(country:e_pop_num, e_mort_exc_tbhiv_100k:e_mort_num_hi) %>%
gather(e_mort_exc_tbhiv_100k:e_mort_num_hi,
key = e_mort_key, value = e_mort_value)
# if we need to create new variables we can use mutate
# e.g., we create a new var which is population divided 100k
TB5 <- TB %>%
mutate(e_pop_100k = e_pop_num/10^5)
View(TB5)
# we can use group_by to perform operations like mutate by groups
# like calculating averaging population by region and year
# and add it as a new var
TB6 <- TB %>%
group_by(g_whoregion, year) %>%
mutate(avg_pop = mean(e_pop_num))
View(TB6)
# if we just want a summary by groupe we can use summarise instead
TB %>%
group_by(g_whoregion, year) %>%
summarise(avg_pop = mean(e_pop_num))
# finaly we can use also chain other R function like lm()
TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
lm(e_inc_100k ~ e_pop_mil, data = .) %>%
summary()
# we can use stargazer instead of summary to get a better output
TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
lm(e_inc_100k ~ e_pop_mil, data = .) %>%
stargazer(type = text)
# finally we can chain group_by and lm to get a linear model by group
# however, we need to install package broom
library(broom)
summary.ols <- TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
group_by(country) %>%
group_modify(~tidy(lm(e_inc_100k ~ e_pop_mil, data = .)))
View(summary.ols)
# Questions
# Is it sensible to remove all NAs in dataset TB?
# Take e_mort_100k as response, can you build a regression model based on this dataset? Which variables would you choose as covariates?
# Hint: Use the TB_data_dictionary file to understand the meaning of each variable.
# Which model works has the smallest MSE?
Reviews
There are no reviews yet.