Let us load the data we will need:
Three common ways to generate basic graphics in R are via - base R
- lattice
- ggplot2
We will skip base R
graphics since ggplot2
will be the graphics package for this class. Let us see how we use it, starting with a simple bar-chart
Remember the typical options…
bar-chart
histogram/box-plot/area-chart
scatter-plot/hex-bin
ggplot2
can work with unit-level data (i.e., one row per observation) AND it can also work with calculated values stored in a data-frame (like our survey-weighted estimates from BRFSS/DHS)
we will see both ways in action
(educ)
ggplot(data = dhs.df14, aes(x = educat, group = wealthq, fill = wealthq)) +
geom_bar(position = "dodge") +
facet_wrap(~ educat)
ggplot(data = dhs.df14, aes(x = educat, group = wealthq, fill = wealthq, y = ..prop..)) +
geom_bar(position = "dodge") +
facet_wrap(~ wealthq, ncol = 2) +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "bottom")
These are all unweighted estimates; how would we plot the weighted estimates?
library(survey)
library(srvyr)
tab.1 <- svymean(~ educat, dhs14_design, na.rm = TRUE)
tab.1 <- as.data.frame(tab.1) * 100
tab.1 <- round(tab.1, digits = 1)
tab.1
mean SE
educatno education 24.0 0.6
educatincomplete primary 6.1 0.2
educatcomplete primary 4.1 0.2
educatincomplete secondary 13.3 0.3
educatcomplete secondary 38.5 0.6
educathigher 13.9 0.5
tab.1p <- tab.1 %>%
mutate(educ = rownames(tab.1),
educ = stringr::str_replace(educ, "educat", ""),
educ = stringr::str_to_title(educ),
educ = ordered(educ,
levels = c("No Education", "Incomplete Primary",
"Complete Primary", "Incomplete Secondary",
"Complete Secondary", "Higher")
)
)
knitr::kable(tab.1p)
mean | SE | educ |
---|---|---|
24.0 | 0.6 | No Education |
6.1 | 0.2 | Incomplete Primary |
4.1 | 0.2 | Complete Primary |
13.3 | 0.3 | Incomplete Secondary |
38.5 | 0.6 | Complete Secondary |
13.9 | 0.5 | Higher |
tab.ew <- svyby(~educat, ~wealthq, dhs14_design, svymean, na.rm = TRUE)
tab.ew <- tab.ew[, c(1:7)]
tab.ew2 <- tab.ew[, c(2:7)] * 100
tab.ew2 <- round(tab.ew2, digits = 1)
knitr::kable(tab.ew2)
educatno education | educatincomplete primary | educatcomplete primary | educatincomplete secondary | educatcomplete secondary | educathigher | |
---|---|---|---|---|---|---|
poorest | 47.7 | 8.6 | 4.3 | 13.8 | 22.7 | 3.0 |
poorer | 37.4 | 8.9 | 4.9 | 14.8 | 30.4 | 3.6 |
middle | 18.9 | 6.1 | 3.7 | 14.6 | 46.5 | 10.2 |
richer | 14.0 | 5.1 | 5.0 | 13.8 | 44.8 | 17.3 |
richest | 5.4 | 2.3 | 2.8 | 9.4 | 45.3 | 34.8 |
library(tidyr)
tab.ew3 <- tab.ew2 %>%
mutate(Wealth = rownames(tab.ew2)) %>%
gather(Education, Percent, 1:6) %>%
mutate(Education = substring(Education, 7),
Education = stringr::str_to_title(Education)
) %>%
mutate(Education2 = ordered(Education,
levels = c("No Education", "Incomplete Primary",
"Complete Primary", "Incomplete Secondary",
"Complete Secondary", "Higher")
),
Wealth = stringr::str_to_title(Wealth)
) %>%
mutate(Wealth2 = ordered(Wealth,
levels = c("Poorest", "Poorer",
"Middle", "Richer",
"Richest")
)
)
knitr::kable(tab.ew3)
Wealth | Education | Percent | Education2 | Wealth2 |
---|---|---|---|---|
Poorest | No Education | 47.7 | No Education | Poorest |
Poorer | No Education | 37.4 | No Education | Poorer |
Middle | No Education | 18.9 | No Education | Middle |
Richer | No Education | 14.0 | No Education | Richer |
Richest | No Education | 5.4 | No Education | Richest |
Poorest | Incomplete Primary | 8.6 | Incomplete Primary | Poorest |
Poorer | Incomplete Primary | 8.9 | Incomplete Primary | Poorer |
Middle | Incomplete Primary | 6.1 | Incomplete Primary | Middle |
Richer | Incomplete Primary | 5.1 | Incomplete Primary | Richer |
Richest | Incomplete Primary | 2.3 | Incomplete Primary | Richest |
Poorest | Complete Primary | 4.3 | Complete Primary | Poorest |
Poorer | Complete Primary | 4.9 | Complete Primary | Poorer |
Middle | Complete Primary | 3.7 | Complete Primary | Middle |
Richer | Complete Primary | 5.0 | Complete Primary | Richer |
Richest | Complete Primary | 2.8 | Complete Primary | Richest |
Poorest | Incomplete Secondary | 13.8 | Incomplete Secondary | Poorest |
Poorer | Incomplete Secondary | 14.8 | Incomplete Secondary | Poorer |
Middle | Incomplete Secondary | 14.6 | Incomplete Secondary | Middle |
Richer | Incomplete Secondary | 13.8 | Incomplete Secondary | Richer |
Richest | Incomplete Secondary | 9.4 | Incomplete Secondary | Richest |
Poorest | Complete Secondary | 22.7 | Complete Secondary | Poorest |
Poorer | Complete Secondary | 30.4 | Complete Secondary | Poorer |
Middle | Complete Secondary | 46.5 | Complete Secondary | Middle |
Richer | Complete Secondary | 44.8 | Complete Secondary | Richer |
Richest | Complete Secondary | 45.3 | Complete Secondary | Richest |
Poorest | Higher | 3.0 | Higher | Poorest |
Poorer | Higher | 3.6 | Higher | Poorer |
Middle | Higher | 10.2 | Higher | Middle |
Richer | Higher | 17.3 | Higher | Richer |
Richest | Higher | 34.8 | Higher | Richest |
ggplot(data = tab.ew3, aes(x = Education2, y = Percent, group = Wealth2, fill = Education2)) +
geom_bar(stat = "identity") +
facet_wrap(~Wealth2, ncol = 5) +
theme(legend.position = "bottom", axis.text.x = element_text(angle = 90))
hemoglobinData <- read.csv(url("http://whitlockschluter.zoology.ubc.ca/wp-content/data/chapter02/chap02e3cHumanHemoglobinElevation.csv"))
ggplot(data = hemoglobinData, aes(x = hemoglobin)) + geom_histogram(fill="cornflowerblue", color = "white") +
labs(x = "Hemoglobin in grams (gm) per deciliter (dL)",
y = "Frequency",
title = "Histogram of Hemoglobin Concentration in Adult Males")
But this masks four different populations …
ggplot(data = hemoglobinData, aes(x = hemoglobin, group = population, fill = population)) +
geom_histogram(color = "white") +
facet_wrap(~population) +
labs(x = "Hemoglobin in grams (gm) per deciliter (dL)",
y = "Frequency",
title = "Histogram of Hemoglobin Concentration in Adult Males")
One useful design element with breakouts is placing in relief the consolidated data (i.e., the distribution without break-outs)
ggplot(data = hemoglobinData, aes(x = hemoglobin, fill = population)) +
geom_histogram(bins = 10, color = "white") +
geom_histogram(data = hemoglobinData[, -3], bins = 10, fill = "grey", alpha = .5) +
facet_wrap(~population)
The distribution of hemoglobin in four populations (the US being the reference group), part of a study looking at the impacy of altitude on hemoglobin concentration (courtesy Whitlock and Schluter).
library(ggridges)
ggplot(hemoglobinData, aes(x = hemoglobin, y = population)) +
geom_density_ridges(scale = 3, alpha = 0.3, aes(fill = population)) +
labs(title = 'Hemoglobin Concentration Levels', subtitle = 'in Four populations') + theme_ridges() +
theme(axis.title.y = element_blank(), legend.position = "none")
They are visually appealing when comparing a large number of groups on a single continuous variable and using simple facet-wrap
or other options would be infeasible.
ggplot(data = hemoglobinData, aes(y = hemoglobin, fill = population, x = population)) +
geom_boxplot() +
coord_flip() +
theme(legend.position = "none")
Let us read-in the 2018 County Health Rankings data and then see how unhealthy days correlate
library(readxl)
chr18 <- read_excel("data/2018CountyHealthRankingsData-v1.xlsx", sheet = "Ranked Measure Data", skip = 1)
ggplot(data = chr18, aes(x = `Physically Unhealthy Days`, y = `Mentally Unhealthy Days`)) +
geom_point()
library(patchwork)
p1 <- ggplot(data = hemoglobinData, aes(y = hemoglobin, fill = population, x = population)) +
geom_boxplot() +
coord_flip() +
theme(legend.position = "none")
p2 <- ggplot(data = dhs.df14, aes(x = educ, group = wealthq, fill = wealthq, y = ..prop..)) +
geom_bar(position = "dodge") +
facet_wrap(~ wealthq, ncol = 2) +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "bottom")
p3 <- ggplot(hemoglobinData, aes(x = hemoglobin, y = population)) +
geom_density_ridges(scale = 3, alpha = 0.3, aes(fill = population)) +
labs(title = 'Hemoglobin Concentration Levels',
subtitle = 'in Four populations') +
theme_ridges() +
theme(axis.title.y = element_blank(), legend.position = "none")
(p1 + p2) + p3 + plot_layout(ncol = 1)
p3 + (p1 + p3) + plot_layout(ncol = 1)
Several packages available to generate maps but we will focus on ggplot2
, rgdal
and leaflet
First things first: Make sure you extract all files in the egyshp1.zip
archive I sent to you and put them in data/egyshp1/
. These were secured from this site
These files are also available from other locations, including diva-gis, here and several other locations
Make sure you have rgdal
and any dependencies asked for installed as well
Assuming the files were extracted and are in data/egyshp1/
we can read the shapefile and plot it to make sure it read fine
OGR data source with driver: ESRI Shapefile
Source: "/Users/ruhil/Documents/github/aniruhilorg/workshops/fulbright/handouts/data/egyshp1", layer: "egy_admbnda_adm1_capmas_20170421"
with 27 features
It has 16 fields
plot(egyshp1)
Maps are useful for showing distributions of variables by using colors to show different values or grouped values
For example, say I want to show the distribution of the highest wealth quintile in each Governorate. I’ll need a few things.
centroid
of each Governorate so I can label each Governorateggplot2
We have the shapefile so let us rebuild the survey-weighted estimates
Next we build the centroids
egypt1 <- fortify(egyshp1, region = "ADM1_EN")
egypt1map <- egypt1 %>%
arrange(id, group, order)
library(sp)
getLabelPoint <- function(govname){
Polygon(govname[c("long", "lat")])@labpt
}
centroids = by(egypt1map, egypt1map$id, getLabelPoint)
centroids2 <- do.call("rbind.data.frame", centroids)
centroids2$govname = rownames(centroids)
names(centroids2) <- c("clong", "clat", "govname")
head(centroids2)
clong clat govname
1 29.74268 30.83925 Alexandria
2 31.64003 27.33632 Assiut
3 32.58306 23.36420 Aswan
4 30.27643 30.62397 Behera
5 31.07534 28.80985 Beni Suef
6 31.59982 29.95994 Cairo
Now a simple map that uses a unique color for and labels each Governorate
library(ggthemes)
library(ggrepel)
ggplot() +
geom_polygon(data = egypt1map,
aes(x = long, y = lat, group = group, fill = id), color = "black") +
geom_text_repel(data = centroids2,
aes(x = clong, y = clat, label = govname), size = 2.25, force = 10) +
coord_fixed(1.3) +
theme_map() +
theme(legend.position = "none")
Now we replace the arbitray fill colors with colors based on the percent in the Highest wealth quintile in each governorate
… but this requires us to merge tab4
with egypt1map
, using governorate names as the merge key Unfortunately, the spellings differ slightly so we have to fix that first
tab.4b <- tab.4 %>%
select(c(11, 1:5)) %>%
tidyr::gather(Wealth, percent, 2:6) %>%
mutate(Wealth = stringr::str_to_title(Wealth))
tab.4b$Wealth <- ordered(tab.4b$Wealth,
levels = c("Poorest", "Poorer", "Middle", "Richer", "Richest")
)
tab.map <- tab.4b %>%
mutate(Governorate = gsub("Assuit", "Assiut", Governorate),
Governorate = gsub("Kafr El-Sheikh", "Kafr El-Shikh", Governorate),
Governorate = gsub("Kalyubia", "Kalyoubia", Governorate),
Governorate = gsub("Matroh", "Matrouh", Governorate),
Governorate = gsub("Menya", "Menia", Governorate),
Governorate = gsub("Souhag", "Suhag", Governorate)
)
map.df <- merge(tab.map, egypt1map, by.x = "Governorate", by.y = "id", all = TRUE)
map.df2 <- map.df %>%
filter(Wealth == "Richest") %>%
arrange(Governorate, group, order) %>%
mutate(Wealth = factor(Wealth, ordered = FALSE))
and now the map …
library(viridis)
ggplot() +
geom_polygon(data = map.df2, aes(x = long, y = lat, group = group, fill = percent),
color = "black") +
geom_text_repel(data = centroids2, aes(x = clong, y = clat, label = govname), size = 3.5, color = "red") +
coord_fixed(1.3) +
theme_map() +
labs(fill = "Highest Quintile (%)") +
theme(legend.position = "bottom") +
scale_fill_viridis("cividis", direction = -1)
.. and now adding specific locations
leaflet(width = "100%", height = "60%") %>%
setView(lat = 26.878860, lng = 32.324557, zoom = 5) %>%
addTiles() %>%
addMarkers(lat = 30.024065, lng = 31.207363,
popup = c("Cairo University, 26F4+4W Giza, Al Omraneyah, Egypt")) %>%
addMarkers(lat = 23.997723, lng = 32.859877,
popup = c("Aswan University, XVW5+JX Tingar, Manteqet as Sad Al Aali, Qism Aswan, Egypt")
)