Required reading

Resources

Data viz in R


Why we plot

1. To not lie

Avoid statistical fallacies.

2. To be clear

Our brains see patterns in distance, size, color, shape more easily than in numerals. Most analytic claims are about variation (comparisons and/or relationships/trends). Any numeric claim can be visualized.

“A picture is worth a thousand words.”

*and far, far, far superior to ten numbers!

“Always, always, always, plot the data” - Dr. Jenny Brian


tidy() model output with broom

See these slides, this vignette, and this example.

lm(height ~ mass, data = starwars)
## 
## Call:
## lm(formula = height ~ mass, data = starwars)
## 
## Coefficients:
## (Intercept)         mass  
##   171.28536      0.02807
lm(height ~ mass, data = starwars) %>%
  tidy(conf.int = TRUE)
## # A tibble: 2 x 7
##   term        estimate std.error statistic  p.value  conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
## 1 (Intercept) 171.        5.34       32.1  3.73e-38  161.      182.    
## 2 mass          0.0281    0.0275      1.02 3.12e- 1   -0.0270    0.0832
"The p-value for `r m1$term[2]` is `r m1$p.value[2]`."

tidy() model output is easy to plot

m1 <- lm(birth_year ~ height + mass, data = starwars) %>% 
  tidy(conf.int = TRUE) 

m1 %>% 
  ggplot( aes(y = term, x = estimate) ) +
  geom_point() + 
  geom_errorbarh( aes(xmin = conf.low, xmax = conf.high) ) +
  geom_vline(xintercept = 0) # vertical line at 0

"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."

“The p-value for height is 0.001.”


Omit the intercept

m1 <- lm(birth_year ~ height + mass, data = starwars) %>% 
  tidy(conf.int = TRUE) 

m1 %>%
  filter(term != "(Intercept)") %>% 
  ggplot( aes(y = term, x = estimate) ) +
  geom_point() + 
  geom_errorbarh( aes(xmin = conf.low, xmax = conf.high)) +
  geom_vline(xintercept = 0) # vertical line at 0

"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."

“The p-value for height is 0.001.”


Pointrange

m1 <- lm(birth_year ~ height + mass, data = starwars) %>% 
  tidy(conf.int = TRUE) 

m1 %>% 
  filter(term != "(Intercept)") %>% 
  ggplot( aes(x = term, y = estimate) ) +
  # geom_pointrange() is like geom_point() + geom_segment()
  geom_pointrange( aes(ymin = conf.low, ymax = conf.high) ) + 
  geom_hline(yintercept = 0) # horizontal line at 0

"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."

“The p-value for height is 0.001.”


Pointrange + coord_flip()

m1 <- lm(birth_year ~ height + mass, data = starwars) %>% 
  tidy(conf.int = TRUE) 

m1 %>% 
  filter(term != "(Intercept)") %>% 
  ggplot( aes(x = term, y = estimate) ) +
  # geom_pointrange() is like geom_point + geom_segment
  geom_pointrange( aes(ymin = conf.low, ymax = conf.high) ) + 
  geom_hline(yintercept = 0) + # "horizontal" line at 0
  coord_flip() # flip x and y axes

"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."

“The p-value for height is 0.001.”


Always plot the data.

starwars %>% 
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) + 
  geom_label( aes(label = ifelse(birth_year > 750, name, NA)), color = "red")


Ideally, contrast data with claims.

starwars %>%   
  mutate(height_estimate = m1$estimate[1] + height*m1$estimate[2] )  %>% 
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) +  
  geom_line( aes(y = height_estimate) ) + 
  geom_ribbon( aes( ymax = height_estimate + 1.96*sum(m1$std.error) ,
                    ymin = height_estimate - 1.96*sum(m1$std.error)),
               alpha =.2)

*Note, for simplicity, I set mass equal to 0. What would be a better approach?


geom_smooth() conditional mean

starwars %>%   
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) +  
  geom_smooth(formula =  "y ~ x", method = lm)

(quick & dirty bivariate regression)


!

starwars %>% filter(name != "Yoda") %>% 
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) + 
  geom_smooth(formula =  "y ~ x", method = "lm")  


!!

starwars %>% filter(!name %in% c("Yoda", "Jabba Desilijic Tiure") ) %>% 
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) + 
  geom_smooth(formula =  "y ~ x", method = "lm")  


Regression fit by group

starwars %>%
  mutate(Human = species == "Human") %>% 
ggplot( aes(x = height, y = birth_year) ) + 
  geom_point( aes(size = mass) ) + 
  geom_smooth(formula =  "y ~ x", method = "lm", aes(color = Human) )  



Truth, beauty, ggplot()


aesthetics map variables onto things people can see (position, color, etc.)

geoms are plot layers (points, lines, etc.)

scales are how we show variation (negative = “red”, thousands = value/1000, etc.)

facets are mini-plots comparing subsets (“small multiples”)

starwars %>%   
  unnest(films) %>% 
  filter(films %in% c("Return of the Jedi", "The Force Awakens")) %>% 
ggplot( aes(x = birth_year, y = height, color = species) ) + 
  geom_point( aes(size = mass), alpha = .5) + 
  geom_text(aes(label = name),  hjust = 0, vjust = 0, check_overlap = T) + 
  scale_x_continuous(limits = c(0, 1000) ) + 
  facet_wrap("films") + # facet by film
  labs(x = "Age", y = "Height", color = "Species", size = "Mass")


Mapping

world <- map_data("world")  

head(world)
##        long      lat group order region subregion
## 1 -69.89912 12.45200     1     1  Aruba      <NA>
## 2 -69.89571 12.42300     1     2  Aruba      <NA>
## 3 -69.94219 12.43853     1     3  Aruba      <NA>
## 4 -70.00415 12.50049     1     4  Aruba      <NA>
## 5 -70.06612 12.54697     1     5  Aruba      <NA>
## 6 -70.05088 12.59707     1     6  Aruba      <NA>
world %>% filter(region == "Aruba") %>% 
ggplot( aes(x = long, y = lat, label = order) ) + geom_label()  


world %>%   
ggplot( aes(x = long, y = lat) ) +  
  # A map layer of country shapes (geom_polygon connects the dots)
  geom_polygon( aes(group = group), fill = NA, color = "grey") +
  # We are here: 
  geom_point(aes(x=-89.4012, y=43.0731), color = "Red") 


wi <- map_data("county", "wisconsin")

wi %>% 
  group_by(subregion) %>%
ggplot( aes(x = long, y = lat, group = group) ) +
  geom_point() +
  geom_polygon(fill = NA, color = "grey60")


wi <- map_data("county", "wisconsin")

wi %>% 
  group_by(subregion) %>%
  # Find the center of each subregion
  mutate(center_lat = mean(range(lat) ),
         center_long = mean(range(long) ) ) %>%
ggplot( aes(x = long, y = lat, group = group) ) +
  geom_polygon(fill = NA, color = "grey60") +
  geom_text( aes(x = center_long, y = center_lat, label = subregion), size = 2, angle = 45, check_overlap = T)


scale


Using GIS data with sf (see full example, adapted from Kieran Healy).

canada_cd <- st_read("data/canada_cd_sim.geojson", quiet = TRUE) 

canada_cd %>% 
ggplot() + 
  geom_sf(mapping = aes(fill = PRNAME)) 


Change projection with st_transform:

canada_cd %>% 
  # Lambert Conformal Conic projection
  st_transform(crs = "+proj=lcc +lat_1=49 +lat_2=77 +lon_0=-91.52 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs") %>%
ggplot() + 
  geom_sf(mapping = aes(fill = PRNAME)) 


Transformations

Summary stats with stat= {count, bin, density, smooth, ecdf, quantile, etc.}. stat applies some function to make a new variable (besides regression, nothing you could not have wrangled with dplyr summary or mutate).


Distributions with geom_histogram

lincoln_weather %>% 
ggplot( aes(x = `Mean Temperature [F]`) ) +
  geom_histogram(binwidth = 10) + # i.e. geom_bar(stat = count, binwidth = 10)
  facet_grid(Month ~ .) + 
  theme(strip.text.y = element_text(angle = 0))


Distributions with geom_density

lincoln_weather %>% 
ggplot( aes(x = `Mean Temperature [F]`) ) +
  geom_density() +
  facet_grid(Month ~ .) + 
  theme(strip.text.y = element_text(angle = 0))


ggridges, lets us use a y scale instead of facets:

library(ggridges)

lincoln_weather %>% 
ggplot( aes(x = `Mean Temperature [F]`, y = Month, fill = ..x..) ) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c() +
  labs(title = "Temperatures in Lincoln NE in 2016", fill = "Temp. [F]")


Saving figures

*DO NOT save mouse-y style


ggsave() in .R

ggplot(...) 
ggsave(here("Figs/figure-name.png"), width = 3, height = 3) # or .pdf etc.

In .Rmd, figures save when you knit

knitr::opts_chunk$set(here(fig.path="Figs/"),
                      fig.height = 3,
                      fig.width = 3)

Chunk names = figure names!