ggplot
tips and tutorials from Dr. Jenny Brian’s [stats class]((http://stat545.com/graph00_index.html)ggplot2
cheatsheetmap_data()
from the maps
package. Advanced maps: r-spatial
tutorial, sf
cheatsheat and cartogrphy
cheatsheetmagick
Avoid statistical fallacies.
Our brains see patterns in distance, size, color, shape more easily than in numerals. Most analytic claims are about variation (comparisons and/or relationships/trends). Any numeric claim can be visualized.
“A picture is worth a thousand words.”
–
*and far, far, far superior to ten numbers!
–
“Always, always, always, plot the data” - Dr. Jenny Brian
tidy()
model output with broom
See these slides, this vignette, and this example.
lm(height ~ mass, data = starwars)
##
## Call:
## lm(formula = height ~ mass, data = starwars)
##
## Coefficients:
## (Intercept) mass
## 171.28536 0.02807
lm(height ~ mass, data = starwars) %>%
tidy(conf.int = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 171. 5.34 32.1 3.73e-38 161. 182.
## 2 mass 0.0281 0.0275 1.02 3.12e- 1 -0.0270 0.0832
"The p-value for `r m1$term[2]` is `r m1$p.value[2]`."
tidy()
model output is easy to plotm1 <- lm(birth_year ~ height + mass, data = starwars) %>%
tidy(conf.int = TRUE)
m1 %>%
ggplot( aes(y = term, x = estimate) ) +
geom_point() +
geom_errorbarh( aes(xmin = conf.low, xmax = conf.high) ) +
geom_vline(xintercept = 0) # vertical line at 0
"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."
“The p-value for height is 0.001.”
m1 <- lm(birth_year ~ height + mass, data = starwars) %>%
tidy(conf.int = TRUE)
m1 %>%
filter(term != "(Intercept)") %>%
ggplot( aes(y = term, x = estimate) ) +
geom_point() +
geom_errorbarh( aes(xmin = conf.low, xmax = conf.high)) +
geom_vline(xintercept = 0) # vertical line at 0
"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."
“The p-value for height is 0.001.”
m1 <- lm(birth_year ~ height + mass, data = starwars) %>%
tidy(conf.int = TRUE)
m1 %>%
filter(term != "(Intercept)") %>%
ggplot( aes(x = term, y = estimate) ) +
# geom_pointrange() is like geom_point() + geom_segment()
geom_pointrange( aes(ymin = conf.low, ymax = conf.high) ) +
geom_hline(yintercept = 0) # horizontal line at 0
"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."
“The p-value for height is 0.001.”
m1 <- lm(birth_year ~ height + mass, data = starwars) %>%
tidy(conf.int = TRUE)
m1 %>%
filter(term != "(Intercept)") %>%
ggplot( aes(x = term, y = estimate) ) +
# geom_pointrange() is like geom_point + geom_segment
geom_pointrange( aes(ymin = conf.low, ymax = conf.high) ) +
geom_hline(yintercept = 0) + # "horizontal" line at 0
coord_flip() # flip x and y axes
"The p-value for `r m1$term[2]` is `r round( m1$p.value[2], 3)`."
“The p-value for height is 0.001.”
–
starwars %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_label( aes(label = ifelse(birth_year > 750, name, NA)), color = "red")
starwars %>%
mutate(height_estimate = m1$estimate[1] + height*m1$estimate[2] ) %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_line( aes(y = height_estimate) ) +
geom_ribbon( aes( ymax = height_estimate + 1.96*sum(m1$std.error) ,
ymin = height_estimate - 1.96*sum(m1$std.error)),
alpha =.2)
*Note, for simplicity, I set mass equal to 0. What would be a better approach?
geom_smooth()
conditional meanstarwars %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_smooth(formula = "y ~ x", method = lm)
(quick & dirty bivariate regression)
starwars %>% filter(name != "Yoda") %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_smooth(formula = "y ~ x", method = "lm")
starwars %>% filter(!name %in% c("Yoda", "Jabba Desilijic Tiure") ) %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_smooth(formula = "y ~ x", method = "lm")
starwars %>%
mutate(Human = species == "Human") %>%
ggplot( aes(x = height, y = birth_year) ) +
geom_point( aes(size = mass) ) +
geom_smooth(formula = "y ~ x", method = "lm", aes(color = Human) )
ggplot()
aes
thetics map variables onto things people can see (position, color, etc.)
geom
s are plot layers (points, lines, etc.)
scale
s are how we show variation (negative = “red”, thousands = value/1000, etc.)
facet
s are mini-plots comparing subsets (“small multiples”)
starwars %>%
unnest(films) %>%
filter(films %in% c("Return of the Jedi", "The Force Awakens")) %>%
ggplot( aes(x = birth_year, y = height, color = species) ) +
geom_point( aes(size = mass), alpha = .5) +
geom_text(aes(label = name), hjust = 0, vjust = 0, check_overlap = T) +
scale_x_continuous(limits = c(0, 1000) ) +
facet_wrap("films") + # facet by film
labs(x = "Age", y = "Height", color = "Species", size = "Mass")
world <- map_data("world")
head(world)
## long lat group order region subregion
## 1 -69.89912 12.45200 1 1 Aruba <NA>
## 2 -69.89571 12.42300 1 2 Aruba <NA>
## 3 -69.94219 12.43853 1 3 Aruba <NA>
## 4 -70.00415 12.50049 1 4 Aruba <NA>
## 5 -70.06612 12.54697 1 5 Aruba <NA>
## 6 -70.05088 12.59707 1 6 Aruba <NA>
world %>% filter(region == "Aruba") %>%
ggplot( aes(x = long, y = lat, label = order) ) + geom_label()
world %>%
ggplot( aes(x = long, y = lat) ) +
# A map layer of country shapes (geom_polygon connects the dots)
geom_polygon( aes(group = group), fill = NA, color = "grey") +
# We are here:
geom_point(aes(x=-89.4012, y=43.0731), color = "Red")
wi <- map_data("county", "wisconsin")
wi %>%
group_by(subregion) %>%
ggplot( aes(x = long, y = lat, group = group) ) +
geom_point() +
geom_polygon(fill = NA, color = "grey60")
wi <- map_data("county", "wisconsin")
wi %>%
group_by(subregion) %>%
# Find the center of each subregion
mutate(center_lat = mean(range(lat) ),
center_long = mean(range(long) ) ) %>%
ggplot( aes(x = long, y = lat, group = group) ) +
geom_polygon(fill = NA, color = "grey60") +
geom_text( aes(x = center_long, y = center_lat, label = subregion), size = 2, angle = 45, check_overlap = T)
scale
Using GIS data with sf
(see full example, adapted from Kieran Healy).
canada_cd <- st_read("data/canada_cd_sim.geojson", quiet = TRUE)
canada_cd %>%
ggplot() +
geom_sf(mapping = aes(fill = PRNAME))
Change projection with st_transform
:
canada_cd %>%
# Lambert Conformal Conic projection
st_transform(crs = "+proj=lcc +lat_1=49 +lat_2=77 +lon_0=-91.52 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs") %>%
ggplot() +
geom_sf(mapping = aes(fill = PRNAME))
Summary stats with stat
= {count
, bin
, density
, smooth
, ecdf
, quantile
, etc.}. stat
applies some function to make a new variable (besides regression, nothing you could not have wrangled with dplyr
summary
or mutate
).
geom_histogram
lincoln_weather %>%
ggplot( aes(x = `Mean Temperature [F]`) ) +
geom_histogram(binwidth = 10) + # i.e. geom_bar(stat = count, binwidth = 10)
facet_grid(Month ~ .) +
theme(strip.text.y = element_text(angle = 0))
geom_density
lincoln_weather %>%
ggplot( aes(x = `Mean Temperature [F]`) ) +
geom_density() +
facet_grid(Month ~ .) +
theme(strip.text.y = element_text(angle = 0))
ggridges
, lets us use a y
scale instead of facets:
library(ggridges)
lincoln_weather %>%
ggplot( aes(x = `Mean Temperature [F]`, y = Month, fill = ..x..) ) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c() +
labs(title = "Temperatures in Lincoln NE in 2016", fill = "Temp. [F]")
*DO NOT save mouse-y style
ggsave()
in .Rggplot(...)
ggsave(here("Figs/figure-name.png"), width = 3, height = 3) # or .pdf etc.
knitr::opts_chunk$set(here(fig.path="Figs/"),
fig.height = 3,
fig.width = 3)