2019-11-04
dplyr / purrr for efficient data manipulationbroomtibbleggplot using relevant aestheticsinteractive session
Tutorial based on the great conference by Hadley Wickham
nest()mtcars %>%
group_nest(cyl) %>%
mutate(model = map(data, ~lm(mpg ~ wt, data = .x)),
summary = map(model, summary),
r_squared = map_dbl(summary, "r.squared"))
# A tibble: 3 x 5
cyl data model summary r_squared
<dbl> <list> <list> <list> <dbl>
1 4 <tibble [11 × 10]> <lm> <smmry.lm> 0.509
2 6 <tibble [7 × 10]> <lm> <smmry.lm> 0.465
3 8 <tibble [14 × 10]> <lm> <smmry.lm> 0.423gapminder05:00
gapminder packagegapminder and tidyverse packages%>% to pass gapminder to ggplot()life expectency (lifeExp in y) ~ year (x)geom_line()library(gapminder) gapminder %>% ggplot(aes(x = year, y = lifeExp, group = country)) + geom_line()
04:00
mutate() named year1950 which is:year - 1950group_nest() the tibble by country and continentby_countrygroup_nest()by_country <- gapminder %>% mutate(year1950 = year - 1950) %>% group_nest(continent, country) by_country
# A tibble: 142 x 3 continent country data <fct> <fct> <list> 1 Africa Algeria <tibble [12 × 5]> 2 Africa Angola <tibble [12 × 5]> 3 Africa Benin <tibble [12 × 5]> 4 Africa Botswana <tibble [12 × 5]> 5 Africa Burkina Faso <tibble [12 × 5]> 6 Africa Burundi <tibble [12 × 5]> 7 Africa Cameroon <tibble [12 × 5]> 8 Africa Central African Republic <tibble [12 × 5]> 9 Africa Chad <tibble [12 × 5]> 10 Africa Comoros <tibble [12 × 5]> # … with 132 more rows
datayear1950 will help to get meaningful interceptscontinent to keep it along with countrygapminder %>% filter(country == "Germany") %>% select(-country, -continent)
# A tibble: 12 x 4
year lifeExp pop gdpPercap
<int> <dbl> <int> <dbl>
1 1952 67.5 69145952 7144.
2 1957 69.1 71019069 10188.
3 1962 70.3 73739117 12902.
4 1967 70.8 76368453 14746.
5 1972 71 78717088 18016.
6 1977 72.5 78160773 20513.
7 1982 73.8 78335266 22032.
8 1987 74.8 77718298 24639.
9 1992 76.1 80597764 26505.
10 1997 77.3 82011073 27789.
11 2002 78.7 82350671 30036.
12 2007 79.4 82400996 32170.by_country %>% filter(country == "Germany")
# A tibble: 1 x 3 continent country data <fct> <fct> <list> 1 Europe Germany <tibble [12 × 5]>
by_country %>% filter(country == "Germany") %>% unnest(data)
# A tibble: 12 x 7 continent country year lifeExp pop gdpPercap year1950 <fct> <fct> <int> <dbl> <int> <dbl> <dbl> 1 Europe Germany 1952 67.5 69145952 7144. 2 2 Europe Germany 1957 69.1 71019069 10188. 7 3 Europe Germany 1962 70.3 73739117 12902. 12 4 Europe Germany 1967 70.8 76368453 14746. 17 5 Europe Germany 1972 71 78717088 18016. 22 6 Europe Germany 1977 72.5 78160773 20513. 27 7 Europe Germany 1982 73.8 78335266 22032. 32 8 Europe Germany 1987 74.8 77718298 24639. 37 9 Europe Germany 1992 76.1 80597764 26505. 42 10 Europe Germany 1997 77.3 82011073 27789. 47 11 Europe Germany 2002 78.7 82350671 30036. 52 12 Europe Germany 2007 79.4 82400996 32170. 57
06:00
by_countrymodel with linear regressions of lifeExp on year1950by_country_lmmutate or summarise?data), do you need to use map?by_country_lm <- by_country %>% mutate(model = map(data, ~ lm(lifeExp ~ year1950, data = .x))) by_country_lm
# A tibble: 142 x 4 continent country data model <fct> <fct> <list> <list> 1 Africa Algeria <tibble [12 × 5]> <lm> 2 Africa Angola <tibble [12 × 5]> <lm> 3 Africa Benin <tibble [12 × 5]> <lm> 4 Africa Botswana <tibble [12 × 5]> <lm> 5 Africa Burkina Faso <tibble [12 × 5]> <lm> 6 Africa Burundi <tibble [12 × 5]> <lm> 7 Africa Cameroon <tibble [12 × 5]> <lm> 8 Africa Central African Republic <tibble [12 × 5]> <lm> 9 Africa Chad <tibble [12 × 5]> <lm> 10 Africa Comoros <tibble [12 × 5]> <lm> # … with 132 more rows
03:00
data columnlifeExp ~ year1950 for Bulgaria by unnesting datafilter() for the desired countryunnest() raw dataggplot()by_country_lm %>% mutate(n = map_int(data, nrow)) %>% select(country, n)
# A tibble: 142 x 2 country n <fct> <int> 1 Algeria 12 2 Angola 12 3 Benin 12 4 Botswana 12 5 Burkina Faso 12 6 Burundi 12 7 Cameroon 12 8 Central African Republic 12 9 Chad 12 10 Comoros 12 # … with 132 more rows
by_country_lm %>% mutate(n = map_int(data, nrow)) %>% distinct(n)
# A tibble: 1 x 1
n
<int>
1 12by_country_lm %>% filter(country == "Bulgaria") %>% unnest(data) %>% ggplot(aes(x = year1950, y = lifeExp)) + geom_line()
06:00
summary for the linear model of Rwandafilter() for the desired countryxth element, use the pluck("model", x) purrr syntaxsummary()by_country_lm %>%
filter(country == "Rwanda") %>%
pluck("model", 1) %>%
summary()
Call:
lm(formula = lifeExp ~ year1950, data = .x)
Residuals:
Min 1Q Median 3Q Max
-17.310 -1.445 2.410 3.073 6.021
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 42.83361 3.74890 11.426 4.63e-07 ***
year1950 -0.04583 0.10969 -0.418 0.685
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.558 on 10 degrees of freedom
Multiple R-squared: 0.01716, Adjusted R-squared: -0.08112
F-statistic: 0.1746 on 1 and 10 DF, p-value: 0.6849\(r^2\) is close to 0, linearity sounds broken
broom will cleanup lm elements into tibbles
06:00
by_country_lm, add 4 new columns:
glance, using the broom function on the model columntidy, using the broom function on the model columnaugment, using the broom function on the model columnrsq from the glance columnmodelsmap when dealing with a list columnmap, shortcut with quotes (like "r.squared") extract the specified variablemap takes and returns a list. Use map_dbl() to coerce output to doubleslibrary(broom)
models <- by_country_lm %>%
mutate(glance = map(model, glance),
tidy = map(model, tidy),
augment = map(model, augment),
rsq = map_dbl(glance, "r.squared"))
models
# A tibble: 142 x 8 continent country data model glance tidy augment rsq <fct> <fct> <list> <list> <list> <list> <list> <dbl> 1 Africa Algeria <tibble… <lm> <tibble… <tibbl… <tibble … 0.985 2 Africa Angola <tibble… <lm> <tibble… <tibbl… <tibble … 0.888 3 Africa Benin <tibble… <lm> <tibble… <tibbl… <tibble … 0.967 4 Africa Botswana <tibble… <lm> <tibble… <tibbl… <tibble … 0.0340 5 Africa Burkina Faso <tibble… <lm> <tibble… <tibbl… <tibble … 0.919 6 Africa Burundi <tibble… <lm> <tibble… <tibbl… <tibble … 0.766 7 Africa Cameroon <tibble… <lm> <tibble… <tibbl… <tibble … 0.680 8 Africa Central Afr… <tibble… <lm> <tibble… <tibbl… <tibble … 0.493 9 Africa Chad <tibble… <lm> <tibble… <tibbl… <tibble … 0.872 10 Africa Comoros <tibble… <lm> <tibble… <tibbl… <tibble … 0.997 # … with 132 more rows
05:00
country ~ rsqrsq): snake plotto reorder discrete values:
factorforcats packagefct_reorder() to reorder according to a continuous variablelibrary(forcats)
models %>%
ggplot(aes(x = rsq,
y = fct_reorder(country,
rsq))) +
geom_point(aes(colour = continent),
alpha = 0.5) +
theme_classic(18) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = c(0.25, 0.75)) +
guides(color = guide_legend(
override.aes = list(alpha = 1))) +
labs(x = "r square",
y = "Country") 04:00
unnest column datalifeExp ~ year with linesarrange(col) will sort according to coltop_n(x, col) not only sort col but return only x top entriestop_n(x, desc(col)) same but sort from lowest valuesmodels %>%
top_n(20, desc(rsq)) %>%
unnest(data) %>%
ggplot(aes(x = year, y = lifeExp)) +
geom_line(aes(colour = continent)) +
facet_wrap(~ country) +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "bottom")models %>%
top_n(20, rsq) %>%
unnest(data) %>%
ggplot(aes(x = year, y = lifeExp)) +
geom_line(aes(colour = continent)) +
facet_wrap(~ country) +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
legend.position = "bottom")year1950?year?filter(models, country == "Germany") %>% unnest(tidy) %>% select(rsq:estimate)
# A tibble: 2 x 6
rsq augment p.value statistic std.error estimate
<dbl> <list> <dbl> <dbl> <dbl> <dbl>
1 0.990 <tibble [12 × 9]> 7.65e-21 282. 0.238 67.1
2 0.990 <tibble [12 × 9]> 3.15e-11 30.7 0.00696 0.214# A tibble: 2 x 3
rsq term estimate
<dbl> <chr> <dbl>
1 0.990 (Intercept) -350.
2 0.990 year 0.21404:00
tidy column)
continent, country and rsq columns(Intercept) name which needs to be called between backsticks ‘`’)scale_size_area() for lisibility)geom_smooth(method = "loess")models %>% unnest(tidy) %>% select(continent, country, rsq, term, estimate) %>% spread(term, estimate) %>% ggplot(aes(x = `(Intercept)`, y = year1950)) + geom_point(aes(colour = continent, size = rsq)) + geom_smooth(se = FALSE, method = "loess") + scale_size_area() + labs(x = "Life expectancy (1950)", y = "Yearly improvement")
library(gganimate)
gapminder %>%
ggplot(aes(x = gdpPercap,
y = lifeExp,
size = pop,
color = continent)) +
transition_time(year) +
ease_aes("linear") +
scale_size(range = c(2, 12)) +
geom_point() +
theme_bw(16) +
labs(title = "Year: {frame_time}",
x = "GDP per capita",
y = "life expectancy") +
scale_x_log10() -> p
animate(p)
anim_save("gapminder2.gif")