2019-11-04
dplyr
/ purrr
for efficient data manipulationbroom
tibble
ggplot
using relevant aestheticsinteractive session
Tutorial based on the great conference by Hadley Wickham
nest()
mtcars %>% group_nest(cyl) %>% mutate(model = map(data, ~lm(mpg ~ wt, data = .x)), summary = map(model, summary), r_squared = map_dbl(summary, "r.squared"))
# A tibble: 3 x 5 cyl data model summary r_squared <dbl> <list> <list> <list> <dbl> 1 4 <tibble [11 × 10]> <lm> <smmry.lm> 0.509 2 6 <tibble [7 × 10]> <lm> <smmry.lm> 0.465 3 8 <tibble [14 × 10]> <lm> <smmry.lm> 0.423
gapminder
05:00
gapminder
packagegapminder
and tidyverse
packages%>%
to pass gapminder
to ggplot()
life expectency
(lifeExp
in y
) ~ year
(x
)geom_line()
library(gapminder) gapminder %>% ggplot(aes(x = year, y = lifeExp, group = country)) + geom_line()
04:00
mutate()
named year1950
which is:year
- 1950group_nest()
the tibble by country
and continent
by_country
group_nest()
by_country <- gapminder %>% mutate(year1950 = year - 1950) %>% group_nest(continent, country) by_country
# A tibble: 142 x 3 continent country data <fct> <fct> <list> 1 Africa Algeria <tibble [12 × 5]> 2 Africa Angola <tibble [12 × 5]> 3 Africa Benin <tibble [12 × 5]> 4 Africa Botswana <tibble [12 × 5]> 5 Africa Burkina Faso <tibble [12 × 5]> 6 Africa Burundi <tibble [12 × 5]> 7 Africa Cameroon <tibble [12 × 5]> 8 Africa Central African Republic <tibble [12 × 5]> 9 Africa Chad <tibble [12 × 5]> 10 Africa Comoros <tibble [12 × 5]> # … with 132 more rows
data
year1950
will help to get meaningful interceptscontinent
to keep it along with country
gapminder %>% filter(country == "Germany") %>% select(-country, -continent)
# A tibble: 12 x 4 year lifeExp pop gdpPercap <int> <dbl> <int> <dbl> 1 1952 67.5 69145952 7144. 2 1957 69.1 71019069 10188. 3 1962 70.3 73739117 12902. 4 1967 70.8 76368453 14746. 5 1972 71 78717088 18016. 6 1977 72.5 78160773 20513. 7 1982 73.8 78335266 22032. 8 1987 74.8 77718298 24639. 9 1992 76.1 80597764 26505. 10 1997 77.3 82011073 27789. 11 2002 78.7 82350671 30036. 12 2007 79.4 82400996 32170.
by_country %>% filter(country == "Germany")
# A tibble: 1 x 3 continent country data <fct> <fct> <list> 1 Europe Germany <tibble [12 × 5]>
by_country %>% filter(country == "Germany") %>% unnest(data)
# A tibble: 12 x 7 continent country year lifeExp pop gdpPercap year1950 <fct> <fct> <int> <dbl> <int> <dbl> <dbl> 1 Europe Germany 1952 67.5 69145952 7144. 2 2 Europe Germany 1957 69.1 71019069 10188. 7 3 Europe Germany 1962 70.3 73739117 12902. 12 4 Europe Germany 1967 70.8 76368453 14746. 17 5 Europe Germany 1972 71 78717088 18016. 22 6 Europe Germany 1977 72.5 78160773 20513. 27 7 Europe Germany 1982 73.8 78335266 22032. 32 8 Europe Germany 1987 74.8 77718298 24639. 37 9 Europe Germany 1992 76.1 80597764 26505. 42 10 Europe Germany 1997 77.3 82011073 27789. 47 11 Europe Germany 2002 78.7 82350671 30036. 52 12 Europe Germany 2007 79.4 82400996 32170. 57
06:00
by_country
model
with linear regressions of lifeExp
on year1950
by_country_lm
mutate
or summarise
?data
), do you need to use map
?by_country_lm <- by_country %>% mutate(model = map(data, ~ lm(lifeExp ~ year1950, data = .x))) by_country_lm
# A tibble: 142 x 4 continent country data model <fct> <fct> <list> <list> 1 Africa Algeria <tibble [12 × 5]> <lm> 2 Africa Angola <tibble [12 × 5]> <lm> 3 Africa Benin <tibble [12 × 5]> <lm> 4 Africa Botswana <tibble [12 × 5]> <lm> 5 Africa Burkina Faso <tibble [12 × 5]> <lm> 6 Africa Burundi <tibble [12 × 5]> <lm> 7 Africa Cameroon <tibble [12 × 5]> <lm> 8 Africa Central African Republic <tibble [12 × 5]> <lm> 9 Africa Chad <tibble [12 × 5]> <lm> 10 Africa Comoros <tibble [12 × 5]> <lm> # … with 132 more rows
03:00
data
columnlifeExp
~ year1950
for Bulgaria by unnesting data
filter()
for the desired countryunnest()
raw data
ggplot()
by_country_lm %>% mutate(n = map_int(data, nrow)) %>% select(country, n)
# A tibble: 142 x 2 country n <fct> <int> 1 Algeria 12 2 Angola 12 3 Benin 12 4 Botswana 12 5 Burkina Faso 12 6 Burundi 12 7 Cameroon 12 8 Central African Republic 12 9 Chad 12 10 Comoros 12 # … with 132 more rows
by_country_lm %>% mutate(n = map_int(data, nrow)) %>% distinct(n)
# A tibble: 1 x 1 n <int> 1 12
by_country_lm %>% filter(country == "Bulgaria") %>% unnest(data) %>% ggplot(aes(x = year1950, y = lifeExp)) + geom_line()
06:00
summary
for the linear model of Rwandafilter()
for the desired countryx
th element, use the pluck("model", x)
purrr
syntaxsummary()
by_country_lm %>% filter(country == "Rwanda") %>% pluck("model", 1) %>% summary()
Call: lm(formula = lifeExp ~ year1950, data = .x) Residuals: Min 1Q Median 3Q Max -17.310 -1.445 2.410 3.073 6.021 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 42.83361 3.74890 11.426 4.63e-07 *** year1950 -0.04583 0.10969 -0.418 0.685 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 6.558 on 10 degrees of freedom Multiple R-squared: 0.01716, Adjusted R-squared: -0.08112 F-statistic: 0.1746 on 1 and 10 DF, p-value: 0.6849
\(r^2\) is close to 0, linearity sounds broken
broom
will cleanup lm elements into tibbles
06:00
by_country_lm
, add 4 new columns:
glance
, using the broom function on the model
columntidy
, using the broom function on the model
columnaugment
, using the broom function on the model
columnrsq
from the glance
columnmodels
map
when dealing with a list columnmap
, shortcut with quotes (like "r.squared"
) extract the specified variablemap
takes and returns a list. Use map_dbl()
to coerce output to doubleslibrary(broom) models <- by_country_lm %>% mutate(glance = map(model, glance), tidy = map(model, tidy), augment = map(model, augment), rsq = map_dbl(glance, "r.squared")) models
# A tibble: 142 x 8 continent country data model glance tidy augment rsq <fct> <fct> <list> <list> <list> <list> <list> <dbl> 1 Africa Algeria <tibble… <lm> <tibble… <tibbl… <tibble … 0.985 2 Africa Angola <tibble… <lm> <tibble… <tibbl… <tibble … 0.888 3 Africa Benin <tibble… <lm> <tibble… <tibbl… <tibble … 0.967 4 Africa Botswana <tibble… <lm> <tibble… <tibbl… <tibble … 0.0340 5 Africa Burkina Faso <tibble… <lm> <tibble… <tibbl… <tibble … 0.919 6 Africa Burundi <tibble… <lm> <tibble… <tibbl… <tibble … 0.766 7 Africa Cameroon <tibble… <lm> <tibble… <tibbl… <tibble … 0.680 8 Africa Central Afr… <tibble… <lm> <tibble… <tibbl… <tibble … 0.493 9 Africa Chad <tibble… <lm> <tibble… <tibbl… <tibble … 0.872 10 Africa Comoros <tibble… <lm> <tibble… <tibbl… <tibble … 0.997 # … with 132 more rows
05:00
country
~ rsq
rsq
): snake plotto reorder discrete values:
factor
forcats
packagefct_reorder()
to reorder according to a continuous variablelibrary(forcats) models %>% ggplot(aes(x = rsq, y = fct_reorder(country, rsq))) + geom_point(aes(colour = continent), alpha = 0.5) + theme_classic(18) + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), legend.position = c(0.25, 0.75)) + guides(color = guide_legend( override.aes = list(alpha = 1))) + labs(x = "r square", y = "Country")
04:00
unnest
column data
lifeExp
~ year
with linesarrange(col)
will sort according to col
top_n(x, col)
not only sort col
but return only x
top entriestop_n(x, desc(col))
same but sort from lowest valuesmodels %>% top_n(20, desc(rsq)) %>% unnest(data) %>% ggplot(aes(x = year, y = lifeExp)) + geom_line(aes(colour = continent)) + facet_wrap(~ country) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")
models %>% top_n(20, rsq) %>% unnest(data) %>% ggplot(aes(x = year, y = lifeExp)) + geom_line(aes(colour = continent)) + facet_wrap(~ country) + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "bottom")
year1950
?year
?filter(models, country == "Germany") %>% unnest(tidy) %>% select(rsq:estimate)
# A tibble: 2 x 6 rsq augment p.value statistic std.error estimate <dbl> <list> <dbl> <dbl> <dbl> <dbl> 1 0.990 <tibble [12 × 9]> 7.65e-21 282. 0.238 67.1 2 0.990 <tibble [12 × 9]> 3.15e-11 30.7 0.00696 0.214
# A tibble: 2 x 3 rsq term estimate <dbl> <chr> <dbl> 1 0.990 (Intercept) -350. 2 0.990 year 0.214
04:00
tidy
column)
continent
, country
and rsq
columns(Intercept)
name which needs to be called between backsticks ‘`’)scale_size_area()
for lisibility)geom_smooth(method = "loess")
models %>% unnest(tidy) %>% select(continent, country, rsq, term, estimate) %>% spread(term, estimate) %>% ggplot(aes(x = `(Intercept)`, y = year1950)) + geom_point(aes(colour = continent, size = rsq)) + geom_smooth(se = FALSE, method = "loess") + scale_size_area() + labs(x = "Life expectancy (1950)", y = "Yearly improvement")
library(gganimate) gapminder %>% ggplot(aes(x = gdpPercap, y = lifeExp, size = pop, color = continent)) + transition_time(year) + ease_aes("linear") + scale_size(range = c(2, 12)) + geom_point() + theme_bw(16) + labs(title = "Year: {frame_time}", x = "GDP per capita", y = "life expectancy") + scale_x_log10() -> p animate(p) anim_save("gapminder2.gif")