# load packageslibrary(tidyverse)library(countdown)library(scales)library(ggthemes)library(glue)library(gt) # For "great" tableslibrary(MoMAColors) # For color palettes inspired by artworks at MomA# set theme for ggplot2ggplot2::theme_set(ggplot2::theme_minimal(base_size =14))# set width of code outputoptions(width =65)# set figure parameters for knitrknitr::opts_chunk$set(fig.width =7, # 7" widthfig.asp =0.618, # the golden ratiofig.retina =3, # dpi multiplier for displaying HTML output on retinafig.align ="center", # center align figuresdpi =300# higher dpi, sharper image)
Transforming and reshaping a single data frame (cont.)
From last time: Monthly bookings
Any questions about recreating any aspect of this plot?
From last time: Monthly bookings
Reveal below for code for creating the previous plot.
Code
hotels <-read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv")hotels <- hotels |>mutate(arrival_date_month =fct_relevel(arrival_date_month, month.name),season =case_when( arrival_date_month %in%c("December", "January", "February") ~"Winter", arrival_date_month %in%c("March", "April", "May") ~"Spring", arrival_date_month %in%c("June", "July", "August") ~"Summer",TRUE~"Fall" ),season =fct_relevel(season, "Winter", "Spring", "Summer", "Fall") )hotels |>count(season, hotel, arrival_date_month) |>ggplot(aes(x = arrival_date_month, y = n, group = hotel, linetype = hotel)) +geom_line(linewidth =0.8, color ="cornsilk4") +geom_point(aes(shape = season, color = season), size =4, show.legend =FALSE) +scale_x_discrete(labels = month.abb) +scale_color_colorblind() +scale_shape_manual(values =c("circle", "square", "diamond", "triangle")) +labs(x ="Arrival month", y ="Number of bookings", linetype =NULL,title ="Number of monthly bookings",subtitle ="July 2015 to August 2017",caption ="Source: Antonio, Almeida and Nunes (2019) | TidyTuesday" ) +coord_cartesian(clip ="off") +theme(legend.position =c(0.12, 0.9),legend.box.background =element_rect(fill ="white", color ="white"),plot.subtitle =element_text(color ="cornsilk4"),plot.caption =element_text(color ="cornsilk4") )
A few takeaways
forcats::fct_relevel() in a mutate() is useful for custom ordering of levels of a factor variable
summarize() after group_by() with multiple variables results in a message about the grouping structure of the resulting data frame – the message can be suppressed by defining .groups (e.g., .groups = "drop" or .groups = "keep")
summarize() also lets you get away with being sloppy and not naming your new column, but that’s not recommended!
Rowwise operations
We want to calculate the total number of guests for each booking. Why does the following not work?
# A tibble: 119,390 × 4
adults children babies guests
<dbl> <dbl> <dbl> <dbl>
1 2 0 0 NA
2 2 0 0 NA
3 1 0 0 NA
4 1 0 0 NA
5 2 0 0 NA
6 2 0 0 NA
7 2 0 0 NA
8 2 0 0 NA
9 2 0 0 NA
10 2 0 0 NA
# ℹ 119,380 more rows
Rowwise operations
hotels |>select(adults, children, babies) |>rowwise() |>mutate(guests =sum(c(adults, children, babies))) |>filter(adults >0, children >0, babies >0) # to show sum works
# A tibble: 4 × 4
hotel is_canceled mean_stays_in_weekend_nights mean_stays_in_week_nights
<chr> <dbl> <dbl> <dbl>
1 City Hotel 0 0.801 2.12
2 City Hotel 1 0.788 2.27
3 Resort Hotel 0 1.13 3.01
4 Resort Hotel 1 1.34 3.44
# A tibble: 10 × 2
name profession
<chr> <chr>
1 Ada Lovelace Mathematician
2 Marie Curie Physicist and Chemist
3 Janaki Ammal Botanist
4 Chien-Shiung Wu Physicist
5 Katherine Johnson Mathematician
6 Rosalind Franklin Chemist
7 Vera Rubin Astronomer
8 Gladys West Mathematician
9 Flossie Wong-Staal Virologist and Molecular Biologist
10 Jennifer Doudna Biochemist
dates
# A tibble: 8 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Janaki Ammal 1897 1984
2 Chien-Shiung Wu 1912 1997
3 Katherine Johnson 1918 2020
4 Rosalind Franklin 1920 1958
5 Vera Rubin 1928 2016
6 Gladys West 1930 NA
7 Flossie Wong-Staal 1947 NA
8 Jennifer Doudna 1964 NA
works
# A tibble: 9 × 2
name known_for
<chr> <chr>
1 Ada Lovelace first computer algorithm
2 Marie Curie theory of radioactivity, first woman Nobel Prize win
3 Janaki Ammal hybrid species, biodiversity protection
4 Chien-Shiung Wu experiment overturning theory of parity
5 Katherine Johnson orbital mechanics critical to sending first Americans into space
6 Vera Rubin existence of dark matter
7 Gladys West mathematical modeling of the shape of the Earth
8 Flossie Wong-Staal first to clone HIV and map its genes, which led to test for virus
9 Jennifer Doudna one of the primary developers of CRISPR
Desired output
# A tibble: 10 × 5
name profession birth_year death_year known_for
<chr> <chr> <dbl> <dbl> <chr>
1 Ada Lovelace Mathematician NA NA first computer algor…
2 Marie Curie Physicist and Chemist NA NA theory of radioactiv…
3 Janaki Ammal Botanist 1897 1984 hybrid species, biod…
4 Chien-Shiung Wu Physicist 1912 1997 experiment overturni…
5 Katherine Johnson Mathematician 1918 2020 orbital mechanics cr…
6 Rosalind Franklin Chemist 1920 1958 <NA>
7 Vera Rubin Astronomer 1928 2016 existence of dark ma…
8 Gladys West Mathematician 1930 NA mathematical modelin…
9 Flossie Wong-Staal Virologist and Molecular Biologist 1947 NA first to clone HIV a…
10 Jennifer Doudna Biochemist 1964 NA one of the primary d…
# A tibble: 10 × 4
name profession birth_year death_year
<chr> <chr> <dbl> <dbl>
1 Ada Lovelace Mathematician NA NA
2 Marie Curie Physicist and Chemist NA NA
3 Janaki Ammal Botanist 1897 1984
4 Chien-Shiung Wu Physicist 1912 1997
5 Katherine Johnson Mathematician 1918 2020
6 Rosalind Franklin Chemist 1920 1958
7 Vera Rubin Astronomer 1928 2016
8 Gladys West Mathematician 1930 NA
9 Flossie Wong-Staal Virologist and Molecular Biologist 1947 NA
10 Jennifer Doudna Biochemist 1964 NA
# A tibble: 10 × 4
name birth_year death_year known_for
<chr> <dbl> <dbl> <chr>
1 Janaki Ammal 1897 1984 hybrid species, biodiversity protection
2 Chien-Shiung Wu 1912 1997 experiment overturning theory of parity
3 Katherine Johnson 1918 2020 orbital mechanics critical to sending first Americans i…
4 Rosalind Franklin 1920 1958 <NA>
5 Vera Rubin 1928 2016 existence of dark matter
6 Gladys West 1930 NA mathematical modeling of the shape of the Earth
7 Flossie Wong-Staal 1947 NA first to clone HIV and map its genes, which led to test…
8 Jennifer Doudna 1964 NA one of the primary developers of CRISPR
9 Ada Lovelace NA NA first computer algorithm
10 Marie Curie NA NA theory of radioactivity, first woman Nobel Prize win
inner_join()
inner_join(x, y)
# A tibble: 2 × 3
id value_x value_y
<dbl> <chr> <chr>
1 1 x1 y1
2 2 x2 y2
inner_join()
dates |>inner_join(works)
# A tibble: 7 × 4
name birth_year death_year known_for
<chr> <dbl> <dbl> <chr>
1 Janaki Ammal 1897 1984 hybrid species, biodiversity protection
2 Chien-Shiung Wu 1912 1997 experiment overturning theory of parity
3 Katherine Johnson 1918 2020 orbital mechanics critical to sending first Americans in…
4 Vera Rubin 1928 2016 existence of dark matter
5 Gladys West 1930 NA mathematical modeling of the shape of the Earth
6 Flossie Wong-Staal 1947 NA first to clone HIV and map its genes, which led to test …
7 Jennifer Doudna 1964 NA one of the primary developers of CRISPR
semi_join()
semi_join(x, y)
# A tibble: 2 × 2
id value_x
<dbl> <chr>
1 1 x1
2 2 x2
semi_join()
dates |>semi_join(works)
# A tibble: 7 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Janaki Ammal 1897 1984
2 Chien-Shiung Wu 1912 1997
3 Katherine Johnson 1918 2020
4 Vera Rubin 1928 2016
5 Gladys West 1930 NA
6 Flossie Wong-Staal 1947 NA
7 Jennifer Doudna 1964 NA
anti_join()
anti_join(x, y)
# A tibble: 1 × 2
id value_x
<dbl> <chr>
1 3 x3
anti_join()
dates |>anti_join(works)
# A tibble: 1 × 3
name birth_year death_year
<chr> <dbl> <dbl>
1 Rosalind Franklin 1920 1958
# A tibble: 10 × 5
name profession birth_year death_year known_for
<chr> <chr> <dbl> <dbl> <chr>
1 Ada Lovelace Mathematician NA NA first computer algor…
2 Marie Curie Physicist and Chemist NA NA theory of radioactiv…
3 Janaki Ammal Botanist 1897 1984 hybrid species, biod…
4 Chien-Shiung Wu Physicist 1912 1997 experiment overturni…
5 Katherine Johnson Mathematician 1918 2020 orbital mechanics cr…
6 Rosalind Franklin Chemist 1920 1958 <NA>
7 Vera Rubin Astronomer 1928 2016 existence of dark ma…
8 Gladys West Mathematician 1930 NA mathematical modelin…
9 Flossie Wong-Staal Virologist and Molecular Biologist 1947 NA first to clone HIV a…
10 Jennifer Doudna Biochemist 1964 NA one of the primary d…
*_join() functions
From dplyr
Incredibly useful for bringing datasets with common information (e.g., unique identifier) together
Use by argument when the names of the column containing the common information are not the same across datasets
Always check that the numbers of rows and columns of the result dataset makes sense