Visualizing time series data II

Lecture 6

Dr. Mine Çetinkaya-Rundel

Duke University
STA 313 - Spring 2026

Warm up

Dataviz of the day

What, if anything, is wrong with the following visualization of median yearly total compensation for data scientists in the US by years of experience (Source: Glassdoor)?

Dataviz of the day - annotated

Dataviz of the day - revisited

Code
library(tidyverse)
library(scales)
ds_salaries <- tribble(
  ~experience , ~salary , ~label        ,
   0.5        ,  117276 , "0-1 years"   ,
   2          ,  128403 , "1-3 years"   ,
   5          ,  141390 , "4-6 years"   ,
   8          ,  152966 , "7-9 years"   ,
  12          ,  166818 , "10-14 years" ,
  15          ,  189884 , "15+ years"
)

p_kdnuggets <- ggplot(
  ds_salaries,
  aes(x = experience * 3, y = salary)
) +
  geom_text(
    aes(y = salary + 20000, label = str_to_upper(label)),
    color = "lightgray",
    fontface = "bold",
    size = 3
  ) +
  geom_text(
    aes(y = salary + 10000, label = dollar(salary)),
    color = "#75b6be",
    fontface = "bold",
    size = 4
  ) +
  theme_void() +
  theme(
    plot.background = element_rect(fill = "black"),
    plot.title = element_text(
      color = "#e8c95c",
      face = "bold",
      hjust = 0.5,
      size = 25
    ),
    axis.title.x = element_text(
      color = "white",
      size = 10,
      margin = margin(t = 2.5, b = 2.5, l = 0, r = 0)
    )
  ) +
  labs(
    x = "Median yearly total compensation for data scientists in the US by years of experience (Glassdoor)",
    y = NULL,
    title = "Data Science Salaries\nby Experience Level"
  )

p_kdnuggets +
  geom_col(width = 4, fill = "#e8c95c")

Dataviz of the day - revisited further

Code
ggplot(
  ds_salaries,
  aes(x = experience * 3, y = salary)
) +
  geom_line(linewidth = 2, color = "#e8c95c") +
  geom_point(
    size = 4,
    color = "#e8c95c",
    fill = "black",
    shape = "circle filled",
    stroke = 2
  ) +
  labs(
    x = "Median yearly total compensation for data scientists in the US by years of experience (Glassdoor)",
    y = NULL,
    title = "Data Science Salaries\nby Experience Level"
  ) +
  geom_text(
    aes(y = salary + 15000, label = str_to_upper(label)),
    color = "lightgray",
    fontface = "bold",
    size = 3
  ) +
  geom_text(
    aes(y = salary + 10000, label = dollar(salary)),
    color = "#75b6be",
    fontface = "bold",
    size = 4
  ) +
  theme_void() +
  theme(
    plot.background = element_rect(fill = "black"),
    plot.title = element_text(
      color = "#e8c95c",
      face = "bold",
      hjust = 0.5,
      size = 25
    ),
    axis.title.x = element_text(
      color = "white",
      size = 10,
      margin = margin(t = 2.5, b = 2.5, l = 0, r = 0)
    )
  )

Announcements

  • HW 1 recap:

    • Those with failing actions got a chance to resolve without penalty (one-time-only!)
    • Late submissions: Due today at 5 pm
  • Project 1 teams are posted at vizdata-s26/teams

  • Tomorrow’s lab:

    • Submit team name
    • Work on proposal
    • All team members must be there! If you have an excused absence, reach out to your team members today
    • Time permitting: Start HW 2

Setup

# load packages
library(tidyverse)
library(scales)
library(ggthemes)
library(colorspace)
library(ggrepel)
library(ggpp) # enhances support for data labels + annotations
library(broom)
library(popthemes) # pak::pak("johnmackintosh/popthemes")

# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7, # 7" width
  fig.asp = 0.618, # the golden ratio
  fig.retina = 3, # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300 # higher dpi, sharper image
)

From last time

Data: Unemployment

Data load and prep code
unemp_bachelor <- read_csv(here::here(
  "slides/06",
  "data/fred",
  "CGBD25O.csv"
)) |>
  rename(date = observation_date, unemp_rate = CGBD25O) |>
  mutate(degree = "Bachelor's")
unemp_master <- read_csv(here::here(
  "slides/06",
  "data/fred",
  "CGMD25O.csv"
)) |>
  rename(date = observation_date, unemp_rate = CGMD25O) |>
  mutate(degree = "Master's")
unemp_doctoral <- read_csv(here::here(
  "slides/06",
  "data/fred",
  "CGDD25O.csv"
)) |>
  rename(date = observation_date, unemp_rate = CGDD25O) |>
  mutate(degree = "Doctoral")
unemp <- bind_rows(unemp_bachelor, unemp_master, unemp_doctoral) |>
  mutate(degree = fct_relevel(degree, "Bachelor's", "Master's", "Doctoral"))
unemp
# A tibble: 936 × 3
   date       unemp_rate degree    
   <date>          <dbl> <fct>     
 1 2000-01-01        2.1 Bachelor's
 2 2000-02-01        1.8 Bachelor's
 3 2000-03-01        1.7 Bachelor's
 4 2000-04-01        1.5 Bachelor's
 5 2000-05-01        1.7 Bachelor's
 6 2000-06-01        1.8 Bachelor's
 7 2000-07-01        1.9 Bachelor's
 8 2000-08-01        2.3 Bachelor's
 9 2000-09-01        2   Bachelor's
10 2000-10-01        1.6 Bachelor's
# ℹ 926 more rows

Unemployment for Bachelor’s & Master’s

In ae-04, finish recreating the following visualization.

Livecoding

Reveal below for code developed during live coding session.

Code
unemp_bachelor_master_wider <- unemp |>
  filter(degree %in% c("Bachelor's", "Master's")) |>
  pivot_wider(names_from = degree, values_from = unemp_rate)

recession_end_dates <- c(
  ymd("2001-11-01"),
  ymd("2009-06-01"),
  ymd("2020-04-01")
)
endpoints <- c(
  ymd("2000-01-01"),
  ymd("2025-12-01")
)
gray_dark_hex <- "#595959"
gray_light_hex <- "#737373"

ggplot(unemp_bachelor_master_wider, aes(x = `Bachelor's`, y = `Master's`)) +
  geom_line(
    aes(color = date),
    linewidth = 1,
    alpha = 0.5,
    show.legend = FALSE
  ) +
  scale_x_continuous(
    labels = label_percent(scale = 1),
    breaks = seq(0, 10, 2),
    limit = c(0, 12)
  ) +
  scale_y_continuous(
    labels = label_percent(scale = 1),
    breaks = seq(0, 10, 2),
    limit = c(0, 8)
  ) +
  geom_text_repel(
    data = unemp_bachelor_master_wider |>
      filter(date %in% recession_end_dates),
    aes(label = as.character(format(date, "%b %e, %Y"))),
    position = ggpp::position_nudge_keep(x = 2)
  ) +
  # recessions
  geom_text_repel(
    data = unemp_bachelor_master_wider |>
      filter(date %in% endpoints),
    aes(
      label = paste(
        as.character(format(date, "%b %e, %Y")),
        c("(Start date)", "(End date)")
      )
    ),
    position = ggpp::position_nudge_keep(x = 3, y = -1),
    color = gray_dark_hex
  ) +
  annotate(
    geom = "label",
    x = 4,
    y = 6.5,
    label = str_wrap(
      "Dates annotated in black indicate end dates of U.S. recessions determined by the National Bureau of Economic Research (NBER).",
      30
    ),
    size = 3,
    hjust = 0,
    fill = "white"
  ) +
  # endpoints
  annotate(
    geom = "label",
    x = 8,
    y = 1,
    label = str_wrap(
      "Dates annotated in gray indicate start and end dates of the data.",
      25
    ),
    size = 3,
    hjust = 0,
    fill = "white",
    color = gray_dark_hex
  ) +
  # diagonal
  geom_abline(
    slope = 1,
    intercept = 0,
    linetype = "dotted",
    color = gray_light_hex
  ) +
  annotate(
    geom = "segment",
    x = 8,
    xend = 8.8,
    y = 8,
    yend = 8,
    linewidth = 0.2,
    color = gray_light_hex
  ) +
  annotate(
    geom = "label",
    x = 9,
    y = 8,
    label = str_wrap(
      "Dotted line represents equal unemployment rate for Master's and Bachelor's degree holders.",
      30
    ),
    size = 3,
    hjust = 0,
    fill = gray_light_hex,
    color = "white"
  ) +
  labs(
    title = "Unemployment rate",
    subtitle = "25 years and over",
    x = "Bachelor's degree holders",
    y = "Master's degree holders",
    caption = "Source: U.S. Bureau of Labor Statistics via FRED"
  ) +
  theme(plot.caption = element_text(color = "darkgray", hjust = 0)) +
  coord_equal(clip = "off")
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).

Critical requirements

Connected scatter plots require:

  1. Clear directional indicators (arrows, color gradients)
  2. Temporal scale information
  3. Careful annotation

Without these elements, the visualization becomes confusing.

Recap

Choosing the right approach

The choice depends on:

  • Data characteristics: Density, regularity, number of series
  • Analytical goals: Trends vs. individual values
  • Audience familiarity: Connected scatter plots require more reader sophistication

Takeaways

  • Time imposes structure – use it with connected visualizations
  • Progress from points → lines → areas based on data density and goals
  • For multiple series, use direct labeling over legends
  • Connected scatter plots reveal cyclical patterns but require careful design
  • Always consider whether lines represent data or interpolation

Working with dates

Reminder: Air Quality Index

  • The AQI is the Environmental Protection Agency’s index for reporting air quality

  • Higher values of AQI indicate worse air quality

AQI Basics for Ozone and Particle Pollution

AQI data

  • Source: EPA’s Daily Air Quality Tracker

  • 2016 - 2025 AQI (Ozone and PM2.5 combined) for Durham-Chapel Hill, NC core-based statistical area (CBSA), one file per year

  • 2016 - 2025 AQI (Ozone and PM2.5 combined) for San Francisco-Oakland-Hayward, CA CBSA, one file per year

2025 Durham-Chapel Hill

dim(dch_2025)
[1] 365  11
names(dch_2025)
 [1] "date"                      "aqi_value"                
 [3] "main_pollutant"            "site_name"                
 [5] "site_id"                   "source"                   
 [7] "x24_year_high_2000_2023"   "x24_year_low_2000_2023"   
 [9] "x5_year_average_2019_2023" "date_of_24_year_high"     
[11] "date_of_24_year_low"      

First look

This plot looks quite bizarre. What might be going on?

ggplot(dch_2025, aes(x = date, y = aqi_value, group = 1)) +
  geom_line()

Peek at data

dch_2025 |>
  select(date, aqi_value, site_name, site_id)
# A tibble: 365 × 4
   date       aqi_value site_name     site_id    
   <chr>      <chr>     <chr>         <chr>      
 1 01/01/2025 18        Durham Armory 37-063-0015
 2 01/02/2025 33        Durham Armory 37-063-0015
 3 01/03/2025 37        Durham Armory 37-063-0015
 4 01/04/2025 16        Durham Armory 37-063-0015
 5 01/05/2025 19        Durham Armory 37-063-0015
 6 01/06/2025 23        Durham Armory 37-063-0015
 7 01/07/2025 25        Durham Armory 37-063-0015
 8 01/08/2025 27        Durham Armory 37-063-0015
 9 01/09/2025 .         <NA>          <NA>       
10 01/10/2025 .         <NA>          <NA>       
# ℹ 355 more rows

Transforming date

Using lubridate::mdy():

dch_2025 |>
  mutate(date = mdy(date))
# A tibble: 365 × 11
   date       aqi_value main_pollutant site_name     site_id     source
   <date>     <chr>     <chr>          <chr>         <chr>       <chr> 
 1 2025-01-01 18        PM2.5          Durham Armory 37-063-0015 AQS   
 2 2025-01-02 33        PM2.5          Durham Armory 37-063-0015 AQS   
 3 2025-01-03 37        PM2.5          Durham Armory 37-063-0015 AQS   
 4 2025-01-04 16        PM2.5          Durham Armory 37-063-0015 AQS   
 5 2025-01-05 19        PM2.5          Durham Armory 37-063-0015 AQS   
 6 2025-01-06 23        PM2.5          Durham Armory 37-063-0015 AQS   
 7 2025-01-07 25        PM2.5          Durham Armory 37-063-0015 AQS   
 8 2025-01-08 27        PM2.5          Durham Armory 37-063-0015 AQS   
 9 2025-01-09 .         <NA>           <NA>          <NA>        <NA>  
10 2025-01-10 .         <NA>           <NA>          <NA>        <NA>  
# ℹ 355 more rows
# ℹ 5 more variables: x24_year_high_2000_2023 <dbl>,
#   x24_year_low_2000_2023 <dbl>, x5_year_average_2019_2023 <dbl>,
#   date_of_24_year_high <chr>, date_of_24_year_low <chr>

Transforming AQI values

What does this warning mean?

dch_2025 |>
  mutate(aqi_value = as.numeric(aqi_value))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `aqi_value = as.numeric(aqi_value)`.
Caused by warning:
! NAs introduced by coercion
# A tibble: 365 × 11
   date       aqi_value main_pollutant site_name     site_id     source
   <chr>          <dbl> <chr>          <chr>         <chr>       <chr> 
 1 01/01/2025        18 PM2.5          Durham Armory 37-063-0015 AQS   
 2 01/02/2025        33 PM2.5          Durham Armory 37-063-0015 AQS   
 3 01/03/2025        37 PM2.5          Durham Armory 37-063-0015 AQS   
 4 01/04/2025        16 PM2.5          Durham Armory 37-063-0015 AQS   
 5 01/05/2025        19 PM2.5          Durham Armory 37-063-0015 AQS   
 6 01/06/2025        23 PM2.5          Durham Armory 37-063-0015 AQS   
 7 01/07/2025        25 PM2.5          Durham Armory 37-063-0015 AQS   
 8 01/08/2025        27 PM2.5          Durham Armory 37-063-0015 AQS   
 9 01/09/2025        NA <NA>           <NA>          <NA>        <NA>  
10 01/10/2025        NA <NA>           <NA>          <NA>        <NA>  
# ℹ 355 more rows
# ℹ 5 more variables: x24_year_high_2000_2023 <dbl>,
#   x24_year_low_2000_2023 <dbl>, x5_year_average_2019_2023 <dbl>,
#   date_of_24_year_high <chr>, date_of_24_year_low <chr>

Investigating AQI values

  • Take a peek at distinct values of AQI
dch_2025 |>
  distinct(aqi_value) |>
  pull()
 [1] "18" "33" "37" "16" "19" "23" "25" "27" "."  "42" "48" "55" "30" "52" "32"
[16] "54" "21" "31" "40" "59" "57" "44" "49" "51" "43" "46" "45" "14" "34" "15"
[31] "28" "36" "65" "74" "68" "38" "67" "47" "64" "50" "53" "61" "58" "56" "39"
[46] "41" "77" "71" "35" "66" "62" "26" "63" "24" "29" "9"  "22" "17"
  • "." likely indicates NA, and it’s causing the entire column to be read in as characters

Rewind, and start over

dch_2025 <- read_csv(
  here::here(
    "slides/06",
    "data/durham-chapel-hill/ad_aqi_tracker_data-2022.csv"
  ),
  na = c(".", "")
) |>
  janitor::clean_names() |>
  mutate(date = mdy(date))

glimpse(dch_2025)
Rows: 365
Columns: 11
$ date                      <date> 2022-01-01, 2022-01-02, 2022-01-03, 2022-01…
$ aqi_value                 <dbl> 22, 12, 10, 21, 35, 29, 15, 28, 28, 15, 25, …
$ main_pollutant            <chr> "PM2.5", "PM2.5", "PM2.5", "PM2.5", "PM2.5",…
$ site_name                 <chr> "Durham Armory", "Durham Armory", "Durham Ar…
$ site_id                   <chr> "37-063-0015", "37-063-0015", "37-063-0015",…
$ source                    <chr> "AQS", "AQS", "AQS", "AQS", "AQS", "AQS", "A…
$ x20_year_high_2000_2019   <dbl> 111, 76, 66, 61, 83, 71, 75, 76, 57, 71, 81,…
$ x20_year_low_2000_2019    <dbl> 10, 8, 14, 9, 8, 15, 18, 15, 19, 20, 14, 5, …
$ x5_year_average_2015_2019 <dbl> 39.2, 36.8, 38.2, 30.4, 26.0, 32.4, 37.0, 36…
$ date_of_20_year_high      <chr> "01/01/2000", "01/02/2005", "01/03/2004", "0…
$ date_of_20_year_low       <chr> "01/01/2007", "01/02/2012", "01/03/2012", "0…

Another look

ggplot(dch_2025, aes(x = date, y = aqi_value, group = 1)) +
  geom_line()

What’s missing?

What’s missing in this plot?

Filling in the gaps

Filling in the gaps - Code

dch_2025 |>
  filter(!is.na(aqi_value)) |>
  ggplot(aes(x = date, y = aqi_value, group = 1)) +
  geom_line(linewidth = 0.8) +
  labs(
    x = NULL,
    y = "AQI",
    title = "Ozone and PM2.5 Daily AQI Values",
    subtitle = "Durham-Chapel Hill, NC",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  )
dch_2025 |>
  mutate(
    impute = if_else(is.na(aqi_value), TRUE, FALSE),
    impute = if_else(is.na(lead(aqi_value)), TRUE, impute),
    aqi_value = if_else(
      is.na(aqi_value),
      (lag(aqi_value) + lead(aqi_value)) / 2,
      aqi_value
    )
  ) |>
  ggplot(aes(x = date, y = aqi_value, group = 1)) +
  geom_line(
    aes(color = impute),
    show.legend = FALSE,
    linewidth = 0.8
  ) +
  scale_color_manual(values = c("black", "cornflowerblue")) +
  labs(
    x = NULL,
    y = "AQI",
    title = "Ozone and PM2.5 Daily AQI Values",
    subtitle = "Durham-Chapel Hill, NC",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  )

Calculating cumulatives

Cumulatives over time

  • When visualizing time series data, a somewhat common task is to calculate cumulatives over time and plot them

  • In our example we’ll calculate the number of days with “good” AQI (\(\le\) 50) and plot that value on the y-axis and the date on the x-axis

Calculating cumulatives

Step 1. Arrange your data

dch_2025 |>
  select(date, aqi_value) |>
  filter(!is.na(aqi_value)) |>
  arrange(date)
# A tibble: 364 × 2
   date       aqi_value
   <date>         <dbl>
 1 2022-01-01        22
 2 2022-01-02        12
 3 2022-01-03        10
 4 2022-01-04        21
 5 2022-01-05        35
 6 2022-01-06        29
 7 2022-01-07        15
 8 2022-01-08        28
 9 2022-01-09        28
10 2022-01-10        15
# ℹ 354 more rows

Calculating cumulatives

Step 2. Identify good days

dch_2025 |>
  select(date, aqi_value) |>
  filter(!is.na(aqi_value)) |>
  arrange(date) |>
  mutate(good_aqi = if_else(aqi_value <= 50, 1, 0))
# A tibble: 364 × 3
   date       aqi_value good_aqi
   <date>         <dbl>    <dbl>
 1 2022-01-01        22        1
 2 2022-01-02        12        1
 3 2022-01-03        10        1
 4 2022-01-04        21        1
 5 2022-01-05        35        1
 6 2022-01-06        29        1
 7 2022-01-07        15        1
 8 2022-01-08        28        1
 9 2022-01-09        28        1
10 2022-01-10        15        1
# ℹ 354 more rows

Calculating cumulatives

Step 3. Sum over time

dch_2025 |>
  select(date, aqi_value) |>
  filter(!is.na(aqi_value)) |>
  arrange(date) |>
  mutate(
    good_aqi = if_else(aqi_value <= 50, 1, 0),
    cumsum_good_aqi = cumsum(good_aqi)
  ) |>
  print(n = 30)
# A tibble: 364 × 4
   date       aqi_value good_aqi cumsum_good_aqi
   <date>         <dbl>    <dbl>           <dbl>
 1 2022-01-01        22        1               1
 2 2022-01-02        12        1               2
 3 2022-01-03        10        1               3
 4 2022-01-04        21        1               4
 5 2022-01-05        35        1               5
 6 2022-01-06        29        1               6
 7 2022-01-07        15        1               7
 8 2022-01-08        28        1               8
 9 2022-01-09        28        1               9
10 2022-01-10        15        1              10
11 2022-01-11        25        1              11
12 2022-01-12        36        1              12
13 2022-01-13        49        1              13
14 2022-01-14        35        1              14
15 2022-01-15        28        1              15
16 2022-01-16        19        1              16
17 2022-01-17        19        1              17
18 2022-01-18        24        1              18
19 2022-01-19        46        1              19
20 2022-01-20        38        1              20
21 2022-01-21        19        1              21
22 2022-01-22        28        1              22
23 2022-01-23        49        1              23
24 2022-01-24        38        1              24
25 2022-01-25        48        1              25
26 2022-01-26        34        1              26
27 2022-01-27        47        1              27
28 2022-01-28        57        0              27
29 2022-01-29        26        1              28
30 2022-01-30        28        1              29
# ℹ 334 more rows

Plotting cumulatives

dch_2025 |>
  select(date, aqi_value) |>
  filter(!is.na(aqi_value)) |>
  arrange(date) |>
  mutate(
    good_aqi = if_else(aqi_value <= 50, 1, 0),
    cumsum_good_aqi = cumsum(good_aqi)
  ) |>
  ggplot(aes(x = date, y = cumsum_good_aqi, group = 1)) +
  geom_line() +
  scale_x_date(date_labels = "%b %Y") +
  labs(
    x = NULL,
    y = "Number of days",
    title = "Cumulative number of good AQI days (AQI < 50)",
    subtitle = "Durham-Chapel Hill, NC",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  ) +
  theme(plot.title.position = "plot")

Detrending

Detrending

  • Detrending is removing prominent long-term trend in time series to specifically highlight any notable deviations

  • Let’s demonstrate using multiple years of AQI data

Multiple years of Durham-Chapel Hill data

dch_files <- fs::dir_ls(here::here("slides/06", "data/durham-chapel-hill"))
dch_files
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2016.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2017.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2018.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2019.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2020.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2021.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2022.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2023.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2024.csv
/Users/mine/Desktop/teaching/Duke/sta313-s26/vizdata-s26/slides/06/data/durham-chapel-hill/ad_aqi_tracker_data-2025.csv

Reading multiple files

dch <- map_dfr(
  dch_files,
  ~ read_csv(.x, na = c(".", ""), col_select = Date:Source)
)

dch <- dch |>
  janitor::clean_names() |>
  mutate(
    date = mdy(date),
    good_aqi = if_else(aqi_value <= 50, 1, 0)
  ) |>
  filter(!is.na(aqi_value)) |>
  arrange(date) |>
  mutate(cumsum_good_aqi = cumsum(good_aqi), .after = aqi_value)

dch
# A tibble: 3,627 × 8
   date       aqi_value cumsum_good_aqi main_pollutant site_name  site_id source
   <date>         <dbl>           <dbl> <chr>          <chr>      <chr>   <chr> 
 1 2016-01-01        32               1 PM2.5          Durham Ar… 37-063… AQS   
 2 2016-01-02        37               2 PM2.5          Durham Ar… 37-063… AQS   
 3 2016-01-03        45               3 PM2.5          Durham Ar… 37-063… AQS   
 4 2016-01-04        33               4 PM2.5          Durham Ar… 37-063… AQS   
 5 2016-01-05        27               5 PM2.5          Durham Ar… 37-063… AQS   
 6 2016-01-06        39               6 PM2.5          Durham Ar… 37-063… AQS   
 7 2016-01-07        39               7 PM2.5          Durham Ar… 37-063… AQS   
 8 2016-01-08        31               8 PM2.5          Durham Ar… 37-063… AQS   
 9 2016-01-09        20               9 PM2.5          Durham Ar… 37-063… AQS   
10 2016-01-10        20              10 PM2.5          Durham Ar… 37-063… AQS   
# ℹ 3,617 more rows
# ℹ 1 more variable: good_aqi <dbl>

Plot trend since 2016

dch |>
  ggplot(aes(x = date, y = cumsum_good_aqi, group = 1)) +
  geom_smooth(method = "lm", color = "pink") +
  geom_line() +
  scale_x_date(
    expand = expansion(mult = 0.07),
    date_labels = "%Y"
  ) +
  labs(
    x = NULL,
    y = "Number of days",
    title = "Cumulative number of good AQI days (AQI < 50)",
    subtitle = "Durham-Chapel Hill, NC",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  ) +
  theme(plot.title.position = "plot")
`geom_smooth()` using formula = 'y ~ x'

Detrend

Step 1. Fit a simple linear regression

m <- lm(cumsum_good_aqi ~ date, data = dch)

m

Call:
lm(formula = cumsum_good_aqi ~ date, data = dch)

Coefficients:
(Intercept)         date  
 -1.295e+04    7.702e-01  

Detrend

Step 2. Augment the data with model results (using broom::augment())

dch_aug <- augment(m)

dch_aug
# A tibble: 3,627 × 8
   cumsum_good_aqi date       .fitted .resid    .hat .sigma   .cooksd .std.resid
             <dbl> <date>       <dbl>  <dbl>   <dbl>  <dbl>     <dbl>      <dbl>
 1               1 2016-01-01  -13.3    14.3 0.00110   45.4 0.0000546      0.315
 2               2 2016-01-02  -12.5    14.5 0.00110   45.4 0.0000563      0.320
 3               3 2016-01-03  -11.8    14.8 0.00110   45.4 0.0000580      0.325
 4               4 2016-01-04  -11.0    15.0 0.00110   45.4 0.0000598      0.330
 5               5 2016-01-05  -10.2    15.2 0.00110   45.4 0.0000616      0.335
 6               6 2016-01-06   -9.45   15.4 0.00109   45.4 0.0000634      0.340
 7               7 2016-01-07   -8.68   15.7 0.00109   45.4 0.0000653      0.345
 8               8 2016-01-08   -7.90   15.9 0.00109   45.4 0.0000671      0.350
 9               9 2016-01-09   -7.13   16.1 0.00109   45.4 0.0000690      0.355
10              10 2016-01-10   -6.36   16.4 0.00109   45.4 0.0000710      0.361
# ℹ 3,617 more rows

Detrend

Step 3. Divide the observed value of cumsum_good_aqi by the respective value in the long-term trend (i.e., .fitted)

dch_aug <- dch_aug |>
  mutate(ratio = cumsum_good_aqi / .fitted, .after = .fitted)

dch_aug
# A tibble: 3,627 × 9
   cumsum_good_aqi date       .fitted   ratio .resid    .hat .sigma   .cooksd
             <dbl> <date>       <dbl>   <dbl>  <dbl>   <dbl>  <dbl>     <dbl>
 1               1 2016-01-01  -13.3  -0.0752   14.3 0.00110   45.4 0.0000546
 2               2 2016-01-02  -12.5  -0.160    14.5 0.00110   45.4 0.0000563
 3               3 2016-01-03  -11.8  -0.255    14.8 0.00110   45.4 0.0000580
 4               4 2016-01-04  -11.0  -0.364    15.0 0.00110   45.4 0.0000598
 5               5 2016-01-05  -10.2  -0.489    15.2 0.00110   45.4 0.0000616
 6               6 2016-01-06   -9.45 -0.635    15.4 0.00109   45.4 0.0000634
 7               7 2016-01-07   -8.68 -0.807    15.7 0.00109   45.4 0.0000653
 8               8 2016-01-08   -7.90 -1.01     15.9 0.00109   45.4 0.0000671
 9               9 2016-01-09   -7.13 -1.26     16.1 0.00109   45.4 0.0000690
10              10 2016-01-10   -6.36 -1.57     16.4 0.00109   45.4 0.0000710
# ℹ 3,617 more rows
# ℹ 1 more variable: .std.resid <dbl>

Visualize detrended data

dch_aug |>
  ggplot(aes(x = date, y = ratio, group = 1)) +
  geom_hline(yintercept = 1, color = "gray") +
  geom_line() +
  scale_x_date(
    expand = expansion(mult = 0.07),
    date_labels = "%Y"
  ) +
  labs(
    x = NULL,
    y = "Number of days\n(detrended)",
    title = "Cumulative number of good AQI days (AQI < 50)",
    subtitle = "Durham-Chapel Hill, NC",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  ) +
  # title aligned to the entire plot (minus any space for margins and plot tag)
  theme(plot.title.position = "plot")

Air Quality in Durham



barely anything interesting happening!

let’s look at data from somewhere with a bit more “interesting” air quality data…

Read in multiple years of SF data

sf_files <- fs::dir_ls(here::here("slides/06/", "data/san-francisco"))

sf <- map_dfr(
  sf_files,
  ~ read_csv(.x, na = c(".", ""), col_select = Date:Source)
)

sf <- sf |>
  janitor::clean_names() |>
  mutate(
    date = mdy(date),
    good_aqi = if_else(aqi_value <= 50, 1, 0)
  ) |>
  filter(!is.na(aqi_value)) |>
  arrange(date) |>
  mutate(
    cumsum_good_aqi = cumsum(good_aqi),
    .after = aqi_value
  )

sf
# A tibble: 3,314 × 8
   date       aqi_value cumsum_good_aqi main_pollutant site_name  site_id source
   <date>         <dbl>           <dbl> <chr>          <chr>      <chr>   <chr> 
 1 2016-01-01        32               1 PM2.5          Durham Ar… 37-063… AQS   
 2 2016-01-02        37               2 PM2.5          Durham Ar… 37-063… AQS   
 3 2016-01-03        45               3 PM2.5          Durham Ar… 37-063… AQS   
 4 2016-01-04        33               4 PM2.5          Durham Ar… 37-063… AQS   
 5 2016-01-05        27               5 PM2.5          Durham Ar… 37-063… AQS   
 6 2016-01-06        39               6 PM2.5          Durham Ar… 37-063… AQS   
 7 2016-01-07        39               7 PM2.5          Durham Ar… 37-063… AQS   
 8 2016-01-08        31               8 PM2.5          Durham Ar… 37-063… AQS   
 9 2016-01-09        20               9 PM2.5          Durham Ar… 37-063… AQS   
10 2016-01-10        20              10 PM2.5          Durham Ar… 37-063… AQS   
# ℹ 3,304 more rows
# ℹ 1 more variable: good_aqi <dbl>

Plot trend since 2016 - good AQI

sf |>
  ggplot(aes(x = date, y = cumsum_good_aqi, group = 1)) +
  geom_smooth(method = "lm", color = "pink") +
  geom_line() +
  scale_x_date(
    expand = expansion(mult = 0.07),
    date_labels = "%Y"
  ) +
  labs(
    x = NULL,
    y = "Number of days",
    title = "Cumulative number of good AQI days (AQI < 50)",
    subtitle = "San Francisco-Oakland-Hayward, CA",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  ) +
  theme(plot.title.position = "plot")
`geom_smooth()` using formula = 'y ~ x'

Detrend

  1. Fit a simple linear regression
m_sf <- lm(cumsum_good_aqi ~ date, data = sf)
  1. Augment the data with model results
sf_aug <- augment(m_sf)
  1. Divide the observed value of cumsum_good_aqi by the respective value in the long-term trend (i.e., .fitted)
sf_aug <- sf_aug |>
  mutate(ratio = cumsum_good_aqi / .fitted, .after = .fitted)

Visualize detrended data

sf_aug |>
  ggplot(aes(x = date, y = ratio, group = 1)) +
  geom_hline(yintercept = 1, color = "gray") +
  geom_line() +
  scale_x_date(
    expand = expansion(mult = 0.07),
    date_labels = "%Y"
  ) +
  labs(
    x = NULL,
    y = "Number of days\n(detrended)",
    title = "Cumulative number of good AQI days (AQI < 50)",
    subtitle = "San Francisco-Oakland-Hayward, CA",
    caption = "\nSource: EPA Daily Air Quality Tracker"
  ) +
  theme(plot.title.position = "plot")

Detrending

  • In step 2 we fit a very simple model

  • Depending on the complexity you’re trying to capture you might choose to fit a much more complex model

  • You can also decompose the trend into multiple trends, e.g. monthly, long-term, seasonal, etc.

Lag and lead

Lag and lead

  • lag(): “previous” value in a vector (likely also in time)
  • lead(): “next” value in a vector
  • Useful for comparing values behind of or ahead of the current values.
dch_2025 |>
  slice_head(n = 10) |>
  mutate(
    aqi_lag = lag(aqi_value),
    aqi_lead = lead(aqi_value)
  ) |>
  select(date, contains("aqi"))
# A tibble: 10 × 4
   date       aqi_value aqi_lag aqi_lead
   <date>         <dbl>   <dbl>    <dbl>
 1 2022-01-01        22      NA       12
 2 2022-01-02        12      22       10
 3 2022-01-03        10      12       21
 4 2022-01-04        21      10       35
 5 2022-01-05        35      21       29
 6 2022-01-06        29      35       15
 7 2022-01-07        15      29       28
 8 2022-01-08        28      15       28
 9 2022-01-09        28      28       15
10 2022-01-10        15      28       NA

Lag and lead further

dch_2025 |>
  slice_head(n = 10) |>
  mutate(
    aqi_lag_1 = lag(aqi_value),
    aqi_lag_2 = lag(aqi_value, n = 2),
    aqi_lead_1 = lead(aqi_value),
    aqi_lead_3 = lead(aqi_value, n = 3),
  ) |>
  select(date, contains("aqi"))
# A tibble: 10 × 6
   date       aqi_value aqi_lag_1 aqi_lag_2 aqi_lead_1 aqi_lead_3
   <date>         <dbl>     <dbl>     <dbl>      <dbl>      <dbl>
 1 2022-01-01        22        NA        NA         12         21
 2 2022-01-02        12        22        NA         10         35
 3 2022-01-03        10        12        22         21         29
 4 2022-01-04        21        10        12         35         15
 5 2022-01-05        35        21        10         29         28
 6 2022-01-06        29        35        21         15         28
 7 2022-01-07        15        29        35         28         15
 8 2022-01-08        28        15        29         28         NA
 9 2022-01-09        28        28        15         15         NA
10 2022-01-10        15        28        28         NA         NA

Gantt chart

Gantt chart

Generally, a horizontal bar chart that illustrates a project schedule.

Or…

[Time permitting] Livecoding

Time permitting: Recreate the gantt chart in the previous slide in ae-05.

Livecoding

Reveal below for code developed during live coding session.

Transform
professions <- read_csv(here::here(
  "slides/06",
  "data/scientists/professions.csv"
))
dates <- read_csv(here::here("slides/06", "data/scientists/dates.csv"))
works <- read_csv(here::here("slides/06", "data/scientists/works.csv"))

scientists <- professions |>
  left_join(dates, by = join_by(name)) |>
  left_join(works, by = join_by(name))

scientists_longer <- scientists |>
  mutate(
    birth_year = case_when(
      name == "Ada Lovelace" ~ 1815,
      name == "Marie Curie" ~ 1867,
      TRUE ~ birth_year
    ),
    death_year = case_when(
      name == "Ada Lovelace" ~ 1852,
      name == "Marie Curie" ~ 1934,
      TRUE ~ death_year
    ),
    status = if_else(is.na(death_year), "alive", "deceased"),
    death_year = if_else(is.na(death_year), 2027, death_year),
    known_for = if_else(
      name == "Rosalind Franklin",
      "understanding of the molecular structures of DNA ",
      known_for
    )
  ) |>
  pivot_longer(
    cols = contains("year"),
    names_to = "year_type",
    values_to = "year"
  ) |>
  mutate(death_year_fake = if_else(year == 2027, TRUE, FALSE))
Plot
ggplot(
  scientists_longer,
  aes(
    x = year,
    y = fct_reorder(name, as.numeric(factor(profession))),
    group = name,
    color = profession
  )
) +
  geom_point(aes(shape = death_year_fake), show.legend = FALSE) +
  geom_line(aes(linetype = status), show.legend = FALSE) +
  geom_text(
    aes(y = name, label = str_wrap(known_for, 50)),
    x = 2030,
    show.legend = FALSE,
    hjust = 0,
    size = 4
  ) +
  geom_text(
    aes(label = profession),
    x = 1809,
    y = Inf,
    hjust = 1,
    vjust = 1,
    show.legend = FALSE,
    fontface = "bold"
  ) +
  scale_shape_manual(values = c("circle", NA)) +
  scale_linetype_manual(values = c("dashed", "solid")) +
  scale_color_manual(values = pop_palette("aqua")) +
  scale_x_continuous(expand = c(0.01, 0), breaks = seq(1820, 2026, 40)) +
  coord_cartesian(clip = "off") +
  facet_grid(
    profession ~ .,
    scales = "free_y",
    space = "free_y",
    switch = "x"
  ) +
  labs(
    x = "Year",
    y = NULL,
    title = "10 women in science who changed the world",
    caption = "Source: Discover magazine"
  ) +
  theme_gray(base_size = 14) +
  theme(
    plot.margin = unit(c(1, 20, 1, 10), "lines"),
    plot.title.position = "plot",
    plot.caption.position = "plot",
    plot.caption = element_text(hjust = 1.7), # manual hack
    strip.background = element_blank(),
    strip.text = element_blank(),
    axis.title.x = element_text(hjust = 0),
    panel.grid.minor = element_blank()
  )