Workshop script

Below is the final version of the script we worked on together during the workshop session.

In-class exercises

library(tidyverse)

# we'll be looking at data on Groundhog predictions
groundhogs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/groundhogs.csv')
predictions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/predictions.csv')

head(predictions)

# find groundhog predictions from 2020
filter(predictions, year == 2020)

# find groundhog predictions from 2020 and 2021
filter(predictions, year == 2020 | year == 2021)

filter(predictions, year %in% c(2020, 2021))

# find predictions between 1900 and 2000
predictions |>
  filter(year >= 1900 & year <= 2000)

# sort our predictions by year
predictions |> arrange(year)

# sort our predictions by year, descending 
predictions |> arrange(desc(year))

# find unique rows in the predictions dataset
predictions |> distinct()

# list the unique years in the predictions dataset
predictions |> distinct(year)

# list unique combinations of groundhog id and year in the predictions dataset
predictions |> distinct(year, id)

# list the first instance of unique combinations of groundhog id and year in the predictions dataset
# and keep all columns
predictions |> distinct(year, id, .keep_all = TRUE)

# check for duplicate predictions
predictions |> distinct(id, year) # since the number of distinct combinations of year and id is the same as the number of rows in the dataset, we know we don't have any duplicates to remove

# create a subset of your data where "shadow" has a value of either TRUE or FALSE, and sort the result by descending year.
predictions <- predictions |>
  filter(shadow %in% c(TRUE, FALSE)) |>
  # distinct(year, id, .keep_all = TRUE) |> # if we found duplicates, we'd need this line!
  arrange(desc(year))

# group predictions by year 
predictions |>
  group_by(year)

# how many predictions were made in each year?
predictions |>
  group_by(year) |>
  summarize(n_predictions = n())

# How many different groundhogs made predictions each year?
predictions |>
  group_by(year) |>
  summarize(n_groundhogs = n_distinct(id)) |>
  arrange(desc(n_groundhogs))

# What is the first year each groundhog made a prediction?
predictions |>
  group_by(id) |>
  summarize(first_prediction = min(year))

# Let's return to our dataframe with the number of predictions in each year. 
# How would we add a column for the number of shadows seen in each year?
predictions |>
  group_by(year) |>
  summarize(n_predictions = n(),
            n_shadows = sum(shadow == TRUE))

# Create a dataframe with 3 variables: 
# groundhog id
# the number of total predictions each groundhog has made
# the number of times each groundhog has seen its shadow
predictions |>
  group_by(id) |>
  summarize(n_predictions = n(),
            n_shadows = sum(shadow == TRUE))

# calculate how many characters are in the details field and put the variable after id
predictions |>
  mutate(details_length = nchar(details), .after = id)

# create a column that indicates whether the prediction was made by Punxsutawney Phil
predictions |> 
  mutate(phil = if_else(id == 1, 'TRUE', 'FALSE'))

# create a column that indicates the century of the predictions
predictions |> 
  mutate(century = case_when(year < 1900 ~ 19,
                             year < 2000 & year >= 1900 ~ 20,
                             year >= 2000 ~ 21))

# Working off of our table with the number of predictions and number of shadows seen per year, lets:
# Add a column called shadow_percent that gives the percentage of groundhogs that saw their shadow that year
# Filter for years that had at least 5 predictions
# Keep only the variables year and shadow_percent, and rename year to prediction_year 
# Assign the result to a variable year_predictions 
year_predictions <- predictions |>
  group_by(year) |>
  summarize(n_predictions = n(),
            n_shadows = sum(shadow == TRUE)) |>
  mutate(shadow_percent = n_shadows/n_predictions) |>
  filter(n_predictions > 5) |>
  select(year, shadow_percent) |>
  rename(prediction_year = year)

# add the variables from the groundhogs table to the predictions table and save the result as predictions_groundhogs
predictions_groundhogs <- left_join(predictions, groundhogs, join_by(id == id))

# add the variables from predictions to the groundhogs table 
right_join(predictions, groundhogs, join_by(id == id))

left_join(groundhogs, predictions, join_by(id == id))

# add variables from groundhogs to predictions where keys appear in both tables
inner_join(predictions, groundhogs, join_by(id == id))

# add variables from groundhogs to predictions. Add rows even if the groundhog isn't in groundhog_predictions
full_join(predictions, groundhogs, join_by(id == id))

# Using our joined predictions_groundhogs table, let's create a summary table comparing the rate at which real groundhogs vs alternative groundhogs saw their shadows each year.
predictions_groundhogs |>
  group_by(year, is_groundhog) |>
  summarize(n_predictions = n(),
            n_shadows = sum(shadow == TRUE)) |>
  mutate(percent_shadow = n_shadows/n_predictions) |>
  arrange(desc(year), is_groundhog)

Bonus Exercises

#### BONUS EXERCISE ANSWERS ####

# Write code to calculate the column predictions_count in groundhogs
groundhogs |> left_join(predictions |>
                          group_by(id) |>
                          summarize(predictions_count = n()))

# Write code to calculate the column is_groundhog in groundhogs
groundhogs |>
  mutate(is_groundhog = if_else(type =='Groundhog', TRUE, FALSE))

# Calculate the proportion of groundhogs from each country that make predictions each year
predictions |>
  left_join(groundhogs) |>
  group_by(year, country) |>
  summarize(n = n()) |>
  mutate(percent = n/sum(n))

# Add a column to groundhogs indicating the first year each groundhog saw its shadow
groundhogs |>
  left_join(predictions |>
              group_by(id) |>
              filter(shadow == TRUE) |>
              summarize(first_shadow = min(year)))

All materials on this site are licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (CC BY-NC-SA 4.0), attributable to Data Services, Johns Hopkins University.