Below is the final version of the script we worked on together during the workshop session.
In-class exercises
library(tidyverse)# we'll be looking at data on Groundhog predictionsgroundhogs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/groundhogs.csv')predictions <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-01-30/predictions.csv')head(predictions)
# find groundhog predictions from 2020filter(predictions, year ==2020)
# find groundhog predictions from 2020 and 2021filter(predictions, year ==2020| year ==2021)
filter(predictions, year %in%c(2020, 2021))
# find predictions between 1900 and 2000predictions |>filter(year >=1900& year <=2000)
# sort our predictions by yearpredictions |>arrange(year)
# sort our predictions by year, descending predictions |>arrange(desc(year))
# find unique rows in the predictions datasetpredictions |>distinct()
# list the unique years in the predictions datasetpredictions |>distinct(year)
# list unique combinations of groundhog id and year in the predictions datasetpredictions |>distinct(year, id)
# list the first instance of unique combinations of groundhog id and year in the predictions dataset# and keep all columnspredictions |>distinct(year, id, .keep_all =TRUE)
# check for duplicate predictionspredictions |>distinct(id, year) # since the number of distinct combinations of year and id is the same as the number of rows in the dataset, we know we don't have any duplicates to remove
# create a subset of your data where "shadow" has a value of either TRUE or FALSE, and sort the result by descending year.predictions <- predictions |>filter(shadow %in%c(TRUE, FALSE)) |># distinct(year, id, .keep_all = TRUE) |> # if we found duplicates, we'd need this line!arrange(desc(year))# group predictions by year predictions |>group_by(year)
# how many predictions were made in each year?predictions |>group_by(year) |>summarize(n_predictions =n())
# How many different groundhogs made predictions each year?predictions |>group_by(year) |>summarize(n_groundhogs =n_distinct(id)) |>arrange(desc(n_groundhogs))
# What is the first year each groundhog made a prediction?predictions |>group_by(id) |>summarize(first_prediction =min(year))
# Let's return to our dataframe with the number of predictions in each year. # How would we add a column for the number of shadows seen in each year?predictions |>group_by(year) |>summarize(n_predictions =n(),n_shadows =sum(shadow ==TRUE))
# Create a dataframe with 3 variables: # groundhog id# the number of total predictions each groundhog has made# the number of times each groundhog has seen its shadowpredictions |>group_by(id) |>summarize(n_predictions =n(),n_shadows =sum(shadow ==TRUE))
# calculate how many characters are in the details field and put the variable after idpredictions |>mutate(details_length =nchar(details), .after = id)
# create a column that indicates whether the prediction was made by Punxsutawney Philpredictions |>mutate(phil =if_else(id ==1, 'TRUE', 'FALSE'))
# create a column that indicates the century of the predictionspredictions |>mutate(century =case_when(year <1900~19, year <2000& year >=1900~20, year >=2000~21))
# Working off of our table with the number of predictions and number of shadows seen per year, lets:# Add a column called shadow_percent that gives the percentage of groundhogs that saw their shadow that year# Filter for years that had at least 5 predictions# Keep only the variables year and shadow_percent, and rename year to prediction_year # Assign the result to a variable year_predictions year_predictions <- predictions |>group_by(year) |>summarize(n_predictions =n(),n_shadows =sum(shadow ==TRUE)) |>mutate(shadow_percent = n_shadows/n_predictions) |>filter(n_predictions >5) |>select(year, shadow_percent) |>rename(prediction_year = year)# add the variables from the groundhogs table to the predictions table and save the result as predictions_groundhogspredictions_groundhogs <-left_join(predictions, groundhogs, join_by(id == id))# add the variables from predictions to the groundhogs table right_join(predictions, groundhogs, join_by(id == id))
# add variables from groundhogs to predictions where keys appear in both tablesinner_join(predictions, groundhogs, join_by(id == id))
# add variables from groundhogs to predictions. Add rows even if the groundhog isn't in groundhog_predictionsfull_join(predictions, groundhogs, join_by(id == id))
# Using our joined predictions_groundhogs table, let's create a summary table comparing the rate at which real groundhogs vs alternative groundhogs saw their shadows each year.predictions_groundhogs |>group_by(year, is_groundhog) |>summarize(n_predictions =n(),n_shadows =sum(shadow ==TRUE)) |>mutate(percent_shadow = n_shadows/n_predictions) |>arrange(desc(year), is_groundhog)
Bonus Exercises
#### BONUS EXERCISE ANSWERS ##### Write code to calculate the column predictions_count in groundhogsgroundhogs |>left_join(predictions |>group_by(id) |>summarize(predictions_count =n()))
# Write code to calculate the column is_groundhog in groundhogsgroundhogs |>mutate(is_groundhog =if_else(type =='Groundhog', TRUE, FALSE))
# Calculate the proportion of groundhogs from each country that make predictions each yearpredictions |>left_join(groundhogs) |>group_by(year, country) |>summarize(n =n()) |>mutate(percent = n/sum(n))
# Add a column to groundhogs indicating the first year each groundhog saw its shadowgroundhogs |>left_join(predictions |>group_by(id) |>filter(shadow ==TRUE) |>summarize(first_shadow =min(year)))