Created
November 29, 2023 09:25
-
-
Save mrcaseb/d26dcf0236c1787f7c41dc095f8cb32c to your computer and use it in GitHub Desktop.
How to bring Statsbomb tracking data into a tidy form
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## HOW TO USE THIS ## | |
# Go to https://github.com/statsbomb/amf-open-data/tree/main#getting-started | |
# and download the zipped json files containing the individual seasons, available via AWS S3 | |
# | |
# Alternatively click the below links directly | |
# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2021.zip | |
# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2022.zip | |
# | |
# Unzip those files to a local directory. That directory will include deeply nested json files | |
# Loop over those files and put their path in the below function. The function will | |
# parse the nested json and either return a tidy data frame into memory (don't do this) or | |
# write it to disc as compressed csv (please do this) | |
# | |
# Let's say you have all the statsbomb json files saved in the folder "data-raw/raw_sb" | |
# and you want parsed csv files in the folder "data-raw/stats-bomb-parsed", | |
# then you could run something like | |
# purrr::walk(list.files("data-raw/raw_sb/"), parse_statsbomb_tracking, write_to = "data-raw/stats-bomb-parsed") | |
# | |
# NB: An average machine needs 25-30 seconds to process one game. The 2021 and 2022 | |
# data consist of 19+18 = 37 games so this could easily run 20 minutes!! | |
#' @export | |
parse_statsbomb_tracking <- function(path_to_json, | |
write_to = "data-raw/stats-bomb-tracking" | |
){ | |
stopifnot(exprs = { | |
"Can only parse one game at a time" = length(path_to_json) == 1 | |
"write_to must be a valid local directory or NULL" = is.null(write_to) || dir.exists(write_to) | |
}) | |
g <- basename(path_to_json) |> tools::file_path_sans_ext() | |
cli::cli_progress_step( | |
msg = "Processing {.pkg {g}}. Please keep cool, this may take a while...", | |
msg_done = "Processed {.pkg {g}}" | |
) | |
raw <- RJSONIO::fromJSON(path_to_json, simplify = FALSE) | |
game_info <- purrr::discard(raw, is.list) |> | |
tibble::as_tibble() |> | |
dplyr::mutate( | |
home = tibble::as_tibble(raw$home_team), | |
away = tibble::as_tibble(raw$away_team) | |
) |> | |
tidyr::unpack(c("home", "away"), names_sep = "_") | |
df <- tibble::tibble(r = raw$plays) |> | |
tidyr::unnest_wider(r) |> | |
# Number of plays times 22, because data per player on the field | |
tidyr::unnest_longer(tracks) |> | |
# add track info with 8 additional columns | |
tidyr::unnest_wider(tracks, names_sep = "_") |> | |
# dd player info with 5 additional columns | |
tidyr::unnest_wider(tracks_player) |> | |
# Now extend by the frames of each player. | |
# The number of frames varies per play AND per player | |
tidyr::unnest_longer(tracks_steps) |> | |
# the tracking data is now again in a list that needs to be expanded | |
tidyr::unnest_wider(tracks_steps) | |
# Bind with game info and finish | |
out <- dplyr::bind_cols(game_info, df) | |
rm(raw, game_info, df) | |
if (!is.null(write_to) && dir.exists(write_to)){ | |
game_id <- out$nfl_game_id[[1]] | |
save_path <- file.path(write_to, paste0(game_id, ".csv.gz")) | |
data.table::fwrite(out, save_path) | |
} else { | |
return(out) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment