plnnr/interpolatepw_multiple_inputs.R

## interpolatepw_multiple_inputs.R
## Load libraries, set options, source scripts ---------------------------------
library(tidyverse)
library(tidycensus)
library(sf)
library(tigris)
library(furrr)

options(
  scipen = 999,
  digits = 4,
  tigris_class = "sf",
  tigris_use_cache = T
)

## Generate spatial data
STATES_TO_DOWNLOAD <- c("41", "53")
COUNTIES_TO_DOWNLOAD <- c("41005", "41009", "41051", "41067", "41071", "53011", "53059")
TARGET_EPSG <- 2913

TRACTS.SF <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
  tigris::tracts(STATE, cb = TRUE, year = 2020) %>%
    filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
  st_transform(TARGET_EPSG)

TRACTS.2019.SF <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
  tigris::tracts(STATE, cb = TRUE, year = 2019) %>%
    filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
  st_transform(TARGET_EPSG)

TRACTS_2010t2020.SF <- rbind(
  mutate(TRACTS.2019.SF, year = 2010),
  mutate(TRACTS.2019.SF, year = 2015),
  mutate(TRACTS.SF, year = 2020)
)

STATE_BLOCKS.2020 <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
  tigris::blocks(STATE, year = 2020) %>%
    filter(substr(GEOID20, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
  st_transform(TARGET_EPSG)

## Load CPI inflation adjustments (CPI-U-RS)
cpi <- rio::import("1_data/0_resources/cpi-u-rs_1950-current.xlsx") %>%
  select(year, inflation_factor = inflation_factor_2020)

## Download data using multi-threading -----------------------------------------

## Enable multi-core processing
plan(multisession, workers = availableCores())
set.seed(123)

core_tables <- c("B25106", "B03002", "B19013", "B25010", "B15002")

## Grab data for selected years and selected tables
acs_table_query_longitudinal <- future_map_dfr(
  .x = c(2010, 2015, 2020),
  .f = function(acs_year){
    map_dfr(
      .x = core_tables,
      .f = function(acs_table){
        query <- get_acs(geography = "tract",
                         table = acs_table,
                         year = acs_year,
                         state = STATES_TO_DOWNLOAD) %>%
          filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD) %>%
          mutate(agg = "count",
                 year = acs_year) %>%
          select(-NAME)
      })
  })

## Determine weighting methods -------------------------------------------------

## Variables that are population-based should be weighted by population
## Variables that are unit-based should be weighted by households
## Variables that are aggregate may need to be re-weighted???
# c("B25106", "B03002", "B19013", "B25010", "B15002")

## Population-based tables:
# B03002
# B15002

## Unit-based tables:
# B25010
# B25106
# B19013

## Extensive
# B03002
# B15002
# B25106

## Intensive (extensive == FALSE)
# B25010
# B19013

## Interpolate results ---------------------------------------------------------
## In this section, we interpolate data from 2010 and 2015 to 2020 census tracts.
## Since the use of the weighting variables and the use of extensive/intensive
## weighting mechanisms vary (count data vs aggregate data like medians), we must
## specify the right combination of tables and weighting columns. We do this by
## carefully specifying the list arguments for the pmap() function.

## First we transform our long data to wide data and append spatial data
wide_query <- acs_table_query_longitudinal %>%
  pivot_wider(id_cols = c(GEOID, year), names_from = variable, values_from = estimate) %>%
  left_join(., TRACTS_2010t2020.SF, by = c("GEOID", "year")) %>%
  st_as_sf()

## Set up list arguments for pmap() function
years <- c(rep(2015, 3),
           rep(2010, 3))

tables <- list(c("B25106"), c("B25010", "B19013"), c("B03002", "B15002"),
               c("B25106"), c("B25010", "B19013"), c("B03002", "B15002"))

weightcols <- c("HOUSING20", "HOUSING20", "POP20",
                "HOUSING20", "HOUSING20", "POP20")

extensives <- c(TRUE, FALSE, TRUE,
                TRUE, FALSE, TRUE)

## Generate list of filtered data based on the years and tables
dfs <- pmap(.l = list(years, tables),
            .f = function(.year, .table){
              # Expand .table in case of multiple tables
              .table <- paste(.table)
              wide_query %>%
                filter(year == .year) %>%
                select(GEOID, starts_with(c({{.table}})))
            })

## Interpolate all of the data
interpolated <- future_pmap(
  .l = list(dfs, years, weightcols, extensives),
  .f = function(.dat, .year, .weightcol, .ext){
    interpolate_pw(from = .dat,
                   to = wide_query %>%
                     filter(year == 2020) %>%
                     select(GEOID),
                   to_id = "GEOID",
                   weights = STATE_BLOCKS.2020,
                   weight_column = .weightcol,
                   crs = TARGET_EPSG,
                   extensive = .ext) %>%
      mutate(year = .year) %>%
      relocate(year, .after = GEOID) %>%
      st_drop_geometry()
  },
  .options = furrr_options(seed = TRUE))

## Bind the list together, cast to long format, and standardize the format
## TODO there has to be a more elegant way to do this step...
interpolated.df <- rbind(
  bind_cols(interpolated[1:3], .name_repair = "unique") %>%
    select(-c(GEOID...49, GEOID...55, year...50, year...56)) %>%
    rename(GEOID = GEOID...1, year = year...2),
  bind_cols(interpolated[4:6], .name_repair = "unique") %>%
    select(-c(GEOID...49, GEOID...55, year...50, year...56)) %>%
    rename(GEOID = GEOID...1, year = year...2)
) %>%
  pivot_longer(cols = -c(GEOID, year),
               names_to = "variable",
               values_to = "estimate") %>%
  mutate(moe = 0,
         agg = "count") %>%
  select(names(acs_table_query_longitudinal))

  ## Optionally add spatial data and generate a quick map
interpolated.sf <- interpolated.df %>%
  left_join(., TRACTS.SF, by = "GEOID") %>%
  st_as_sf()

interpolated.sf %>%
  filter(year == 2015,
         variable == "B25106_001") %>%
  mapview(zcol = "estimate")
	## Load libraries, set options, source scripts ---------------------------------
	library(tidyverse)
	library(tidycensus)
	library(sf)
	library(tigris)
	library(furrr)

	options(
	scipen = 999,
	digits = 4,
	tigris_class = "sf",
	tigris_use_cache = T
	)

	## Generate spatial data
	STATES_TO_DOWNLOAD <- c("41", "53")
	COUNTIES_TO_DOWNLOAD <- c("41005", "41009", "41051", "41067", "41071", "53011", "53059")
	TARGET_EPSG <- 2913

	TRACTS.SF <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
	tigris::tracts(STATE, cb = TRUE, year = 2020) %>%
	filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
	st_transform(TARGET_EPSG)

	TRACTS.2019.SF <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
	tigris::tracts(STATE, cb = TRUE, year = 2019) %>%
	filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
	st_transform(TARGET_EPSG)

	TRACTS_2010t2020.SF <- rbind(
	mutate(TRACTS.2019.SF, year = 2010),
	mutate(TRACTS.2019.SF, year = 2015),
	mutate(TRACTS.SF, year = 2020)
	)

	STATE_BLOCKS.2020 <- map_dfr(STATES_TO_DOWNLOAD, .f = function(STATE){
	tigris::blocks(STATE, year = 2020) %>%
	filter(substr(GEOID20, 1, 5) %in% COUNTIES_TO_DOWNLOAD)}) %>%
	st_transform(TARGET_EPSG)

	## Load CPI inflation adjustments (CPI-U-RS)
	cpi <- rio::import("1_data/0_resources/cpi-u-rs_1950-current.xlsx") %>%
	select(year, inflation_factor = inflation_factor_2020)

	## Download data using multi-threading -----------------------------------------

	## Enable multi-core processing
	plan(multisession, workers = availableCores())
	set.seed(123)

	core_tables <- c("B25106", "B03002", "B19013", "B25010", "B15002")

	## Grab data for selected years and selected tables
	acs_table_query_longitudinal <- future_map_dfr(
	.x = c(2010, 2015, 2020),
	.f = function(acs_year){
	map_dfr(
	.x = core_tables,
	.f = function(acs_table){
	query <- get_acs(geography = "tract",
	table = acs_table,
	year = acs_year,
	state = STATES_TO_DOWNLOAD) %>%
	filter(substr(GEOID, 1, 5) %in% COUNTIES_TO_DOWNLOAD) %>%
	mutate(agg = "count",
	year = acs_year) %>%
	select(-NAME)
	})
	})

	## Determine weighting methods -------------------------------------------------

	## Variables that are population-based should be weighted by population
	## Variables that are unit-based should be weighted by households
	## Variables that are aggregate may need to be re-weighted???
	# c("B25106", "B03002", "B19013", "B25010", "B15002")

	## Population-based tables:
	# B03002
	# B15002

	## Unit-based tables:
	# B25010
	# B25106
	# B19013

	## Extensive
	# B03002
	# B15002
	# B25106

	## Intensive (extensive == FALSE)
	# B25010
	# B19013

	## Interpolate results ---------------------------------------------------------
	## In this section, we interpolate data from 2010 and 2015 to 2020 census tracts.
	## Since the use of the weighting variables and the use of extensive/intensive
	## weighting mechanisms vary (count data vs aggregate data like medians), we must
	## specify the right combination of tables and weighting columns. We do this by
	## carefully specifying the list arguments for the pmap() function.

	## First we transform our long data to wide data and append spatial data
	wide_query <- acs_table_query_longitudinal %>%
	pivot_wider(id_cols = c(GEOID, year), names_from = variable, values_from = estimate) %>%
	left_join(., TRACTS_2010t2020.SF, by = c("GEOID", "year")) %>%
	st_as_sf()

	## Set up list arguments for pmap() function
	years <- c(rep(2015, 3),
	rep(2010, 3))

	tables <- list(c("B25106"), c("B25010", "B19013"), c("B03002", "B15002"),
	c("B25106"), c("B25010", "B19013"), c("B03002", "B15002"))

	weightcols <- c("HOUSING20", "HOUSING20", "POP20",
	"HOUSING20", "HOUSING20", "POP20")

	extensives <- c(TRUE, FALSE, TRUE,
	TRUE, FALSE, TRUE)

	## Generate list of filtered data based on the years and tables
	dfs <- pmap(.l = list(years, tables),
	.f = function(.year, .table){
	# Expand .table in case of multiple tables
	.table <- paste(.table)
	wide_query %>%
	filter(year == .year) %>%
	select(GEOID, starts_with(c({{.table}})))
	})

	## Interpolate all of the data
	interpolated <- future_pmap(
	.l = list(dfs, years, weightcols, extensives),
	.f = function(.dat, .year, .weightcol, .ext){
	interpolate_pw(from = .dat,
	to = wide_query %>%
	filter(year == 2020) %>%
	select(GEOID),
	to_id = "GEOID",
	weights = STATE_BLOCKS.2020,
	weight_column = .weightcol,
	crs = TARGET_EPSG,
	extensive = .ext) %>%
	mutate(year = .year) %>%
	relocate(year, .after = GEOID) %>%
	st_drop_geometry()
	},
	.options = furrr_options(seed = TRUE))

	## Bind the list together, cast to long format, and standardize the format
	## TODO there has to be a more elegant way to do this step...
	interpolated.df <- rbind(
	bind_cols(interpolated[1:3], .name_repair = "unique") %>%
	select(-c(GEOID...49, GEOID...55, year...50, year...56)) %>%
	rename(GEOID = GEOID...1, year = year...2),
	bind_cols(interpolated[4:6], .name_repair = "unique") %>%
	select(-c(GEOID...49, GEOID...55, year...50, year...56)) %>%
	rename(GEOID = GEOID...1, year = year...2)
	) %>%
	pivot_longer(cols = -c(GEOID, year),
	names_to = "variable",
	values_to = "estimate") %>%
	mutate(moe = 0,
	agg = "count") %>%
	select(names(acs_table_query_longitudinal))

	## Optionally add spatial data and generate a quick map
	interpolated.sf <- interpolated.df %>%
	left_join(., TRACTS.SF, by = "GEOID") %>%
	st_as_sf()

	interpolated.sf %>%
	filter(year == 2015,
	variable == "B25106_001") %>%
	mapview(zcol = "estimate")