harmonize_pf <- function(path, xwalk_pf, logger){
# Extract the calendar year for the legacy core/soi dataset
year <- stringr::str_extract(path, "(19|20)\\d{2}")
if (is.na(year)){year <- paste0("20", stringr::str_extract(path, "\\d{2}"),
collapse = "")}
# Read file into data.table object
pf_legacy_dt <- data.table::fread(path)
setNames(pf_legacy_dt, toupper(names(pf_legacy_dt)))
# Get column names
legacy_colnames <- names(pf_legacy_dt)
# Create xwalk sample
xwalk_sample <- xwalk_pf |>
dplyr::filter(VAR_NAME_OLD %in% legacy_colnames,
! is.na(VAR_NAME_NEW),
VAR_NAME_NEW != "")
# Get columnn names that are absent
unharmonized_cols <- setdiff(legacy_colnames, xwalk_sample$VAR_NAME_OLD)
# Perform harmonization
harmonizable_cols <- unique(xwalk_sample$VAR_NAME_OLD)
pf_legacy_sample <- pf_legacy_dt[, ..harmonizable_cols]
data.table::setnames(pf_legacy_sample,
xwalk_sample$VAR_NAME_OLD,
xwalk_sample$VAR_NAME_NEW
)
# Duplicated column names - need to log
dup_columns <- names(pf_legacy_sample)[duplicated(names(pf_legacy_sample))]
# Create column containing tax year
if ("F9_00_FISCAL_YEAR_END" %in% harmonizable_cols){
pf_legacy_sample[, TAX_YEAR := substr(F9_00_FISCAL_YEAR_END, 1, 4)]
} else {
pf_legacy_sample[, TAX_YEAR := substr(F9_00_TAX_PERIOD_END_DATE, 1, 4)]
}
# Create EIN2 Column with check
pf_legacy_sample[, EIN2 := format_ein(F9_00_ORG_EIN, to="id")]
# Log outputs
log4r::info(logger, message = paste("Year", year))
log4r::info(logger, message = paste("Unharmonized cols:", paste0(unharmonized_cols, collapse = ", ")))
log4r::info(logger, message = paste("Harmonized cols:", paste0(harmonizable_cols, collapse = ", ")))
log4r::info(logger, message = paste("Duplicated cols:", paste0(dup_columns, collapse = ", ")))
return(pf_legacy_sample)
}
Generalize the following harmonization function in
R/pf_harmonization_test.RThe function should:
@Cprinvil1996 Create a branch titled iss3 and submit a PR