Skip to content

Harmonization function #3

@Thiyaghessan

Description

@Thiyaghessan

Generalize the following harmonization function in R/pf_harmonization_test.R

The function should:

  • Work with any dataset given a crosswalk
  • Come with the appropriate roxygen2 documentation

@Cprinvil1996 Create a branch titled iss3 and submit a PR

harmonize_pf <- function(path, xwalk_pf, logger){
  
  # Extract the calendar year for the legacy core/soi dataset
  year <- stringr::str_extract(path, "(19|20)\\d{2}")
  if (is.na(year)){year <- paste0("20", stringr::str_extract(path, "\\d{2}"),
                                  collapse = "")}
  
  # Read file into data.table object
  pf_legacy_dt <- data.table::fread(path)
  setNames(pf_legacy_dt, toupper(names(pf_legacy_dt)))
  
  # Get column names
  legacy_colnames <- names(pf_legacy_dt)
  
  # Create xwalk sample
  xwalk_sample <- xwalk_pf |>
    dplyr::filter(VAR_NAME_OLD %in% legacy_colnames,
                  ! is.na(VAR_NAME_NEW),
                  VAR_NAME_NEW != "")
  
  # Get columnn names that are absent
  unharmonized_cols <- setdiff(legacy_colnames, xwalk_sample$VAR_NAME_OLD)
  
  # Perform harmonization
  harmonizable_cols <- unique(xwalk_sample$VAR_NAME_OLD)
  pf_legacy_sample <- pf_legacy_dt[, ..harmonizable_cols]
  data.table::setnames(pf_legacy_sample,
                       xwalk_sample$VAR_NAME_OLD,
                       xwalk_sample$VAR_NAME_NEW
  )
  
  # Duplicated column names - need to log
  dup_columns <- names(pf_legacy_sample)[duplicated(names(pf_legacy_sample))]
  
  # Create column containing tax year
  if ("F9_00_FISCAL_YEAR_END" %in% harmonizable_cols){
    pf_legacy_sample[, TAX_YEAR := substr(F9_00_FISCAL_YEAR_END, 1, 4)]
  } else {
    pf_legacy_sample[, TAX_YEAR := substr(F9_00_TAX_PERIOD_END_DATE, 1, 4)]
  }
  
  # Create EIN2 Column with check
  pf_legacy_sample[, EIN2 := format_ein(F9_00_ORG_EIN, to="id")]
  
  # Log outputs
  log4r::info(logger, message = paste("Year", year))
  log4r::info(logger, message = paste("Unharmonized cols:", paste0(unharmonized_cols, collapse = ", ")))
  log4r::info(logger, message = paste("Harmonized cols:", paste0(harmonizable_cols, collapse = ", ")))
  log4r::info(logger, message = paste("Duplicated cols:", paste0(dup_columns, collapse = ", ")))
  
  return(pf_legacy_sample)
}

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions