diff --git a/R/build_pnadc_panel.R b/R/build_pnadc_panel.R index 6ac7fcb..55e25fe 100644 --- a/R/build_pnadc_panel.R +++ b/R/build_pnadc_panel.R @@ -24,23 +24,24 @@ build_pnadc_panel <- function(dat, panel) { ########################### ## Bind Global Variables ## ########################### - - UPA <- V1008 <- V1014 <- id_dom <- UF <- V20082 <- V20081 <- rs_valid <- unmatched_basic <- NULL - V2008 <- V2007 <- id_ind <- V2003 <- V1016 <- appearances <- V1016 <- id_rs <- unmatched_adv <- NULL - + + UPA <- V1008 <- V1014 <- id_dom <- V20082 <- V20081 <- V2008 <- V2007 <- NULL + Ano <- Trimestre <- id_ind <- num_appearances <- V2003 <- id_rs <- NULL + num_appearances_rs <- q_count_ind <- q_count_rs <- NULL + ############################# ## Define Basic Parameters ## ############################# - + # Check if the panel type is 'none'; if so, return the original data if (panel == "none") { return(dat) } - + ########################## ## Basic Identification ## ########################## - + # If the panel type is not 'none', perform basic identification steps if (panel != "none") { # Household identifier combines UPA, V1008, and V1014, creating an unique number for every combination of those variables, all through the function cur_group_id @@ -49,7 +50,7 @@ build_pnadc_panel <- function(dat, panel) { id_dom = dplyr::cur_group_id(), .by = c(UPA, V1008, V1014) ) - + # Individual id combines the household id, sex (V2007), and date of birth (V20082, V20081, V2008), creating an unique number for every combination of those variables, all through the function cur_group_id dat <- dat %>% dplyr::mutate( @@ -63,7 +64,7 @@ build_pnadc_panel <- function(dat, panel) { dplyr::add_count(id_ind, Ano, Trimestre, name = "num_appearances") %>% # counts number of times that each id_ind appears dplyr::mutate( id_ind = dplyr::case_when( - num_appearances != 1 ~ NA, + num_appearances != 1 ~ NA_real_, .default = id_ind )) @@ -71,25 +72,25 @@ build_pnadc_panel <- function(dat, panel) { dat <- dat %>% dplyr::mutate( id_ind = dplyr::case_when( - V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA, + V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA_real_, .default = id_ind ) ) } - + ############################# ## Advanced Identification ## ############################# - + ## Stage 1: - + if (!(panel %in% c("none", "basic"))) { m <- max(dat$id_ind, na.rm = TRUE) # to avoid overlap between id numbers # id_rs are always higher numbers than id_ind # remove NAs otherwise m is NA and all id_rs are NAs - + # advanced identification is only run on previously unmatched individuals - + dat <- dat %>% dplyr::mutate( id_rs = dplyr::cur_group_id() + m, @@ -105,7 +106,7 @@ build_pnadc_panel <- function(dat, panel) { dplyr::add_count(id_rs, Ano, Trimestre, name = "num_appearances_rs") %>% # counts number of times that each id_ind appears dplyr::mutate( id_rs = dplyr::case_when( - num_appearances_rs != 1 ~ NA, + num_appearances_rs != 1 ~ NA_real_, .default = id_rs )) @@ -113,7 +114,7 @@ build_pnadc_panel <- function(dat, panel) { dat <- dat %>% dplyr::mutate( id_rs = dplyr::case_when( - V2008 == "99" | V20081 == "99" ~ NA, + V2008 == "99" | V20081 == "99" ~ NA_real_, .default = id_rs ) ) @@ -130,7 +131,7 @@ build_pnadc_panel <- function(dat, panel) { .by = id_rs ) %>% dplyr::mutate( - id_final = dplyr::case_when( + id_rs = dplyr::case_when( # perfect tracking with basic identification q_count_ind == 5 ~ id_ind, @@ -138,8 +139,8 @@ build_pnadc_panel <- function(dat, panel) { # no more than 5 appearances (or else errors are introduced) q_count_rs > q_count_ind & q_count_rs <= 5 ~ id_rs, - # if else stick to id_ind - TRUE ~ id_ind + # fallback: prefer id_ind if available, otherwise keep id_rs + TRUE ~ dplyr::coalesce(id_ind, id_rs) ) ) @@ -148,27 +149,36 @@ build_pnadc_panel <- function(dat, panel) { ########################## ## Pasting panel number ## ########################## - + # to avoid overlap when binding more than one panel (all ids are just counts from 1, ..., N) - + # ifelse guards against as.hexmode(NA) which returns the string "NA" instead of a true NA + # basic panel if (panel != "none") { - dat$id_ind <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind)) - + dat$id_ind <- ifelse( + is.na(dat$id_ind), + NA_character_, + paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind)) + ) + } # advanced panel if (!(panel %in% c("none", "basic"))) { - dat$id_rs <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs)) + dat$id_rs <- ifelse( + is.na(dat$id_rs), + NA_character_, + paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs)) + ) } - + ################# ## Return Data ## ################# - + # Return the modified dataset return(dat) -} +} \ No newline at end of file diff --git a/R/calculate_panel_attrition.R b/R/calculate_panel_attrition.R new file mode 100644 index 0000000..960bee9 --- /dev/null +++ b/R/calculate_panel_attrition.R @@ -0,0 +1,97 @@ +#' Create an attrition table for a PNADc panel file +#' +#' @param data Input data frame (PNADc panel). +#' @param panel ID strategy: "basic" (id_ind), "advanced" (id_rs), or "households" (id_dom). +#' +#' @return A data frame with 8 columns detailing unconditional and conditional attrition. +calculate_panel_attrition <- function(data, panel) { + + # 1. Standardize ID column based on panel type + id_col <- switch(panel, + "basic" = "id_ind", + "advanced" = "id_rs", + "households" = "id_dom", + stop("Invalid panel type. Use 'basic', 'advanced', or 'households'.")) + + message(paste0(stringr::str_to_title(panel), " panel attrition calculated.")) + + # 2. Preparation: Filter for those present in Wave 1 and rename ID + df_prepared <- data %>% + dplyr::mutate(V1016 = as.integer(V1016)) %>% + dplyr::rename(individual_id = !!id_col) + + ids_w1 <- df_prepared %>% + dplyr::filter(V1016 == 1) %>% + dplyr::pull(individual_id) %>% + unique() + + n_w1 <- length(ids_w1) + + # 3. Create a wide presence matrix (1 if present, 0 if absent) + # This makes logical comparisons across waves much faster + presence_matrix <- df_prepared %>% + dplyr::filter(individual_id %in% ids_w1) %>% + dplyr::select(individual_id, V1016) %>% + dplyr::mutate(present = 1) %>% + tidyr::pivot_wider(names_from = V1016, + values_from = present, + names_prefix = "w", + values_fill = 0) + + # 4. Initialize the results storage + results <- data.frame( + wave_step = character(), + n_missing_uncond = numeric(), + n_found_uncond = numeric(), + pct_found_uncond = numeric(), + n_missing_cond = numeric(), + n_found_cond = numeric(), + pct_found_cond_w1 = numeric(), + pct_found_cond_prev = numeric(), + stringsAsFactors = FALSE + ) + + # 5. Calculate metrics for each transition (1->2, 2->3, 3->4, 4->5) + # For conditional, we track the 'survivors' who were in ALL previous waves + current_cond_ids <- ids_w1 + + for (i in 1:4) { + prev_w <- paste0("w", i) + next_w <- paste0("w", i + 1) + + # --- Unconditional Logic --- + # Who was in Wave i? + ids_in_prev <- presence_matrix$individual_id[presence_matrix[[prev_w]] == 1] + # Out of those, who is in Wave i + 1? + found_uncond <- presence_matrix %>% + dplyr::filter(!!rlang::sym(prev_w) == 1, !!rlang::sym(next_w) == 1) %>% + nrow() + + # --- Conditional Logic --- + # Who was in ALL waves from 1 to i? (stored in current_cond_ids) + # Out of those, who is also in Wave i + 1? + found_cond_df <- presence_matrix %>% + dplyr::filter(individual_id %in% current_cond_ids, !!rlang::sym(next_w) == 1) + + n_found_cond <- nrow(found_cond_df) + n_missing_cond <- length(current_cond_ids) - n_found_cond + + # Update denominator for next conditional step + prev_cond_total <- length(current_cond_ids) + current_cond_ids <- found_cond_df$individual_id + + # --- Assemble Row --- + results[i, ] <- list( + wave_step = paste("Wave", i, "to", i + 1), + n_missing_uncond = length(ids_in_prev) - found_uncond, + n_found_uncond = found_uncond, + pct_found_uncond = (found_uncond / n_w1) * 100, + n_missing_cond = n_missing_cond, + n_found_cond = n_found_cond, + pct_found_cond_w1 = (n_found_cond / n_w1) * 100, + pct_found_cond_prev = (n_found_cond / prev_cond_total) * 100 + ) + } + + return(results) +} \ No newline at end of file