datazoompuc · bernardo-sieira · Mar 23, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/R/build_pnadc_panel.R b/R/build_pnadc_panel.R
@@ -24,23 +24,24 @@ build_pnadc_panel <- function(dat, panel) {
   ###########################
   ## Bind Global Variables ##
   ###########################
-
-  UPA <- V1008 <- V1014 <- id_dom <- UF <- V20082 <- V20081 <- rs_valid <-  unmatched_basic <- NULL
-  V2008 <- V2007 <- id_ind <- V2003 <- V1016 <- appearances <- V1016 <- id_rs <- unmatched_adv <- NULL
-
+
+  UPA <- V1008 <- V1014 <- id_dom <- V20082 <- V20081 <- V2008 <- V2007 <- NULL
+  Ano <- Trimestre <- id_ind <- num_appearances <- V2003 <- id_rs <- NULL
+  num_appearances_rs <- q_count_ind <- q_count_rs <- NULL
+
   #############################
   ## Define Basic Parameters ## 
   #############################
-
+  
   # Check if the panel type is 'none'; if so, return the original data
   if (panel == "none") {
     return(dat)
   }
-
+  
   ##########################
   ## Basic Identification ##
   ##########################
-
+  
   # If the panel type is not 'none', perform basic identification steps
   if (panel != "none") {
     # Household identifier combines UPA, V1008, and V1014, creating an unique number for every combination of those variables, all through the function cur_group_id
@@ -49,7 +50,7 @@ build_pnadc_panel <- function(dat, panel) {
         id_dom = dplyr::cur_group_id(),
         .by = c(UPA, V1008, V1014)
       )
-
+    
     # Individual id combines the household id, sex (V2007), and date of birth (V20082, V20081, V2008), creating an unique number for every combination of those variables, all through the function cur_group_id
     dat <- dat %>%
       dplyr::mutate(
@@ -63,33 +64,33 @@ build_pnadc_panel <- function(dat, panel) {
       dplyr::add_count(id_ind, Ano, Trimestre, name = "num_appearances") %>% # counts number of times that each id_ind appears
       dplyr::mutate(
         id_ind = dplyr::case_when(
-          num_appearances != 1 ~ NA,
+          num_appearances != 1 ~ NA_real_,
           .default = id_ind
         ))
 
     # missing values
 
     dat <- dat %>% dplyr::mutate(
       id_ind = dplyr::case_when(
-        V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA,
+        V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA_real_,
         .default = id_ind
       )
     )
   }
-
+  
   #############################
   ## Advanced Identification ##
   #############################
-
+  
   ## Stage 1:
-
+  
   if (!(panel %in% c("none", "basic"))) {
     m <- max(dat$id_ind, na.rm = TRUE) # to avoid overlap between id numbers
     # id_rs are always higher numbers than id_ind
     # remove NAs otherwise m is NA and all id_rs are NAs
-
+    
     # advanced identification is only run on previously unmatched individuals
-
+    
     dat <- dat %>%
       dplyr::mutate(
         id_rs = dplyr::cur_group_id() + m,
@@ -105,15 +106,15 @@ build_pnadc_panel <- function(dat, panel) {
       dplyr::add_count(id_rs, Ano, Trimestre, name = "num_appearances_rs") %>% # counts number of times that each id_ind appears
       dplyr::mutate(
         id_rs = dplyr::case_when(
-          num_appearances_rs != 1 ~ NA,
+          num_appearances_rs != 1 ~ NA_real_,
           .default = id_rs
         ))
 
     # missing values
 
     dat <- dat %>% dplyr::mutate(
       id_rs = dplyr::case_when(
-        V2008 == "99" | V20081 == "99" ~ NA,
+        V2008 == "99" | V20081 == "99" ~ NA_real_,
         .default = id_rs
       )
     )
@@ -130,16 +131,16 @@ build_pnadc_panel <- function(dat, panel) {
         .by = id_rs
       ) %>%
       dplyr::mutate(
-        id_final = dplyr::case_when(
+        id_rs = dplyr::case_when(
           # perfect tracking with basic identification
           q_count_ind == 5 ~ id_ind,
 
           # id_rs takes over if it finds more quarters than id_ind does
           # no more than 5 appearances (or else errors are introduced)
           q_count_rs > q_count_ind & q_count_rs <= 5 ~ id_rs,
 
-          # if else stick to id_ind
-          TRUE ~ id_ind
+          # fallback: prefer id_ind if available, otherwise keep id_rs
+          TRUE ~ dplyr::coalesce(id_ind, id_rs)
         )
       )
 
@@ -148,27 +149,36 @@ build_pnadc_panel <- function(dat, panel) {
   ##########################
   ## Pasting panel number ##
   ##########################
-
+  
   # to avoid overlap when binding more than one panel (all ids are just counts from 1, ..., N)
-
+  # ifelse guards against as.hexmode(NA) which returns the string "NA" instead of a true NA
+
   # basic panel
   if (panel != "none") {
 
-    dat$id_ind <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind))
-
+    dat$id_ind <- ifelse(
+      is.na(dat$id_ind),
+      NA_character_,
+      paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind))
+    )
+
   }
 
   # advanced panel
   if (!(panel %in% c("none", "basic"))) {
 
-    dat$id_rs <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs))
+    dat$id_rs <- ifelse(
+      is.na(dat$id_rs),
+      NA_character_,
+      paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs))
+    )
 
   }
-
+  
   #################
   ## Return Data ##
   #################
-
+  
   # Return the modified dataset
   return(dat)
-}
+}
diff --git a/R/calculate_panel_attrition.R b/R/calculate_panel_attrition.R
@@ -0,0 +1,97 @@
+#' Create an attrition table for a PNADc panel file
+#'
+#' @param data Input data frame (PNADc panel).
+#' @param panel ID strategy: "basic" (id_ind), "advanced" (id_rs), or "households" (id_dom).
+#'
+#' @return A data frame with 8 columns detailing unconditional and conditional attrition.
+calculate_panel_attrition <- function(data, panel) {
+
+  # 1. Standardize ID column based on panel type
+  id_col <- switch(panel,
+                   "basic"      = "id_ind",
+                   "advanced"   = "id_rs",
+                   "households" = "id_dom",
+                   stop("Invalid panel type. Use 'basic', 'advanced', or 'households'."))
+
+  message(paste0(stringr::str_to_title(panel), " panel attrition calculated."))
+
+  # 2. Preparation: Filter for those present in Wave 1 and rename ID
+  df_prepared <- data %>%
+    dplyr::mutate(V1016 = as.integer(V1016)) %>%
+    dplyr::rename(individual_id = !!id_col)
+
+  ids_w1 <- df_prepared %>%
+    dplyr::filter(V1016 == 1) %>%
+    dplyr::pull(individual_id) %>%
+    unique()
+
+  n_w1 <- length(ids_w1)
+
+  # 3. Create a wide presence matrix (1 if present, 0 if absent)
+  # This makes logical comparisons across waves much faster
+  presence_matrix <- df_prepared %>%
+    dplyr::filter(individual_id %in% ids_w1) %>%
+    dplyr::select(individual_id, V1016) %>%
+    dplyr::mutate(present = 1) %>%
+    tidyr::pivot_wider(names_from = V1016, 
+                       values_from = present, 
+                       names_prefix = "w", 
+                       values_fill = 0)
+
+  # 4. Initialize the results storage
+  results <- data.frame(
+    wave_step               = character(),
+    n_missing_uncond        = numeric(),
+    n_found_uncond          = numeric(),
+    pct_found_uncond        = numeric(),
+    n_missing_cond          = numeric(),
+    n_found_cond            = numeric(),
+    pct_found_cond_w1       = numeric(),
+    pct_found_cond_prev     = numeric(),
+    stringsAsFactors        = FALSE
+  )
+
+  # 5. Calculate metrics for each transition (1->2, 2->3, 3->4, 4->5)
+  # For conditional, we track the 'survivors' who were in ALL previous waves
+  current_cond_ids <- ids_w1
+
+  for (i in 1:4) {
+    prev_w <- paste0("w", i)
+    next_w <- paste0("w", i + 1)
+
+    # --- Unconditional Logic ---
+    # Who was in Wave i?
+    ids_in_prev <- presence_matrix$individual_id[presence_matrix[[prev_w]] == 1]
+    # Out of those, who is in Wave i + 1?
+    found_uncond <- presence_matrix %>%
+      dplyr::filter(!!rlang::sym(prev_w) == 1, !!rlang::sym(next_w) == 1) %>%
+      nrow()
+
+    # --- Conditional Logic ---
+    # Who was in ALL waves from 1 to i? (stored in current_cond_ids)
+    # Out of those, who is also in Wave i + 1?
+    found_cond_df <- presence_matrix %>%
+      dplyr::filter(individual_id %in% current_cond_ids, !!rlang::sym(next_w) == 1)
+
+    n_found_cond <- nrow(found_cond_df)
+    n_missing_cond <- length(current_cond_ids) - n_found_cond
+
+    # Update denominator for next conditional step
+    prev_cond_total <- length(current_cond_ids)
+    current_cond_ids <- found_cond_df$individual_id
+
+    # --- Assemble Row ---
+    results[i, ] <- list(
+      wave_step           = paste("Wave", i, "to", i + 1),
+      n_missing_uncond    = length(ids_in_prev) - found_uncond,
+      n_found_uncond      = found_uncond,
+      pct_found_uncond    = (found_uncond / n_w1) * 100,
+      n_missing_cond      = n_missing_cond,
+      n_found_cond        = n_found_cond,
+      pct_found_cond_w1   = (n_found_cond / n_w1) * 100,
+      pct_found_cond_prev = (n_found_cond / prev_cond_total) * 100
+    )
+  }
+
+  return(results)
+}