Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 38 additions & 28 deletions R/build_pnadc_panel.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,24 @@ build_pnadc_panel <- function(dat, panel) {
###########################
## Bind Global Variables ##
###########################

UPA <- V1008 <- V1014 <- id_dom <- UF <- V20082 <- V20081 <- rs_valid <- unmatched_basic <- NULL
V2008 <- V2007 <- id_ind <- V2003 <- V1016 <- appearances <- V1016 <- id_rs <- unmatched_adv <- NULL


UPA <- V1008 <- V1014 <- id_dom <- V20082 <- V20081 <- V2008 <- V2007 <- NULL
Ano <- Trimestre <- id_ind <- num_appearances <- V2003 <- id_rs <- NULL
num_appearances_rs <- q_count_ind <- q_count_rs <- NULL

#############################
## Define Basic Parameters ##
#############################

# Check if the panel type is 'none'; if so, return the original data
if (panel == "none") {
return(dat)
}

##########################
## Basic Identification ##
##########################

# If the panel type is not 'none', perform basic identification steps
if (panel != "none") {
# Household identifier combines UPA, V1008, and V1014, creating an unique number for every combination of those variables, all through the function cur_group_id
Expand All @@ -49,7 +50,7 @@ build_pnadc_panel <- function(dat, panel) {
id_dom = dplyr::cur_group_id(),
.by = c(UPA, V1008, V1014)
)

# Individual id combines the household id, sex (V2007), and date of birth (V20082, V20081, V2008), creating an unique number for every combination of those variables, all through the function cur_group_id
dat <- dat %>%
dplyr::mutate(
Expand All @@ -63,33 +64,33 @@ build_pnadc_panel <- function(dat, panel) {
dplyr::add_count(id_ind, Ano, Trimestre, name = "num_appearances") %>% # counts number of times that each id_ind appears
dplyr::mutate(
id_ind = dplyr::case_when(
num_appearances != 1 ~ NA,
num_appearances != 1 ~ NA_real_,
.default = id_ind
))

# missing values

dat <- dat %>% dplyr::mutate(
id_ind = dplyr::case_when(
V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA,
V2008 == "99" | V20081 == "99" | V20082 == "9999" ~ NA_real_,
.default = id_ind
)
)
}

#############################
## Advanced Identification ##
#############################

## Stage 1:

if (!(panel %in% c("none", "basic"))) {
m <- max(dat$id_ind, na.rm = TRUE) # to avoid overlap between id numbers
# id_rs are always higher numbers than id_ind
# remove NAs otherwise m is NA and all id_rs are NAs

# advanced identification is only run on previously unmatched individuals

dat <- dat %>%
dplyr::mutate(
id_rs = dplyr::cur_group_id() + m,
Expand All @@ -105,15 +106,15 @@ build_pnadc_panel <- function(dat, panel) {
dplyr::add_count(id_rs, Ano, Trimestre, name = "num_appearances_rs") %>% # counts number of times that each id_ind appears
dplyr::mutate(
id_rs = dplyr::case_when(
num_appearances_rs != 1 ~ NA,
num_appearances_rs != 1 ~ NA_real_,
.default = id_rs
))

# missing values

dat <- dat %>% dplyr::mutate(
id_rs = dplyr::case_when(
V2008 == "99" | V20081 == "99" ~ NA,
V2008 == "99" | V20081 == "99" ~ NA_real_,
.default = id_rs
)
)
Expand All @@ -130,16 +131,16 @@ build_pnadc_panel <- function(dat, panel) {
.by = id_rs
) %>%
dplyr::mutate(
id_final = dplyr::case_when(
id_rs = dplyr::case_when(
# perfect tracking with basic identification
q_count_ind == 5 ~ id_ind,

# id_rs takes over if it finds more quarters than id_ind does
# no more than 5 appearances (or else errors are introduced)
q_count_rs > q_count_ind & q_count_rs <= 5 ~ id_rs,

# if else stick to id_ind
TRUE ~ id_ind
# fallback: prefer id_ind if available, otherwise keep id_rs
TRUE ~ dplyr::coalesce(id_ind, id_rs)
)
)

Expand All @@ -148,27 +149,36 @@ build_pnadc_panel <- function(dat, panel) {
##########################
## Pasting panel number ##
##########################

# to avoid overlap when binding more than one panel (all ids are just counts from 1, ..., N)

# ifelse guards against as.hexmode(NA) which returns the string "NA" instead of a true NA

# basic panel
if (panel != "none") {

dat$id_ind <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind))

dat$id_ind <- ifelse(
is.na(dat$id_ind),
NA_character_,
paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_ind))
)

}

# advanced panel
if (!(panel %in% c("none", "basic"))) {

dat$id_rs <- paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs))
dat$id_rs <- ifelse(
is.na(dat$id_rs),
NA_character_,
paste0(as.hexmode(dat$V1014), as.hexmode(dat$id_rs))
)

}

#################
## Return Data ##
#################

# Return the modified dataset
return(dat)
}
}
97 changes: 97 additions & 0 deletions R/calculate_panel_attrition.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#' Create an attrition table for a PNADc panel file
#'
#' @param data Input data frame (PNADc panel).
#' @param panel ID strategy: "basic" (id_ind), "advanced" (id_rs), or "households" (id_dom).
#'
#' @return A data frame with 8 columns detailing unconditional and conditional attrition.
calculate_panel_attrition <- function(data, panel) {

# 1. Standardize ID column based on panel type
id_col <- switch(panel,
"basic" = "id_ind",
"advanced" = "id_rs",
"households" = "id_dom",
stop("Invalid panel type. Use 'basic', 'advanced', or 'households'."))

message(paste0(stringr::str_to_title(panel), " panel attrition calculated."))

# 2. Preparation: Filter for those present in Wave 1 and rename ID
df_prepared <- data %>%
dplyr::mutate(V1016 = as.integer(V1016)) %>%
dplyr::rename(individual_id = !!id_col)

ids_w1 <- df_prepared %>%
dplyr::filter(V1016 == 1) %>%
dplyr::pull(individual_id) %>%
unique()

n_w1 <- length(ids_w1)

# 3. Create a wide presence matrix (1 if present, 0 if absent)
# This makes logical comparisons across waves much faster
presence_matrix <- df_prepared %>%
dplyr::filter(individual_id %in% ids_w1) %>%
dplyr::select(individual_id, V1016) %>%
dplyr::mutate(present = 1) %>%
tidyr::pivot_wider(names_from = V1016,
values_from = present,
names_prefix = "w",
values_fill = 0)

# 4. Initialize the results storage
results <- data.frame(
wave_step = character(),
n_missing_uncond = numeric(),
n_found_uncond = numeric(),
pct_found_uncond = numeric(),
n_missing_cond = numeric(),
n_found_cond = numeric(),
pct_found_cond_w1 = numeric(),
pct_found_cond_prev = numeric(),
stringsAsFactors = FALSE
)

# 5. Calculate metrics for each transition (1->2, 2->3, 3->4, 4->5)
# For conditional, we track the 'survivors' who were in ALL previous waves
current_cond_ids <- ids_w1

for (i in 1:4) {
prev_w <- paste0("w", i)
next_w <- paste0("w", i + 1)

# --- Unconditional Logic ---
# Who was in Wave i?
ids_in_prev <- presence_matrix$individual_id[presence_matrix[[prev_w]] == 1]
# Out of those, who is in Wave i + 1?
found_uncond <- presence_matrix %>%
dplyr::filter(!!rlang::sym(prev_w) == 1, !!rlang::sym(next_w) == 1) %>%
nrow()

# --- Conditional Logic ---
# Who was in ALL waves from 1 to i? (stored in current_cond_ids)
# Out of those, who is also in Wave i + 1?
found_cond_df <- presence_matrix %>%
dplyr::filter(individual_id %in% current_cond_ids, !!rlang::sym(next_w) == 1)

n_found_cond <- nrow(found_cond_df)
n_missing_cond <- length(current_cond_ids) - n_found_cond

# Update denominator for next conditional step
prev_cond_total <- length(current_cond_ids)
current_cond_ids <- found_cond_df$individual_id

# --- Assemble Row ---
results[i, ] <- list(
wave_step = paste("Wave", i, "to", i + 1),
n_missing_uncond = length(ids_in_prev) - found_uncond,
n_found_uncond = found_uncond,
pct_found_uncond = (found_uncond / n_w1) * 100,
n_missing_cond = n_missing_cond,
n_found_cond = n_found_cond,
pct_found_cond_w1 = (n_found_cond / n_w1) * 100,
pct_found_cond_prev = (n_found_cond / prev_cond_total) * 100
)
}

return(results)
}