From 84e395dde56037704fb242d7cf599ae37494716e Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 9 Mar 2026 00:16:02 -0300 Subject: [PATCH 01/11] fst and parquet added. It still needs improvement in doc, but is working. --- DESCRIPTION | 2 + R/load_pnadc.R | 114 ++++++++++++++++++++++++++++++------------------- 2 files changed, 71 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4a851b4..09a6572 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,8 +17,10 @@ Description: Functions to download and treat Brazilian social data from a License: MIT + file LICENSE URL: https://datazoom.com.br/en/ Imports: + arrow, data.table, dplyr, + fst, PNADcIBGE, purrr, readr, diff --git a/R/load_pnadc.R b/R/load_pnadc.R index d12ef65..2f783f7 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -26,9 +26,13 @@ #' } #' @export -load_pnadc <- function(save_to = getwd(), years, - quarters = 1:4, panel = "advanced", - raw_data = FALSE) { +load_pnadc <- function(save_to = getwd(), + years, + quarters = 1:4, + panel = "advanced", + raw_data = FALSE, + save_quarters = FALSE, # Doesn't work if panel = "none" + panel_format = c(".csv", ".parquet")) { # Check if PNADcIBGE namespace is already attached if (!"PNADcIBGE" %in% .packages()) { # If not attached, attach it @@ -62,7 +66,8 @@ load_pnadc <- function(save_to = getwd(), years, param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly param$save_to <- save_to # the directory in which the user desires to save the files downloaded - + panel_format <- match.arg(panel_format) + # Check if quarter is a list; if not, wrap it in a list and repeat it for each year if (!is.list(quarters)) { param$quarters <- rep(list(quarters), length(years)) @@ -120,7 +125,7 @@ load_pnadc <- function(save_to = getwd(), years, #<<- stabilishing a variable inside the function that continues to exist outside the function, it is not just local to the function's current context file_path <- file.path( - param$save_to, paste0("pnadc_", year, "_", quarter, ".rds") # defining the file's names to a certain format: year= 2022, quarter=3, file -> pnadc_2022_3.rds + param$save_to, paste0("pnadc_", year, "_", quarter, ".fst") # defining the file's names to a certain format: year= 2022, quarter=3, file -> pnadc_2022_3.fst ) # runs data cleaning if desired @@ -136,7 +141,7 @@ load_pnadc <- function(save_to = getwd(), years, paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") ) - readr::write_rds(df, file_path, compress = "gz") # saving the file into the user's computer + fst::write_fst(as.data.frame(df), file_path, compress = 50) # saving the file into the user's computer return(file_path) } @@ -149,6 +154,7 @@ load_pnadc <- function(save_to = getwd(), years, ## Return Raw Data if (param$panel == "none") { + return(paste("Quarters saved to", param$save_to)) } @@ -161,54 +167,58 @@ load_pnadc <- function(save_to = getwd(), years, panel_list <- unique(panel_list) # listing all the panels included in the quarters downloaded - # set up .csv file paths for each panel such as "pnadc_panel_2.csv" + # set up file paths for each panel according to panel_format panel_files <- purrr::map( panel_list, function(panel) { + ext <- if (panel_format == ".parquet") ".parquet" else ".csv" file_path <- file.path( - param$save_to, paste0("pnadc", "_panel_", panel, ".csv") + param$save_to, paste0("pnadc", "_panel_", panel, ext) ) - file_path } ) # write an empty dataframe into each - purrr::map( - panel_files, - function(path) { - readr::write_csv(data.frame(), path, col_names = cnames) - } - ) + if (panel_format == ".csv") { + purrr::map( + panel_files, + function(path) { + readr::write_csv(data.frame(), path, col_names = cnames) + } + ) + } - # read each of the source files, split into panels, and append - # to their corresponding .csv files + # read each of the source .fst files, split into panels, and append + # to their corresponding panel files # we use the .csv files because they have a appending propriety, meaning that they can receive new information without having the older one deleted # for the R users, you can simply think as literally doing a rbind() into those files, but in a much more efficient way - purrr::map( - source_files, # source_files= the .rds files with the data that were downloaded way before in this function before - function(file) { - dat <- readr::read_rds(file) %>% - split(.$V1014) - - dat %>% - purrr::imap( - function(df, panel) { - file_path <- file.path( - param$save_to, paste0("pnadc", "_panel_", panel, ".csv") - ) - - message(paste("Compiling panel", panel, "to", file_path, "\n")) - - readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above - } - ) - } - ) + purrr::map(source_files, # source_files= the .fst files with the data that were downloaded way before in this function before + function(file) { + dat <- fst::read_fst(file) %>% + split(.$V1014) + + dat %>% + purrr::imap(function(df, panel) { + ext <- if (panel_format == ".parquet") + ".parquet" + else + ".csv" + file_path <- file.path(param$save_to, paste0("pnadc", "_panel_", panel, ext)) + + message(paste("Compiling panel", panel, "to", file_path, "\n")) + + if (panel_format == "csv") { + readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above + } else { + arrow::write_parquet(df, sink = file_path) + } + }) + }) ########################## ## Panel Identification ## @@ -240,16 +250,30 @@ load_pnadc <- function(save_to = getwd(), years, function(path) { message(paste("Running", param$panel, "identification on", path, "\n")) - df <- data.table::fread( - path, - col.names = cnames, - colClasses = ctypes - ) %>% - build_pnadc_panel(panel = param$panel) - - readr::write_csv(df, path) + if (panel_format == "csv") { + df <- data.table::fread( + path, + col.names = cnames, + colClasses = ctypes + ) %>% + build_pnadc_panel(panel = param$panel) + readr::write_csv(df, path) + } else { + df <- arrow::read_parquet(path) %>% + build_pnadc_panel(panel = param$panel) + arrow::write_parquet(df, sink = path) + } } ) + + # delete .fst quarter files if user did not ask to keep them + # (only applies when panel != "none"; when panel = "none" quarters are always kept) + if (!save_quarters) { + purrr::walk(source_files, function(f) { + if (file.exists(f)) file.remove(f) + }) + } + } #################### From 773c4e2f4dbc63e2b0c492f80f8441b55581c234 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 9 Mar 2026 00:17:19 -0300 Subject: [PATCH 02/11] mistakes --- R/load_pnadc.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index 2f783f7..117117a 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -212,7 +212,7 @@ load_pnadc <- function(save_to = getwd(), message(paste("Compiling panel", panel, "to", file_path, "\n")) - if (panel_format == "csv") { + if (panel_format == ".csv") { readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above } else { arrow::write_parquet(df, sink = file_path) @@ -250,7 +250,7 @@ load_pnadc <- function(save_to = getwd(), function(path) { message(paste("Running", param$panel, "identification on", path, "\n")) - if (panel_format == "csv") { + if (panel_format == ".csv") { df <- data.table::fread( path, col.names = cnames, From 45b78b79f73e3690280392680a4998da53876da2 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 9 Mar 2026 00:31:41 -0300 Subject: [PATCH 03/11] in-file doc --- R/load_pnadc.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index 117117a..d9e40a3 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,9 +7,13 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. +###CHANGE +#' @param save_quarters A \code{logical}. If \code{TRUE}, keeps each quarter saved as a .fst file after the panel is built. If \code{FALSE} (default), the .fst files are deleted after use. Ignored when \code{panel = "none"} (quarters are always kept in that case). +#' @param panel_format A \code{character} choosing the output format for panel files: ".csv" (default) or ".parquet". +###CHANGE #' #' @return A message indicating the successful save of panel files. -#' +#' #' @import data.table #' @import PNADcIBGE #' @importFrom magrittr `%>%` @@ -20,12 +24,15 @@ #' save_to = "Directory/You/Would/like/to/save/the/files", #' years = 2016, #' quarters = 1:4, -#' panel = "basic", -#' raw_data = FALSE +#' panel = "advanced", +#' raw_data = FALSE, +#' save_quarters = TRUE, +#' pane_format = ".parquet" #' ) #' } #' @export + load_pnadc <- function(save_to = getwd(), years, quarters = 1:4, From 9f6aa15f8175d3ad80bfd7b81a80d571543d6701 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 9 Mar 2026 00:42:04 -0300 Subject: [PATCH 04/11] Doc + mistakes --- R/load_pnadc.R | 2 -- vignettes/LOAD_PNADC.Rmd | 51 +++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index d9e40a3..f421744 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,10 +7,8 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. -###CHANGE #' @param save_quarters A \code{logical}. If \code{TRUE}, keeps each quarter saved as a .fst file after the panel is built. If \code{FALSE} (default), the .fst files are deleted after use. Ignored when \code{panel = "none"} (quarters are always kept in that case). #' @param panel_format A \code{character} choosing the output format for panel files: ".csv" (default) or ".parquet". -###CHANGE #' #' @return A message indicating the successful save of panel files. #' diff --git a/vignettes/LOAD_PNADC.Rmd b/vignettes/LOAD_PNADC.Rmd index 8e1ed2b..4b80836 100644 --- a/vignettes/LOAD_PNADC.Rmd +++ b/vignettes/LOAD_PNADC.Rmd @@ -23,13 +23,17 @@ The `load_pnadc` function is a wrapper for [*`get_pnadc`*](https://www.rdocument Default ```{r eval=FALSE} + load_pnadc( save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_trimestres = FALSE, + panel_format = ".csv" ) + ``` To download PNADC data for all quarters of 2022 and 2023, with advanced identification, simply run @@ -63,6 +67,28 @@ load_pnadc( ``` +To download PNADC data and keep the quarterly `.fst` files after the panel is built, run + +```{r eval=FALSE} +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_trimestres = TRUE +) +``` + +To download PNADC data and save the panel in Parquet format instead of CSV, run + +```{r eval=FALSE} +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + panel_format = ".parquet" +) +``` + + + *** **Options:** @@ -76,7 +102,7 @@ load_pnadc( 4. **panel**: Which panel algorithm to apply to this data. There are three options: - * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. + * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. Quarterly `.fst` files are always kept when `panel = "none"`. * `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction * `advanced`: Performs advanced identification steps for creating households and individual identifiers for panel construction. @@ -84,21 +110,30 @@ load_pnadc( 5. **raw_data**: A command to define if the user would like to download the raw or treated data. There are two options: * `TRUE`: if you want the PNADC variables as they come. * `FALSE`: if you want the treated version of the PNADC variables. + + 6. **save_trimestres**: A command to define whether the quarterly `.fst` files should be kept after the panel is built. There are two options: + * `TRUE`: the `.fst` files for each quarter are kept in `save_to` after the panel is built. + * `FALSE` (default): the `.fst` files are deleted after the panel is built. Ignored when `panel = "none"` (files are always kept in that case). + + 7. **panel_format**: The file format for the output panel files. There are two options: + * `".csv"` (default): panel files are saved as `.csv`. + * `".parquet"`: panel files are saved as `.parquet`, using the `arrow` package. Parquet files are faster to read and more space-efficient than CSV. *** **Details:** The function performs the following steps: - 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. + + 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files named `pnadc_year_quarter.fst`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. - 2. Split the data into panels, by reading each `.rds` file and filtering by the quarter variable `V1014`. Data from each panel `x` is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data from each quarter to be appended on top of the previous ones, making the process faster. + 2. Split the data into panels, by reading each `.fst` file and filtering by the quarter variable `V1014`. Data from each panel `x` is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on `panel_format`. The use of `.csv` allows for data from each quarter to be appended on top of the previous ones, making the process faster. 3. Read each panel file and apply the identification algorithms defined in the `build_pnadc_panel`. + 4. If `save_trimestres = FALSE` (default), the intermediate `.fst` quarter files are deleted after the panel is built. + + * The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): "Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE". -*** - - - +*** \ No newline at end of file From 4914f4e7705bbff6be1f5ecfebc6886b22503fe3 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 9 Mar 2026 00:48:41 -0300 Subject: [PATCH 05/11] doc build --- README.md | 82 ++++++++++++++++++++++++++++++++++------------- man/load_pnadc.Rd | 14 ++++++-- 2 files changed, 70 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 4ca0b98..19c77ba 100644 --- a/README.md +++ b/README.md @@ -45,18 +45,6 @@ install.packages("devtools") devtools::install_github("datazoompuc/datazoom.social") ``` - ## Warning: replacing previous import 'data.table::first' by 'dplyr::first' when - ## loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when - ## loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::between' by 'dplyr::between' - ## when loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::transpose' by - ## 'purrr::transpose' when loading 'datazoom.social' - ## Data @@ -86,12 +74,15 @@ build a Panel. Default ``` r + load_pnadc( save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_trimestres = FALSE, + panel_format = ".csv" ) ``` @@ -128,6 +119,28 @@ load_pnadc( ) ``` +To download PNADC data and keep the quarterly `.fst` files after the +panel is built, run + +``` r +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_trimestres = TRUE +) +``` + +To download PNADC data and save the panel in Parquet format instead of +CSV, run + +``` r +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + panel_format = ".parquet" +) +``` + ------------------------------------------------------------------------ **Options:** @@ -147,6 +160,7 @@ load_pnadc( - `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. + Quarterly `.fst` files are always kept when `panel = "none"`. - `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction - `advanced`: Performs advanced identification steps for creating @@ -158,6 +172,24 @@ load_pnadc( - `TRUE`: if you want the PNADC variables as they come. - `FALSE`: if you want the treated version of the PNADC variables. +6. **save_trimestres**: A command to define whether the quarterly + `.fst` files should be kept after the panel is built. There are two + options: + + - `TRUE`: the `.fst` files for each quarter are kept in `save_to` + after the panel is built. + - `FALSE` (default): the `.fst` files are deleted after the panel is + built. Ignored when `panel = "none"` (files are always kept in + that case). + +7. **panel_format**: The file format for the output panel files. There + are two options: + + - `".csv"` (default): panel files are saved as `.csv`. + - `".parquet"`: panel files are saved as `.parquet`, using the + `arrow` package. Parquet files are faster to read and more + space-efficient than CSV. + ------------------------------------------------------------------------ **Details:** @@ -166,18 +198,22 @@ The function performs the following steps: 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files - named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`, + named `pnadc_year_quarter.fst`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. -2. Split the data into panels, by reading each `.rds` file and +2. Split the data into panels, by reading each `.fst` file and filtering by the quarter variable `V1014`. Data from each panel `x` - is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data - from each quarter to be appended on top of the previous ones, making - the process faster. + is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, + depending on `panel_format`. The use of `.csv` allows for data from + each quarter to be appended on top of the previous ones, making the + process faster. 3. Read each panel file and apply the identification algorithms defined in the `build_pnadc_panel`. +4. If `save_trimestres = FALSE` (default), the intermediate `.fst` + quarter files are deleted after the panel is built. + - The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): “Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE”. @@ -270,18 +306,18 @@ the advanced algorithm in each interview. DataZoom is developed by a team at Pontifícia Universidade Católica do Rio de Janeiro (PUC-Rio), Department of Economics. Our official website -is at: . +is at: . To cite package `datazoom.social` in publications use: > Data Zoom (2023). Data Zoom: Simplifying Access To Brazilian > Microdata. -> +> A BibTeX entry for LaTeX users is: - @Unpublished{DataZoom2023, + @Unpublished{DataZoom2024, author = {Data Zoom}, title = {Data Zoom: Simplifying Access To Brazilian Microdata}, - url = {https://datazoom.com.br/en/}, - year = {2023}} + url = {https://www.econ.puc-rio.br/datazoom/english/index.html}, + year = {2024}} diff --git a/man/load_pnadc.Rd b/man/load_pnadc.Rd index 6493141..68929a9 100644 --- a/man/load_pnadc.Rd +++ b/man/load_pnadc.Rd @@ -9,7 +9,9 @@ load_pnadc( years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_quarters = FALSE, + panel_format = c(".csv", ".parquet") ) } \arguments{ @@ -22,6 +24,10 @@ load_pnadc( \item{panel}{A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")}} \item{raw_data}{A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.} + +\item{save_quarters}{A \code{logical}. If \code{TRUE}, keeps each quarter saved as a .fst file after the panel is built. If \code{FALSE} (default), the .fst files are deleted after use. Ignored when \code{panel = "none"} (quarters are always kept in that case).} + +\item{panel_format}{A \code{character} choosing the output format for panel files: ".csv" (default) or ".parquet".} } \value{ A message indicating the successful save of panel files. @@ -35,8 +41,10 @@ load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2016, quarters = 1:4, - panel = "basic", - raw_data = FALSE + panel = "advanced", + raw_data = FALSE, + save_quarters = TRUE, + pane_format = ".parquet" ) } } From 6e0f7444f0054e0f8d273eab2ebe5c4880c14d4d Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 16 Mar 2026 01:14:23 -0300 Subject: [PATCH 06/11] using parquet into the process and changing the delivery option --- R/load_pnadc.R | 283 ++++++++++++++++++++++++------------------------- 1 file changed, 139 insertions(+), 144 deletions(-) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index f421744..0b37753 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,8 +7,16 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. -#' @param save_quarters A \code{logical}. If \code{TRUE}, keeps each quarter saved as a .fst file after the panel is built. If \code{FALSE} (default), the .fst files are deleted after use. Ignored when \code{panel = "none"} (quarters are always kept in that case). -#' @param panel_format A \code{character} choosing the output format for panel files: ".csv" (default) or ".parquet". +#' @param save_options A \code{logical} vector of length 2 controlling how files are saved. +#' The first element controls whether quarter files are kept after the panel is built; +#' the second controls whether output files are saved as CSV (\code{TRUE}) or Parquet (\code{FALSE}). +#' \itemize{ +#' \item \code{c(TRUE, TRUE)} (default): saves quarters parquet; saves panels as CSV. +#' \item \code{c(FALSE, TRUE)}: does not keep quarter parquet; saves panels as CSV. +#' \item \code{c(TRUE, FALSE)}: saves quarters parquet; saves panels as Parquet. +#' \item \code{c(FALSE, FALSE)}: does not keep quarter parquet; saves panels as Parquet. +#' } +#' When \code{panel = "none"}, the quarter parquet is always kept regardless of \code{save_options[1]}. #' #' @return A message indicating the successful save of panel files. #' @@ -24,20 +32,27 @@ #' quarters = 1:4, #' panel = "advanced", #' raw_data = FALSE, -#' save_quarters = TRUE, -#' pane_format = ".parquet" +#' save_options = c(TRUE, FALSE) #' ) #' } #' @export -load_pnadc <- function(save_to = getwd(), +load_pnadc <- function(save_to = getwd(), years, - quarters = 1:4, + quarters = 1:4, panel = "advanced", raw_data = FALSE, - save_quarters = FALSE, # Doesn't work if panel = "none" - panel_format = c(".csv", ".parquet")) { + save_options = c(TRUE, TRUE)) { + + # Validate save_options + if (!is.logical(save_options) || length(save_options) != 2) { + stop("`save_options` must be a logical vector of length 2, e.g. c(TRUE, TRUE).") + } + + save_quarters <- save_options[1] # keep quarter parquet on disk after panel is built + panels_as_csv <- save_options[2] # TRUE = panels saved as .csv, FALSE = panels saved as .parquet + # Check if PNADcIBGE namespace is already attached if (!"PNADcIBGE" %in% .packages()) { # If not attached, attach it @@ -71,7 +86,6 @@ load_pnadc <- function(save_to = getwd(), param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly param$save_to <- save_to # the directory in which the user desires to save the files downloaded - panel_format <- match.arg(panel_format) # Check if quarter is a list; if not, wrap it in a list and repeat it for each year if (!is.list(quarters)) { @@ -103,21 +117,20 @@ load_pnadc <- function(save_to = getwd(), panel_list <- c() cnames <- NULL - # download to the saving directory + # Download all quarters, collecting each as a named data frame in a list + # The list will be saved as a single parquet file: pnadc_quarters.parquet + # Each element is named "year_quarter" for traceability - source_files <- purrr::map2( - param$years, param$quarters, # looping over the two parallel vector of years and quarters (this was previoulsy done in a "for" structure, but qwe optimized it) - + quarters_df_list <- list() + + purrr::map2( + param$years, param$quarters, function(year, quarter) { - base::message( - paste0("Downloading PNADC ", year, " Q", quarter, "\n") # just generating a message so the user knows which file is being downloaded now - ) + base::message(paste0("Downloading PNADC ", year, " Q", quarter, "\n")) - df <- get_pnadc( - year = year, quarter = quarter, labels = FALSE, design = FALSE) # downloading the file, design= FALSE returns to us just the dataframe with all variables in the PNADc) + df <- get_pnadc(year = year, quarter = quarter, labels = FALSE, design = FALSE) - # get_pnadc returns a message and the NULL object when download fails due to non-existing file if (is.null(df)) { return(NULL) @@ -126,12 +139,7 @@ load_pnadc <- function(save_to = getwd(), df <- df %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.numeric)) - panel_list <<- c(panel_list, unique(df$V1014)) # registering, for every quarter, the panel's which the quarter's observations are included (every OBS is just included in one panel, but there should be OBS inserted in 2 to 3 panels for every quarter, check our READ-ME or the IBGE's website about the rotation scheme for PNADc surveys) - #<<- stabilishing a variable inside the function that continues to exist outside the function, it is not just local to the function's current context - - file_path <- file.path( - param$save_to, paste0("pnadc_", year, "_", quarter, ".fst") # defining the file's names to a certain format: year= 2022, quarter=3, file -> pnadc_2022_3.fst - ) + panel_list <<- c(panel_list, unique(df$V1014)) # runs data cleaning if desired if (!param$raw_data) { @@ -140,145 +148,132 @@ load_pnadc <- function(save_to = getwd(), cnames <<- names(df) - # download each quarter to a separate file - - base::message( - paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") - ) + key <- paste0(year, "_", quarter) + quarters_df_list[[key]] <<- df - fst::write_fst(as.data.frame(df), file_path, compress = 50) # saving the file into the user's computer - - return(file_path) + base::message(paste0("Processed PNADC ", year, " Q", quarter, "\n")) } } ) - # erase NULL observations from source_files list - source_files <- purrr::compact(source_files) + # Remove NULL entries (failed downloads) + quarters_df_list <- purrr::compact(quarters_df_list) + + # Save all quarters to a single parquet file (list of data frames as separate row groups / named list) + quarters_parquet_path <- file.path(param$save_to, "pnadc_quarters.parquet") + + base::message(paste0("Saving all quarters to ", quarters_parquet_path, "\n")) + + # Write as a single parquet by binding all quarters and adding a key column + quarters_combined <- dplyr::bind_rows( + purrr::imap(quarters_df_list, function(df, key) { + dplyr::mutate(df, .quarter_key = key) + }) + ) + arrow::write_parquet(quarters_combined, sink = quarters_parquet_path) - ## Return Raw Data + ## Return Raw Data (panel = "none") if (param$panel == "none") { - - return(paste("Quarters saved to", param$save_to)) + return(paste("Quarters saved to", quarters_parquet_path)) } ################# ## Panel Files ## ################# - if (param$panel != "none") { - ## Split data into panels - - panel_list <- unique(panel_list) # listing all the panels included in the quarters downloaded - - # set up file paths for each panel according to panel_format - - panel_files <- purrr::map( - panel_list, - function(panel) { - ext <- if (panel_format == ".parquet") ".parquet" else ".csv" - file_path <- file.path( - param$save_to, paste0("pnadc", "_panel_", panel, ext) - ) - file_path - } - ) - - # write an empty dataframe into each - - if (panel_format == ".csv") { - purrr::map( - panel_files, - function(path) { - readr::write_csv(data.frame(), path, col_names = cnames) - } - ) + panel_list <- unique(panel_list) + + # Determine panel file extension + panel_ext <- if (panels_as_csv) ".csv" else ".parquet" + + panel_files <- purrr::map( + panel_list, + function(p) { + file.path(param$save_to, paste0("pnadc_panel_", p, panel_ext)) } + ) + + # Initialize empty CSV panel files if needed (for appending) + if (panels_as_csv) { + purrr::map(panel_files, function(path) { + readr::write_csv(data.frame(), path, col_names = TRUE) + }) + } + + # Lazy-load the quarters parquet and split into panels + # We use arrow::open_dataset for lazy/columnar reading + base::message("Splitting quarters into panels...\n") + + quarters_dataset <- arrow::open_dataset(quarters_parquet_path) + + # Only load the columns needed: V1014 (panel assignment) + all cnames + # (lazy scan: only materialise when we filter by panel) + + purrr::walk(panel_list, function(p) { + base::message(paste("Compiling panel", p, "\n")) - # read each of the source .fst files, split into panels, and append - # to their corresponding panel files - - # we use the .csv files because they have a appending propriety, meaning that they can receive new information without having the older one deleted - # for the R users, you can simply think as literally doing a rbind() into those files, but in a much more efficient way - - purrr::map(source_files, # source_files= the .fst files with the data that were downloaded way before in this function before - function(file) { - dat <- fst::read_fst(file) %>% - split(.$V1014) - - dat %>% - purrr::imap(function(df, panel) { - ext <- if (panel_format == ".parquet") - ".parquet" - else - ".csv" - file_path <- file.path(param$save_to, paste0("pnadc", "_panel_", panel, ext)) - - message(paste("Compiling panel", panel, "to", file_path, "\n")) - - if (panel_format == ".csv") { - readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above - } else { - arrow::write_parquet(df, sink = file_path) - } - }) - }) - - ########################## - ## Panel Identification ## - ########################## - - # defining column types + # Lazy filter — only reads rows for this panel from the parquet + panel_df <- quarters_dataset %>% + dplyr::filter(V1014 == p) %>% + dplyr::select(dplyr::all_of(cnames)) %>% # drop the .quarter_key helper column + dplyr::collect() - if (param$raw_data) { - ctypes <- readr::cols(.default = readr::col_number()) - } + ext <- if (panels_as_csv) ".csv" else ".parquet" + file_path <- file.path(param$save_to, paste0("pnadc_panel_", p, ext)) - else { - ctypes <- readr::cols( - .default = readr::col_number(), - regiao = readr::col_character(), - sigla_uf = readr::col_character(), - sexo = readr::col_character(), - faixa_idade = readr::col_character(), - faixa_educ = readr::col_character(), - cnae_2dig = readr::col_character(), - cod_2dig = readr::col_character() - ) + if (panels_as_csv) { + readr::write_csv(panel_df, file_path, append = TRUE) + } else { + arrow::write_parquet(panel_df, sink = file_path) } - - # read each file in panel_files and apply the identification algorithms defined in the build_pnadc_panel.R - - purrr::map( - panel_files, - function(path) { - message(paste("Running", param$panel, "identification on", path, "\n")) - - if (panel_format == ".csv") { - df <- data.table::fread( - path, - col.names = cnames, - colClasses = ctypes - ) %>% - build_pnadc_panel(panel = param$panel) - readr::write_csv(df, path) - } else { - df <- arrow::read_parquet(path) %>% - build_pnadc_panel(panel = param$panel) - arrow::write_parquet(df, sink = path) - } - } + }) + + ########################## + ## Panel Identification ## + ########################## + + # defining column types (only needed for CSV reading) + if (param$raw_data) { + ctypes <- readr::cols(.default = readr::col_number()) + } else { + ctypes <- readr::cols( + .default = readr::col_number(), + regiao = readr::col_character(), + sigla_uf = readr::col_character(), + sexo = readr::col_character(), + faixa_idade = readr::col_character(), + faixa_educ = readr::col_character(), + cnae_2dig = readr::col_character(), + cod_2dig = readr::col_character() ) - - # delete .fst quarter files if user did not ask to keep them - # (only applies when panel != "none"; when panel = "none" quarters are always kept) - if (!save_quarters) { - purrr::walk(source_files, function(f) { - if (file.exists(f)) file.remove(f) - }) + } + + purrr::map( + panel_files, + function(path) { + message(paste("Running", param$panel, "identification on", path, "\n")) + + if (panels_as_csv) { + df <- data.table::fread( + path, + col.names = cnames, + colClasses = ctypes + ) %>% + build_pnadc_panel(panel = param$panel) + readr::write_csv(df, path) + } else { + df <- arrow::read_parquet(path) %>% + build_pnadc_panel(panel = param$panel) + arrow::write_parquet(df, sink = path) + } } - + ) + + # Delete the quarter parquet if the user did not ask to keep it + # (ignored when panel = "none" — quarters always kept in that case) + if (!save_quarters) { + if (file.exists(quarters_parquet_path)) file.remove(quarters_parquet_path) } #################### @@ -287,7 +282,7 @@ load_pnadc <- function(save_to = getwd(), return(paste("Panel files saved to", param$save_to)) } - + ###################### ## Data Engineering ## ###################### From f67c1fcf62e773a4f51fe34862c4682ac37f183a Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Mon, 16 Mar 2026 01:33:36 -0300 Subject: [PATCH 07/11] Doc Adding the table showing the beggining and the end of each panel and updating the doc --- DESCRIPTION | 1 - README.html | 782 +++++++++++++++++++++++++++++++++++++++ README.md | 91 +++-- man/load_pnadc.Rd | 19 +- vignettes/LOAD_PNADC.Rmd | 57 ++- 5 files changed, 886 insertions(+), 64 deletions(-) create mode 100644 README.html diff --git a/DESCRIPTION b/DESCRIPTION index 09a6572..534157d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,6 @@ Imports: arrow, data.table, dplyr, - fst, PNADcIBGE, purrr, readr, diff --git a/README.html b/README.html new file mode 100644 index 0000000..f3b25c7 --- /dev/null +++ b/README.html @@ -0,0 +1,782 @@ + + + + + + + + + + + + + +README + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

+ +
+

datazoom.social

+ +

Languages Commits Open Issues Closed Issues Files Followers

+

The datazoom.social package facilitates access to official Brazilian +social data.

+

This package is in development stage - more datasets will be released +soon.

+

In this first version of the package, the focus is only on the Continuous +PNAD. We allow for many quarters to be easily downloaded and read, +as well as identifying individuals across time, forming a panel.

+
+
+

Installation

+ + + + + +

You can install the development version of +datazoom.social from GitHub with:

+
install.packages("devtools")
+devtools::install_github("datazoompuc/datazoom.social")
+
+

Data

+
+ + + +
+ ++++ + + + + + + + + + + +
Continuous PNADDownload PNADC of a range of quarters
Panel +IdentificationBuild a Panel of PNADC individuals
+
+ +
+

Continuous PNAD

+

The load_pnadc function is a wrapper for get_pnadc +from the package PNADcIBGE, with added identification +algorithms to build a Panel.

+
+

Panel Structure:

+

The table below shows the first and last quarter +(ANOtrimestre, e.g. 20121 = 2012 Q1) covered +by each PNADC rotating panel:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PanelStartEnd
12012120124
22012120141
32013220152
42014320163
52015420174
62017120191
72018220202
82019320213
92020420224
102022120241
112023220252
122024320263
132025420274
142027120291
+
+

Usage:

+

Default

+

+load_pnadc(
+  save_to = getwd(),
+  years,
+  quarters = 1:4,
+  panel = "advanced",
+  raw_data = FALSE,
+  save_options = c(TRUE, TRUE)
+)
+

To download PNADC data for all quarters of 2022 and 2023, with +advanced identification, simply run

+
load_pnadc(
+  save_to = "Directory/You/Would/like/to/save/the/files",
+  years = 2022:2023
+)
+

To download PNADC data for all of 2022, but only the first quarter of +2023, run

+
load_pnadc(
+  save_to = "Directory/You/Would/like/to/save/the/files",
+  years = 2022:2023,
+  quarters = list(1:4, 1)
+)
+

To download PNADC data without any variables treatment or +identification (e.g., for all quarters of 2021), run

+
load_pnadc(
+  save_to = "Directory/You/Would/like/to/save/the/files",
+  years = 2021,
+  panel = "none",
+  raw_data = TRUE
+)
+

To download PNADC data, keep the quarters parquet on disk, and save +panels as Parquet, run

+
load_pnadc(
+  save_to = "Directory/You/Would/like/to/save/the/files",
+  years = 2022,
+  save_options = c(TRUE, FALSE)
+)
+

To download PNADC data and save panels as CSV but discard the +intermediate quarters parquet, run

+
load_pnadc(
+  save_to = "Directory/You/Would/like/to/save/the/files",
+  years = 2022,
+  save_options = c(FALSE, TRUE)
+)
+
+

Options:

+
    +
  1. save_to: The directory in which the user desires +to save the downloaded files.

  2. +
  3. years: picks the years for which the data will +be downloaded

  4. +
  5. quarters: The quarters within those years to be +downloaded. Can be either a vector such as 1:4 for +consistent quarters across years, or a list of vectors, if quarters are +different for each year.

  6. +
  7. panel: Which panel algorithm to apply to this +data. There are three options:

    +
      +
    • none: No panel is built. If +raw_data = TRUE, returns the original data. Otherwise, +creates some extra treated variables. The intermediate quarters parquet +is always kept when panel = "none".
    • +
    • basic: Performs basic identification steps for creating +households and individual identifiers for panel construction
    • +
    • advanced: Performs advanced identification steps for +creating households and individual identifiers for panel +construction.
    • +
  8. +
  9. raw_data: A command to define if the user would +like to download the raw or treated data. There are two options:

    +
      +
    • TRUE: if you want the PNADC variables as they +come.
    • +
    • FALSE: if you want the treated version of the PNADC +variables.
    • +
  10. +
  11. save_options: A logical vector of length 2 +controlling file saving behaviour:

    +
      +
    • c(TRUE, TRUE) (default): keeps the intermediate +quarters parquet after panel is built; saves panel files as +.csv.
    • +
    • c(FALSE, TRUE): deletes the quarters parquet after use; +saves panel files as .csv.
    • +
    • c(TRUE, FALSE): keeps the quarters parquet; saves panel +files as .parquet (a list of panel data frames).
    • +
    • c(FALSE, FALSE): deletes the quarters parquet after +use; saves panel files as .parquet.
    • +
  12. +
+
+

Details:

+

The function performs the following steps:

+
    +
  1. Loop over years and quarters using +PNADcIBGE::get_pnadc to download the data. All quarters are +collected in memory and saved together into a single +pnadc_quarters.parquet file in save_to. If the +raw_data option is FALSE, some PNADC variables +are treated at this stage.

  2. +
  3. Split the data into panels by lazy-loading the parquet and +filtering by the panel variable V1014. Data from each panel +x is saved to pnadc_panel_x.csv or +pnadc_panel_x.parquet, depending on +save_options[2].

  4. +
  5. Read each panel file and apply the identification algorithms +defined in build_pnadc_panel.

  6. +
  7. If save_options[1] = FALSE, the intermediate +quarters parquet is deleted after the panels are built.

  8. +
+
    +
  • The identification algorithms in build_pnadc_panel are +drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): +“Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE”.
  • +
+
+
+
+
+

PNAD Panel Identification

+

Description

+

Our load_pnadc function uses the internal function +build_pnadc_panel to identify households and individuals +across quarters. The method used for the identification is based on the +paper of Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): +“Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE”.

+
+
+
+

Basic Identification

+

The household identifier – stored as id_dom – combines +the variables:

+
    +
  • UF – State;

  • +
  • UPA – Primary Sampling Unit - PSU;

  • +
  • V1008 – Household;

  • +
  • V1014 – Panel Number;

  • +
+

In order to create a unique number for every combination of those +variables.

+
+

The basic individual identifier – stored as id_ind – +combines the household id with:

+
    +
  • V2003 – Order number: individual’s unique number +within their household;

  • +
  • V2007 – Sex;

  • +
  • Date of Birth – \[`V20082` (year), +`V20081` (month), `V2008` (day)\];

  • +
+

In order to create an unique number for every combination of those +variables.

+
+
+
+

Advanced Identification

+

The advanced identifier is saved as id_rs. On +individuals who were not matched on all interviews, we relax some +assumptions to increase matching power. Under the assumption that the +date of birth is often misreported, we take individuals who are +either:

+
    +
  1. Head of the household or their partner

  2. +
  3. Child of the head of the household, 25 or older

  4. +
+

For these observations, we run the basic identification again, but +allowing the year of birth to be wrong. We also include the order +number.

+
+
+

Attrition

+

The tables below show the levels of attrition obtained using the +basic and advanced identification algorithms, and compares them to the +attrition levels obtained in the Stata datazoom_social +package.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
InterviewPercentage found (R)Percentage found (Stata)
1100.0100.0
286.285.7
378.577.5
473.271.6
569.166.8
+

Attrition for Panel 2

+

Each cell is the percentage of PNADC observations that are identified +by the advanced algorithm in each interview.

+
+
+ +
+

Credits

+

DataZoom is developed by a team at Pontifícia Universidade Católica +do Rio de Janeiro (PUC-Rio), Department of Economics. Our official +website is at: https://www.econ.puc-rio.br/datazoom/.

+

To cite package datazoom.social in publications use:

+
+

Data Zoom (2023). Data Zoom: Simplifying Access To Brazilian +Microdata.
+https://www.econ.puc-rio.br/datazoom/english/index.html

+
+

A BibTeX entry for LaTeX users is:

+
@Unpublished{DataZoom2024,
+    author = {Data Zoom},
+    title = {Data Zoom: Simplifying Access To Brazilian Microdata},
+    url = {https://www.econ.puc-rio.br/datazoom/english/index.html},
+    year = {2024}}
+
+ + + + + + + + + + + + + + + + + + + + diff --git a/README.md b/README.md index 19c77ba..a8a2ea3 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,30 @@ build a Panel. ------------------------------------------------------------------------ +**Panel Structure:** + +The table below shows the first and last quarter (`ANOtrimestre`, e.g. +`20121` = 2012 Q1) covered by each PNADC rotating panel: + +| Panel | Start | End | +|------:|------:|------:| +| 1 | 20121 | 20124 | +| 2 | 20121 | 20141 | +| 3 | 20132 | 20152 | +| 4 | 20143 | 20163 | +| 5 | 20154 | 20174 | +| 6 | 20171 | 20191 | +| 7 | 20182 | 20202 | +| 8 | 20193 | 20213 | +| 9 | 20204 | 20224 | +| 10 | 20221 | 20241 | +| 11 | 20232 | 20252 | +| 12 | 20243 | 20263 | +| 13 | 20254 | 20274 | +| 14 | 20271 | 20291 | + +------------------------------------------------------------------------ + **Usage:** Default @@ -81,8 +105,7 @@ load_pnadc( quarters = 1:4, panel = "advanced", raw_data = FALSE, - save_trimestres = FALSE, - panel_format = ".csv" + save_options = c(TRUE, TRUE) ) ``` @@ -119,25 +142,25 @@ load_pnadc( ) ``` -To download PNADC data and keep the quarterly `.fst` files after the -panel is built, run +To download PNADC data, keep the quarters parquet on disk, and save +panels as Parquet, run ``` r load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2022, - save_trimestres = TRUE + save_options = c(TRUE, FALSE) ) ``` -To download PNADC data and save the panel in Parquet format instead of -CSV, run +To download PNADC data and save panels as CSV but discard the +intermediate quarters parquet, run ``` r load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2022, - panel_format = ".parquet" + save_options = c(FALSE, TRUE) ) ``` @@ -160,7 +183,8 @@ load_pnadc( - `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. - Quarterly `.fst` files are always kept when `panel = "none"`. + The intermediate quarters parquet is always kept when + `panel = "none"`. - `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction - `advanced`: Performs advanced identification steps for creating @@ -172,23 +196,17 @@ load_pnadc( - `TRUE`: if you want the PNADC variables as they come. - `FALSE`: if you want the treated version of the PNADC variables. -6. **save_trimestres**: A command to define whether the quarterly - `.fst` files should be kept after the panel is built. There are two - options: - - - `TRUE`: the `.fst` files for each quarter are kept in `save_to` - after the panel is built. - - `FALSE` (default): the `.fst` files are deleted after the panel is - built. Ignored when `panel = "none"` (files are always kept in - that case). - -7. **panel_format**: The file format for the output panel files. There - are two options: +6. **save_options**: A logical vector of length 2 controlling file + saving behaviour: - - `".csv"` (default): panel files are saved as `.csv`. - - `".parquet"`: panel files are saved as `.parquet`, using the - `arrow` package. Parquet files are faster to read and more - space-efficient than CSV. + - `c(TRUE, TRUE)` (default): keeps the intermediate quarters parquet + after panel is built; saves panel files as `.csv`. + - `c(FALSE, TRUE)`: deletes the quarters parquet after use; saves + panel files as `.csv`. + - `c(TRUE, FALSE)`: keeps the quarters parquet; saves panel files as + `.parquet` (a list of panel data frames). + - `c(FALSE, FALSE)`: deletes the quarters parquet after use; saves + panel files as `.parquet`. ------------------------------------------------------------------------ @@ -197,22 +215,21 @@ load_pnadc( The function performs the following steps: 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to - download the data and save in the `save_to` directory, in files - named `pnadc_year_quarter.fst`. If the `raw_data` option is `FALSE`, - some PNADC variables are treated at this stage. + download the data. All quarters are collected in memory and saved + together into a single `pnadc_quarters.parquet` file in `save_to`. + If the `raw_data` option is `FALSE`, some PNADC variables are + treated at this stage. -2. Split the data into panels, by reading each `.fst` file and - filtering by the quarter variable `V1014`. Data from each panel `x` - is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, - depending on `panel_format`. The use of `.csv` allows for data from - each quarter to be appended on top of the previous ones, making the - process faster. +2. Split the data into panels by lazy-loading the parquet and filtering + by the panel variable `V1014`. Data from each panel `x` is saved to + `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on + `save_options[2]`. 3. Read each panel file and apply the identification algorithms defined - in the `build_pnadc_panel`. + in `build_pnadc_panel`. -4. If `save_trimestres = FALSE` (default), the intermediate `.fst` - quarter files are deleted after the panel is built. +4. If `save_options[1] = FALSE`, the intermediate quarters parquet is + deleted after the panels are built. - The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): “Sobre o diff --git a/man/load_pnadc.Rd b/man/load_pnadc.Rd index 68929a9..c06927f 100644 --- a/man/load_pnadc.Rd +++ b/man/load_pnadc.Rd @@ -10,8 +10,7 @@ load_pnadc( quarters = 1:4, panel = "advanced", raw_data = FALSE, - save_quarters = FALSE, - panel_format = c(".csv", ".parquet") + save_options = c(TRUE, TRUE) ) } \arguments{ @@ -25,9 +24,16 @@ load_pnadc( \item{raw_data}{A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.} -\item{save_quarters}{A \code{logical}. If \code{TRUE}, keeps each quarter saved as a .fst file after the panel is built. If \code{FALSE} (default), the .fst files are deleted after use. Ignored when \code{panel = "none"} (quarters are always kept in that case).} - -\item{panel_format}{A \code{character} choosing the output format for panel files: ".csv" (default) or ".parquet".} +\item{save_options}{A \code{logical} vector of length 2 controlling how files are saved. +The first element controls whether quarter files are kept after the panel is built; +the second controls whether output files are saved as CSV (\code{TRUE}) or Parquet (\code{FALSE}). +\itemize{ +\item \code{c(TRUE, TRUE)} (default): saves quarters parquet; saves panels as CSV. +\item \code{c(FALSE, TRUE)}: does not keep quarter parquet; saves panels as CSV. +\item \code{c(TRUE, FALSE)}: saves quarters parquet; saves panels as Parquet. +\item \code{c(FALSE, FALSE)}: does not keep quarter parquet; saves panels as Parquet. +} +When \code{panel = "none"}, the quarter parquet is always kept regardless of \code{save_options[1]}.} } \value{ A message indicating the successful save of panel files. @@ -43,8 +49,7 @@ load_pnadc( quarters = 1:4, panel = "advanced", raw_data = FALSE, - save_quarters = TRUE, - pane_format = ".parquet" + save_options = c(TRUE, FALSE) ) } } diff --git a/vignettes/LOAD_PNADC.Rmd b/vignettes/LOAD_PNADC.Rmd index 4b80836..88d1cbc 100644 --- a/vignettes/LOAD_PNADC.Rmd +++ b/vignettes/LOAD_PNADC.Rmd @@ -17,6 +17,29 @@ knitr::opts_chunk$set( The `load_pnadc` function is a wrapper for [*`get_pnadc`*](https://www.rdocumentation.org/packages/PNADcIBGE/versions/0.7.0/topics/get_pnadc) from the package `PNADcIBGE`, with added identification algorithms to build a Panel. +*** +**Panel Structure:** + +The table below shows the first and last quarter (`ANOtrimestre`, e.g. +`20121` = 2012 Q1) covered by each PNADC rotating panel: + +| Panel | Start | End | +|------:|------:|------:| +| 1 | 20121 | 20124 | +| 2 | 20121 | 20141 | +| 3 | 20132 | 20152 | +| 4 | 20143 | 20163 | +| 5 | 20154 | 20174 | +| 6 | 20171 | 20191 | +| 7 | 20182 | 20202 | +| 8 | 20193 | 20213 | +| 9 | 20204 | 20224 | +| 10 | 20221 | 20241 | +| 11 | 20232 | 20252 | +| 12 | 20243 | 20263 | +| 13 | 20254 | 20274 | +| 14 | 20271 | 20291 | + *** **Usage:** @@ -30,8 +53,7 @@ load_pnadc( quarters = 1:4, panel = "advanced", raw_data = FALSE, - save_trimestres = FALSE, - panel_format = ".csv" + save_options = c(TRUE, TRUE) ) ``` @@ -66,24 +88,23 @@ load_pnadc( ) ``` - -To download PNADC data and keep the quarterly `.fst` files after the panel is built, run +To download PNADC data, keep the quarters parquet on disk, and save panels as Parquet, run ```{r eval=FALSE} load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2022, - save_trimestres = TRUE + save_options = c(TRUE, FALSE) ) ``` -To download PNADC data and save the panel in Parquet format instead of CSV, run +To download PNADC data and save panels as CSV but discard the intermediate quarters parquet, run ```{r eval=FALSE} load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2022, - panel_format = ".parquet" + save_options = c(FALSE, TRUE) ) ``` @@ -102,7 +123,7 @@ load_pnadc( 4. **panel**: Which panel algorithm to apply to this data. There are three options: - * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. Quarterly `.fst` files are always kept when `panel = "none"`. + * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. The intermediate quarters parquet is always kept when `panel = "none"`. * `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction * `advanced`: Performs advanced identification steps for creating households and individual identifiers for panel construction. @@ -111,13 +132,11 @@ load_pnadc( * `TRUE`: if you want the PNADC variables as they come. * `FALSE`: if you want the treated version of the PNADC variables. - 6. **save_trimestres**: A command to define whether the quarterly `.fst` files should be kept after the panel is built. There are two options: - * `TRUE`: the `.fst` files for each quarter are kept in `save_to` after the panel is built. - * `FALSE` (default): the `.fst` files are deleted after the panel is built. Ignored when `panel = "none"` (files are always kept in that case). - - 7. **panel_format**: The file format for the output panel files. There are two options: - * `".csv"` (default): panel files are saved as `.csv`. - * `".parquet"`: panel files are saved as `.parquet`, using the `arrow` package. Parquet files are faster to read and more space-efficient than CSV. + 6. **save_options**: A logical vector of length 2 controlling file saving behaviour: + * `c(TRUE, TRUE)` (default): keeps the intermediate quarters parquet after panel is built; saves panel files as `.csv`. + * `c(FALSE, TRUE)`: deletes the quarters parquet after use; saves panel files as `.csv`. + * `c(TRUE, FALSE)`: keeps the quarters parquet; saves panel files as `.parquet` (a list of panel data frames). + * `c(FALSE, FALSE)`: deletes the quarters parquet after use; saves panel files as `.parquet`. *** **Details:** @@ -125,13 +144,13 @@ load_pnadc( The function performs the following steps: - 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files named `pnadc_year_quarter.fst`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. + 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data. All quarters are collected in memory and saved together into a single `pnadc_quarters.parquet` file in `save_to`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. - 2. Split the data into panels, by reading each `.fst` file and filtering by the quarter variable `V1014`. Data from each panel `x` is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on `panel_format`. The use of `.csv` allows for data from each quarter to be appended on top of the previous ones, making the process faster. + 2. Split the data into panels by lazy-loading the parquet and filtering by the panel variable `V1014`. Data from each panel `x` is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on `save_options[2]`. - 3. Read each panel file and apply the identification algorithms defined in the `build_pnadc_panel`. + 3. Read each panel file and apply the identification algorithms defined in `build_pnadc_panel`. - 4. If `save_trimestres = FALSE` (default), the intermediate `.fst` quarter files are deleted after the panel is built. + 4. If `save_options[1] = FALSE`, the intermediate quarters parquet is deleted after the panels are built. * The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): "Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE". From 361dbb62a36e66d3914208f2f842f045d61b19ef Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Sun, 22 Mar 2026 19:12:37 -0300 Subject: [PATCH 08/11] correcting mistake --- R/load_pnadc.R | 300 ++++++++++++++++++++++++------------------------- 1 file changed, 146 insertions(+), 154 deletions(-) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index 0b37753..1ddd1de 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,20 +7,23 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. -#' @param save_options A \code{logical} vector of length 2 controlling how files are saved. -#' The first element controls whether quarter files are kept after the panel is built; -#' the second controls whether output files are saved as CSV (\code{TRUE}) or Parquet (\code{FALSE}). +#' @param save_files A \code{logical} vector of length 2. Controls whether quarterly +#' files are saved and in which format all files are saved. Panel files are +#' always saved. There are four possible combinations: #' \itemize{ -#' \item \code{c(TRUE, TRUE)} (default): saves quarters parquet; saves panels as CSV. -#' \item \code{c(FALSE, TRUE)}: does not keep quarter parquet; saves panels as CSV. -#' \item \code{c(TRUE, FALSE)}: saves quarters parquet; saves panels as Parquet. -#' \item \code{c(FALSE, FALSE)}: does not keep quarter parquet; saves panels as Parquet. +#' \item \code{c(TRUE, TRUE)}: saves quarterly and panel files in +#' \code{.csv} format. This is the default. +#' \item \code{c(TRUE, FALSE)}: saves quarterly and panel files in +#' \code{.parquet} format. +#' \item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files +#' are saved in \code{.csv} format. +#' \item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files +#' are saved in \code{.parquet} format. #' } -#' When \code{panel = "none"}, the quarter parquet is always kept regardless of \code{save_options[1]}. #' #' @return A message indicating the successful save of panel files. #' -#' @import data.table +#' @importFrom data.table fread #' @import PNADcIBGE #' @importFrom magrittr `%>%` #' @@ -30,29 +33,15 @@ #' save_to = "Directory/You/Would/like/to/save/the/files", #' years = 2016, #' quarters = 1:4, -#' panel = "advanced", -#' raw_data = FALSE, -#' save_options = c(TRUE, FALSE) +#' panel = "basic", +#' raw_data = FALSE #' ) #' } #' @export - -load_pnadc <- function(save_to = getwd(), - years, - quarters = 1:4, - panel = "advanced", - raw_data = FALSE, - save_options = c(TRUE, TRUE)) { - - # Validate save_options - if (!is.logical(save_options) || length(save_options) != 2) { - stop("`save_options` must be a logical vector of length 2, e.g. c(TRUE, TRUE).") - } - - save_quarters <- save_options[1] # keep quarter parquet on disk after panel is built - panels_as_csv <- save_options[2] # TRUE = panels saved as .csv, FALSE = panels saved as .parquet - +load_pnadc <- function(save_to = getwd(), years, + quarters = 1:4, panel = "advanced", + raw_data = FALSE, save_files = c(TRUE, TRUE)) { # Check if PNADcIBGE namespace is already attached if (!"PNADcIBGE" %in% .packages()) { # If not attached, attach it @@ -81,12 +70,14 @@ load_pnadc <- function(save_to = getwd(), # The param list contains the various objects that will be used as parameters for this function param <- list() - param$years <- years # the years the user would like to download - param$quarters <- quarters # the quarters within those years to be downloaded - param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation - param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly - param$save_to <- save_to # the directory in which the user desires to save the files downloaded - + param$years <- years # the years the user would like to download + param$quarters <- quarters # the quarters within those years to be downloaded + param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation + param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly + param$save_to <- save_to # the directory in which the user desires to save the files downloaded + param$save_quarters <- save_files[1] # whether to save quarterly files to disk + param$csv <- save_files[2] # if TRUE, saves as .csv; if FALSE, saves as .parquet + # Check if quarter is a list; if not, wrap it in a list and repeat it for each year if (!is.list(quarters)) { param$quarters <- rep(list(quarters), length(years)) @@ -105,7 +96,7 @@ load_pnadc <- function(save_to = getwd(), # generaring these two paralell vectors of years and quarter to loop over - param$years <- unlist(param$years) + param$years <- unlist(param$years) param$quarters <- unlist(param$quarters) ################## @@ -115,22 +106,22 @@ load_pnadc <- function(save_to = getwd(), # store info on all panels and column names panel_list <- c() - cnames <- NULL - - # Download all quarters, collecting each as a named data frame in a list - # The list will be saved as a single parquet file: pnadc_quarters.parquet - # Each element is named "year_quarter" for traceability + cnames <- NULL - quarters_df_list <- list() + # download to the saving directory - purrr::map2( - param$years, param$quarters, + source_files <- purrr::map2( + param$years, param$quarters, # looping over the two parallel vector of years and quarters (this was previoulsy done in a "for" structure, but qwe optimized it) function(year, quarter) { - base::message(paste0("Downloading PNADC ", year, " Q", quarter, "\n")) + base::message( + paste0("Downloading PNADC ", year, " Q", quarter, "\n") # just generating a message so the user knows which file is being downloaded now + ) - df <- get_pnadc(year = year, quarter = quarter, labels = FALSE, design = FALSE) + df <- get_pnadc( + year = year, quarter = quarter, labels = FALSE, design = FALSE) # downloading the file, design= FALSE returns to us just the dataframe with all variables in the PNADc) + # get_pnadc returns a message and the NULL object when download fails due to non-existing file if (is.null(df)) { return(NULL) @@ -139,7 +130,8 @@ load_pnadc <- function(save_to = getwd(), df <- df %>% dplyr::mutate(dplyr::across(dplyr::everything(), as.numeric)) - panel_list <<- c(panel_list, unique(df$V1014)) + panel_list <<- c(panel_list, unique(df$V1014)) # registering, for every quarter, the panel's which the quarter's observations are included (every OBS is just included in one panel, but there should be OBS inserted in 2 to 3 panels for every quarter, check our READ-ME or the IBGE's website about the rotation scheme for PNADc surveys) + #<<- stabilishing a variable inside the function that continues to exist outside the function, it is not just local to the function's current context # runs data cleaning if desired if (!param$raw_data) { @@ -148,132 +140,132 @@ load_pnadc <- function(save_to = getwd(), cnames <<- names(df) - key <- paste0(year, "_", quarter) - quarters_df_list[[key]] <<- df - - base::message(paste0("Processed PNADC ", year, " Q", quarter, "\n")) + if (param$save_quarters) { + if (param$csv) { + file_path <- file.path( + param$save_to, paste0("pnadc_", year, "_", quarter, ".csv") + ) + base::message( + paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") + ) + readr::write_csv(df, file_path) + } else { + file_path <- file.path( + param$save_to, paste0("pnadc_", year, "_", quarter, ".parquet") + ) + base::message( + paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") + ) + arrow::write_parquet(df, file_path) + } + return(file_path) + } else { + return(df) # return the df in memory instead of the file path + } } } ) - # Remove NULL entries (failed downloads) - quarters_df_list <- purrr::compact(quarters_df_list) - - # Save all quarters to a single parquet file (list of data frames as separate row groups / named list) - quarters_parquet_path <- file.path(param$save_to, "pnadc_quarters.parquet") - - base::message(paste0("Saving all quarters to ", quarters_parquet_path, "\n")) - - # Write as a single parquet by binding all quarters and adding a key column - quarters_combined <- dplyr::bind_rows( - purrr::imap(quarters_df_list, function(df, key) { - dplyr::mutate(df, .quarter_key = key) - }) - ) - arrow::write_parquet(quarters_combined, sink = quarters_parquet_path) + # erase NULL observations from source_files list + source_files <- purrr::compact(source_files) - ## Return Raw Data (panel = "none") + ## Return Raw Data if (param$panel == "none") { - return(paste("Quarters saved to", quarters_parquet_path)) + return(paste("Quarters saved to", param$save_to)) } ################# ## Panel Files ## ################# - panel_list <- unique(panel_list) - - # Determine panel file extension - panel_ext <- if (panels_as_csv) ".csv" else ".parquet" - - panel_files <- purrr::map( - panel_list, - function(p) { - file.path(param$save_to, paste0("pnadc_panel_", p, panel_ext)) - } - ) - - # Initialize empty CSV panel files if needed (for appending) - if (panels_as_csv) { - purrr::map(panel_files, function(path) { - readr::write_csv(data.frame(), path, col_names = TRUE) - }) - } - - # Lazy-load the quarters parquet and split into panels - # We use arrow::open_dataset for lazy/columnar reading - base::message("Splitting quarters into panels...\n") - - quarters_dataset <- arrow::open_dataset(quarters_parquet_path) - - # Only load the columns needed: V1014 (panel assignment) + all cnames - # (lazy scan: only materialise when we filter by panel) - - purrr::walk(panel_list, function(p) { - base::message(paste("Compiling panel", p, "\n")) + if (param$panel != "none") { + ## Split data into panels - # Lazy filter — only reads rows for this panel from the parquet - panel_df <- quarters_dataset %>% - dplyr::filter(V1014 == p) %>% - dplyr::select(dplyr::all_of(cnames)) %>% # drop the .quarter_key helper column - dplyr::collect() + panel_list <- unique(panel_list) # listing all the panels included in the quarters downloaded - ext <- if (panels_as_csv) ".csv" else ".parquet" - file_path <- file.path(param$save_to, paste0("pnadc_panel_", p, ext)) + # set up file paths for each panel such as "pnadc_panel_2.csv" or "pnadc_panel_2.parquet" - if (panels_as_csv) { - readr::write_csv(panel_df, file_path, append = TRUE) - } else { - arrow::write_parquet(panel_df, sink = file_path) - } - }) - - ########################## - ## Panel Identification ## - ########################## - - # defining column types (only needed for CSV reading) - if (param$raw_data) { - ctypes <- readr::cols(.default = readr::col_number()) - } else { - ctypes <- readr::cols( - .default = readr::col_number(), - regiao = readr::col_character(), - sigla_uf = readr::col_character(), - sexo = readr::col_character(), - faixa_idade = readr::col_character(), - faixa_educ = readr::col_character(), - cnae_2dig = readr::col_character(), - cod_2dig = readr::col_character() + panel_files <- purrr::map( + panel_list, + function(p) { + ext <- if (param$csv) ".csv" else ".parquet" + file_path <- file.path( + param$save_to, paste0("pnadc_panel_", p, ext) + ) + file_path + } ) - } - - purrr::map( - panel_files, - function(path) { - message(paste("Running", param$panel, "identification on", path, "\n")) - - if (panels_as_csv) { - df <- data.table::fread( - path, - col.names = cnames, - colClasses = ctypes + + # read each of the source files, split into panels, and compile them + + # we use the .csv files because they have a appending propriety, meaning that they can receive new information without having the older one deleted + # for the R users, you can simply think as literally doing a rbind() into those files, but in a much more efficient way + + panel_data <- purrr::map( + panel_list, + function(p) { + purrr::map( + source_files, + function(file) { + dat <- if (param$save_quarters) { + if (param$csv) { + readr::read_csv(file, show_col_types = FALSE) + } else { + arrow::read_parquet(file) + } + } else { + file # already a df in memory + } + dat %>% dplyr::filter(V1014 == p) + } ) %>% - build_pnadc_panel(panel = param$panel) - readr::write_csv(df, path) - } else { - df <- arrow::read_parquet(path) %>% - build_pnadc_panel(panel = param$panel) - arrow::write_parquet(df, sink = path) + purrr::list_rbind() } + ) + + ########################## + ## Panel Identification ## + ########################## + + # defining column types + + if (param$raw_data) { + ctypes <- readr::cols(.default = readr::col_number()) + } else { + ctypes <- readr::cols( + .default = readr::col_number(), + regiao = readr::col_character(), + sigla_uf = readr::col_character(), + sexo = readr::col_character(), + faixa_idade = readr::col_character(), + faixa_educ = readr::col_character(), + cnae_2dig = readr::col_character(), + cod_2dig = readr::col_character() + ) } - ) - - # Delete the quarter parquet if the user did not ask to keep it - # (ignored when panel = "none" — quarters always kept in that case) - if (!save_quarters) { - if (file.exists(quarters_parquet_path)) file.remove(quarters_parquet_path) + + # apply the identification algorithms defined in build_pnadc_panel.R and save panel files + + purrr::map2( + panel_data, panel_files, + function(dat, path) { + message(paste("Running", param$panel, "identification on panel", "\n")) + + df <- dat %>% + build_pnadc_panel(panel = param$panel) + + if (param$csv) { + message(paste("Compiling panel to", path, "\n")) + readr::write_csv(df, path) + } else { + message(paste("Compiling panel to", path, "\n")) + arrow::write_parquet(df, path) + } + + return(df) + } + ) } #################### @@ -282,7 +274,7 @@ load_pnadc <- function(save_to = getwd(), return(paste("Panel files saved to", param$save_to)) } - + ###################### ## Data Engineering ## ###################### @@ -378,7 +370,7 @@ treat_pnadc <- function(df) { dplyr::mutate( faixa_educ = dplyr::case_match( VD3004, - 1 ~ "Sem instru\u00e7\u00a3o", + 1 ~ "Sem instru\u00e7\u00e3o", 2 ~ "1 a 7 anos de estudo", 3 ~ "8 a 11 anos de estudo", 4:6 ~ "9 a 14 anos de estudo", From c4f75490084249cf95684f19d0d3300eb158d3b4 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Sun, 22 Mar 2026 19:43:05 -0300 Subject: [PATCH 09/11] vignette adjust --- vignettes/LOAD_PNADC.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/LOAD_PNADC.Rmd b/vignettes/LOAD_PNADC.Rmd index 88d1cbc..ccb47ce 100644 --- a/vignettes/LOAD_PNADC.Rmd +++ b/vignettes/LOAD_PNADC.Rmd @@ -98,7 +98,7 @@ load_pnadc( ) ``` -To download PNADC data and save panels as CSV but discard the intermediate quarters parquet, run +To download PNADC data and save panels as CSV but discard the intermediate quarters, run ```{r eval=FALSE} load_pnadc( @@ -133,10 +133,10 @@ load_pnadc( * `FALSE`: if you want the treated version of the PNADC variables. 6. **save_options**: A logical vector of length 2 controlling file saving behaviour: - * `c(TRUE, TRUE)` (default): keeps the intermediate quarters parquet after panel is built; saves panel files as `.csv`. - * `c(FALSE, TRUE)`: deletes the quarters parquet after use; saves panel files as `.csv`. - * `c(TRUE, FALSE)`: keeps the quarters parquet; saves panel files as `.parquet` (a list of panel data frames). - * `c(FALSE, FALSE)`: deletes the quarters parquet after use; saves panel files as `.parquet`. + * `c(TRUE, TRUE)` (default): keeps the intermediate quarters after panel is built; saves all files as `.csv`. + * `c(FALSE, TRUE)`: deletes the quarters after use; saves panel files as `.csv`. + * `c(TRUE, FALSE)`: keeps the quarters; saves all files as `.parquet` (a list of panel data frames). + * `c(FALSE, FALSE)`: deletes the quarters after use; saves panel files as `.parquet`. *** **Details:** From abd14fecf3150492762956efda85cf3d1e8efb50 Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Sun, 22 Mar 2026 19:56:35 -0300 Subject: [PATCH 10/11] mistake solving --- NAMESPACE | 2 +- R/load_pnadc.R | 16 ++++++++-------- README.md | 16 ++++++++-------- man/load_pnadc.Rd | 30 +++++++++++++++++------------- 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9eec8de..e186f97 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,5 +3,5 @@ export(build_pnadc_panel) export(load_pnadc) import(PNADcIBGE) -import(data.table) +importFrom(data.table,fread) importFrom(magrittr,`%>%`) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index 1ddd1de..c8f09cd 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,7 +7,7 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. -#' @param save_files A \code{logical} vector of length 2. Controls whether quarterly +#' @param save_options A \code{logical} vector of length 2. Controls whether quarterly #' files are saved and in which format all files are saved. Panel files are #' always saved. There are four possible combinations: #' \itemize{ @@ -27,21 +27,21 @@ #' @import PNADcIBGE #' @importFrom magrittr `%>%` #' -#' @examples -#' \dontrun{ +#' @examplesIf interactive() +#' #' load_pnadc( #' save_to = "Directory/You/Would/like/to/save/the/files", #' years = 2016, #' quarters = 1:4, #' panel = "basic", -#' raw_data = FALSE +#' raw_data = FALSE, +#' save_options = c(FALSE, FALSE) #' ) -#' } #' @export load_pnadc <- function(save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE, save_files = c(TRUE, TRUE)) { + raw_data = FALSE, save_options = c(TRUE, TRUE)) { # Check if PNADcIBGE namespace is already attached if (!"PNADcIBGE" %in% .packages()) { # If not attached, attach it @@ -75,8 +75,8 @@ load_pnadc <- function(save_to = getwd(), years, param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly param$save_to <- save_to # the directory in which the user desires to save the files downloaded - param$save_quarters <- save_files[1] # whether to save quarterly files to disk - param$csv <- save_files[2] # if TRUE, saves as .csv; if FALSE, saves as .parquet + param$save_quarters <- save_options[1] # whether to save quarterly files to disk + param$csv <- save_options[2] # if TRUE, saves as .csv; if FALSE, saves as .parquet # Check if quarter is a list; if not, wrap it in a list and repeat it for each year if (!is.list(quarters)) { diff --git a/README.md b/README.md index a8a2ea3..666e519 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,7 @@ load_pnadc( ``` To download PNADC data and save panels as CSV but discard the -intermediate quarters parquet, run +intermediate quarters, run ``` r load_pnadc( @@ -199,14 +199,14 @@ load_pnadc( 6. **save_options**: A logical vector of length 2 controlling file saving behaviour: - - `c(TRUE, TRUE)` (default): keeps the intermediate quarters parquet - after panel is built; saves panel files as `.csv`. - - `c(FALSE, TRUE)`: deletes the quarters parquet after use; saves - panel files as `.csv`. - - `c(TRUE, FALSE)`: keeps the quarters parquet; saves panel files as + - `c(TRUE, TRUE)` (default): keeps the intermediate quarters after + panel is built; saves all files as `.csv`. + - `c(FALSE, TRUE)`: deletes the quarters after use; saves panel + files as `.csv`. + - `c(TRUE, FALSE)`: keeps the quarters; saves all files as `.parquet` (a list of panel data frames). - - `c(FALSE, FALSE)`: deletes the quarters parquet after use; saves - panel files as `.parquet`. + - `c(FALSE, FALSE)`: deletes the quarters after use; saves panel + files as `.parquet`. ------------------------------------------------------------------------ diff --git a/man/load_pnadc.Rd b/man/load_pnadc.Rd index c06927f..ae91ce4 100644 --- a/man/load_pnadc.Rd +++ b/man/load_pnadc.Rd @@ -24,16 +24,19 @@ load_pnadc( \item{raw_data}{A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.} -\item{save_options}{A \code{logical} vector of length 2 controlling how files are saved. -The first element controls whether quarter files are kept after the panel is built; -the second controls whether output files are saved as CSV (\code{TRUE}) or Parquet (\code{FALSE}). +\item{save_options}{A \code{logical} vector of length 2. Controls whether quarterly +files are saved and in which format all files are saved. Panel files are +always saved. There are four possible combinations: \itemize{ -\item \code{c(TRUE, TRUE)} (default): saves quarters parquet; saves panels as CSV. -\item \code{c(FALSE, TRUE)}: does not keep quarter parquet; saves panels as CSV. -\item \code{c(TRUE, FALSE)}: saves quarters parquet; saves panels as Parquet. -\item \code{c(FALSE, FALSE)}: does not keep quarter parquet; saves panels as Parquet. -} -When \code{panel = "none"}, the quarter parquet is always kept regardless of \code{save_options[1]}.} +\item \code{c(TRUE, TRUE)}: saves quarterly and panel files in +\code{.csv} format. This is the default. +\item \code{c(TRUE, FALSE)}: saves quarterly and panel files in +\code{.parquet} format. +\item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files +are saved in \code{.csv} format. +\item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files +are saved in \code{.parquet} format. +}} } \value{ A message indicating the successful save of panel files. @@ -42,14 +45,15 @@ A message indicating the successful save of panel files. This function downloads PNADC data and applies panel identification algorithms } \examples{ -\dontrun{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2016, quarters = 1:4, - panel = "advanced", + panel = "basic", raw_data = FALSE, - save_options = c(TRUE, FALSE) + save_options = c(FALSE, FALSE) ) -} +\dontshow{\}) # examplesIf} } From e4108f69b963f60e15472c2c17f8d9469610483f Mon Sep 17 00:00:00 2001 From: Bernardo Furlanetto Sieira Date: Sun, 22 Mar 2026 22:08:36 -0300 Subject: [PATCH 11/11] Delete README.html --- README.html | 782 ---------------------------------------------------- 1 file changed, 782 deletions(-) delete mode 100644 README.html diff --git a/README.html b/README.html deleted file mode 100644 index f3b25c7..0000000 --- a/README.html +++ /dev/null @@ -1,782 +0,0 @@ - - - - - - - - - - - - - -README - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -

- -
-

datazoom.social

- -

Languages Commits Open Issues Closed Issues Files Followers

-

The datazoom.social package facilitates access to official Brazilian -social data.

-

This package is in development stage - more datasets will be released -soon.

-

In this first version of the package, the focus is only on the Continuous -PNAD. We allow for many quarters to be easily downloaded and read, -as well as identifying individuals across time, forming a panel.

-
-
-

Installation

- - - - - -

You can install the development version of -datazoom.social from GitHub with:

-
install.packages("devtools")
-devtools::install_github("datazoompuc/datazoom.social")
-
-

Data

- - - - -
- ---- - - - - - - - - - - -
Continuous PNADDownload PNADC of a range of quarters
Panel -IdentificationBuild a Panel of PNADC individuals
-
-
-
-

Continuous PNAD

-

The load_pnadc function is a wrapper for get_pnadc -from the package PNADcIBGE, with added identification -algorithms to build a Panel.

-
-

Panel Structure:

-

The table below shows the first and last quarter -(ANOtrimestre, e.g. 20121 = 2012 Q1) covered -by each PNADC rotating panel:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PanelStartEnd
12012120124
22012120141
32013220152
42014320163
52015420174
62017120191
72018220202
82019320213
92020420224
102022120241
112023220252
122024320263
132025420274
142027120291
-
-

Usage:

-

Default

-

-load_pnadc(
-  save_to = getwd(),
-  years,
-  quarters = 1:4,
-  panel = "advanced",
-  raw_data = FALSE,
-  save_options = c(TRUE, TRUE)
-)
-

To download PNADC data for all quarters of 2022 and 2023, with -advanced identification, simply run

-
load_pnadc(
-  save_to = "Directory/You/Would/like/to/save/the/files",
-  years = 2022:2023
-)
-

To download PNADC data for all of 2022, but only the first quarter of -2023, run

-
load_pnadc(
-  save_to = "Directory/You/Would/like/to/save/the/files",
-  years = 2022:2023,
-  quarters = list(1:4, 1)
-)
-

To download PNADC data without any variables treatment or -identification (e.g., for all quarters of 2021), run

-
load_pnadc(
-  save_to = "Directory/You/Would/like/to/save/the/files",
-  years = 2021,
-  panel = "none",
-  raw_data = TRUE
-)
-

To download PNADC data, keep the quarters parquet on disk, and save -panels as Parquet, run

-
load_pnadc(
-  save_to = "Directory/You/Would/like/to/save/the/files",
-  years = 2022,
-  save_options = c(TRUE, FALSE)
-)
-

To download PNADC data and save panels as CSV but discard the -intermediate quarters parquet, run

-
load_pnadc(
-  save_to = "Directory/You/Would/like/to/save/the/files",
-  years = 2022,
-  save_options = c(FALSE, TRUE)
-)
-
-

Options:

-
    -
  1. save_to: The directory in which the user desires -to save the downloaded files.

  2. -
  3. years: picks the years for which the data will -be downloaded

  4. -
  5. quarters: The quarters within those years to be -downloaded. Can be either a vector such as 1:4 for -consistent quarters across years, or a list of vectors, if quarters are -different for each year.

  6. -
  7. panel: Which panel algorithm to apply to this -data. There are three options:

    -
      -
    • none: No panel is built. If -raw_data = TRUE, returns the original data. Otherwise, -creates some extra treated variables. The intermediate quarters parquet -is always kept when panel = "none".
    • -
    • basic: Performs basic identification steps for creating -households and individual identifiers for panel construction
    • -
    • advanced: Performs advanced identification steps for -creating households and individual identifiers for panel -construction.
    • -
  8. -
  9. raw_data: A command to define if the user would -like to download the raw or treated data. There are two options:

    -
      -
    • TRUE: if you want the PNADC variables as they -come.
    • -
    • FALSE: if you want the treated version of the PNADC -variables.
    • -
  10. -
  11. save_options: A logical vector of length 2 -controlling file saving behaviour:

    -
      -
    • c(TRUE, TRUE) (default): keeps the intermediate -quarters parquet after panel is built; saves panel files as -.csv.
    • -
    • c(FALSE, TRUE): deletes the quarters parquet after use; -saves panel files as .csv.
    • -
    • c(TRUE, FALSE): keeps the quarters parquet; saves panel -files as .parquet (a list of panel data frames).
    • -
    • c(FALSE, FALSE): deletes the quarters parquet after -use; saves panel files as .parquet.
    • -
  12. -
-
-

Details:

-

The function performs the following steps:

-
    -
  1. Loop over years and quarters using -PNADcIBGE::get_pnadc to download the data. All quarters are -collected in memory and saved together into a single -pnadc_quarters.parquet file in save_to. If the -raw_data option is FALSE, some PNADC variables -are treated at this stage.

  2. -
  3. Split the data into panels by lazy-loading the parquet and -filtering by the panel variable V1014. Data from each panel -x is saved to pnadc_panel_x.csv or -pnadc_panel_x.parquet, depending on -save_options[2].

  4. -
  5. Read each panel file and apply the identification algorithms -defined in build_pnadc_panel.

  6. -
  7. If save_options[1] = FALSE, the intermediate -quarters parquet is deleted after the panels are built.

  8. -
-
    -
  • The identification algorithms in build_pnadc_panel are -drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): -“Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE”.
  • -
-
-
-
-
-

PNAD Panel Identification

-

Description

-

Our load_pnadc function uses the internal function -build_pnadc_panel to identify households and individuals -across quarters. The method used for the identification is based on the -paper of Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): -“Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE”.

-
-
-
-

Basic Identification

-

The household identifier – stored as id_dom – combines -the variables:

-
    -
  • UF – State;

  • -
  • UPA – Primary Sampling Unit - PSU;

  • -
  • V1008 – Household;

  • -
  • V1014 – Panel Number;

  • -
-

In order to create a unique number for every combination of those -variables.

-
-

The basic individual identifier – stored as id_ind – -combines the household id with:

-
    -
  • V2003 – Order number: individual’s unique number -within their household;

  • -
  • V2007 – Sex;

  • -
  • Date of Birth – \[`V20082` (year), -`V20081` (month), `V2008` (day)\];

  • -
-

In order to create an unique number for every combination of those -variables.

-
-
-
-

Advanced Identification

-

The advanced identifier is saved as id_rs. On -individuals who were not matched on all interviews, we relax some -assumptions to increase matching power. Under the assumption that the -date of birth is often misreported, we take individuals who are -either:

-
    -
  1. Head of the household or their partner

  2. -
  3. Child of the head of the household, 25 or older

  4. -
-

For these observations, we run the basic identification again, but -allowing the year of birth to be wrong. We also include the order -number.

-
-
-

Attrition

-

The tables below show the levels of attrition obtained using the -basic and advanced identification algorithms, and compares them to the -attrition levels obtained in the Stata datazoom_social -package.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
InterviewPercentage found (R)Percentage found (Stata)
1100.0100.0
286.285.7
378.577.5
473.271.6
569.166.8
-

Attrition for Panel 2

-

Each cell is the percentage of PNADC observations that are identified -by the advanced algorithm in each interview.

-
-
-
-
-

Credits

-

DataZoom is developed by a team at Pontifícia Universidade Católica -do Rio de Janeiro (PUC-Rio), Department of Economics. Our official -website is at: https://www.econ.puc-rio.br/datazoom/.

-

To cite package datazoom.social in publications use:

-
-

Data Zoom (2023). Data Zoom: Simplifying Access To Brazilian -Microdata.
-https://www.econ.puc-rio.br/datazoom/english/index.html

-
-

A BibTeX entry for LaTeX users is:

-
@Unpublished{DataZoom2024,
-    author = {Data Zoom},
-    title = {Data Zoom: Simplifying Access To Brazilian Microdata},
-    url = {https://www.econ.puc-rio.br/datazoom/english/index.html},
-    year = {2024}}
-
- - - - -
- - - - - - - - - - - - - - -