diff --git a/DESCRIPTION b/DESCRIPTION index 4a851b4..534157d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,7 @@ Description: Functions to download and treat Brazilian social data from a License: MIT + file LICENSE URL: https://datazoom.com.br/en/ Imports: + arrow, data.table, dplyr, PNADcIBGE, diff --git a/NAMESPACE b/NAMESPACE index 9eec8de..e186f97 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,5 +3,5 @@ export(build_pnadc_panel) export(load_pnadc) import(PNADcIBGE) -import(data.table) +importFrom(data.table,fread) importFrom(magrittr,`%>%`) diff --git a/R/load_pnadc.R b/R/load_pnadc.R index d12ef65..c8f09cd 100644 --- a/R/load_pnadc.R +++ b/R/load_pnadc.R @@ -7,28 +7,41 @@ #' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year. #' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")} #' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables. +#' @param save_options A \code{logical} vector of length 2. Controls whether quarterly +#' files are saved and in which format all files are saved. Panel files are +#' always saved. There are four possible combinations: +#' \itemize{ +#' \item \code{c(TRUE, TRUE)}: saves quarterly and panel files in +#' \code{.csv} format. This is the default. +#' \item \code{c(TRUE, FALSE)}: saves quarterly and panel files in +#' \code{.parquet} format. +#' \item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files +#' are saved in \code{.csv} format. +#' \item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files +#' are saved in \code{.parquet} format. +#' } #' #' @return A message indicating the successful save of panel files. -#' -#' @import data.table +#' +#' @importFrom data.table fread #' @import PNADcIBGE #' @importFrom magrittr `%>%` #' -#' @examples -#' \dontrun{ +#' @examplesIf interactive() +#' #' load_pnadc( #' save_to = "Directory/You/Would/like/to/save/the/files", #' years = 2016, #' quarters = 1:4, #' panel = "basic", -#' raw_data = FALSE +#' raw_data = FALSE, +#' save_options = c(FALSE, FALSE) #' ) -#' } #' @export load_pnadc <- function(save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE) { + raw_data = FALSE, save_options = c(TRUE, TRUE)) { # Check if PNADcIBGE namespace is already attached if (!"PNADcIBGE" %in% .packages()) { # If not attached, attach it @@ -57,11 +70,13 @@ load_pnadc <- function(save_to = getwd(), years, # The param list contains the various objects that will be used as parameters for this function param <- list() - param$years <- years # the years the user would like to download - param$quarters <- quarters # the quarters within those years to be downloaded - param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation - param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly - param$save_to <- save_to # the directory in which the user desires to save the files downloaded + param$years <- years # the years the user would like to download + param$quarters <- quarters # the quarters within those years to be downloaded + param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation + param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly + param$save_to <- save_to # the directory in which the user desires to save the files downloaded + param$save_quarters <- save_options[1] # whether to save quarterly files to disk + param$csv <- save_options[2] # if TRUE, saves as .csv; if FALSE, saves as .parquet # Check if quarter is a list; if not, wrap it in a list and repeat it for each year if (!is.list(quarters)) { @@ -81,7 +96,7 @@ load_pnadc <- function(save_to = getwd(), years, # generaring these two paralell vectors of years and quarter to loop over - param$years <- unlist(param$years) + param$years <- unlist(param$years) param$quarters <- unlist(param$quarters) ################## @@ -91,14 +106,13 @@ load_pnadc <- function(save_to = getwd(), years, # store info on all panels and column names panel_list <- c() - cnames <- NULL + cnames <- NULL # download to the saving directory source_files <- purrr::map2( param$years, param$quarters, # looping over the two parallel vector of years and quarters (this was previoulsy done in a "for" structure, but qwe optimized it) - function(year, quarter) { base::message( paste0("Downloading PNADC ", year, " Q", quarter, "\n") # just generating a message so the user knows which file is being downloaded now @@ -119,10 +133,6 @@ load_pnadc <- function(save_to = getwd(), years, panel_list <<- c(panel_list, unique(df$V1014)) # registering, for every quarter, the panel's which the quarter's observations are included (every OBS is just included in one panel, but there should be OBS inserted in 2 to 3 panels for every quarter, check our READ-ME or the IBGE's website about the rotation scheme for PNADc surveys) #<<- stabilishing a variable inside the function that continues to exist outside the function, it is not just local to the function's current context - file_path <- file.path( - param$save_to, paste0("pnadc_", year, "_", quarter, ".rds") # defining the file's names to a certain format: year= 2022, quarter=3, file -> pnadc_2022_3.rds - ) - # runs data cleaning if desired if (!param$raw_data) { df <- treat_pnadc(df) @@ -130,15 +140,28 @@ load_pnadc <- function(save_to = getwd(), years, cnames <<- names(df) - # download each quarter to a separate file - - base::message( - paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") - ) - - readr::write_rds(df, file_path, compress = "gz") # saving the file into the user's computer - - return(file_path) + if (param$save_quarters) { + if (param$csv) { + file_path <- file.path( + param$save_to, paste0("pnadc_", year, "_", quarter, ".csv") + ) + base::message( + paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") + ) + readr::write_csv(df, file_path) + } else { + file_path <- file.path( + param$save_to, paste0("pnadc_", year, "_", quarter, ".parquet") + ) + base::message( + paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n") + ) + arrow::write_parquet(df, file_path) + } + return(file_path) + } else { + return(df) # return the df in memory instead of the file path + } } } ) @@ -161,52 +184,43 @@ load_pnadc <- function(save_to = getwd(), years, panel_list <- unique(panel_list) # listing all the panels included in the quarters downloaded - # set up .csv file paths for each panel such as "pnadc_panel_2.csv" + # set up file paths for each panel such as "pnadc_panel_2.csv" or "pnadc_panel_2.parquet" panel_files <- purrr::map( panel_list, - function(panel) { + function(p) { + ext <- if (param$csv) ".csv" else ".parquet" file_path <- file.path( - param$save_to, paste0("pnadc", "_panel_", panel, ".csv") + param$save_to, paste0("pnadc_panel_", p, ext) ) - file_path } ) - # write an empty dataframe into each - - purrr::map( - panel_files, - function(path) { - readr::write_csv(data.frame(), path, col_names = cnames) - } - ) - - # read each of the source files, split into panels, and append - # to their corresponding .csv files + # read each of the source files, split into panels, and compile them # we use the .csv files because they have a appending propriety, meaning that they can receive new information without having the older one deleted # for the R users, you can simply think as literally doing a rbind() into those files, but in a much more efficient way - purrr::map( - source_files, # source_files= the .rds files with the data that were downloaded way before in this function before - function(file) { - dat <- readr::read_rds(file) %>% - split(.$V1014) - - dat %>% - purrr::imap( - function(df, panel) { - file_path <- file.path( - param$save_to, paste0("pnadc", "_panel_", panel, ".csv") - ) - - message(paste("Compiling panel", panel, "to", file_path, "\n")) - - readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above + panel_data <- purrr::map( + panel_list, + function(p) { + purrr::map( + source_files, + function(file) { + dat <- if (param$save_quarters) { + if (param$csv) { + readr::read_csv(file, show_col_types = FALSE) + } else { + arrow::read_parquet(file) + } + } else { + file # already a df in memory } - ) + dat %>% dplyr::filter(V1014 == p) + } + ) %>% + purrr::list_rbind() } ) @@ -218,9 +232,7 @@ load_pnadc <- function(save_to = getwd(), years, if (param$raw_data) { ctypes <- readr::cols(.default = readr::col_number()) - } - - else { + } else { ctypes <- readr::cols( .default = readr::col_number(), regiao = readr::col_character(), @@ -233,21 +245,25 @@ load_pnadc <- function(save_to = getwd(), years, ) } - # read each file in panel_files and apply the identification algorithms defined in the build_pnadc_panel.R + # apply the identification algorithms defined in build_pnadc_panel.R and save panel files - purrr::map( - panel_files, - function(path) { - message(paste("Running", param$panel, "identification on", path, "\n")) + purrr::map2( + panel_data, panel_files, + function(dat, path) { + message(paste("Running", param$panel, "identification on panel", "\n")) - df <- data.table::fread( - path, - col.names = cnames, - colClasses = ctypes - ) %>% + df <- dat %>% build_pnadc_panel(panel = param$panel) - readr::write_csv(df, path) + if (param$csv) { + message(paste("Compiling panel to", path, "\n")) + readr::write_csv(df, path) + } else { + message(paste("Compiling panel to", path, "\n")) + arrow::write_parquet(df, path) + } + + return(df) } ) } @@ -354,7 +370,7 @@ treat_pnadc <- function(df) { dplyr::mutate( faixa_educ = dplyr::case_match( VD3004, - 1 ~ "Sem instru\u00e7\u00a3o", + 1 ~ "Sem instru\u00e7\u00e3o", 2 ~ "1 a 7 anos de estudo", 3 ~ "8 a 11 anos de estudo", 4:6 ~ "9 a 14 anos de estudo", diff --git a/README.md b/README.md index 4ca0b98..666e519 100644 --- a/README.md +++ b/README.md @@ -45,18 +45,6 @@ install.packages("devtools") devtools::install_github("datazoompuc/datazoom.social") ``` - ## Warning: replacing previous import 'data.table::first' by 'dplyr::first' when - ## loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when - ## loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::between' by 'dplyr::between' - ## when loading 'datazoom.social' - - ## Warning: replacing previous import 'data.table::transpose' by - ## 'purrr::transpose' when loading 'datazoom.social' - ## Data @@ -81,17 +69,43 @@ build a Panel. ------------------------------------------------------------------------ +**Panel Structure:** + +The table below shows the first and last quarter (`ANOtrimestre`, e.g. +`20121` = 2012 Q1) covered by each PNADC rotating panel: + +| Panel | Start | End | +|------:|------:|------:| +| 1 | 20121 | 20124 | +| 2 | 20121 | 20141 | +| 3 | 20132 | 20152 | +| 4 | 20143 | 20163 | +| 5 | 20154 | 20174 | +| 6 | 20171 | 20191 | +| 7 | 20182 | 20202 | +| 8 | 20193 | 20213 | +| 9 | 20204 | 20224 | +| 10 | 20221 | 20241 | +| 11 | 20232 | 20252 | +| 12 | 20243 | 20263 | +| 13 | 20254 | 20274 | +| 14 | 20271 | 20291 | + +------------------------------------------------------------------------ + **Usage:** Default ``` r + load_pnadc( save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_options = c(TRUE, TRUE) ) ``` @@ -128,6 +142,28 @@ load_pnadc( ) ``` +To download PNADC data, keep the quarters parquet on disk, and save +panels as Parquet, run + +``` r +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_options = c(TRUE, FALSE) +) +``` + +To download PNADC data and save panels as CSV but discard the +intermediate quarters, run + +``` r +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_options = c(FALSE, TRUE) +) +``` + ------------------------------------------------------------------------ **Options:** @@ -147,6 +183,8 @@ load_pnadc( - `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. + The intermediate quarters parquet is always kept when + `panel = "none"`. - `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction - `advanced`: Performs advanced identification steps for creating @@ -158,6 +196,18 @@ load_pnadc( - `TRUE`: if you want the PNADC variables as they come. - `FALSE`: if you want the treated version of the PNADC variables. +6. **save_options**: A logical vector of length 2 controlling file + saving behaviour: + + - `c(TRUE, TRUE)` (default): keeps the intermediate quarters after + panel is built; saves all files as `.csv`. + - `c(FALSE, TRUE)`: deletes the quarters after use; saves panel + files as `.csv`. + - `c(TRUE, FALSE)`: keeps the quarters; saves all files as + `.parquet` (a list of panel data frames). + - `c(FALSE, FALSE)`: deletes the quarters after use; saves panel + files as `.parquet`. + ------------------------------------------------------------------------ **Details:** @@ -165,18 +215,21 @@ load_pnadc( The function performs the following steps: 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to - download the data and save in the `save_to` directory, in files - named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`, - some PNADC variables are treated at this stage. + download the data. All quarters are collected in memory and saved + together into a single `pnadc_quarters.parquet` file in `save_to`. + If the `raw_data` option is `FALSE`, some PNADC variables are + treated at this stage. -2. Split the data into panels, by reading each `.rds` file and - filtering by the quarter variable `V1014`. Data from each panel `x` - is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data - from each quarter to be appended on top of the previous ones, making - the process faster. +2. Split the data into panels by lazy-loading the parquet and filtering + by the panel variable `V1014`. Data from each panel `x` is saved to + `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on + `save_options[2]`. 3. Read each panel file and apply the identification algorithms defined - in the `build_pnadc_panel`. + in `build_pnadc_panel`. + +4. If `save_options[1] = FALSE`, the intermediate quarters parquet is + deleted after the panels are built. - The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): “Sobre o @@ -270,18 +323,18 @@ the advanced algorithm in each interview. DataZoom is developed by a team at Pontifícia Universidade Católica do Rio de Janeiro (PUC-Rio), Department of Economics. Our official website -is at: . +is at: . To cite package `datazoom.social` in publications use: > Data Zoom (2023). Data Zoom: Simplifying Access To Brazilian > Microdata. -> +> A BibTeX entry for LaTeX users is: - @Unpublished{DataZoom2023, + @Unpublished{DataZoom2024, author = {Data Zoom}, title = {Data Zoom: Simplifying Access To Brazilian Microdata}, - url = {https://datazoom.com.br/en/}, - year = {2023}} + url = {https://www.econ.puc-rio.br/datazoom/english/index.html}, + year = {2024}} diff --git a/man/load_pnadc.Rd b/man/load_pnadc.Rd index 6493141..ae91ce4 100644 --- a/man/load_pnadc.Rd +++ b/man/load_pnadc.Rd @@ -9,7 +9,8 @@ load_pnadc( years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_options = c(TRUE, TRUE) ) } \arguments{ @@ -22,6 +23,20 @@ load_pnadc( \item{panel}{A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")}} \item{raw_data}{A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.} + +\item{save_options}{A \code{logical} vector of length 2. Controls whether quarterly +files are saved and in which format all files are saved. Panel files are +always saved. There are four possible combinations: +\itemize{ +\item \code{c(TRUE, TRUE)}: saves quarterly and panel files in +\code{.csv} format. This is the default. +\item \code{c(TRUE, FALSE)}: saves quarterly and panel files in +\code{.parquet} format. +\item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files +are saved in \code{.csv} format. +\item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files +are saved in \code{.parquet} format. +}} } \value{ A message indicating the successful save of panel files. @@ -30,13 +45,15 @@ A message indicating the successful save of panel files. This function downloads PNADC data and applies panel identification algorithms } \examples{ -\dontrun{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} + load_pnadc( save_to = "Directory/You/Would/like/to/save/the/files", years = 2016, quarters = 1:4, panel = "basic", - raw_data = FALSE + raw_data = FALSE, + save_options = c(FALSE, FALSE) ) -} +\dontshow{\}) # examplesIf} } diff --git a/vignettes/LOAD_PNADC.Rmd b/vignettes/LOAD_PNADC.Rmd index 8e1ed2b..ccb47ce 100644 --- a/vignettes/LOAD_PNADC.Rmd +++ b/vignettes/LOAD_PNADC.Rmd @@ -17,19 +17,45 @@ knitr::opts_chunk$set( The `load_pnadc` function is a wrapper for [*`get_pnadc`*](https://www.rdocumentation.org/packages/PNADcIBGE/versions/0.7.0/topics/get_pnadc) from the package `PNADcIBGE`, with added identification algorithms to build a Panel. +*** +**Panel Structure:** + +The table below shows the first and last quarter (`ANOtrimestre`, e.g. +`20121` = 2012 Q1) covered by each PNADC rotating panel: + +| Panel | Start | End | +|------:|------:|------:| +| 1 | 20121 | 20124 | +| 2 | 20121 | 20141 | +| 3 | 20132 | 20152 | +| 4 | 20143 | 20163 | +| 5 | 20154 | 20174 | +| 6 | 20171 | 20191 | +| 7 | 20182 | 20202 | +| 8 | 20193 | 20213 | +| 9 | 20204 | 20224 | +| 10 | 20221 | 20241 | +| 11 | 20232 | 20252 | +| 12 | 20243 | 20263 | +| 13 | 20254 | 20274 | +| 14 | 20271 | 20291 | + *** **Usage:** Default ```{r eval=FALSE} + load_pnadc( save_to = getwd(), years, quarters = 1:4, panel = "advanced", - raw_data = FALSE + raw_data = FALSE, + save_options = c(TRUE, TRUE) ) + ``` To download PNADC data for all quarters of 2022 and 2023, with advanced identification, simply run @@ -62,6 +88,27 @@ load_pnadc( ) ``` +To download PNADC data, keep the quarters parquet on disk, and save panels as Parquet, run + +```{r eval=FALSE} +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_options = c(TRUE, FALSE) +) +``` + +To download PNADC data and save panels as CSV but discard the intermediate quarters, run + +```{r eval=FALSE} +load_pnadc( + save_to = "Directory/You/Would/like/to/save/the/files", + years = 2022, + save_options = c(FALSE, TRUE) +) +``` + + *** **Options:** @@ -76,7 +123,7 @@ load_pnadc( 4. **panel**: Which panel algorithm to apply to this data. There are three options: - * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. + * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. The intermediate quarters parquet is always kept when `panel = "none"`. * `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction * `advanced`: Performs advanced identification steps for creating households and individual identifiers for panel construction. @@ -84,21 +131,28 @@ load_pnadc( 5. **raw_data**: A command to define if the user would like to download the raw or treated data. There are two options: * `TRUE`: if you want the PNADC variables as they come. * `FALSE`: if you want the treated version of the PNADC variables. + + 6. **save_options**: A logical vector of length 2 controlling file saving behaviour: + * `c(TRUE, TRUE)` (default): keeps the intermediate quarters after panel is built; saves all files as `.csv`. + * `c(FALSE, TRUE)`: deletes the quarters after use; saves panel files as `.csv`. + * `c(TRUE, FALSE)`: keeps the quarters; saves all files as `.parquet` (a list of panel data frames). + * `c(FALSE, FALSE)`: deletes the quarters after use; saves panel files as `.parquet`. *** **Details:** The function performs the following steps: - 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. + + 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data. All quarters are collected in memory and saved together into a single `pnadc_quarters.parquet` file in `save_to`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage. - 2. Split the data into panels, by reading each `.rds` file and filtering by the quarter variable `V1014`. Data from each panel `x` is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data from each quarter to be appended on top of the previous ones, making the process faster. + 2. Split the data into panels by lazy-loading the parquet and filtering by the panel variable `V1014`. Data from each panel `x` is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on `save_options[2]`. - 3. Read each panel file and apply the identification algorithms defined in the `build_pnadc_panel`. + 3. Read each panel file and apply the identification algorithms defined in `build_pnadc_panel`. + + 4. If `save_options[1] = FALSE`, the intermediate quarters parquet is deleted after the panels are built. + * The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): "Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE". -*** - - - +*** \ No newline at end of file