diff --git a/DESCRIPTION b/DESCRIPTION
index 4a851b4..534157d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,6 +17,7 @@ Description: Functions to download and treat Brazilian social data from a
License: MIT + file LICENSE
URL: https://datazoom.com.br/en/
Imports:
+ arrow,
data.table,
dplyr,
PNADcIBGE,
diff --git a/NAMESPACE b/NAMESPACE
index 9eec8de..e186f97 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,5 +3,5 @@
export(build_pnadc_panel)
export(load_pnadc)
import(PNADcIBGE)
-import(data.table)
+importFrom(data.table,fread)
importFrom(magrittr,`%>%`)
diff --git a/R/load_pnadc.R b/R/load_pnadc.R
index d12ef65..c8f09cd 100644
--- a/R/load_pnadc.R
+++ b/R/load_pnadc.R
@@ -7,28 +7,41 @@
#' @param quarters The quarters within those years to be downloaded. Can be a numeric vector or a list of vectors, for different quarters per year.
#' @param panel A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")}
#' @param raw_data A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.
+#' @param save_options A \code{logical} vector of length 2. Controls whether quarterly
+#' files are saved and in which format all files are saved. Panel files are
+#' always saved. There are four possible combinations:
+#' \itemize{
+#' \item \code{c(TRUE, TRUE)}: saves quarterly and panel files in
+#' \code{.csv} format. This is the default.
+#' \item \code{c(TRUE, FALSE)}: saves quarterly and panel files in
+#' \code{.parquet} format.
+#' \item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files
+#' are saved in \code{.csv} format.
+#' \item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files
+#' are saved in \code{.parquet} format.
+#' }
#'
#' @return A message indicating the successful save of panel files.
-#'
-#' @import data.table
+#'
+#' @importFrom data.table fread
#' @import PNADcIBGE
#' @importFrom magrittr `%>%`
#'
-#' @examples
-#' \dontrun{
+#' @examplesIf interactive()
+#'
#' load_pnadc(
#' save_to = "Directory/You/Would/like/to/save/the/files",
#' years = 2016,
#' quarters = 1:4,
#' panel = "basic",
-#' raw_data = FALSE
+#' raw_data = FALSE,
+#' save_options = c(FALSE, FALSE)
#' )
-#' }
#' @export
load_pnadc <- function(save_to = getwd(), years,
quarters = 1:4, panel = "advanced",
- raw_data = FALSE) {
+ raw_data = FALSE, save_options = c(TRUE, TRUE)) {
# Check if PNADcIBGE namespace is already attached
if (!"PNADcIBGE" %in% .packages()) {
# If not attached, attach it
@@ -57,11 +70,13 @@ load_pnadc <- function(save_to = getwd(), years,
# The param list contains the various objects that will be used as parameters for this function
param <- list()
- param$years <- years # the years the user would like to download
- param$quarters <- quarters # the quarters within those years to be downloaded
- param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation
- param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly
- param$save_to <- save_to # the directory in which the user desires to save the files downloaded
+ param$years <- years # the years the user would like to download
+ param$quarters <- quarters # the quarters within those years to be downloaded
+ param$panel <- panel # which panel algorithm (none, basic or advanced) should be applied to this data, check our READ-ME for greater explanation
+ param$raw_data <- raw_data # A command to define if the user would like to download the raw data from the IBGE website directly
+ param$save_to <- save_to # the directory in which the user desires to save the files downloaded
+ param$save_quarters <- save_options[1] # whether to save quarterly files to disk
+ param$csv <- save_options[2] # if TRUE, saves as .csv; if FALSE, saves as .parquet
# Check if quarter is a list; if not, wrap it in a list and repeat it for each year
if (!is.list(quarters)) {
@@ -81,7 +96,7 @@ load_pnadc <- function(save_to = getwd(), years,
# generaring these two paralell vectors of years and quarter to loop over
- param$years <- unlist(param$years)
+ param$years <- unlist(param$years)
param$quarters <- unlist(param$quarters)
##################
@@ -91,14 +106,13 @@ load_pnadc <- function(save_to = getwd(), years,
# store info on all panels and column names
panel_list <- c()
- cnames <- NULL
+ cnames <- NULL
# download to the saving directory
source_files <- purrr::map2(
param$years, param$quarters, # looping over the two parallel vector of years and quarters (this was previoulsy done in a "for" structure, but qwe optimized it)
-
function(year, quarter) {
base::message(
paste0("Downloading PNADC ", year, " Q", quarter, "\n") # just generating a message so the user knows which file is being downloaded now
@@ -119,10 +133,6 @@ load_pnadc <- function(save_to = getwd(), years,
panel_list <<- c(panel_list, unique(df$V1014)) # registering, for every quarter, the panel's which the quarter's observations are included (every OBS is just included in one panel, but there should be OBS inserted in 2 to 3 panels for every quarter, check our READ-ME or the IBGE's website about the rotation scheme for PNADc surveys)
#<<- stabilishing a variable inside the function that continues to exist outside the function, it is not just local to the function's current context
- file_path <- file.path(
- param$save_to, paste0("pnadc_", year, "_", quarter, ".rds") # defining the file's names to a certain format: year= 2022, quarter=3, file -> pnadc_2022_3.rds
- )
-
# runs data cleaning if desired
if (!param$raw_data) {
df <- treat_pnadc(df)
@@ -130,15 +140,28 @@ load_pnadc <- function(save_to = getwd(), years,
cnames <<- names(df)
- # download each quarter to a separate file
-
- base::message(
- paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n")
- )
-
- readr::write_rds(df, file_path, compress = "gz") # saving the file into the user's computer
-
- return(file_path)
+ if (param$save_quarters) {
+ if (param$csv) {
+ file_path <- file.path(
+ param$save_to, paste0("pnadc_", year, "_", quarter, ".csv")
+ )
+ base::message(
+ paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n")
+ )
+ readr::write_csv(df, file_path)
+ } else {
+ file_path <- file.path(
+ param$save_to, paste0("pnadc_", year, "_", quarter, ".parquet")
+ )
+ base::message(
+ paste0("Saving ", year, " Q", quarter, " to\n", file_path, "\n")
+ )
+ arrow::write_parquet(df, file_path)
+ }
+ return(file_path)
+ } else {
+ return(df) # return the df in memory instead of the file path
+ }
}
}
)
@@ -161,52 +184,43 @@ load_pnadc <- function(save_to = getwd(), years,
panel_list <- unique(panel_list) # listing all the panels included in the quarters downloaded
- # set up .csv file paths for each panel such as "pnadc_panel_2.csv"
+ # set up file paths for each panel such as "pnadc_panel_2.csv" or "pnadc_panel_2.parquet"
panel_files <- purrr::map(
panel_list,
- function(panel) {
+ function(p) {
+ ext <- if (param$csv) ".csv" else ".parquet"
file_path <- file.path(
- param$save_to, paste0("pnadc", "_panel_", panel, ".csv")
+ param$save_to, paste0("pnadc_panel_", p, ext)
)
-
file_path
}
)
- # write an empty dataframe into each
-
- purrr::map(
- panel_files,
- function(path) {
- readr::write_csv(data.frame(), path, col_names = cnames)
- }
- )
-
- # read each of the source files, split into panels, and append
- # to their corresponding .csv files
+ # read each of the source files, split into panels, and compile them
# we use the .csv files because they have a appending propriety, meaning that they can receive new information without having the older one deleted
# for the R users, you can simply think as literally doing a rbind() into those files, but in a much more efficient way
- purrr::map(
- source_files, # source_files= the .rds files with the data that were downloaded way before in this function before
- function(file) {
- dat <- readr::read_rds(file) %>%
- split(.$V1014)
-
- dat %>%
- purrr::imap(
- function(df, panel) {
- file_path <- file.path(
- param$save_to, paste0("pnadc", "_panel_", panel, ".csv")
- )
-
- message(paste("Compiling panel", panel, "to", file_path, "\n"))
-
- readr::write_csv(df, file_path, append = TRUE) # append=TRUE allows us to add new info without deleting the older one, as comented above
+ panel_data <- purrr::map(
+ panel_list,
+ function(p) {
+ purrr::map(
+ source_files,
+ function(file) {
+ dat <- if (param$save_quarters) {
+ if (param$csv) {
+ readr::read_csv(file, show_col_types = FALSE)
+ } else {
+ arrow::read_parquet(file)
+ }
+ } else {
+ file # already a df in memory
}
- )
+ dat %>% dplyr::filter(V1014 == p)
+ }
+ ) %>%
+ purrr::list_rbind()
}
)
@@ -218,9 +232,7 @@ load_pnadc <- function(save_to = getwd(), years,
if (param$raw_data) {
ctypes <- readr::cols(.default = readr::col_number())
- }
-
- else {
+ } else {
ctypes <- readr::cols(
.default = readr::col_number(),
regiao = readr::col_character(),
@@ -233,21 +245,25 @@ load_pnadc <- function(save_to = getwd(), years,
)
}
- # read each file in panel_files and apply the identification algorithms defined in the build_pnadc_panel.R
+ # apply the identification algorithms defined in build_pnadc_panel.R and save panel files
- purrr::map(
- panel_files,
- function(path) {
- message(paste("Running", param$panel, "identification on", path, "\n"))
+ purrr::map2(
+ panel_data, panel_files,
+ function(dat, path) {
+ message(paste("Running", param$panel, "identification on panel", "\n"))
- df <- data.table::fread(
- path,
- col.names = cnames,
- colClasses = ctypes
- ) %>%
+ df <- dat %>%
build_pnadc_panel(panel = param$panel)
- readr::write_csv(df, path)
+ if (param$csv) {
+ message(paste("Compiling panel to", path, "\n"))
+ readr::write_csv(df, path)
+ } else {
+ message(paste("Compiling panel to", path, "\n"))
+ arrow::write_parquet(df, path)
+ }
+
+ return(df)
}
)
}
@@ -354,7 +370,7 @@ treat_pnadc <- function(df) {
dplyr::mutate(
faixa_educ = dplyr::case_match(
VD3004,
- 1 ~ "Sem instru\u00e7\u00a3o",
+ 1 ~ "Sem instru\u00e7\u00e3o",
2 ~ "1 a 7 anos de estudo",
3 ~ "8 a 11 anos de estudo",
4:6 ~ "9 a 14 anos de estudo",
diff --git a/README.md b/README.md
index 4ca0b98..666e519 100644
--- a/README.md
+++ b/README.md
@@ -45,18 +45,6 @@ install.packages("devtools")
devtools::install_github("datazoompuc/datazoom.social")
```
- ## Warning: replacing previous import 'data.table::first' by 'dplyr::first' when
- ## loading 'datazoom.social'
-
- ## Warning: replacing previous import 'data.table::last' by 'dplyr::last' when
- ## loading 'datazoom.social'
-
- ## Warning: replacing previous import 'data.table::between' by 'dplyr::between'
- ## when loading 'datazoom.social'
-
- ## Warning: replacing previous import 'data.table::transpose' by
- ## 'purrr::transpose' when loading 'datazoom.social'
-
## Data
@@ -81,17 +69,43 @@ build a Panel.
------------------------------------------------------------------------
+**Panel Structure:**
+
+The table below shows the first and last quarter (`ANOtrimestre`, e.g.
+`20121` = 2012 Q1) covered by each PNADC rotating panel:
+
+| Panel | Start | End |
+|------:|------:|------:|
+| 1 | 20121 | 20124 |
+| 2 | 20121 | 20141 |
+| 3 | 20132 | 20152 |
+| 4 | 20143 | 20163 |
+| 5 | 20154 | 20174 |
+| 6 | 20171 | 20191 |
+| 7 | 20182 | 20202 |
+| 8 | 20193 | 20213 |
+| 9 | 20204 | 20224 |
+| 10 | 20221 | 20241 |
+| 11 | 20232 | 20252 |
+| 12 | 20243 | 20263 |
+| 13 | 20254 | 20274 |
+| 14 | 20271 | 20291 |
+
+------------------------------------------------------------------------
+
**Usage:**
Default
``` r
+
load_pnadc(
save_to = getwd(),
years,
quarters = 1:4,
panel = "advanced",
- raw_data = FALSE
+ raw_data = FALSE,
+ save_options = c(TRUE, TRUE)
)
```
@@ -128,6 +142,28 @@ load_pnadc(
)
```
+To download PNADC data, keep the quarters parquet on disk, and save
+panels as Parquet, run
+
+``` r
+load_pnadc(
+ save_to = "Directory/You/Would/like/to/save/the/files",
+ years = 2022,
+ save_options = c(TRUE, FALSE)
+)
+```
+
+To download PNADC data and save panels as CSV but discard the
+intermediate quarters, run
+
+``` r
+load_pnadc(
+ save_to = "Directory/You/Would/like/to/save/the/files",
+ years = 2022,
+ save_options = c(FALSE, TRUE)
+)
+```
+
------------------------------------------------------------------------
**Options:**
@@ -147,6 +183,8 @@ load_pnadc(
- `none`: No panel is built. If `raw_data = TRUE`, returns the
original data. Otherwise, creates some extra treated variables.
+ The intermediate quarters parquet is always kept when
+ `panel = "none"`.
- `basic`: Performs basic identification steps for creating
households and individual identifiers for panel construction
- `advanced`: Performs advanced identification steps for creating
@@ -158,6 +196,18 @@ load_pnadc(
- `TRUE`: if you want the PNADC variables as they come.
- `FALSE`: if you want the treated version of the PNADC variables.
+6. **save_options**: A logical vector of length 2 controlling file
+ saving behaviour:
+
+ - `c(TRUE, TRUE)` (default): keeps the intermediate quarters after
+ panel is built; saves all files as `.csv`.
+ - `c(FALSE, TRUE)`: deletes the quarters after use; saves panel
+ files as `.csv`.
+ - `c(TRUE, FALSE)`: keeps the quarters; saves all files as
+ `.parquet` (a list of panel data frames).
+ - `c(FALSE, FALSE)`: deletes the quarters after use; saves panel
+ files as `.parquet`.
+
------------------------------------------------------------------------
**Details:**
@@ -165,18 +215,21 @@ load_pnadc(
The function performs the following steps:
1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to
- download the data and save in the `save_to` directory, in files
- named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`,
- some PNADC variables are treated at this stage.
+ download the data. All quarters are collected in memory and saved
+ together into a single `pnadc_quarters.parquet` file in `save_to`.
+ If the `raw_data` option is `FALSE`, some PNADC variables are
+ treated at this stage.
-2. Split the data into panels, by reading each `.rds` file and
- filtering by the quarter variable `V1014`. Data from each panel `x`
- is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data
- from each quarter to be appended on top of the previous ones, making
- the process faster.
+2. Split the data into panels by lazy-loading the parquet and filtering
+ by the panel variable `V1014`. Data from each panel `x` is saved to
+ `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on
+ `save_options[2]`.
3. Read each panel file and apply the identification algorithms defined
- in the `build_pnadc_panel`.
+ in `build_pnadc_panel`.
+
+4. If `save_options[1] = FALSE`, the intermediate quarters parquet is
+ deleted after the panels are built.
- The identification algorithms in `build_pnadc_panel` are drawn from
Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): “Sobre o
@@ -270,18 +323,18 @@ the advanced algorithm in each interview.
DataZoom is developed by a team at Pontifícia Universidade Católica do
Rio de Janeiro (PUC-Rio), Department of Economics. Our official website
-is at: .
+is at: .
To cite package `datazoom.social` in publications use:
> Data Zoom (2023). Data Zoom: Simplifying Access To Brazilian
> Microdata.
->
+>
A BibTeX entry for LaTeX users is:
- @Unpublished{DataZoom2023,
+ @Unpublished{DataZoom2024,
author = {Data Zoom},
title = {Data Zoom: Simplifying Access To Brazilian Microdata},
- url = {https://datazoom.com.br/en/},
- year = {2023}}
+ url = {https://www.econ.puc-rio.br/datazoom/english/index.html},
+ year = {2024}}
diff --git a/man/load_pnadc.Rd b/man/load_pnadc.Rd
index 6493141..ae91ce4 100644
--- a/man/load_pnadc.Rd
+++ b/man/load_pnadc.Rd
@@ -9,7 +9,8 @@ load_pnadc(
years,
quarters = 1:4,
panel = "advanced",
- raw_data = FALSE
+ raw_data = FALSE,
+ save_options = c(TRUE, TRUE)
)
}
\arguments{
@@ -22,6 +23,20 @@ load_pnadc(
\item{panel}{A \code{character} choosing the panel algorithm to apply ("none", "basic", or "advanced"). For details, check \code{vignette("BUILD_PNADC_PANEL")}}
\item{raw_data}{A \code{logical} setting the return of raw (\code{TRUE}) or processed (\code{FALSE}) variables.}
+
+\item{save_options}{A \code{logical} vector of length 2. Controls whether quarterly
+files are saved and in which format all files are saved. Panel files are
+always saved. There are four possible combinations:
+\itemize{
+\item \code{c(TRUE, TRUE)}: saves quarterly and panel files in
+\code{.csv} format. This is the default.
+\item \code{c(TRUE, FALSE)}: saves quarterly and panel files in
+\code{.parquet} format.
+\item \code{c(FALSE, TRUE)}: does not save quarterly files; panel files
+are saved in \code{.csv} format.
+\item \code{c(FALSE, FALSE)}: does not save quarterly files; panel files
+are saved in \code{.parquet} format.
+}}
}
\value{
A message indicating the successful save of panel files.
@@ -30,13 +45,15 @@ A message indicating the successful save of panel files.
This function downloads PNADC data and applies panel identification algorithms
}
\examples{
-\dontrun{
+\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+
load_pnadc(
save_to = "Directory/You/Would/like/to/save/the/files",
years = 2016,
quarters = 1:4,
panel = "basic",
- raw_data = FALSE
+ raw_data = FALSE,
+ save_options = c(FALSE, FALSE)
)
-}
+\dontshow{\}) # examplesIf}
}
diff --git a/vignettes/LOAD_PNADC.Rmd b/vignettes/LOAD_PNADC.Rmd
index 8e1ed2b..ccb47ce 100644
--- a/vignettes/LOAD_PNADC.Rmd
+++ b/vignettes/LOAD_PNADC.Rmd
@@ -17,19 +17,45 @@ knitr::opts_chunk$set(
The `load_pnadc` function is a wrapper for [*`get_pnadc`*](https://www.rdocumentation.org/packages/PNADcIBGE/versions/0.7.0/topics/get_pnadc) from the package `PNADcIBGE`, with added identification algorithms to build a Panel.
+***
+**Panel Structure:**
+
+The table below shows the first and last quarter (`ANOtrimestre`, e.g.
+`20121` = 2012 Q1) covered by each PNADC rotating panel:
+
+| Panel | Start | End |
+|------:|------:|------:|
+| 1 | 20121 | 20124 |
+| 2 | 20121 | 20141 |
+| 3 | 20132 | 20152 |
+| 4 | 20143 | 20163 |
+| 5 | 20154 | 20174 |
+| 6 | 20171 | 20191 |
+| 7 | 20182 | 20202 |
+| 8 | 20193 | 20213 |
+| 9 | 20204 | 20224 |
+| 10 | 20221 | 20241 |
+| 11 | 20232 | 20252 |
+| 12 | 20243 | 20263 |
+| 13 | 20254 | 20274 |
+| 14 | 20271 | 20291 |
+
***
**Usage:**
Default
```{r eval=FALSE}
+
load_pnadc(
save_to = getwd(),
years,
quarters = 1:4,
panel = "advanced",
- raw_data = FALSE
+ raw_data = FALSE,
+ save_options = c(TRUE, TRUE)
)
+
```
To download PNADC data for all quarters of 2022 and 2023, with advanced identification, simply run
@@ -62,6 +88,27 @@ load_pnadc(
)
```
+To download PNADC data, keep the quarters parquet on disk, and save panels as Parquet, run
+
+```{r eval=FALSE}
+load_pnadc(
+ save_to = "Directory/You/Would/like/to/save/the/files",
+ years = 2022,
+ save_options = c(TRUE, FALSE)
+)
+```
+
+To download PNADC data and save panels as CSV but discard the intermediate quarters, run
+
+```{r eval=FALSE}
+load_pnadc(
+ save_to = "Directory/You/Would/like/to/save/the/files",
+ years = 2022,
+ save_options = c(FALSE, TRUE)
+)
+```
+
+
***
**Options:**
@@ -76,7 +123,7 @@ load_pnadc(
4. **panel**: Which panel algorithm to apply to this data. There are three options:
- * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables.
+ * `none`: No panel is built. If `raw_data = TRUE`, returns the original data. Otherwise, creates some extra treated variables. The intermediate quarters parquet is always kept when `panel = "none"`.
* `basic`: Performs basic identification steps for creating households and individual identifiers for panel construction
* `advanced`: Performs advanced identification steps for creating households and individual identifiers for panel construction.
@@ -84,21 +131,28 @@ load_pnadc(
5. **raw_data**: A command to define if the user would like to download the raw or treated data. There are two options:
* `TRUE`: if you want the PNADC variables as they come.
* `FALSE`: if you want the treated version of the PNADC variables.
+
+ 6. **save_options**: A logical vector of length 2 controlling file saving behaviour:
+ * `c(TRUE, TRUE)` (default): keeps the intermediate quarters after panel is built; saves all files as `.csv`.
+ * `c(FALSE, TRUE)`: deletes the quarters after use; saves panel files as `.csv`.
+ * `c(TRUE, FALSE)`: keeps the quarters; saves all files as `.parquet` (a list of panel data frames).
+ * `c(FALSE, FALSE)`: deletes the quarters after use; saves panel files as `.parquet`.
***
**Details:**
The function performs the following steps:
- 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data and save in the `save_to` directory, in files named `pnadc_year_quarter.rds`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage.
+
+ 1. Loop over years and quarters using `PNADcIBGE::get_pnadc` to download the data. All quarters are collected in memory and saved together into a single `pnadc_quarters.parquet` file in `save_to`. If the `raw_data` option is `FALSE`, some PNADC variables are treated at this stage.
- 2. Split the data into panels, by reading each `.rds` file and filtering by the quarter variable `V1014`. Data from each panel `x` is saved to `pnad_panel_x.csv`. The use of `.csv` allows for data from each quarter to be appended on top of the previous ones, making the process faster.
+ 2. Split the data into panels by lazy-loading the parquet and filtering by the panel variable `V1014`. Data from each panel `x` is saved to `pnadc_panel_x.csv` or `pnadc_panel_x.parquet`, depending on `save_options[2]`.
- 3. Read each panel file and apply the identification algorithms defined in the `build_pnadc_panel`.
+ 3. Read each panel file and apply the identification algorithms defined in `build_pnadc_panel`.
+
+ 4. If `save_options[1] = FALSE`, the intermediate quarters parquet is deleted after the panels are built.
+
* The identification algorithms in `build_pnadc_panel` are drawn from Ribas, Rafael Perez, and Sergei Suarez Dillon Soares (2008): "Sobre o painel da Pesquisa Mensal de Emprego (PME) do IBGE".
-***
-
-
-
+***
\ No newline at end of file