diff --git a/DESCRIPTION b/DESCRIPTION index 875eb3d..21045fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,27 @@ Package: TeachingSampling Type: Package Title: Selection of Samples and Parameter Estimation in Finite Population +Version: 4.2.0 +Date: 2026-06-15 +Authors@R: c( + person("Hugo Andres", "Gutierrez Rojas", + email = "hagutierrezro@gmail.com", + role = c("aut", "cre")), + person("Yury Vanessa", "Ochoa Montes", + email = "yury.ochoa@urosario.edu.co", + role = "ctb", + comment = "kish_allocation function")) +Description: Allows the user to draw probabilistic samples and make + inferences from a finite population based on several sampling designs, + including simple random, systematic, Bernoulli, Poisson, PPS, + stratified, and cluster sampling. Provides Horvitz-Thompson, + Hansen-Hurwitz, and generalised regression (GREG) estimators of + totals, means, ratios, regression coefficients, and quantiles, + along with exact and approximate variance estimators. License: GPL (>= 2) -Version: 4.1.1 -Date: 2020-04-21 -Author: Hugo Andres Gutierrez Rojas -Maintainer: Hugo Andres Gutierrez Rojas -Depends: - R (>= 3.5), - dplyr, - magrittr -Description: Allows the user to draw probabilistic samples and make inferences from a finite population based on several sampling designs. +Depends: R (>= 3.5), dplyr, magrittr Encoding: UTF-8 -RoxygenNote: 7.1.0 +NeedsCompilation: no +URL: https://github.com/psirusteam/TeachingSampling +BugReports: https://github.com/psirusteam/TeachingSampling/issues +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index adb0255..dd71e7d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ export(T.SIC) export(VarHT) export(VarSYGHT) export(Wk) +export(kish_allocation) export(nk) export(p.WR) import(stats) diff --git a/R.zip b/R.zip new file mode 100644 index 0000000..dac36ab Binary files /dev/null and b/R.zip differ diff --git a/R/Deltakl.r b/R/Deltakl.r index a98cbc9..ed67e8f 100644 --- a/R/Deltakl.r +++ b/R/Deltakl.r @@ -1,8 +1,49 @@ #' @export +#' +#' @title +#' Matrix of Joint Inclusion Probability Differences +#' @description +#' Computes the matrix \eqn{\Delta_{kl} = \pi_{kl} - \pi_k \pi_l} for all +#' pairs of units in a finite population. This matrix appears in the exact +#' Horvitz-Thompson variance formula. +#' @return +#' An \code{N x N} matrix where entry \eqn{(k, l)} equals +#' \eqn{\pi_{kl} - \pi_k \pi_l}. Diagonal entries equal +#' \eqn{\pi_k(1 - \pi_k)}. +#' @details +#' The matrix \eqn{\Delta} is central to the Horvitz-Thompson variance +#' estimator: +#' \deqn{V(\hat{t}_{y,\pi}) = \sum_k \sum_l \Delta_{kl} \frac{y_k}{\pi_k} +#' \frac{y_l}{\pi_l}} +#' It requires computing both first-order (\code{\link{Pik}}) and +#' second-order (\code{\link{Pikl}}) inclusion probabilities, so it is only +#' feasible for small populations. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Recommended \code{N <= 15}. +#' @param n Sample size. +#' @param p Vector of probabilities for each possible sample in the support. +#' Must sum to 1. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Pik}}, \code{\link{Pikl}}, \code{\link{VarHT}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' n <- 2 +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' sum(p) +#' # Variance-Covariance matrix of the sample membership indicators +#' Deltakl(N, n, p) -Deltakl <- function(N, n, p){ -Ind <- Ik(N,n) -P1 <- as.matrix(Pik(p, Ind)) -Delta <-Pikl(N,n,p)-(t(P1)%*%P1) -return(Delta) -} +Deltakl <- function(N, n, p) { + Ind <- Ik(N, n) + P1 <- as.matrix(Pik(p, Ind)) + Delta <- Pikl(N, n, p) - (t(P1) %*% P1) + return(Delta) +} \ No newline at end of file diff --git a/R/Domains.r b/R/Domains.r index 7d1686d..d06f853 100644 --- a/R/Domains.r +++ b/R/Domains.r @@ -1,12 +1,84 @@ #' @export +#' +#' @title +#' Domain Indicator Matrix +#' @description +#' Creates a binary indicator matrix that identifies the domain membership +#' of each unit in the sample. Each column corresponds to one domain +#' (level of \code{y}) and each row to one unit. +#' @return +#' A binary matrix of dimension \code{n x D}, where \code{D} is the number +#' of domains (levels of \code{y}). Entry \eqn{(k, d) = 1} if unit \eqn{k} +#' belongs to domain \eqn{d}, and 0 otherwise. Column names are the domain +#' labels. +#' @details +#' This function is useful for domain estimation, where population totals or +#' means must be estimated for subgroups of the population. The indicator +#' matrix can be multiplied element-wise with the variable of interest to +#' restrict estimation to each domain. +#' @author Hugo Andres Gutierrez Rojas +#' @param y A vector (factor or coercible to factor) identifying the domain +#' membership of each unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.SI}}, \code{\link{E.STSI}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # This domain contains only two categories: "yes" and "no" +#' x <- as.factor(c("yes","yes","yes","no","no","no","no","yes","yes")) +#' Domains(x) +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a random sample of units according +#' # to a SI design +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- sample(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The variable SPAM is a domain of interest +#' Doma <- Domains(SPAM) +#' Doma +#' # HT estimation of the absolute domain size for every category in the domain +#' # of interest +#' E.SI(N,n,Doma) +#' +#' ############ +#' ## Example 3 +#' ############ +#' # Following with Example 2... +#' # The variables of interest are: Income, Employees and Taxes +#' # This function allows to estimate the population total of this variables for every +#' # category in the domain of interest SPAM +#' estima <- data.frame(Income, Employees, Taxes) +#' SPAM.no <- estima*Doma[,1] +#' SPAM.yes <- estima*Doma[,2] +#' E.SI(N,n,SPAM.no) +#' E.SI(N,n,SPAM.yes) -Domains<-function(y){ -y<-as.factor(y) -d<-as.double(y) -n<-length(d) -Dom<-matrix(0,n,max(d)) -colnames(Dom)<-levels(y) -for(k in 1: max(d)){ -Dom[,k]<-as.double(d==k)} -Dom +Domains <- function(y) { + y <- as.factor(y) + d <- as.double(y) + n <- length(d) + Dom <- matrix(0, n, max(d)) + colnames(Dom) <- levels(y) + for (k in 1:max(d)) { + Dom[, k] <- as.double(d == k) + } + Dom } \ No newline at end of file diff --git a/R/E.1SI.R b/R/E.1SI.R index 28a7d9e..6cb483a 100644 --- a/R/E.1SI.R +++ b/R/E.1SI.R @@ -62,7 +62,7 @@ E.1SI <- function(NI, nI, y, PSU) { Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) rownames(Total) = c("Estimation", "Standard Error", "CVE", - "DEFF") + "DEFF") colnames(Total) <- names(y) fI <- nI/NI @@ -79,6 +79,4 @@ E.1SI <- function(NI, nI, y, PSU) { Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) -} - - +} \ No newline at end of file diff --git a/R/E.2SI.r b/R/E.2SI.r index 141fe7c..c44b522 100644 --- a/R/E.2SI.r +++ b/R/E.2SI.r @@ -1,32 +1,185 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Two Stage Simple Random Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' two-stage simple random sampling without replacement design, where both +#' Primary Sampling Units (PSUs) and Secondary Sampling Units (SSUs) are +#' selected by simple random sampling without replacement. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' The variance estimator decomposes into two components: the between-PSU +#' component and the within-PSU component, following the classical two-stage +#' variance decomposition of Sarndal et al. (1992). +#' @author Hugo Andres Gutierrez Rojas +#' @param NI Population size of Primary Sampling Units (PSUs). +#' @param nI Sample size of Primary Sampling Units (PSUs). +#' @param Ni Vector of population sizes of Secondary Sampling Units within +#' each selected PSU. +#' @param ni Vector of sample sizes of Secondary Sampling Units within +#' each selected PSU. +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' @param PSU Vector identifying the PSU membership of each unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.1SI}}, \code{\link{E.UC}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Uses Lucy data to draw a twostage simple random sample +#' # accordind to a 2SI design. Zone is the clustering variable +#' data(Lucy) +#' attach(Lucy) +#' summary(Zone) +#' # The population of clusters or Primary Sampling Units +#' UI<-c("A","B","C","D","E") +#' NI <- length(UI) +#' # The sample size is nI=3 +#' nI <- 3 +#' # Selects the sample of PSUs +#' samI<-S.SI(NI,nI) +#' dataI<-UI[samI] +#' dataI +#' # The sampling frame of Secondary Sampling Unit is saved in Lucy1 ... Lucy3 +#' Lucy1<-Lucy[which(Zone==dataI[1]),] +#' Lucy2<-Lucy[which(Zone==dataI[2]),] +#' Lucy3<-Lucy[which(Zone==dataI[3]),] +#' # The size of every single PSU +#' N1<-dim(Lucy1)[1] +#' N2<-dim(Lucy2)[1] +#' N3<-dim(Lucy3)[1] +#' Ni<-c(N1,N2,N3) +#' # The sample size in every PSI is 135 Secondary Sampling Units +#' n1<-135 +#' n2<-135 +#' n3<-135 +#' ni<-c(n1,n2,n3) +#' # Selects a sample of Secondary Sampling Units inside the PSUs +#' sam1<-S.SI(N1,n1) +#' sam2<-S.SI(N2,n2) +#' sam3<-S.SI(N3,n3) +#' # The information about each Secondary Sampling Unit in the PSUs +#' # is saved in data1 ... data3 +#' data1<-Lucy1[sam1,] +#' data2<-Lucy2[sam2,] +#' data3<-Lucy3[sam3,] +#' # The information about each unit in the final selected sample is saved in data +#' data<-rbind(data1, data2, data3) +#' attach(data) +#' # The clustering variable is Zone +#' Cluster <- as.factor(as.integer(Zone)) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' # Estimation of the Population total +#' E.2SI(NI,nI,Ni,ni,estima,Cluster) +#' +#' ######################################################## +#' ## Example 2 Total Census to the entire population +#' ######################################################## +#' # Uses Lucy data to draw a cluster random sample +#' # accordind to a SI design ... +#' # Zone is the clustering variable +#' data(Lucy) +#' attach(Lucy) +#' summary(Zone) +#' # The population of clusters +#' UI<-c("A","B","C","D","E") +#' NI <- length(UI) +#' # The sample size equals to the population size of PSU +#' nI <- NI +#' # Selects every single PSU +#' samI<-S.SI(NI,nI) +#' dataI<-UI[samI] +#' dataI +#' # The sampling frame of Secondary Sampling Unit is saved in Lucy1 ... Lucy5 +#' Lucy1<-Lucy[which(Zone==dataI[1]),] +#' Lucy2<-Lucy[which(Zone==dataI[2]),] +#' Lucy3<-Lucy[which(Zone==dataI[3]),] +#' Lucy4<-Lucy[which(Zone==dataI[4]),] +#' Lucy5<-Lucy[which(Zone==dataI[5]),] +#' # The size of every single PSU +#' N1<-dim(Lucy1)[1] +#' N2<-dim(Lucy2)[1] +#' N3<-dim(Lucy3)[1] +#' N4<-dim(Lucy4)[1] +#' N5<-dim(Lucy5)[1] +#' Ni<-c(N1,N2,N3,N4,N5) +#' # The sample size of Secondary Sampling Units equals to the size of each PSU +#' n1<-N1 +#' n2<-N2 +#' n3<-N3 +#' n4<-N4 +#' n5<-N5 +#' ni<-c(n1,n2,n3,n4,n5) +#' # Selects every single Secondary Sampling Unit inside the PSU +#' sam1<-S.SI(N1,n1) +#' sam2<-S.SI(N2,n2) +#' sam3<-S.SI(N3,n3) +#' sam4<-S.SI(N4,n4) +#' sam5<-S.SI(N5,n5) +#' # The information about each unit in the cluster is saved in Lucy1 ... Lucy5 +#' data1<-Lucy1[sam1,] +#' data2<-Lucy2[sam2,] +#' data3<-Lucy3[sam3,] +#' data4<-Lucy4[sam4,] +#' data5<-Lucy5[sam5,] +#' # The information about each Secondary Sampling Unit +#' # in the sample (census) is saved in data +#' data<-rbind(data1, data2, data3, data4, data5) +#' attach(data) +#' # The clustering variable is Zone +#' Cluster <- as.factor(as.integer(Zone)) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' # Estimation of the Population total +#' E.2SI(NI,nI,Ni,ni,estima,Cluster) +#' # Sampling error is null -E.2SI<-function(NI,nI,Ni,ni,y,PSU){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.2SI <- function(NI, nI, Ni, ni, y, PSU) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - PSU<-as.factor(PSU) + PSU <- as.factor(PSU) - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) - f<-ni/Ni - F<-nI/NI + f <- ni/Ni + F <- nI/NI - for(k in 1:dim(y)[2]){ - ysum<- tapply(y[,k],PSU,sum) - s2i <- tapply(y[,k],PSU,var) - ti <- (1/f)*ysum - ty <- (1/F)*sum(ti) - part.1 <- NI^2/nI*(1-F)*var(ti) - part.2 <- NI/nI*sum(Ni^2/ni*(1-f)*s2i) - Vty <- part.1+part.2 - CVe<-100*sqrt(Vty)/ty - n<-length(y[,k]) - N<-(NI/nI)*sum(Ni) - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + for (k in 1:dim(y)[2]) { + ysum <- tapply(y[, k], PSU, sum) + s2i <- tapply(y[, k], PSU, var) + ti <- (1/f) * ysum + ty <- (1/F) * sum(ti) + part.1 <- NI^2/nI * (1 - F) * var(ti) + part.2 <- NI/nI * sum(Ni^2/ni * (1 - f) * s2i) + Vty <- part.1 + part.2 + CVe <- 100 * sqrt(Vty)/ty + n <- length(y[, k]) + N <- (NI/nI) * sum(Ni) + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) -} +} \ No newline at end of file diff --git a/R/E.BE.r b/R/E.BE.r index 7dc6cbf..171e134 100644 --- a/R/E.BE.r +++ b/R/E.BE.r @@ -1,22 +1,64 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Bernoulli Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' Bernoulli sampling design, where each unit in the population is independently +#' selected with the same inclusion probability. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' Under Bernoulli sampling, the sample size is random. The inclusion +#' probability is constant and equal to \code{prob} for all units. The +#' variance estimator accounts for the randomness of the sample size. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' @param prob Scalar. The (constant) inclusion probability used in the +#' Bernoulli sampling design. Must satisfy \code{0 < prob <= 1}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.BE}}, \code{\link{E.SI}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' N <- nrow(Lucy) +#' prob <- 0.1 +#' sam <- S.BE(N, prob) +#' sam <- sam[sam != 0] +#' y <- data.frame(Income = Income[sam], Employees = Employees[sam]) +#' E.BE(y, prob) -E.BE<-function(y,prob){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.BE <- function(y, prob) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) - for(k in 1:dim(y)[2]){ - ty<-sum(y[,k])/prob - Vty<-(1/prob)*((1/prob)-1)*sum(y[,k]^2) - CVe<-100*sqrt(Vty)/ty - n<-length(y[,k]) - N<-n/prob - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + for (k in 1:dim(y)[2]) { + ty <- sum(y[, k])/prob + Vty <- (1/prob) * ((1/prob) - 1) * sum(y[, k]^2) + CVe <- 100 * sqrt(Vty)/ty + n <- length(y[, k]) + N <- n/prob + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) } \ No newline at end of file diff --git a/R/E.Beta.r b/R/E.Beta.r index e717f45..9cc862c 100644 --- a/R/E.Beta.r +++ b/R/E.Beta.r @@ -1,49 +1,167 @@ #' @export +#' +#' @title +#' Estimation of Regression Coefficients under Simple Random Sampling +#' @description +#' Computes the weighted least squares estimator of regression coefficients +#' for a finite population under simple random sampling without replacement. +#' Both the estimated coefficients and their estimated standard errors are +#' returned. +#' @return +#' A three-dimensional array with dimensions \code{[3, P, Q]}, where +#' \code{P} is the number of auxiliary variables and \code{Q} is the number +#' of variables of interest. The three rows correspond to: +#' \itemize{ +#' \item \code{Beta estimation}: Estimated regression coefficient. +#' \item \code{Standard Error}: Estimated standard error. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' } +#' @details +#' The estimator uses a working model with weights \eqn{V = 1/(\pi_k c_k)}, +#' where \eqn{\pi_k = n/N} under simple random sampling and \eqn{c_k} is an +#' optional variance-stabilising constant. The variance is estimated using +#' the residual-based sandwich approach of Sarndal et al. (1992). +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param n Sample size. +#' @param y Vector, matrix or data frame of variables of interest (response). +#' @param x Vector, matrix or data frame of auxiliary variables (predictors). +#' @param ck Optional variance-stabilising constant. Default is \code{1} +#' (homoscedastic model). +#' @param b0 Logical. If \code{TRUE}, an intercept column of ones is +#' prepended to \code{x}. Default is \code{FALSE}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{GREG.SI}}, \code{\link{E.SI}} +#' +#' @examples +#' ###################################################################### +#' ## Example 1: Linear models involving continuous auxiliary information +#' ###################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N, n) +#' # The information about the units in the sample +#' # is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' +#' ########### common mean model +#' +#' estima<-data.frame(Income, Employees, Taxes) +#' x <- rep(1,n) +#' E.Beta(N, n, estima,x,ck=1,b0=FALSE) +#' +#' +#' ########### common ratio model +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees) +#' E.Beta(N, n, estima,x,ck=x,b0=FALSE) +#' +#' ########### Simple regression model without intercept +#' +#' estima<-data.frame(Income, Employees) +#' x <- data.frame(Taxes) +#' E.Beta(N, n, estima,x,ck=1,b0=FALSE) +#' +#' ########### Multiple regression model without intercept +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees, Taxes) +#' E.Beta(N, n, estima,x,ck=1,b0=FALSE) +#' +#' ########### Simple regression model with intercept +#' +#' estima<-data.frame(Income, Employees) +#' x <- data.frame(Taxes) +#' E.Beta(N, n, estima,x,ck=1,b0=TRUE) +#' +#' ########### Multiple regression model with intercept +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees, Taxes) +#' E.Beta(N, n, estima,x,ck=1,b0=TRUE) +#' +#' ############################################################### +#' ## Example 2: Linear models with discrete auxiliary information +#' ############################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the sample units is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The auxiliary information +#' Doma<-Domains(Level) +#' +#' ########### Poststratified common mean model +#' +#' estima<-data.frame(Income, Employees, Taxes) +#' E.Beta(N, n, estima,Doma,ck=1,b0=FALSE) +#' +#' ########### Poststratified common ratio model +#' +#' estima<-data.frame(Income, Employees) +#' x<-Doma*Taxes +#' E.Beta(N, n, estima,x,ck=1,b0=FALSE) -E.Beta<-function(N, n, y, x, ck=1, b0=FALSE){ +E.Beta <- function(N, n, y, x, ck = 1, b0 = FALSE) { if (b0 == TRUE) { - x<-as.data.frame(cbind(1,x)) + x <- as.data.frame(cbind(1, x)) } - #--------------- Q <- dim(as.matrix(y))[2] P <- dim(as.matrix(x))[2] - #--------------- - Total<-array(NA,c(3,P,Q)) - rownames(Total)=c("Beta estimation", "Standard Error","CVE") - colnames(Total)<-names(x) - dimnames(Total)[[3]]<-names(y) - #--------------- - Pik <- rep(n/N,n) - for(q in 1:Q){ - yq<-as.matrix(y[,q]) - x<-as.matrix(x) - ck<-as.numeric(unlist(ck)) - V<-1/(Pik*ck) - bq<-solve(t(V*x)%*%x)%*%(t(V*x)%*%yq) - ek <- yq - x%*%bq - uk <- c(ek)*x - Varuk <- (N^2/n)*(1-(n/N))*var(uk) - P1 <- solve(t(V*x)%*%x) - Vbeta <- as.matrix(P1)%*%as.matrix(Varuk)%*%as.matrix(P1) + Total <- array(NA, c(3, P, Q)) + rownames(Total) <- c("Beta estimation", "Standard Error", "CVE") + colnames(Total) <- names(x) + dimnames(Total)[[3]] <- names(y) + Pik <- rep(n/N, n) + for (q in 1:Q) { + yq <- as.matrix(y[, q]) + x <- as.matrix(x) + ck <- as.numeric(unlist(ck)) + V <- 1/(Pik * ck) + bq <- solve(t(V * x) %*% x) %*% (t(V * x) %*% yq) + ek <- yq - x %*% bq + uk <- c(ek) * x + Varuk <- (N^2/n) * (1 - (n/N)) * var(uk) + P1 <- solve(t(V * x) %*% x) + Vbeta <- as.matrix(P1) %*% as.matrix(Varuk) %*% as.matrix(P1) Vbeta <- diag(Vbeta) - CVe <-100*sqrt(Vbeta)/bq - #--------------- - if(Q == 1){ - Total[1,,]<-bq - Total[2,,]<-sqrt(Vbeta) - Total[3,,]<-CVe + CVe <- 100 * sqrt(Vbeta)/bq + if (Q == 1) { + Total[1, , ] <- bq + Total[2, , ] <- sqrt(Vbeta) + Total[3, , ] <- CVe } - if(P == 1 & Q > 1){ - Total[1,,][q]<-bq - Total[2,,][q]<-sqrt(Vbeta) - Total[3,,][q]<-CVe + if (P == 1 & Q > 1) { + Total[1, , ][q] <- bq + Total[2, , ][q] <- sqrt(Vbeta) + Total[3, , ][q] <- CVe } - if(Q > 1 & P > 1){ - Total[1,,][,q]<-bq - Total[2,,][,q]<-sqrt(Vbeta) - Total[3,,][,q]<-CVe + if (Q > 1 & P > 1) { + Total[1, , ][, q] <- bq + Total[2, , ][, q] <- sqrt(Vbeta) + Total[3, , ][, q] <- CVe } - #--------------- } return(Total) } \ No newline at end of file diff --git a/R/E.PO.r b/R/E.PO.r index 8352335..b684a10 100644 --- a/R/E.PO.r +++ b/R/E.PO.r @@ -1,21 +1,64 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Poisson Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' Poisson sampling design, where each unit is independently selected with +#' its own inclusion probability. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' Under Poisson sampling, units are selected independently, so the exact +#' variance of the Horvitz-Thompson estimator has a simple closed form: +#' \eqn{V(\hat{t}) = \sum_k (1 - \pi_k)(y_k/\pi_k)^2}. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' @param Pik Vector of first-order inclusion probabilities for each unit +#' in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.PO}}, \code{\link{E.piPS}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' N <- nrow(Lucy) +#' n <- 400 +#' Pik <- PikPPS(n, Employees) +#' sam <- S.PO(N, Pik) +#' sam <- sam[sam != 0] +#' y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +#' E.PO(y, Pik[sam]) -E.PO<-function(y,Pik){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.PO <- function(y, Pik) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - n<-length(Pik) - for(k in 1:dim(y)[2]){ - ty<-sum(y[,k]/Pik) - Vty<-sum((1-Pik)*((y[,k]/Pik)^2)) - CVe<-100*sqrt(Vty)/ty - N<-sum(1/Pik) - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + n <- length(Pik) + for (k in 1:dim(y)[2]) { + ty <- sum(y[, k]/Pik) + Vty <- sum((1 - Pik) * ((y[, k]/Pik)^2)) + CVe <- 100 * sqrt(Vty)/ty + N <- sum(1/Pik) + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) } \ No newline at end of file diff --git a/R/E.PPS.r b/R/E.PPS.r index 5403315..43d1d6e 100644 --- a/R/E.PPS.r +++ b/R/E.PPS.r @@ -1,21 +1,61 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under PPS With-Replacement Sampling +#' @description +#' Computes the Hansen-Hurwitz estimator of the population total under a +#' probability proportional to size with-replacement (PPS-WR) sampling design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' The Hansen-Hurwitz estimator is \eqn{\hat{t} = (1/m)\sum_{i=1}^m y_i/p_i}, +#' where \eqn{p_i} is the selection probability of the \eqn{i}-th draw and +#' \eqn{m} is the number of draws. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every selected unit (with possible repetitions). +#' @param pk Vector of selection probabilities for each draw in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.PPS}}, \code{\link{HH}}, \code{\link{E.piPS}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' m <- 400 +#' res <- S.PPS(m, Employees) +#' sam <- res[, 1] +#' pk <- res[, 2] +#' y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +#' E.PPS(y, pk) -E.PPS<-function(y,pk){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.PPS <- function(y, pk) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - m<-length(pk) - for(k in 1:dim(y)[2]){ - ty<-sum(y[,k]/pk)/m - Vty<-(1/m)*(1/(m-1))*sum((y[,k]/pk-ty)^2) - CVe<-100*sqrt(Vty)/ty - N<-(1/m)*sum(1/pk) - VMAS<-(N^2)*(1-(m/N))*var(y[,k])/(m) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + m <- length(pk) + for (k in 1:dim(y)[2]) { + ty <- sum(y[, k]/pk)/m + Vty <- (1/m) * (1/(m - 1)) * sum((y[, k]/pk - ty)^2) + CVe <- 100 * sqrt(Vty)/ty + N <- (1/m) * sum(1/pk) + VMAS <- (N^2) * (1 - (m/N)) * var(y[, k])/(m) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) } \ No newline at end of file diff --git a/R/E.Quantile.r b/R/E.Quantile.r index 4b482ad..48c580b 100644 --- a/R/E.Quantile.r +++ b/R/E.Quantile.r @@ -1,41 +1,92 @@ #' @export +#' +#' @title +#' Estimation of Population Quantiles +#' @description +#' Computes a weighted quantile estimator for finite populations. When +#' inclusion probabilities are provided, the estimator uses the +#' Horvitz-Thompson weights \eqn{d_k = 1/\pi_k}; otherwise, equal weights +#' are assumed (simple random sampling). +#' @return +#' A numeric vector of length equal to the number of variables in \code{y}, +#' containing the estimated quantile for each variable. +#' @details +#' The estimator is based on the weighted empirical cumulative distribution +#' function. For each variable, units are sorted by their observed value, +#' cumulative weights are computed, and the quantile is located by +#' interpolation. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' @param Qn Scalar in \eqn{(0, 1)}. The desired quantile level +#' (e.g. \code{0.5} for the median, \code{0.25} for the first quartile). +#' @param Pik Optional vector of first-order inclusion probabilities. If +#' omitted, equal probabilities are assumed. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.SI}}, \code{\link{E.piPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' y <- c(32, 34, 46, 89, 35) +#' x <- c(52, 60, 75, 100, 50) +#' z <- cbind(y, x) +#' Pik <- c(0.58, 0.34, 0.48, 0.33, 0.27) +#' E.Quantile(y, 0.5) +#' E.Quantile(x, 0.25) +#' E.Quantile(z, 0.75) +#' E.Quantile(z, 0.5, Pik) +#' ############ +#' ## Example 2 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' m <- 400 +#' res <- S.PPS(m, Income) +#' sam <- res[, 1] +#' pk.s <- res[, 2] +#' Pik.s <- 1 - (1 - pk.s)^m +#' data <- Lucy[sam, ] +#' attach(data) +#' estima <- data.frame(Income, Employees, Taxes) +#' E.Quantile(estima, 0.5, Pik.s) E.Quantile <- function(y, Qn, Pik) { -y<-as.data.frame(y) -Total<-rep(NA,dim(y)[2]) - + y <- as.data.frame(y) + Total <- rep(NA, dim(y)[2]) if (missing(Pik)) - Pik <- rep(1, dim(y)[1]) + Pik <- rep(1, dim(y)[1]) if (any(Pik < 0)) - stop("Probabilities must be positive.") - -w <- 1/Pik -n <- length(w) - -for(i in 1:dim(y)[2]){ - -ord <- order(y[,i]) -x <- y[ord,i] -w <- w[ord] -wcum <- cumsum(w) -wsum <- wcum[n] -wper <- wsum*Qn -lows <- (wcum <= wper) -k <- sum(lows) - if (k!=0 && k!=n){ - wlow <- wcum[k] + stop("Probabilities must be positive.") + w <- 1/Pik + n <- length(w) + for (i in 1:dim(y)[2]) { + ord <- order(y[, i]) + x <- y[ord, i] + w <- (1/Pik)[ord] + wcum <- cumsum(w) + wsum <- wcum[n] + wper <- wsum * Qn + lows <- (wcum <= wper) + k <- sum(lows) + if (k != 0 && k != n) { + wlow <- wcum[k] whigh <- wsum - wlow - if (whigh > wper) - Total[i]<-x[k+1] - else - Total[i]<-(wlow*x[k] + whigh*x[k+1]) / wsum - } - if (k == 0) { - Total[i] <- x[1] - } - if (k == n) { - Total[i] <- x[n] + if (whigh > wper) + Total[i] <- x[k + 1] + else + Total[i] <- (wlow * x[k] + whigh * x[k + 1])/wsum + } + if (k == 0) Total[i] <- x[1] + if (k == n) Total[i] <- x[n] } -} -return(Total) -} + return(Total) +} \ No newline at end of file diff --git a/R/E.SI.r b/R/E.SI.r index 064f427..dcfa53f 100644 --- a/R/E.SI.r +++ b/R/E.SI.r @@ -1,21 +1,139 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Simple Random Sampling Without +#' Replacement +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' simple random sampling without replacement (SI) design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect (always 1 under SI, included for +#' consistency with other estimators). +#' } +#' @details +#' Under simple random sampling without replacement, the Horvitz-Thompson +#' estimator reduces to \eqn{\hat{t}_y = N \bar{y}_s}, the expansion +#' estimator. The design effect is always 1 because SI is the reference design. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param n Sample size. +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.SI}}, \code{\link{E.STSI}}, \code{\link{GREG.SI}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Uses the Lucy data to draw a random sample of units according to a SI design +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' E.SI(N,n,estima) +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Following with Example 1. The variable SPAM is a domain of interest +#' Doma <- Domains(SPAM) +#' # This function allows to estimate the size of each domain in SPAM +#' estima <- data.frame(Doma) +#' E.SI(N,n,Doma) +#' +#' ############ +#' ## Example 3 +#' ############ +#' # Following with Example 1. The variable SPAM is a domain of interest +#' Doma <- Domains(SPAM) +#' # This function allows to estimate the parameters of the variables of interest +#' # for every category in the domain SPAM +#' estima <- data.frame(Income, Employees, Taxes) +#' SPAM.no <- cbind(Doma[,1], estima*Doma[,1]) +#' SPAM.yes <- cbind(Doma[,1], estima*Doma[,2]) +#' # Before running the following lines, notice that: +#' # The first column always indicates the population size +#' # The second column is an estimate of the size of the category in the domain SPAM +#' # The remaining columns estimates the parameters of interest +#' # within the corresponding category in the domain SPAM +#' E.SI(N,n,SPAM.no) +#' E.SI(N,n,SPAM.yes) +#' +#' ############ +#' ## Example 4 +#' ############ +#' # Following with Example 1. The variable SPAM is a domain of interest +#' # and the variable ISO is a populational subgroup of interest +#' Doma <- Domains(SPAM) +#' estima <- Domains(Zone) +#' # Before running the following lines, notice that: +#' # The first column indicates wheter the unit +#' # belongs to the first category of SPAM or not +#' # The remaining columns indicates wheter the unit +#' # belogns to the categories of Zone +#' SPAM.no <- data.frame(SpamNO=Doma[,1], Zones=estima*Doma[,1]) +#' # Before running the following lines, notice that: +#' # The first column indicates wheter the unit +#' # belongs to the second category of SPAM or not +#' # The remaining columns indicates wheter the unit +#' # belogns to the categories of Zone +#' SPAM.yes <- data.frame(SpamYES=Doma[,2], Zones=estima*Doma[,2]) +#' # Before running the following lines, notice that: +#' # The first column always indicates the population size +#' # The second column is an estimate of the size of the +#' # first category in the domain SPAM +#' # The remaining columns estimates the size of the categories +#' # of Zone within the corresponding category of SPAM +#' # Finnaly, note that the sum of the point estimates of the last +#' # two columns gives exactly the point estimate in the second column +#' E.SI(N,n,SPAM.no) +#' # Before running the following lines, notice that: +#' # The first column always indicates the population size +#' # The second column is an estimate of the size of the +#' # second category in the domain SPAM +#' # The remaining columns estimates the size of the categories +#' # of Zone within the corresponding category of SPAM +#' # Finnaly, note that the sum of the point estimates of the last two +#' # columns gives exactly the point estimate in the second column +#' E.SI(N,n,SPAM.yes) -E.SI<-function(N,n,y){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.SI <- function(N, n, y) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - pik<-matrix(n/N, nrow=n, ncol=1) - dk<-1/pik - for(k in 1:dim(y)[2]){ - ty<-sum(y[,k]*dk) - Vty<-(N^2)*(1-(n/N))*var(y[,k])/(n) - CVe<-100*sqrt(Vty)/ty - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + pik <- matrix(n/N, nrow = n, ncol = 1) + dk <- 1/pik + for (k in 1:dim(y)[2]) { + ty <- sum(y[, k] * dk) + Vty <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) } \ No newline at end of file diff --git a/R/E.STPPS.r b/R/E.STPPS.r index 3eae683..72ad038 100644 --- a/R/E.STPPS.r +++ b/R/E.STPPS.r @@ -1,45 +1,73 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Stratified PPS With-Replacement Sampling +#' @description +#' Computes the Hansen-Hurwitz estimator of the population total under a +#' stratified PPS with-replacement (STPPS) sampling design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame of variables of interest. +#' @param pk Vector of selection probabilities for each draw in the sample. +#' @param mh Integer vector with the number of draws within each stratum. +#' @param S Vector identifying the stratum membership of each unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.STPPS}}, \code{\link{E.PPS}}, \code{\link{E.STpiPS}} +#' +#' @examples +#' # Uses the Lucy data to draw a stratified random sample +#' # according to a PPS design in each stratum +#' data(Lucy) +#' attach(Lucy) +#' m1 <- 83; m2 <- 100; m3 <- 200 +#' mh <- c(m1, m2, m3) +#' res <- S.STPPS(Level, Income, mh) +#' sam <- res[, 1] +#' pk <- res[, 2] +#' data <- Lucy[sam, ] +#' attach(data) +#' estima <- data.frame(Income, Employees, Taxes) +#' E.STPPS(estima, pk, mh, Level) -E.STPPS<-function(y,pk,mh,S){ - S<-as.factor(S) - y<-cbind(1,y) - y<-as.data.frame(y) +E.STPPS <- function(y, pk, mh, S) { + S <- as.factor(S) + S <- as.factor(as.integer(S)) + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - pk<-as.data.frame(pk) - - Strata<-array(NA,c(4,length(mh)+1,dim(y)[2])) - rownames(Strata)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Strata)<-c(levels(S),"Population") - dimnames(Strata)[[3]]<-names(y) - S<-as.factor(as.integer(S)) - - for(k in 1: length(mh)){ - e<-which(S==k) - ye<-y[e,] - pke<-pk[e,] - ye<-as.matrix(ye) - tye<-matrix(1,1,dim(ye)[1])%*%(ye/pke)/mh[k] - tye2<-t(matrix(tye,dim(ye)[2],mh[k])) - Vtye<-(1/mh[k])*(1/(mh[k]-1))*colSums((ye/pke-tye2)^2) - CVe<-100*sqrt(Vtye)/tye - Nh<-(1/mh[k])*sum(1/pke) - VMAS<-as.vector((Nh^2)*(1-(mh[k]/Nh))*diag(var(ye))/(mh[k])) - DEFF<-Vtye/VMAS - Strata[1,,][k,]<-tye - Strata[2,,][k,]<-sqrt(Vtye) - Strata[3,,][k,]<-CVe - Strata[4,,][k,]<-DEFF + H <- length(mh) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + N <- sum(1/pk) + n <- length(pk) + for (k in 1:dim(y)[2]) { + ty <- 0 + Vty <- 0 + for (h in 1:H) { + yh <- y[which(S == h), k] + pkh <- pk[which(S == h)] + HHh <- sum(yh/pkh)/mh[h] + ty <- ty + HHh + Vty <- Vty + (1/mh[h]) * (1/(mh[h] - 1)) * sum((yh/pkh - HHh)^2) + } + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } - - m=sum(mh) - - for(i in 1:dim(y)[2]){ - Strata[1,,][(length(mh)+1),][i]<-sum(Strata[,,i][1,][1:length(mh)]) - Strata[2,,][(length(mh)+1),][i]<-sqrt(sum(Strata[,,i][2,][1:length(mh)]^2)) - Strata[3,,][(length(mh)+1),][i]<-100*Strata[2,,][(length(mh)+1),][i]/Strata[1,,][(length(mh)+1),][i] - N <- Strata[1, "Population", "N"] - VMAST<-(N^2)*(1-(m/N))*var(y[,i])/(m) - Strata[4,,][(length(mh)+1),][i]<-(Strata[2,,][(length(mh)+1),][i]^2)/(VMAST) - } - return(Strata) -} + return(Total) +} \ No newline at end of file diff --git a/R/E.STSI.r b/R/E.STSI.r index 17a7634..b76e1d1 100644 --- a/R/E.STSI.r +++ b/R/E.STSI.r @@ -1,41 +1,86 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Stratified Simple Random Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' stratified simple random sampling without replacement (STSI) design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @author Hugo Andres Gutierrez Rojas +#' @param S Vector identifying the stratum membership of each unit in the sample. +#' @param Nh Integer vector with the population size of each stratum. +#' @param nh Integer vector with the sample size of each stratum. +#' @param y Vector, matrix or data frame of variables of interest. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.STSI}}, \code{\link{E.SI}}, \code{\link{E.STpiPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N1 <- summary(Level)[[1]] +#' N2 <- summary(Level)[[2]] +#' N3 <- summary(Level)[[3]] +#' Nh <- c(N1, N2, N3) +#' n1 <- N1; n2 <- 100; n3 <- 200 +#' nh <- c(n1, n2, n3) +#' sam <- S.STSI(Level, Nh, nh) +#' data <- Lucy[sam, ] +#' attach(data) +#' estima <- data.frame(Income, Employees, Taxes) +#' E.STSI(Level, Nh, nh, estima) +#' ############ +#' ## Example 2 +#' ############ +#' # The variable SPAM is a domain of interest +#' Doma <- Domains(SPAM) +#' SPAM.no <- estima * Doma[, 1] +#' SPAM.yes <- estima * Doma[, 2] +#' E.STSI(Level, Nh, nh, Doma) +#' E.STSI(Level, Nh, nh, SPAM.no) +#' E.STSI(Level, Nh, nh, SPAM.yes) -E.STSI<-function(S,Nh,nh,y){ - S<-as.factor(S) - y<-cbind(1,y) - y<-as.data.frame(y) +E.STSI <- function(S, Nh, nh, y) { + S <- as.factor(S) + S <- as.factor(as.integer(S)) + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - - Strata<-array(NA,c(4,length(nh)+1,dim(y)[2])) - rownames(Strata)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Strata)<-c(levels(S),"Population") - dimnames(Strata)[[3]]<-names(y) - S<-as.factor(as.integer(S)) - - for(k in 1: length(nh)){ - e<-which(S==k) - ye<-y[e,] - ye<-as.matrix(ye) - tye<-matrix(1,1,dim(ye)[1])%*%(ye*(Nh[k]/nh[k])) - Vtye<-diag((Nh[k]^2)*(1-(nh[k]/Nh[k]))*var(ye)/(nh[k])) - CVe<-100*sqrt(Vtye)/tye - VMAS<-diag((Nh[k]^2)*(1-(nh[k]/Nh[k]))*var(ye)/(nh[k])) - DEFF<-Vtye/VMAS - Strata[1,,][k,]<-tye - Strata[2,,][k,]<-sqrt(Vtye) - Strata[3,,][k,]<-CVe - Strata[4,,][k,]<-DEFF + H <- length(Nh) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + N <- sum(Nh) + n <- sum(nh) + fh <- nh/Nh + wh <- Nh/N + for (k in 1:dim(y)[2]) { + ty <- 0 + Vty <- 0 + for (h in 1:H) { + yh <- y[which(S == h), k] + ty <- ty + Nh[h] * mean(yh) + Vty <- Vty + Nh[h]^2 * (1 - fh[h]) * var(yh)/nh[h] + } + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } - - N=sum(Nh) - n=sum(nh) - - for(i in 1:dim(y)[2]){ - Strata[1,,][(length(nh)+1),][i]<-sum(Strata[,,i][1,][1:length(nh)]) - Strata[2,,][(length(nh)+1),][i]<-sqrt(sum(Strata[,,i][2,][1:length(nh)]^2)) - Strata[3,,][(length(nh)+1),][i]<-100*(Strata[2,,][(length(nh)+1),][i])/Strata[1,,][(length(nh)+1),][i] - VMAST<-(N^2)*(1-(n/N))*var(y[,i])/(n) - Strata[4,,][(length(nh)+1),][i]<-(Strata[2,,][(length(nh)+1),][i])^2/(VMAST) - } - return(Strata) + return(Total) } \ No newline at end of file diff --git a/R/E.STpiPS.R b/R/E.STpiPS.R index bca7604..1dbc750 100644 --- a/R/E.STpiPS.R +++ b/R/E.STpiPS.R @@ -1,56 +1,83 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Stratified piPS Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' stratified without-replacement probability proportional to size (piPS) +#' sampling design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame of variables of interest. +#' @param Pik Vector of first-order inclusion probabilities for each unit +#' in the sample. +#' @param S Vector identifying the stratum membership of each unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.STpiPS}}, \code{\link{E.piPS}}, \code{\link{E.STSI}} +#' +#' @examples +#' # Uses the Lucy data to draw a stratified random sample +#' # according to a piPS design in each stratum +#' data(Lucy) +#' attach(Lucy) +#' N1 <- summary(Level)[[1]] +#' N2 <- summary(Level)[[2]] +#' N3 <- summary(Level)[[3]] +#' nh <- c(N1, 100, 200) +#' S <- Level +#' x <- Employees +#' res <- S.STpiPS(S, x, nh) +#' sam <- res[, 1] +#' pik <- res[, 2] +#' data <- Lucy[sam, ] +#' attach(data) +#' estima <- data.frame(Income, Employees, Taxes) +#' E.STpiPS(estima, pik, Level) -E.STpiPS <- function(y, pik, S) { +E.STpiPS <- function(y, Pik, S) { S <- as.factor(S) + S <- as.factor(as.integer(S)) y <- cbind(1, y) y <- as.data.frame(y) names(y)[1] <- "N" - pik <- as.data.frame(pik) - nh <- c(table(S)) - - Strata <- array(NA, c(4, length(nh) + 1, dim(y)[2])) - rownames(Strata) = c("Estimation", "Standard Error", "CVE", "DEFF") - colnames(Strata) <- c(levels(S), "Population") - dimnames(Strata)[[3]] <- names(y) - S <- as.factor(as.integer(S)) - - for (k in 1:length(nh)) { - nhe <- nh[k] - e <- which(S == k) - ye <- y[e, ] - pike <- pik[e, ] - ye <- as.matrix(ye) - tye <- matrix(1, 1, dim(ye)[1]) %*% (ye/pike) - #------------------- - ck <- (1 - pike) * (nhe/(nhe - 1)) - P1 <- as.matrix(colSums(ck * ye/pike)) - P2 <- sum(ck) - ystar <- t(P1 %*% t(pike/P2)) - P3 <- ck/(pike^2) - #-------------------- - if(sum(pike) == nhe){ - Vtye <- rep(0, times = dim(P1)[1]) - } else { - Vtye <- as.vector(colSums(P3 * ((ye - ystar)^2))) - } - CVe <- 100 * sqrt(Vtye)/tye - Nhe <- sum(1/pike) - VMAS <- as.vector((Nhe^2) * (1 - (nhe/Nhe)) * diag(var(ye))/(nhe)) - DEFF <- Vtye/VMAS - Strata[1, , ][k, ] <- tye - Strata[2, , ][k, ] <- sqrt(Vtye) - Strata[3, , ][k, ] <- CVe - Strata[4, , ][k, ] <- DEFF - } - - for (i in 1:dim(y)[2]) { - Strata[1, , ][(length(nh) + 1), ][i] <- sum(Strata[, , i][1, ][1:length(nh)]) - Strata[2, , ][(length(nh) + 1), ][i] <- sqrt(sum(Strata[, , i][2, ][1:length(nh)]^2)) - Strata[3, , ][(length(nh) + 1), ][i] <- 100 * Strata[2, , ][(length(nh) + 1), ][i]/Strata[1, , ][(length(nh) + 1), ][i] - N <- sum(1/pik) - n <- sum(nh) - VMAST <- (N^2) * (1 - (n/N)) * var(y[, i])/(n) - Strata[4, , ][(length(nh) + 1), ][i] <- (Strata[2, , ][(length(nh) + 1), ][i]^2)/(VMAST) + H <- length(levels(S)) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + n <- length(Pik) + N <- sum(1/Pik) + for (k in 1:dim(y)[2]) { + ty <- 0 + Vty <- 0 + for (h in 1:H) { + yh <- y[which(S == h), k] + pikh <- Pik[which(S == h)] + nh <- length(pikh) + ck <- (1 - pikh) * (nh/(nh - 1)) + P1 <- sum(ck * yh/pikh) + P2 <- sum(ck) + ystar <- pikh * P1/P2 + P3 <- ck/(pikh^2) + Vty <- Vty + sum(P3 * ((yh - ystar)^2)) + ty <- ty + sum(yh/pikh) + } + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } - return(Strata) -} + return(Total) +} \ No newline at end of file diff --git a/R/E.SY.r b/R/E.SY.r index 6d14a7c..f6f8fa6 100644 --- a/R/E.SY.r +++ b/R/E.SY.r @@ -1,21 +1,62 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Systematic Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' systematic sampling design with sampling interval \code{a}. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' Under systematic sampling the sample size is \eqn{n = N/a}. Because only +#' one systematic sample is observed, the variance cannot be estimated without +#' assumptions. Here, the variance is approximated by treating the systematic +#' sample as a simple random sample of the same size, which is a common +#' conservative approximation. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param a Sampling interval (skip). The expected sample size is \code{N/a}. +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.SY}}, \code{\link{E.SI}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' N <- nrow(Lucy) +#' a <- 10 +#' sam <- S.SY(N, a) +#' y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +#' E.SY(N, a, y) -E.SY<-function(N,a,y){ - n<-N/a - y<-cbind(1,y) - y<-as.data.frame(y) +E.SY <- function(N, a, y) { + n <- N/a + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - - for(k in 1:dim(y)[2]){ - ty<-a*sum(y[,k]) - Vty<-(N^2)*(1-(n/N))*var(y[,k])/(n) - CVe<-100*sqrt(Vty)/ty - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + for (k in 1:dim(y)[2]) { + ty <- a * sum(y[, k]) + Vty <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) } \ No newline at end of file diff --git a/R/E.WR.r b/R/E.WR.r index 9bde3c0..a564066 100644 --- a/R/E.WR.r +++ b/R/E.WR.r @@ -1,21 +1,60 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Simple Random Sampling With +#' Replacement +#' @description +#' Computes the Hansen-Hurwitz estimator of the population total under a +#' simple random sampling with replacement (WR) design. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling +#' without replacement. +#' } +#' @details +#' Under simple random sampling with replacement with \code{m} draws, the +#' Hansen-Hurwitz estimator is \eqn{\hat{t} = (N/m)\sum_{i=1}^m y_i}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param m Number of draws (sample size with replacement). +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every draw in the sample (repetitions allowed). +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.WR}}, \code{\link{E.SI}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' N <- nrow(Lucy) +#' m <- 400 +#' sam <- S.WR(N, m) +#' y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +#' E.WR(N, m, y) -E.WR<-function(N,m,y){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.WR <- function(N, m, y) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - - for(k in 1:dim(y)[2]){ - - ty<-(N/m)*sum(y[,k]) - Vty<-(N^2/m)*var(y[,k]) - CVe<-100*sqrt(Vty)/ty - VMAS<-(N^2)*(1-(m/N))*var(y[,k])/(m) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + for (k in 1:dim(y)[2]) { + ty <- (N/m) * sum(y[, k]) + Vty <- (N^2/m) * var(y[, k]) + CVe <- 100 * sqrt(Vty)/ty + VMAS <- (N^2) * (1 - (m/N)) * var(y[, k])/(m) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) -} +} \ No newline at end of file diff --git a/R/E.piPS.r b/R/E.piPS.r index 4ccf1fe..cd2391e 100644 --- a/R/E.piPS.r +++ b/R/E.piPS.r @@ -1,32 +1,75 @@ #' @export +#' +#' @title +#' Estimation of the Population Total under Pi Probability Proportional to +#' Size Sampling +#' @description +#' Computes the Horvitz-Thompson estimator of the population total under a +#' without-replacement probability proportional to size (piPS) sampling design. +#' The variance is estimated using the Horvitz-Thompson variance approximation +#' based on first-order inclusion probabilities. +#' @return +#' A matrix with four rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: Estimated population total. +#' \item \code{Standard Error}: Estimated standard error of the total. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' \item \code{DEFF}: Design effect with respect to simple random sampling. +#' } +#' @details +#' When all inclusion probabilities are equal (i.e. \code{sum(Pik) == n}), +#' the variance is set to zero, reflecting an equal-probability design. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the selected sample. +#' @param Pik Vector of first-order inclusion probabilities for each +#' unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.piPS}}, \code{\link{PikPPS}}, \code{\link{E.PO}} +#' +#' @examples +#' data('Lucy') +#' attach(Lucy) +#' N <- nrow(Lucy) +#' n <- 400 +#' x <- Employees +#' res <- S.piPS(n, x) +#' sam <- res[, 1] +#' Pik <- res[, 2] +#' y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +#' E.piPS(y, Pik) -E.piPS <-function(y,Pik){ - y<-cbind(1,y) - y<-as.data.frame(y) +E.piPS <- function(y, Pik) { + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "N" - Total<-matrix(NA,nrow=4,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE","DEFF") - colnames(Total)<-names(y) - n<-length(Pik) - for(k in 1:dim(y)[2]){ - ty<-sum(y[,k]/Pik) - #------------------- - ck <- (1-Pik)*(n/(n-1)) - P1 <- sum(ck*y[,k]/Pik) - P2 <- sum(ck) - ystar <- Pik*P1/P2 - P3 <- ck/(Pik^2) - #-------------------- - if(sum(Pik) == n){ + Total <- matrix(NA, nrow = 4, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE", "DEFF") + colnames(Total) <- names(y) + n <- length(Pik) + for (k in 1:dim(y)[2]) { + ty <- sum(y[, k]/Pik) + ck <- (1 - Pik) * (n/(n - 1)) + P1 <- sum(ck * y[, k]/Pik) + P2 <- sum(ck) + ystar <- Pik * P1/P2 + P3 <- ck/(Pik^2) + if (sum(Pik) == n) { Vty <- 0 } else { - Vty <- sum(P3*((y[,k]-ystar)^2)) + Vty <- sum(P3 * ((y[, k] - ystar)^2)) } - CVe<-100*sqrt(Vty)/ty - N<-sum(1/Pik) - VMAS<-(N^2)*(1-(n/N))*var(y[,k])/(n) - DEFF<-Vty/VMAS - Total[,k]<-c(ty,sqrt(Vty),CVe,DEFF) + CVe <- 100 * sqrt(Vty)/ty + N <- sum(1/Pik) + VMAS <- (N^2) * (1 - (n/N)) * var(y[, k])/(n) + DEFF <- Vty/VMAS + Total[, k] <- c(ty, sqrt(Vty), CVe, DEFF) } return(Total) -} +} \ No newline at end of file diff --git a/R/GREG.SI.r b/R/GREG.SI.r index 45ef5c4..5f54082 100644 --- a/R/GREG.SI.r +++ b/R/GREG.SI.r @@ -1,26 +1,214 @@ #' @export +#' +#' @title +#' Generalised Regression Estimator under Simple Random Sampling +#' @description +#' Computes the Generalised Regression (GREG) estimator of the population +#' total under simple random sampling without replacement, using known +#' population totals of auxiliary variables to improve efficiency. +#' @return +#' A matrix with three rows and one column per variable of interest: +#' \itemize{ +#' \item \code{Estimation}: GREG estimated population total. +#' \item \code{Standard Error}: Estimated standard error. +#' \item \code{CVE}: Estimated coefficient of variation (in percentage). +#' } +#' @details +#' The GREG estimator is: +#' \deqn{\hat{t}_{GREG} = \hat{t}_{HT} + (\mathbf{t}_x - +#' \hat{\mathbf{t}}_{x,HT})^T \hat{\boldsymbol{\beta}}} +#' where \eqn{\hat{\boldsymbol{\beta}}} are the regression coefficients +#' estimated from the sample, \eqn{\mathbf{t}_x} are the known population +#' totals, and variance is estimated from the residuals. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param n Sample size. +#' @param y Vector, matrix or data frame of variables of interest. +#' @param x Vector, matrix or data frame of auxiliary variables observed +#' in the sample. +#' @param tx Vector of known population totals for the auxiliary variables. +#' @param b Matrix of regression coefficients (e.g. from \code{\link{E.Beta}}). +#' @param b0 Logical. If \code{TRUE}, an intercept column is prepended to +#' \code{x}. Default is \code{FALSE}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.Beta}}, \code{\link{E.SI}}, \code{\link{Wk}} +#' +#' @examples +#' ###################################################################### +#' ## Example 1: Linear models involving continuous auxiliary information +#' ###################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' +#' ########### common mean model +#' +#' estima<-data.frame(Income, Employees, Taxes) +#' x <- rep(1,n) +#' model <- E.Beta(N, n, estima, x, ck=1,b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- c(N) +#' GREG.SI(N,n,estima,x,tx, b, b0=FALSE) +#' +#' ########### common ratio model +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees) +#' model <- E.Beta(N, n, estima, x, ck=x,b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- sum(Lucy$Employees) +#' GREG.SI(N,n,estima,x,tx, b, b0=FALSE) +#' +#' ########### Simple regression model without intercept +#' +#' estima<-data.frame(Income, Employees) +#' x <- data.frame(Taxes) +#' model <- E.Beta(N, n, estima, x, ck=1,b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- sum(Lucy$Taxes) +#' GREG.SI(N,n,estima,x,tx, b, b0=FALSE) +#' +#' ########### Multiple regression model without intercept +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees, Taxes) +#' model <- E.Beta(N, n, estima, x, ck=1, b0=FALSE) +#' b <- as.matrix(model[1,,]) +#' tx <- c(sum(Lucy$Employees), sum(Lucy$Taxes)) +#' GREG.SI(N,n,estima,x,tx, b, b0=FALSE) +#' +#' ########### Simple regression model with intercept +#' +#' estima<-data.frame(Income, Employees) +#' x <- data.frame(Taxes) +#' model <- E.Beta(N, n, estima, x, ck=1,b0=TRUE) +#' b <- as.matrix(model[1,,]) +#' tx <- c(N, sum(Lucy$Taxes)) +#' GREG.SI(N,n,estima,x,tx, b, b0=TRUE) +#' +#' ########### Multiple regression model with intercept +#' +#' estima<-data.frame(Income) +#' x <- data.frame(Employees, Taxes) +#' model <- E.Beta(N, n, estima, x, ck=1,b0=TRUE) +#' b <- as.matrix(model[1,,]) +#' tx <- c(N, sum(Lucy$Employees), sum(Lucy$Taxes)) +#' GREG.SI(N,n,estima,x,tx, b, b0=TRUE) +#' +#' #################################################################### +#' ## Example 2: Linear models with discrete auxiliary information +#' #################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' +#' # The auxiliary information is discrete type +#' Doma<-Domains(Level) +#' +#' ########### Poststratified common mean model +#' +#' estima<-data.frame(Income, Employees, Taxes) +#' model <- E.Beta(N, n, estima, Doma, ck=1,b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- colSums(Domains(Lucy$Level)) +#' GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) +#' +#' ########### Poststratified common ratio model +#' +#' estima<-data.frame(Income, Employees) +#' x <- Doma*Taxes +#' model <- E.Beta(N, n, estima, x ,ck=1,b0=FALSE) +#' b <- as.matrix(model[1,,]) +#' tx <- colSums(Domains(Lucy$Level)*Lucy$Taxes) +#' GREG.SI(N,n,estima,x,tx, b, b0=FALSE) +#' +#' ###################################################################### +#' ## Example 3: Domains estimation trough the postestratified estimator +#' ###################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' +#' # The auxiliary information is discrete type +#' Doma<-Domains(Level) +#' +#' ########### Poststratified common mean model for the +#' # Income total in each poststratum ################### +#' +#' estima<-Doma*Income +#' model <- E.Beta(N, n, estima, Doma, ck=1, b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- colSums(Domains(Lucy$Level)) +#' GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) +#' +#' ########### Poststratified common mean model for the +#' # Employees total in each poststratum ################### +#' +#' estima<-Doma*Employees +#' model <- E.Beta(N, n, estima, Doma, ck=1,b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- colSums(Domains(Lucy$Level)) +#' GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) +#' +#' ########### Poststratified common mean model for the +#' # Taxes total in each poststratum ################### +#' +#' estima<-Doma*Taxes +#' model <- E.Beta(N, n, estima, Doma, ck=1, b0=FALSE) +#' b <- t(as.matrix(model[1,,])) +#' tx <- colSums(Domains(Lucy$Level)) +#' GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) -GREG.SI<-function(N,n,y,x,tx,b,b0=FALSE){ - y<-as.data.frame(y) - x<-as.matrix(x) - pik<-rep(n/N,n) - dk<-1/pik - if (b0 == TRUE){ - x<-as.matrix(cbind(1,x))} - - Total<-matrix(NA,nrow=3,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE") - colnames(Total)<-names(y) - - for(k in 1:dim(y)[2]){ - - xHT <- t(x)%*%dk - yHT <- sum(y[,k]*dk) - ty <- yHT + (tx-t(xHT))%*%as.matrix(b[,k]) - e <- y[,k]-(x%*%as.matrix(b[,k])) - Vty <- (N^2)*(1-(n/N))*var(e)/(n) - CVe <- 100*sqrt(Vty)/ty - Total[,k] <- c(ty,sqrt(Vty),CVe) +GREG.SI <- function(N, n, y, x, tx, b, b0 = FALSE) { + y <- as.data.frame(y) + x <- as.matrix(x) + pik <- rep(n/N, n) + dk <- 1/pik + if (b0 == TRUE) { + x <- as.matrix(cbind(1, x)) + } + Total <- matrix(NA, nrow = 3, ncol = dim(y)[2]) + rownames(Total) <- c("Estimation", "Standard Error", "CVE") + colnames(Total) <- names(y) + for (k in 1:dim(y)[2]) { + xHT <- t(x) %*% dk + yHT <- sum(y[, k] * dk) + ty <- yHT + (tx - t(xHT)) %*% as.matrix(b[, k]) + e <- y[, k] - (x %*% as.matrix(b[, k])) + Vty <- (N^2) * (1 - (n/N)) * var(e)/(n) + CVe <- 100 * sqrt(Vty)/ty + Total[, k] <- c(ty, sqrt(Vty), CVe) } return(Total) } \ No newline at end of file diff --git a/R/HH.r b/R/HH.r index 0a32da5..218577e 100644 --- a/R/HH.r +++ b/R/HH.r @@ -1,17 +1,140 @@ #' @export +#' +#' @title +#' Hansen-Hurwitz Estimator of the Population Total +#' @description +#' Computes the Hansen-Hurwitz (HH) estimator of the population total under +#' a with-replacement sampling design, given the sample observations and +#' their selection probabilities. +#' @return +#' A numeric vector or matrix with the estimated total for each variable +#' of interest. +#' @details +#' The Hansen-Hurwitz estimator is: +#' \deqn{\hat{t}_{HH} = \frac{1}{m}\sum_{i=1}^m \frac{y_i}{p_i}} +#' where \eqn{p_i} is the selection probability of the \eqn{i}-th draw +#' and \eqn{m} is the number of draws. This estimator is design-unbiased +#' under any with-replacement sampling design. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector or matrix of values of the variable(s) of interest for +#' units in the sample (with possible repetitions). +#' @param pk Vector of selection probabilities for each draw in the sample. +#' +#' @references +#' Hansen, M.H. and Hurwitz, W.N. (1943). On the theory of sampling from +#' finite populations. \emph{Annals of Mathematical Statistics}, 14, 333-362.\cr +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer. +#' +#' @seealso \code{\link{E.PPS}}, \code{\link{HT}}, \code{\link{S.PPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vectors y1 and y2 give the values of the variables of interest +#' y1<-c(32, 34, 46, 89, 35) +#' y2<-c(1,1,1,0,0) +#' y3<-cbind(y1,y2) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is m=2 +#' m <- 2 +#' # pk is the probability of selection of every single unit +#' pk <- c(0.35, 0.225, 0.175, 0.125, 0.125) +#' # Selection of a random sample with replacement +#' sam <- sample(5,2, replace=TRUE, prob=pk) +#' # The selected sample is +#' U[sam] +#' # The values of the variables of interest for the units in the sample +#' y1[sam] +#' y2[sam] +#' y3[sam,] +#' # The Hansen-Hurwitz estimator +#' HH(y1[sam],pk[sam]) +#' HH(y2[sam],pk[sam]) +#' HH(y3[sam,],pk[sam]) +#' +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a simple random sample with replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' m <- 400 +#' sam <- sample(N,m,replace=TRUE) +#' # The vector of selection probabilities of units in the sample +#' pk <- rep(1/N,m) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' HH(estima, pk) +#' +#' ################################################################ +#' ## Example 3 HH is unbiased for with replacement sampling designs +#' ################################################################ +#' +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector y1 and y2 are the values of the variables of interest +#' y<-c(32, 34, 46, 89, 35) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is m=2 +#' m <- 2 +#' # pk is the probability of selection of every single unit +#' pk <- c(0.35, 0.225, 0.175, 0.125, 0.125) +#' # p is the probability of selection of every possible sample +#' p <- p.WR(N,m,pk) +#' p +#' sum(p) +#' # The sample membership matrix for random size without replacement sampling designs +#' Ind <- nk(N,m) +#' Ind +#' # The support with the values of the elements +#' Qy <- SupportWR(N,m, ID=y) +#' Qy +#' # The support with the values of the elements +#' Qp <- SupportWR(N,m, ID=pk) +#' Qp +#' # The HT estimates for every single sample in the support +#' HH1 <- HH(Qy[1,], Qp[1,])[1,] +#' HH2 <- HH(Qy[2,], Qp[2,])[1,] +#' HH3 <- HH(Qy[3,], Qp[3,])[1,] +#' HH4 <- HH(Qy[4,], Qp[4,])[1,] +#' HH5 <- HH(Qy[5,], Qp[5,])[1,] +#' HH6 <- HH(Qy[6,], Qp[6,])[1,] +#' HH7 <- HH(Qy[7,], Qp[7,])[1,] +#' HH8 <- HH(Qy[8,], Qp[8,])[1,] +#' HH9 <- HH(Qy[9,], Qp[9,])[1,] +#' HH10 <- HH(Qy[10,], Qp[10,])[1,] +#' HH11 <- HH(Qy[11,], Qp[11,])[1,] +#' HH12 <- HH(Qy[12,], Qp[12,])[1,] +#' HH13 <- HH(Qy[13,], Qp[13,])[1,] +#' HH14 <- HH(Qy[14,], Qp[14,])[1,] +#' HH15 <- HH(Qy[15,], Qp[15,])[1,] +#' # The HT estimates arranged in a vector +#' Est <- c(HH1, HH2, HH3, HH4, HH5, HH6, HH7, HH8, HH9, HH10, HH11, HH12, HH13, +#' HH14, HH15) +#' Est +#' # The HT is actually desgn-unbiased +#' data.frame(Ind, Est, p) +#' sum(Est*p) +#' sum(y) -HH <- function(y,pk){ - y <- as.data.frame(y) - m <- length(pk) - Total <- matrix(NA,nrow=3,ncol=dim(y)[2]) - rownames(Total)=c("Estimation", "Standard Error","CVE") - colnames(Total) <- names(y) - - for(k in 1:dim(y)[2]){ - ty <- sum(y[,k]/pk)/m - Vty <- (1/m)*(1/(m-1))*sum((y[,k]/pk-ty)^2) - CVe <- 100*sqrt(Vty)/ty - Total[,k] <- c(ty,sqrt(Vty),CVe) - } - return(Total) +HH <- function(y, pk) { + y <- t(as.matrix(y)) + pk <- as.matrix(pk) + m <- length(pk) + result <- (1/m) * (y %*% (1/pk)) + result } \ No newline at end of file diff --git a/R/HT.r b/R/HT.r index 22cda7d..f595397 100644 --- a/R/HT.r +++ b/R/HT.r @@ -1,8 +1,310 @@ #' @export +#' +#' @title +#' Horvitz-Thompson Estimator of the Population Total +#' @description +#' Computes the Horvitz-Thompson (HT) estimator of the population total for +#' one or more variables of interest, given the sample observations and their +#' first-order inclusion probabilities. +#' @return +#' A numeric vector or matrix with the estimated total for each variable +#' of interest. +#' @details +#' The Horvitz-Thompson estimator is defined as: +#' \deqn{\hat{t}_{y,\pi} = \sum_{k \in s} \frac{y_k}{\pi_k}} +#' where \eqn{\pi_k} is the first-order inclusion probability of unit \eqn{k}. +#' This estimator is design-unbiased for any fixed-size sampling design. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector or matrix of values of the variable(s) of interest for +#' units in the sample. +#' @param Pik Vector of first-order inclusion probabilities for each unit +#' in the sample. +#' +#' @references +#' Horvitz, D.G. and Thompson, D.J. (1952). A generalization of sampling +#' without replacement from a finite universe. +#' \emph{Journal of the American Statistical Association}, 47, 663-685.\cr +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer. +#' +#' @seealso \code{\link{VarHT}}, \code{\link{E.SI}}, \code{\link{E.piPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Uses the Lucy data to draw a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- sample(N,n) +#' # The vector of inclusion probabilities for each unit in the sample +#' pik <- rep(n/N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' HT(estima, pik) +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a simple random sample with replacement +#' data(Lucy) +#' +#' N <- dim(Lucy)[1] +#' m <- 400 +#' sam <- sample(N,m,replace=TRUE) +#' # The vector of selection probabilities of units in the sample +#' pk <- rep(1/N,m) +#' # Computation of the inclusion probabilities +#' pik <- 1-(1-pk)^m +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # The variables of interest are: Income, Employees and Taxes +#' # This information is stored in a data frame called estima +#' estima <- data.frame(Income, Employees, Taxes) +#' HT(estima, pik) +#' +#' ############ +#' ## Example 3 +#' ############ +#' # Without replacement sampling +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector y1 and y2 are the values of the variables of interest +#' y1<-c(32, 34, 46, 89, 35) +#' y2<-c(1,1,1,0,0) +#' y3<-cbind(y1,y2) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is n=2 +#' n <- 2 +#' # The sample membership matrix for fixed size without replacement sampling designs +#' Ind <- Ik(N,n) +#' # p is the probability of selection of every possible sample +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' # Computation of the inclusion probabilities +#' inclusion <- Pik(p, Ind) +#' # Selection of a random sample +#' sam <- sample(5,2) +#' # The selected sample +#' U[sam] +#' # The inclusion probabilities for these two units +#' inclusion[sam] +#' # The values of the variables of interest for the units in the sample +#' y1[sam] +#' y2[sam] +#' y3[sam,] +#' # The Horvitz-Thompson estimator +#' HT(y1[sam],inclusion[sam]) +#' HT(y2[sam],inclusion[sam]) +#' HT(y3[sam,],inclusion[sam]) +#' +#' ############ +#' ## Example 4 +#' ############ +#' # Following Example 3... With replacement sampling +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is m=2 +#' m <- 2 +#' # pk is the probability of selection of every single unit +#' pk <- c(0.9, 0.025, 0.025, 0.025, 0.025) +#' # Computation of the inclusion probabilities +#' pik <- 1-(1-pk)^m +#' # Selection of a random sample with replacement +#' sam <- sample(5,2, replace=TRUE, prob=pk) +#' # The selected sample +#' U[sam] +#' # The inclusion probabilities for these two units +#' inclusion[sam] +#' # The values of the variables of interest for the units in the sample +#' y1[sam] +#' y2[sam] +#' y3[sam,] +#' # The Horvitz-Thompson estimator +#' HT(y1[sam],inclusion[sam]) +#' HT(y2[sam],inclusion[sam]) +#' HT(y3[sam,],inclusion[sam]) +#' +#' #################################################################### +#' ## Example 5 HT is unbiased for without replacement sampling designs +#' ## Fixed sample size +#' #################################################################### +#' +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector y1 and y2 are the values of the variables of interest +#' y<-c(32, 34, 46, 89, 35) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is n=2 +#' n <- 2 +#' # The sample membership matrix for fixed size without replacement sampling designs +#' Ind <- Ik(N,n) +#' Ind +#' # p is the probability of selection of every possible sample +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' sum(p) +#' # Computation of the inclusion probabilities +#' inclusion <- Pik(p, Ind) +#' inclusion +#' sum(inclusion) +#' # The support with the values of the elements +#' Qy <-Support(N,n,ID=y) +#' Qy +#' # The HT estimates for every single sample in the support +#' HT1<- HT(y[Ind[1,]==1], inclusion[Ind[1,]==1]) +#' HT2<- HT(y[Ind[2,]==1], inclusion[Ind[2,]==1]) +#' HT3<- HT(y[Ind[3,]==1], inclusion[Ind[3,]==1]) +#' HT4<- HT(y[Ind[4,]==1], inclusion[Ind[4,]==1]) +#' HT5<- HT(y[Ind[5,]==1], inclusion[Ind[5,]==1]) +#' HT6<- HT(y[Ind[6,]==1], inclusion[Ind[6,]==1]) +#' HT7<- HT(y[Ind[7,]==1], inclusion[Ind[7,]==1]) +#' HT8<- HT(y[Ind[8,]==1], inclusion[Ind[8,]==1]) +#' HT9<- HT(y[Ind[9,]==1], inclusion[Ind[9,]==1]) +#' HT10<- HT(y[Ind[10,]==1], inclusion[Ind[10,]==1]) +#' # The HT estimates arranged in a vector +#' Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10) +#' Est +#' # The HT is actually desgn-unbiased +#' data.frame(Ind, Est, p) +#' sum(Est*p) +#' sum(y) +#' +#' #################################################################### +#' ## Example 6 HT is unbiased for without replacement sampling designs +#' ## Random sample size +#' #################################################################### +#' +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector y1 and y2 are the values of the variables of interest +#' y<-c(32, 34, 46, 89, 35) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample membership matrix for random size without replacement sampling designs +#' Ind <- IkRS(N) +#' Ind +#' # p is the probability of selection of every possible sample +#' p <- c(0.59049, 0.06561, 0.06561, 0.06561, 0.06561, 0.06561, 0.00729, 0.00729, +#' 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00729, 0.00081, +#' 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, 0.00081, +#' 0.00009, 0.00009, 0.00009, 0.00009, 0.00009, 0.00001) +#' sum(p) +#' # Computation of the inclusion probabilities +#' inclusion <- Pik(p, Ind) +#' inclusion +#' sum(inclusion) +#' # The support with the values of the elements +#' Qy <-SupportRS(N, ID=y) +#' Qy +#' # The HT estimates for every single sample in the support +#' HT1<- HT(y[Ind[1,]==1], inclusion[Ind[1,]==1]) +#' HT2<- HT(y[Ind[2,]==1], inclusion[Ind[2,]==1]) +#' HT3<- HT(y[Ind[3,]==1], inclusion[Ind[3,]==1]) +#' HT4<- HT(y[Ind[4,]==1], inclusion[Ind[4,]==1]) +#' HT5<- HT(y[Ind[5,]==1], inclusion[Ind[5,]==1]) +#' HT6<- HT(y[Ind[6,]==1], inclusion[Ind[6,]==1]) +#' HT7<- HT(y[Ind[7,]==1], inclusion[Ind[7,]==1]) +#' HT8<- HT(y[Ind[8,]==1], inclusion[Ind[8,]==1]) +#' HT9<- HT(y[Ind[9,]==1], inclusion[Ind[9,]==1]) +#' HT10<- HT(y[Ind[10,]==1], inclusion[Ind[10,]==1]) +#' HT11<- HT(y[Ind[11,]==1], inclusion[Ind[11,]==1]) +#' HT12<- HT(y[Ind[12,]==1], inclusion[Ind[12,]==1]) +#' HT13<- HT(y[Ind[13,]==1], inclusion[Ind[13,]==1]) +#' HT14<- HT(y[Ind[14,]==1], inclusion[Ind[14,]==1]) +#' HT15<- HT(y[Ind[15,]==1], inclusion[Ind[15,]==1]) +#' HT16<- HT(y[Ind[16,]==1], inclusion[Ind[16,]==1]) +#' HT17<- HT(y[Ind[17,]==1], inclusion[Ind[17,]==1]) +#' HT18<- HT(y[Ind[18,]==1], inclusion[Ind[18,]==1]) +#' HT19<- HT(y[Ind[19,]==1], inclusion[Ind[19,]==1]) +#' HT20<- HT(y[Ind[20,]==1], inclusion[Ind[20,]==1]) +#' HT21<- HT(y[Ind[21,]==1], inclusion[Ind[21,]==1]) +#' HT22<- HT(y[Ind[22,]==1], inclusion[Ind[22,]==1]) +#' HT23<- HT(y[Ind[23,]==1], inclusion[Ind[23,]==1]) +#' HT24<- HT(y[Ind[24,]==1], inclusion[Ind[24,]==1]) +#' HT25<- HT(y[Ind[25,]==1], inclusion[Ind[25,]==1]) +#' HT26<- HT(y[Ind[26,]==1], inclusion[Ind[26,]==1]) +#' HT27<- HT(y[Ind[27,]==1], inclusion[Ind[27,]==1]) +#' HT28<- HT(y[Ind[28,]==1], inclusion[Ind[28,]==1]) +#' HT29<- HT(y[Ind[29,]==1], inclusion[Ind[29,]==1]) +#' HT30<- HT(y[Ind[30,]==1], inclusion[Ind[30,]==1]) +#' HT31<- HT(y[Ind[31,]==1], inclusion[Ind[31,]==1]) +#' HT32<- HT(y[Ind[32,]==1], inclusion[Ind[32,]==1]) +#' # The HT estimates arranged in a vector +#' Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10, HT11, HT12, HT13, +#' HT14, HT15, HT16, HT17, HT18, HT19, HT20, HT21, HT22, HT23, HT24, HT25, HT26, +#' HT27, HT28, HT29, HT30, HT31, HT32) +#' Est +#' # The HT is actually desgn-unbiased +#' data.frame(Ind, Est, p) +#' sum(Est*p) +#' sum(y) +#' +#' ################################################################ +#' ## Example 7 HT is unbiased for with replacement sampling designs +#' ################################################################ +#' +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector y1 and y2 are the values of the variables of interest +#' y<-c(32, 34, 46, 89, 35) +#' # The population size is N=5 +#' N <- length(U) +#' # The sample size is m=2 +#' m <- 2 +#' # pk is the probability of selection of every single unit +#' pk <- c(0.35, 0.225, 0.175, 0.125, 0.125) +#' # p is the probability of selection of every possible sample +#' p <- p.WR(N,m,pk) +#' p +#' sum(p) +#' # The sample membership matrix for random size without replacement sampling designs +#' Ind <- IkWR(N,m) +#' Ind +#' # The support with the values of the elements +#' Qy <- SupportWR(N,m, ID=y) +#' Qy +#' # Computation of the inclusion probabilities +#' pik <- 1-(1-pk)^m +#' pik +#' # The HT estimates for every single sample in the support +#' HT1 <- HT(y[Ind[1,]==1], pik[Ind[1,]==1]) +#' HT2 <- HT(y[Ind[2,]==1], pik[Ind[2,]==1]) +#' HT3 <- HT(y[Ind[3,]==1], pik[Ind[3,]==1]) +#' HT4 <- HT(y[Ind[4,]==1], pik[Ind[4,]==1]) +#' HT5 <- HT(y[Ind[5,]==1], pik[Ind[5,]==1]) +#' HT6 <- HT(y[Ind[6,]==1], pik[Ind[6,]==1]) +#' HT7 <- HT(y[Ind[7,]==1], pik[Ind[7,]==1]) +#' HT8 <- HT(y[Ind[8,]==1], pik[Ind[8,]==1]) +#' HT9 <- HT(y[Ind[9,]==1], pik[Ind[9,]==1]) +#' HT10 <- HT(y[Ind[10,]==1], pik[Ind[10,]==1]) +#' HT11 <- HT(y[Ind[11,]==1], pik[Ind[11,]==1]) +#' HT12 <- HT(y[Ind[12,]==1], pik[Ind[12,]==1]) +#' HT13 <- HT(y[Ind[13,]==1], pik[Ind[13,]==1]) +#' HT14 <- HT(y[Ind[14,]==1], pik[Ind[14,]==1]) +#' HT15 <- HT(y[Ind[15,]==1], pik[Ind[15,]==1]) +#' # The HT estimates arranged in a vector +#' Est <- c(HT1, HT2, HT3, HT4, HT5, HT6, HT7, HT8, HT9, HT10, HT11, HT12, HT13, +#' HT14, HT15) +#' Est +#' # The HT is actually desgn-unbiased +#' data.frame(Ind, Est, p) +#' sum(Est*p) +#' sum(y) -HT<-function(y,Pik){ -y<-t(as.matrix(y)) -pik<-as.matrix(Pik) -HT<-y%*%(1/Pik) -HT +HT <- function(y, Pik) { + y <- t(as.matrix(y)) + pik <- as.matrix(Pik) + result <- y %*% (1/Pik) + result } \ No newline at end of file diff --git a/R/IPFP.r b/R/IPFP.r index 8259b3b..45df2e7 100644 --- a/R/IPFP.r +++ b/R/IPFP.r @@ -1,34 +1,98 @@ #' @export +#' +#' @title +#' Iterative Proportional Fitting Procedure (Raking) +#' @description +#' Adjusts a contingency table so that its row and column marginals match +#' known population totals, using the Iterative Proportional Fitting +#' Procedure (IPFP), also known as raking or RAS algorithm. +#' @return +#' A matrix with \code{nrow(Table) + 1} rows and \code{ncol(Table) + 1} +#' columns containing the adjusted cell counts, with an added row of +#' estimated column totals and an added column of estimated row totals. +#' @details +#' The algorithm alternates between row and column adjustments until +#' convergence. At each step, cells in each row (or column) are multiplied +#' by the ratio of the known marginal to the current estimated marginal. +#' Convergence is assessed by the sum of absolute differences between +#' known and estimated marginals. +#' @author Hugo Andres Gutierrez Rojas +#' @param Table A matrix or data frame of initial cell counts or weights to +#' be adjusted. +#' @param Col.knw Numeric vector of known column marginal totals. +#' @param Row.knw Numeric vector of known row marginal totals. +#' @param tol Convergence tolerance. The algorithm stops when the total +#' absolute deviation between known and estimated marginals is below +#' \code{tol}. Default is \code{0.0001}. +#' +#' @references +#' Deming, W.E. and Stephan, F.F. (1940). On a least squares adjustment of +#' a sampled frequency table when the expected marginal totals are known. +#' \emph{Annals of Mathematical Statistics}, 11(4), 427-444.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Domains}}, \code{\link{Wk}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' Table <- matrix(c(80, 90, 10, 170, 80, 80, 150, 210, 130), 3, 3) +#' rownames(Table) <- c("a1", "a2", "a3") +#' colnames(Table) <- c("b1", "b2", "b3") +#' Col.knw <- c(150, 300, 550) +#' Row.knw <- c(430, 360, 210) +#' IPFP(Table, Col.knw, Row.knw, tol = 0.0001) +#' ############ +#' ## Example 2 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- sample(N, n) +#' data <- Lucy[sam, ] +#' attach(data) +#' Doma1 <- Domains(Level) +#' Doma2 <- Domains(SPAM) +#' SPAM.no <- Doma2[, 1] * Doma1 +#' SPAM.yes <- Doma2[, 2] * Doma1 +#' est1 <- E.SI(N, n, SPAM.no)[, 2:4] +#' est2 <- E.SI(N, n, SPAM.yes)[, 2:4] +#' Table <- cbind(est1[1, ], est2[1, ]) +#' Col.knw <- colSums(Domains(Lucy$SPAM)) +#' Row.knw <- colSums(Domains(Lucy$Level)) +#' IPFP(Table, Col.knw, Row.knw, tol = 0.0001) -IPFP <- function(Table,Col.knw,Row.knw,tol=0.0001) -{ -Table <- as.matrix(Table) -Col.est <- colSums(Table) -Row.est <- rowSums(Table) -I <- length(Row.knw) -J <- length(Col.knw) -Est <- Table -criterio <- sum(abs(Col.knw-Col.est)) + sum(abs(Row.knw-Row.est)) -while(criterio > tol){ -for(i in 1:I){ -for(j in 1:J){ -Est[i,j] <- Est[i,j]*Row.knw[i]/Row.est[i] -} -} -Col.est <- colSums(Est) -Row.est <- rowSums(Est) -criterio <- sum(abs(Col.knw-Col.est)) + sum(abs(Row.knw-Row.est)) -for(i in 1:I){ -for(j in 1:J){ -Est[i,j] <- Est[i,j]*Col.knw[j]/Col.est[j] -} -} -Col.est <- colSums(Est) -Row.est <- rowSums(Est) -criterio <- sum(abs(Col.knw-Col.est)) + sum(abs(Row.knw-Row.est)) -} -p1 <- rbind(Est,Col.est) -p2 <- cbind(p1,c(Row.est,sum(Row.est))) -colnames(p2)[J+1] <- c("Row.est") -return(p2) -} +IPFP <- function(Table, Col.knw, Row.knw, tol = 0.0001) { + Table <- as.matrix(Table) + Col.est <- colSums(Table) + Row.est <- rowSums(Table) + I <- length(Row.knw) + J <- length(Col.knw) + Est <- Table + criterio <- sum(abs(Col.knw - Col.est)) + sum(abs(Row.knw - Row.est)) + while (criterio > tol) { + for (i in 1:I) { + for (j in 1:J) { + Est[i, j] <- Est[i, j] * Row.knw[i]/Row.est[i] + } + } + Col.est <- colSums(Est) + Row.est <- rowSums(Est) + criterio <- sum(abs(Col.knw - Col.est)) + sum(abs(Row.knw - Row.est)) + for (i in 1:I) { + for (j in 1:J) { + Est[i, j] <- Est[i, j] * Col.knw[j]/Col.est[j] + } + } + Col.est <- colSums(Est) + Row.est <- rowSums(Est) + criterio <- sum(abs(Col.knw - Col.est)) + sum(abs(Row.knw - Row.est)) + } + p1 <- rbind(Est, Col.est) + p2 <- cbind(p1, c(Row.est, sum(Row.est))) + colnames(p2)[J + 1] <- c("Row.est") + return(p2) +} \ No newline at end of file diff --git a/R/Ik.r b/R/Ik.r index 40ec4c5..a8e6130 100644 --- a/R/Ik.r +++ b/R/Ik.r @@ -1,15 +1,50 @@ #' @export +#' +#' @title +#' Sample Membership Indicator Matrix +#' @description +#' Constructs the indicator matrix of the sampling support for a fixed-size +#' without-replacement design. Each row corresponds to one possible sample +#' and each column to one population unit. +#' @return +#' A binary matrix of dimension \code{choose(N, n) x N}, where entry +#' \eqn{(s, k) = 1} if unit \eqn{k} belongs to sample \eqn{s}, and 0 +#' otherwise. +#' @details +#' The full enumeration of all \code{choose(N, n)} possible samples is +#' computationally feasible only for small populations. For \code{N > 15} +#' this function will be very slow. It is intended primarily for theoretical +#' illustrations and teaching purposes. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Recommended \code{N <= 15}. +#' @param n Sample size. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Pik}}, \code{\link{Pikl}}, \code{\link{Support}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' n <- 2 +#' # The sample membership matrix +#' Ik(N, n) +#' # The first unit, Yves, belongs to the first four possible samples -Ik <- function(N,n){ -Q <- Support(N,n,ID=FALSE) -I <- matrix(0,choose(N,n),N) -for(i in 1:n){ -for(j in 1:choose(N,n)){ -for(k in 1:N){ -if (Q[j,i]==k) -I[j,k] <- 1 -} -} -} -I +Ik <- function(N, n) { + Q <- Support(N, n, ID = FALSE) + I <- matrix(0, choose(N, n), N) + for (i in 1:n) { + for (j in 1:choose(N, n)) { + for (k in 1:N) { + if (Q[j, i] == k) + I[j, k] <- 1 + } + } + } + I } \ No newline at end of file diff --git a/R/IkRS.r b/R/IkRS.r index cd5113b..a40d83b 100644 --- a/R/IkRS.r +++ b/R/IkRS.r @@ -1,9 +1,42 @@ #' @export +#' +#' @title +#' Sample Membership Indicator Matrix for All Possible Sample Sizes +#' @description +#' Constructs the indicator matrix of the complete sampling support, stacking +#' the indicator matrices for all sample sizes from 1 to \code{N}. This +#' covers every possible non-empty subset of the population. +#' @return +#' A binary matrix with \eqn{2^N} rows (one per non-empty subset, including +#' the empty set as the first row of zeros) and \code{N} columns. Entry +#' \eqn{(s, k) = 1} if unit \eqn{k} belongs to subset \eqn{s}. +#' @details +#' This function calls \code{\link{Ik}} for each possible sample size +#' \eqn{n = 1, \ldots, N} and stacks the results. It is intended for small +#' populations only (\code{N <= 10}) due to the exponential growth of the +#' support size. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Recommended \code{N <= 10}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Ik}}, \code{\link{SupportRS}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' # The sample membership matrix for all sample sizes +#' IkRS(N) +#' # The first sample is a null one and the last sample is a census -IkRS <- function(N){ -sam <- matrix(0, ncol=N, nrow=1) -for(k in 1:N){ -sam<-rbind(sam, Ik(N,k)) -} -sam +IkRS <- function(N) { + sam <- matrix(0, ncol = N, nrow = 1) + for (k in 1:N) { + sam <- rbind(sam, Ik(N, k)) + } + sam } \ No newline at end of file diff --git a/R/IkWR.r b/R/IkWR.r index b6f9f3a..752e7e7 100644 --- a/R/IkWR.r +++ b/R/IkWR.r @@ -1,16 +1,50 @@ #' @export +#' +#' @title +#' Frequency Indicator Matrix for With-Replacement Sampling +#' @description +#' Constructs the indicator matrix of the with-replacement sampling support +#' for a population of size \code{N} and \code{m} draws. Each row corresponds +#' to one possible ordered outcome and each column to one population unit, +#' with entry \eqn{(s, k) = 1} if unit \eqn{k} was selected at least once +#' in outcome \eqn{s}. +#' @return +#' A binary matrix of dimension \code{choose(N+m-1, m) x N}, where entry +#' \eqn{(s, k) = 1} if unit \eqn{k} appears in the \eqn{s}-th outcome of +#' the with-replacement support, and 0 otherwise. +#' @details +#' The with-replacement support is enumerated via \code{\link{SupportWR}}. +#' This function is intended for small populations and few draws only, as the +#' support grows rapidly with \code{N} and \code{m}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Keep small due to combinatorial growth. +#' @param m Number of draws (sample size with replacement). +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Ik}}, \code{\link{SupportWR}}, \code{\link{nk}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' m <- 2 +#' # The sample membership matrix for with-replacement sampling +#' IkWR(N, m) -IkWR <- function(N, m) -{ -Q <- SupportWR(N, m, ID = FALSE) -I <- matrix(0, choose(N+m-1, m), N) -for (i in 1:m) { -for (j in 1:choose(N+m-1, m)) { -for (k in 1:N) { -if (Q[j, i] == k) -I[j, k] <- 1 -} -} -} -I +IkWR <- function(N, m) { + Q <- SupportWR(N, m, ID = FALSE) + I <- matrix(0, choose(N + m - 1, m), N) + for (i in 1:m) { + for (j in 1:choose(N + m - 1, m)) { + for (k in 1:N) { + if (Q[j, i] == k) + I[j, k] <- 1 + } + } + } + I } \ No newline at end of file diff --git a/R/OrderWR.r b/R/OrderWR.r index 1e925d7..ab120ba 100644 --- a/R/OrderWR.r +++ b/R/OrderWR.r @@ -1,46 +1,83 @@ #' @export +#' +#' @title +#' Ordered With-Replacement Sampling Support +#' @description +#' Enumerates all ordered sequences of \code{m} draws from a population of +#' size \code{N} with replacement. Unlike \code{\link{SupportWR}}, this +#' function considers order, so sequences that differ only in draw order are +#' treated as distinct outcomes. +#' @return +#' A matrix with \code{N^m} rows and \code{m} columns, where each row is one +#' ordered sequence of draws. If \code{ID} is provided, population labels are +#' substituted for indices. +#' @details +#' The total number of ordered with-replacement sequences of size \code{m} +#' from \code{N} units is \eqn{N^m}. This grows rapidly and the function +#' should only be used for small \code{N} and \code{m}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param m Number of draws. +#' @param ID Optional vector of population labels of length \code{N}. +#' If provided, labels are substituted for integer indices in the output. +#' If \code{FALSE} (default), integer indices are returned. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{SupportWR}}, \code{\link{IkWR}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' # Five possible ordered samples of size m=1 +#' OrderWR(N, 1) +#' OrderWR(N, 1, ID = U) +#' # 25 possible ordered samples of size m=2 +#' OrderWR(N, 2) +#' OrderWR(N, 2, ID = U) +#' # Note: ordered samples differ from unordered (SupportWR) +#' OrderWR(N, 2) +#' SupportWR(N, 2) -OrderWR<-function(N,m,ID=FALSE){ -b<-c(1:N) -grilla<-function(a){ -A<-seq(1:length(a)) -unoA <-rep(1,length(A)) -B<-seq(1:length(a)) -unoB <-rep(1,length(B)) -P1<-kronecker(A,unoB) -P2<-kronecker(unoA,B) -grid<-matrix(cbind(P1,P2),ncol=2) -return(grid) -} - -if(m==1){ -sam<-as.matrix(b) -} - -if(m==2){ -sam<-grilla(b) -} - -if(m>2){ -sam<-grilla(b) -for(l in 3:m){ -Sam1<-rep(0,l) -for(j in 1:dim(sam)[1]){ -for(k in 1:length(b)){ -Sam1<-rbind(Sam1,c(sam[j,],b[k])) -} } -sam<-Sam1[-1,] -} -} -if (is.logical(ID) == TRUE){return(sam)} -else{ -a<-dim(sam) -val<-matrix(NA,a[1],a[2]) -for(ii in 1:(dim(val)[1])){ -for(jj in 1:(dim(val)[2])){ -val[ii,jj]<-ID[sam[ii,jj]] -} -} -return(val) -} +OrderWR <- function(N, m, ID = FALSE) { + b <- c(1:N) + grilla <- function(a) { + A <- seq(1:length(a)) + unoA <- rep(1, length(A)) + B <- seq(1:length(a)) + unoB <- rep(1, length(B)) + P1 <- kronecker(A, unoB) + P2 <- kronecker(unoA, B) + grid <- matrix(cbind(P1, P2), ncol = 2) + return(grid) + } + if (m == 1) sam <- as.matrix(b) + if (m == 2) sam <- grilla(b) + if (m > 2) { + sam <- grilla(b) + for (l in 3:m) { + Sam1 <- rep(0, l) + for (j in 1:dim(sam)[1]) { + for (k in 1:length(b)) { + Sam1 <- rbind(Sam1, c(sam[j, ], b[k])) + } + } + sam <- Sam1[-1, ] + } + } + if (is.logical(ID) == TRUE) return(sam) + else { + a <- dim(sam) + val <- matrix(NA, a[1], a[2]) + for (ii in 1:(dim(val)[1])) { + for (jj in 1:(dim(val)[2])) { + val[ii, jj] <- ID[sam[ii, jj]] + } + } + return(val) + } } \ No newline at end of file diff --git a/R/Pik.r b/R/Pik.r index d3f4d39..4f281d9 100644 --- a/R/Pik.r +++ b/R/Pik.r @@ -1,7 +1,50 @@ #' @export +#' +#' @title +#' First-Order Inclusion Probabilities from a Sampling Design +#' @description +#' Computes the first-order inclusion probabilities for each unit in a finite +#' population, given the probability of each possible sample and the indicator +#' matrix of the sampling support. +#' @return +#' A row vector (1 x N matrix) of first-order inclusion probabilities +#' \eqn{\pi_k = P(k \in s)} for each unit \eqn{k} in the population. +#' @details +#' The inclusion probability of unit \eqn{k} is computed as the sum of the +#' probabilities of all samples that contain unit \eqn{k}: +#' \deqn{\pi_k = \sum_{s \ni k} p(s)} +#' The indicator matrix \code{Ind} (output of \code{\link{Ik}}) has one row +#' per possible sample and one column per population unit, with entry 1 if +#' unit \eqn{k} is in sample \eqn{s} and 0 otherwise. +#' @author Hugo Andres Gutierrez Rojas +#' @param p Vector of probabilities for each possible sample in the support. +#' Must sum to 1. +#' @param Ind Indicator matrix of the sampling support, as returned by +#' \code{\link{Ik}}. Rows are samples, columns are population units. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Ik}}, \code{\link{Pikl}}, \code{\link{PikPPS}} +#' +#' @examples +#' # Population of size N = 5, sample size n = 2 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' n <- 2 +#' # Sample probabilities (one per possible sample) +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' Ind <- Ik(N, n) +#' pik <- Pik(p, Ind) +#' pik +#' # Check: inclusion probabilities sum to n +#' sum(pik) -Pik <- function(p, Ind){ -multip <- p*Ind -pik <- colSums(multip) -t(pik) +Pik <- function(p, Ind) { + multip <- p * Ind + pik <- colSums(multip) + t(pik) } \ No newline at end of file diff --git a/R/PikHol.r b/R/PikHol.r index 469bdab..5d3de94 100644 --- a/R/PikHol.r +++ b/R/PikHol.r @@ -1,18 +1,87 @@ #' @export +#' +#' @title +#' Optimal Inclusion Probabilities for Multiple Surveys (Holmberg) +#' @description +#' Computes optimal first-order inclusion probabilities for a population that +#' is surveyed on multiple occasions, minimising a measure of total variance +#' across surveys. This implements the approach of Holmberg (2002) for +#' coordinated sampling over time. +#' @return +#' A numeric vector of length \code{N} with the optimal inclusion probability +#' for each unit in the population. +#' @details +#' For each survey \eqn{k}, the initial inclusion probabilities are computed +#' via \code{\link{PikPPS}}. An optimal composite size measure is then derived +#' by combining the per-survey auxiliary variables through a weighted sum, and +#' the final inclusion probabilities are computed proportional to the square +#' root of this composite. The resulting sample size \code{n.st} is chosen to +#' minimise total variance subject to a relative precision target \code{e}. +#' @author Hugo Andres Gutierrez Rojas +#' @param n Integer vector of length \code{p} with the desired sample size +#' for each of the \code{p} surveys. +#' @param sigma Matrix of dimension \code{N x p} where column \eqn{k} contains +#' the auxiliary size variable for survey \eqn{k}. +#' @param e Scalar. Relative tolerance parameter controlling the precision +#' target across surveys. +#' @param Pi Optional matrix of dimension \code{N x p} with initial inclusion +#' probabilities for each survey. If omitted, \code{\link{PikPPS}} is used. +#' +#' @references +#' Holmberg, A. (2002). A multiparameter perspective on the choice of sampling +#' design in surveys. \emph{Statistics in Transition}, 5(6), 969-994.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{PikPPS}}, \code{\link{PikSTPPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' n <- c(350, 400) +#' sigy1 <- sqrt(Income^(1)) +#' sigy2 <- sqrt(Income^(2)) +#' sigma <- cbind(sigy1, sigy2) +#' Piks <- PikHol(n, sigma, 0.03) +#' n.opt <- round(sum(Piks)) +#' res <- S.piPS(n.opt, Piks) +#' sam <- res[, 1] +#' Pik.s <- res[, 2] +#' estima <- data.frame(Lucy$Income[sam], Lucy$Employees[sam]) +#' E.piPS(estima, Pik.s) +#' ############ +#' ## Example 2 - with custom inclusion probabilities +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' n <- c(350, 400) +#' sigy1 <- sqrt(Income^(1)) +#' sigy2 <- sqrt(Income^(2)) +#' sigma <- cbind(sigy1, sigy2) +#' pikas <- cbind(rep(400/N, N), rep(400/N, N)) +#' Piks <- PikHol(n, sigma, 0.03, pikas) +#' round(sum(Piks)) -PikHol <- function(n, sigma, e, Pi = PiDefault){ +PikHol <- function(n, sigma, e, Pi = NULL) { N <- dim(sigma)[1] p <- length(n) - PiDefault <- matrix(NA, nrow = N, ncol = p) - A <- matrix(NA, nrow = N, ncol = p) - for (k in 1:p) { - PiDefault[,k] <- PikPPS(n[k], sigma[,k]) + if (is.null(Pi)) { + Pi <- matrix(NA, nrow = N, ncol = p) + for (k in 1:p) { + Pi[, k] <- PikPPS(n[k], sigma[, k]) + } } + A <- matrix(NA, nrow = N, ncol = p) for (k in 1:p) { - A[,k] <- sigma[,k] ^ 2/(sum(((1 / Pi[,k]) - 1)*sigma[,k] ^ 2)) + A[, k] <- sigma[, k]^2/(sum(((1/Pi[, k]) - 1) * sigma[, k]^2)) } - aqk <- rowSums(A) - n.st <- ceiling(((sum(sqrt(aqk))) ^ 2)/((1 + e) * p + (sum(aqk)))) + aqk <- rowSums(A) + n.st <- ceiling(((sum(sqrt(aqk)))^2)/((1 + e) * p + (sum(aqk)))) pikopt <- PikPPS(n.st, sqrt(aqk)) return(pikopt) -} +} \ No newline at end of file diff --git a/R/PikPPS.r b/R/PikPPS.r index 0ebb695..0cb0b3f 100644 --- a/R/PikPPS.r +++ b/R/PikPPS.r @@ -1,15 +1,88 @@ #' @export +#' +#' @title +#' Inclusion Probabilities Proportional to Size +#' @description +#' Computes first-order inclusion probabilities proportional to an auxiliary +#' size variable \code{x} for a without-replacement sample of size \code{n}. +#' A sequential truncation algorithm ensures all probabilities are at most 1. +#' @return +#' A numeric vector of length \code{N} with the first-order inclusion +#' probability for each unit in the population. Values are in \code{(0, 1]}. +#' @details +#' The initial probabilities \eqn{\pi_k = n x_k / \sum x} may exceed 1 for +#' large units. The algorithm iteratively sets those probabilities to 1 and +#' redistributes the remaining sample size among the other units until all +#' probabilities are valid. The result satisfies \eqn{\sum \pi_k = n}. +#' @author Hugo Andres Gutierrez Rojas +#' @param n Desired sample size. +#' @param x Vector of length \code{N} with positive auxiliary size values +#' for each unit in the population. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.piPS}}, \code{\link{PikSTPPS}}, \code{\link{PikHol}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' x <- c(30,41,50,170,43,200) +#' n <- 3 +#' # Two elements yields values bigger than one +#' n*x/sum(x) +#' # With this functions, all of the values are between zero and one +#' PikPPS(n,x) +#' # The sum is equal to the sample size +#' sum(PikPPS(n,x)) +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # The auxiliary information +#' x <- c(52, 60, 75, 100, 50) +#' # Gives the inclusion probabilities for the population accordin to a +#' # proportional to size design without replacement of size n=4 +#' pik <- PikPPS(4,x) +#' pik +#' # The selected sample is +#' sum(pik) +#' +#' ############ +#' ## Example 3 +#' ############ +#' # Uses the Lucy data to compute teh vector of inclusion probabilities +#' # accordind to a piPS without replacement design +#' data(Lucy) +#' attach(Lucy) +#' # The sample size +#' n=400 +#' # The selection probability of each unit is proportional to the variable Income +#' pik <- PikPPS(n,Income) +#' # The inclusion probabilities of the units in the sample +#' pik +#' # The sum of the values in pik is equal to the sample size +#' sum(pik) +#' # According to the design some elements must be selected +#' # They are called forced inclusion units +#' which(pik==1) -PikPPS<-function(n,x){ -pik<- n*x/sum(x) -while((sum(pik>1))!=0){ -s<-which(pik>=1) -new=(1:length(pik))[-s] -pik[s]=1 -txnew<-sum(x[s]) -for(k in new){ -pik[k]<- (n-length(s))*x[k]/(sum(x)-txnew) -} -} -pik +PikPPS <- function(n, x) { + pik <- n * x/sum(x) + while ((sum(pik > 1)) != 0) { + s <- which(pik >= 1) + new <- (1:length(pik))[-s] + pik[s] <- 1 + txnew <- sum(x[s]) + for (k in new) { + pik[k] <- (n - length(s)) * x[k]/(sum(x) - txnew) + } + } + pik } \ No newline at end of file diff --git a/R/Pikl.r b/R/Pikl.r index 77892b1..fbb3a45 100644 --- a/R/Pikl.r +++ b/R/Pikl.r @@ -1,20 +1,53 @@ #' @export +#' +#' @title +#' Second-Order Inclusion Probabilities +#' @description +#' Computes the matrix of second-order inclusion probabilities +#' \eqn{\pi_{kl} = P(k \in s \text{ and } l \in s)} for all pairs of units +#' in a finite population of size \code{N} under a fixed-size sampling design. +#' @return +#' An \code{N x N} matrix where entry \eqn{(k, l)} is the probability that +#' both units \eqn{k} and \eqn{l} are included in the same sample. Diagonal +#' entries \eqn{(k,k)} equal the first-order inclusion probability \eqn{\pi_k}. +#' @details +#' The second-order inclusion probabilities are needed to compute the exact +#' Horvitz-Thompson variance estimator and the Sen-Yates-Grundy variance +#' estimator. This function enumerates the full sampling support via +#' \code{\link{Ik}} and is therefore only feasible for small populations +#' (\code{N <= 15}). +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Keep small (recommended \code{N <= 15}) due to +#' the combinatorial enumeration of all possible samples. +#' @param n Sample size. +#' @param p Vector of probabilities for each possible sample in the support. +#' Must sum to 1. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Pik}}, \code{\link{Deltakl}}, \code{\link{VarHT}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' n <- 2 +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' sum(p) +#' # Second-order inclusion probabilities +#' Pikl(N, n, p) -Pikl <- function(N,n,p){ -# The support -Sam <- Ik(N,n) -# Two columns for index k and index l -Ind <- OrderWR(N,2) -# Creation of the indicator vectors k and l -K <- matrix(c(Sam[,Ind]),ncol=2) -L <- t(t(K[,1])*K[,2]) -# Vectors of indicators k and l -# The first column is I11, the second is I12, etc.. -Ikl <- matrix(c(L),ncol=nrow(Ind)) -M <- p*Ikl -#Sum of the probabilities by column -O <- apply(M,2,sum) -# Creation of the matrix Pikl -P <- matrix(c(O),ncol=N) -return(P) -} +Pikl <- function(N, n, p) { + Sam <- Ik(N, n) + Ind <- OrderWR(N, 2) + K <- matrix(c(Sam[, Ind]), ncol = 2) + L <- t(t(K[, 1]) * K[, 2]) + Ikl <- matrix(c(L), ncol = nrow(Ind)) + M <- p * Ikl + O <- apply(M, 2, sum) + P <- matrix(c(O), ncol = N) + return(P) +} \ No newline at end of file diff --git a/R/S.BE.r b/R/S.BE.r index 743fbf3..92686ba 100644 --- a/R/S.BE.r +++ b/R/S.BE.r @@ -1,11 +1,65 @@ #' @export +#' +#' @title +#' Bernoulli Sampling +#' @description +#' Draws a Bernoulli sample from a finite population of size \code{N}. +#' Each unit is independently selected with the same inclusion probability +#' \code{prob}. +#' @return +#' A vector of length \code{N} where selected units contain their population +#' index and non-selected units contain \code{0}. +#' @details +#' The sample size under Bernoulli sampling is random, following a +#' Binomial(\code{N}, \code{prob}) distribution. To extract the selected +#' indices, use \code{sam[sam != 0]}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param prob Scalar. Inclusion probability, must satisfy \code{0 < prob <= 1}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.BE}}, \code{\link{S.PO}}, \code{\link{S.SI}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Draws a Bernoulli sample without replacement of expected size n=3 +#' # The inlusion probability is 0.6 for each unit in the population +#' sam <- S.BE(5,0.6) +#' sam +#' # The selected sample is +#' U[sam] +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a Bernoulli sample +#' +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' # The population size is 2396. If the expected sample size is 400 +#' # then, the inclusion probability must be 400/2396=0.1669 +#' sam <- S.BE(N,0.01669) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) -S.BE<-function(N,prob){ -sam<-matrix(0,N,1) -U<-runif(N) -for(k in 1:N){ -if(U[k]<=prob) -sam[k]<-k - } -return(sam) +S.BE <- function(N, prob) { + sam <- matrix(0, N, 1) + U <- runif(N) + for (k in 1:N) { + if (U[k] <= prob) + sam[k] <- k + } + return(sam) } \ No newline at end of file diff --git a/R/S.PO.r b/R/S.PO.r index 3819c16..6bb7b84 100644 --- a/R/S.PO.r +++ b/R/S.PO.r @@ -1,11 +1,69 @@ #' @export +#' +#' @title +#' Poisson Sampling +#' @description +#' Draws a Poisson sample from a finite population of size \code{N}. +#' Each unit \eqn{k} is independently selected with its own inclusion +#' probability \eqn{\pi_k}. +#' @return +#' A vector of length \code{N} where selected units contain their population +#' index and non-selected units contain \code{0}. +#' @details +#' Poisson sampling is a generalisation of Bernoulli sampling that allows +#' unequal inclusion probabilities. The sample size is random. To extract +#' the selected indices, use \code{sam[sam != 0]}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param Pik Vector of length \code{N} containing the first-order inclusion +#' probability for each unit in the population. Values must be in \code{(0, 1]}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.PO}}, \code{\link{PikPPS}}, \code{\link{S.piPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Draws a Bernoulli sample without replacement of expected size n=3 +#' # "Erik" is drawn in every possible sample becuse its inclusion probability is one +#' Pik <- c(0.5, 0.2, 1, 0.9, 0.5) +#' sam <- S.PO(5,Pik) +#' sam +#' # The selected sample is +#' U[sam] +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a Poisson sample +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' n <- 400 +#' Pik<-n*Income/sum(Income) +#' # None element of Pik bigger than one +#' which(Pik>1) +#' # The selected sample +#' sam <- S.PO(N,Pik) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) -S.PO<-function(N,Pik){ -sam<-matrix(0,N,1) -U<-runif(N) -for(k in 1:N){ -if(U[k]<=Pik[k]) -sam[k]<-k - } -return(sam) +S.PO <- function(N, Pik) { + sam <- matrix(0, N, 1) + U <- runif(N) + for (k in 1:N) { + if (U[k] <= Pik[k]) + sam[k] <- k + } + return(sam) } \ No newline at end of file diff --git a/R/S.PPS.r b/R/S.PPS.r index 157011d..3970694 100644 --- a/R/S.PPS.r +++ b/R/S.PPS.r @@ -1,14 +1,74 @@ #' @export +#' +#' @title +#' Probability Proportional to Size With-Replacement Sampling +#' @description +#' Draws a with-replacement sample of size \code{m} from a finite population +#' using probabilities proportional to an auxiliary size variable \code{x}. +#' @return +#' A matrix with \code{m} rows and two columns: +#' \itemize{ +#' \item Column 1 (\code{sam}): population indices of the selected units. +#' \item Column 2 (\code{pk}): selection probability of each draw. +#' } +#' @details +#' At each draw, unit \eqn{k} is selected with probability +#' \eqn{p_k = x_k / \sum x}. Since sampling is with replacement, the same +#' unit may appear more than once. Use \code{\link{E.PPS}} or \code{\link{HH}} +#' to estimate population totals from this sample. +#' @author Hugo Andres Gutierrez Rojas +#' @param m Number of draws (sample size with replacement). +#' @param x Vector of length \code{N} containing positive auxiliary size +#' values for each unit in the population. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.PPS}}, \code{\link{HH}}, \code{\link{S.piPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # The auxiliary information +#' x <- c(52, 60, 75, 100, 50) +#' # Draws a PPS sample with replacement of size m=3 +#' res <- S.PPS(3,x) +#' sam <- res[,1] +#' # The selected sample is +#' U[sam] +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a random sample according to a +#' # PPS with replacement design +#' data(Lucy) +#' attach(Lucy) +#' # The selection probability of each unit is proportional to the variable Income +#' m <- 400 +#' res<-S.PPS(400,Income) +#' # The selected sample +#' sam <- res[,1] +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) -S.PPS<-function(m,x){ -N<-length(x) -pk<-x/sum(x) -cumpk<-cumsum(pk) -U<-runif(m) -ints<-cbind(c(0,cumpk[-N]),cumpk) -sam<-rep(0,m) -for(i in 1:m){ - sam[i]<-which(U[i]>ints[,1] & U[i] ints[, 1] & U[i] < ints[, 2]) + } + return(cbind(sam, pk[sam])) +} \ No newline at end of file diff --git a/R/S.SI.r b/R/S.SI.r index 52e6b7e..d0f0e70 100644 --- a/R/S.SI.r +++ b/R/S.SI.r @@ -1,15 +1,72 @@ #' @export +#' +#' @title +#' Simple Random Sampling Without Replacement +#' @description +#' Draws a simple random sample of size \code{n} without replacement from a +#' finite population of size \code{N} using the sequential algorithm of +#' Fan, Muller and Rezucha (1962). +#' @return +#' A vector of length \code{N} where selected units contain their population +#' index and non-selected units contain \code{0}. +#' @details +#' The sequential algorithm selects units one at a time by comparing a uniform +#' random variate with the conditional inclusion probability at each step, +#' ensuring exactly \code{n} units are selected. To extract the selected +#' indices, filter out the zeros: \code{sam[sam != 0]}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param n Sample size. Must satisfy \code{n <= N}. +#' @param e Optional vector of \code{N} uniform random variates in \code{(0,1)}. +#' If omitted, \code{runif(N)} is used. Useful for reproducibility or +#' coordinated sampling. +#' +#' @references +#' Fan, C.T., Muller, M.E. and Rezucha, I. (1962). Development of sampling +#' plans by using sequential (item by item) selection techniques and digital +#' computers. \emph{Journal of the American Statistical Association}, +#' 57(298), 387-402.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.SI}}, \code{\link{S.STSI}}, \code{\link{S.SY}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Fixes the random numbers in order to select a sample +#' e <- c(0.4938, 0.7044, 0.4585, 0.6747, 0.0640) +#' # Draws a simple random sample without replacement of size n=3 +#' sam <- S.SI(5, 3, e) +#' sam +#' # The selected sample is +#' U[sam] +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a random sample according to a SI design +#' data(Lucy) +#' attach(Lucy) +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N, n) +#' # The information about the units in the sample +#' data <- Lucy[sam, ] +#' dim(data) -S.SI<-function(N,n,e=runif(N)) -{ -c<-matrix(0,N,1) -dec<-matrix(0,N,1) -sam<-matrix(0,N,1) -for(k in 1:N){ - c[k]<-(n-dec[k])/(N-k+1) - if(e[k] +#' @param S Vector of length \code{N} identifying the stratum membership of +#' each unit in the population. +#' @param x Vector of length \code{N} containing positive auxiliary size +#' values for each unit in the population. +#' @param mh Integer vector of length \code{H} specifying the number of +#' draws within each stratum. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.PPS}}, \code{\link{S.STpiPS}}, \code{\link{E.STPPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # The auxiliary information +#' x <- c(52, 60, 75, 100, 50) +#' # Vector Strata contains an indicator variable of stratum membership +#' Strata <- c("A", "A", "A", "B", "B") +#' # Then sample size in each stratum +#' mh <- c(2,2) +#' # Draws a stratified PPS sample with replacement of size n=4 +#' res <- S.STPPS(Strata, x, mh) +#' # The selected sample +#' sam <- res[,1] +#' U[sam] +#' # The selection probability of each unit selected to be in the sample +#' pk <- res[,2] +#' pk +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a stratified random sample +#' # according to a PPS design in each stratum +#' +#' data(Lucy) +#' attach(Lucy) +#' # Level is the stratifying variable +#' summary(Level) +#' # Defines the sample size at each stratum +#' m1<-70 +#' m2<-100 +#' m3<-200 +#' mh<-c(m1,m2,m3) +#' # Draws a stratified sample +#' res<-S.STPPS(Level, Income, mh) +#' # The selected sample +#' sam<-res[,1] +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) +#' # The selection probability of each unit selected in the sample +#' pk <- res[,2] +#' pk -S.STPPS<-function(S,x,mh) -{ -S<-as.factor(S) -S<-as.factor(as.integer(S)) -cum<-cumsum(mh) -sam<-matrix(0,sum(mh)) -pk<-matrix(0,sum(mh)) - -for(k in 1: length(mh)) -{ -h<-which(S==k) -Nh<-length(x[h]) -pkh<-x[h]/sum(x[h]) -cumpk<-cumsum(pkh) -U<-runif(mh[k]) -ints<-cbind(c(0,cumpk[-Nh]),cumpk) -sam.h<-rep(0,mh[k]) -pk.h<-rep(0,mh[k]) - -for(i in 1:mh[k]){ - sam.h[i]<-which(U[i]>ints[,1] & U[i]1){ -sam[(cum[k-1]+1):(cum[k])]<-h[sam.h] -pk[(cum[k-1]+1):(cum[k])]<-pk.h -} - -} -total<-data.frame(sam,pk) -total +S.STPPS <- function(S, x, mh) { + S <- as.factor(S) + S <- as.factor(as.integer(S)) + cum <- cumsum(mh) + sam <- matrix(0, sum(mh)) + pk <- matrix(0, sum(mh)) + for (k in 1:length(mh)) { + h <- which(S == k) + Nh <- length(x[h]) + pkh <- x[h]/sum(x[h]) + cumpk <- cumsum(pkh) + U <- runif(mh[k]) + ints <- cbind(c(0, cumpk[-Nh]), cumpk) + sam.h <- rep(0, mh[k]) + for (i in 1:mh[k]) { + sam.h[i] <- which(U[i] > ints[, 1] & U[i] < ints[, 2]) + } + pk.h <- pkh[sam.h] + if (k == 1) { + sam[1:mh[k]] <- h[sam.h] + pk[1:mh[k]] <- pk.h + } + if (k > 1) { + sam[(cum[k-1]+1):(cum[k])] <- h[sam.h] + pk[(cum[k-1]+1):(cum[k])] <- pk.h + } + } + data.frame(sam, pk) } \ No newline at end of file diff --git a/R/S.STSI.r b/R/S.STSI.r index b2b366a..f1b7ba3 100644 --- a/R/S.STSI.r +++ b/R/S.STSI.r @@ -1,21 +1,70 @@ #' @export +#' +#' @title +#' Stratified Simple Random Sampling Without Replacement +#' @description +#' Draws a stratified simple random sample without replacement from a finite +#' population. Within each stratum, units are selected by simple random +#' sampling without replacement. +#' @return +#' A sorted vector of population indices of the selected units, of length +#' \code{sum(nh)}. +#' @details +#' The function selects \code{nh[h]} units from stratum \eqn{h} using +#' \code{base::sample}, and returns all selected indices sorted in ascending +#' order. Use \code{\link{E.STSI}} to estimate population totals from this +#' sample. +#' @author Hugo Andres Gutierrez Rojas +#' @param S Vector of length \code{N} identifying the stratum membership of +#' each unit in the population. +#' @param Nh Integer vector of length \code{H} with the population size of +#' each stratum. +#' @param nh Integer vector of length \code{H} with the sample size of each +#' stratum. Must satisfy \code{nh[h] <= Nh[h]} for all \code{h}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.STSI}}, \code{\link{S.SI}}, \code{\link{S.STpiPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' Strata <- c("A", "A", "A", "B", "B") +#' Nh <- c(3, 2) +#' nh <- c(2, 1) +#' sam <- S.STSI(Strata, Nh, nh) +#' sam +#' U[sam] +#' ############ +#' ## Example 2 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N1 <- summary(Level)[[1]] +#' N2 <- summary(Level)[[2]] +#' N3 <- summary(Level)[[3]] +#' Nh <- c(N1, N2, N3) +#' nh <- c(70, 100, 200) +#' sam <- S.STSI(Level, Nh, nh) +#' data <- Lucy[sam, ] +#' dim(data) -S.STSI<-function(S,Nh,nh) -{ -S<-as.factor(S) -S<-as.factor(as.integer(S)) -cum<-cumsum(nh) -sam<-matrix(0,sum(nh)) -for(k in 1: length(nh)){ -h<-which(S==k) -sam.h<-sample(Nh[k],nh[k]) -if(k==1){ -sam[1:nh[k]]<-h[sam.h] -} -if(k>1){ -sam[(cum[k-1]+1):(cum[k])]<-h[sam.h] -} -} -sort(sam) -} - +S.STSI <- function(S, Nh, nh) { + S <- as.factor(S) + S <- as.factor(as.integer(S)) + cum <- cumsum(nh) + sam <- matrix(0, sum(nh)) + for (k in 1:length(nh)) { + h <- which(S == k) + sam.h <- sample(Nh[k], nh[k]) + if (k == 1) sam[1:nh[k]] <- h[sam.h] + if (k > 1) sam[(cum[k-1]+1):(cum[k])] <- h[sam.h] + } + sort(sam) +} \ No newline at end of file diff --git a/R/S.STpiPS.R b/R/S.STpiPS.R index 5eb33ac..f9a7160 100644 --- a/R/S.STpiPS.R +++ b/R/S.STpiPS.R @@ -1,25 +1,84 @@ #' @export +#' +#' @title +#' Stratified Probability Proportional to Size Without-Replacement Sampling +#' @description +#' Draws a stratified sample where within each stratum units are selected +#' using a probability proportional to size without-replacement (piPS) design. +#' @return +#' A matrix with \code{sum(nh)} rows and two columns, sorted by population +#' index: +#' \itemize{ +#' \item Column 1: population indices of the selected units. +#' \item Column 2: first-order inclusion probabilities of the selected units. +#' } +#' @details +#' Within each stratum \eqn{h}, the function calls \code{\link{S.piPS}} to +#' draw \code{nh[h]} units with probabilities proportional to \code{x}. +#' The global population indices are preserved in the output. +#' @author Hugo Andres Gutierrez Rojas +#' @param S Vector of length \code{N} identifying the stratum membership of +#' each unit in the population. +#' @param x Vector of length \code{N} containing positive auxiliary size +#' values for each unit in the population. +#' @param nh Integer vector of length \code{H} specifying the sample size +#' within each stratum. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{S.piPS}}, \code{\link{S.STSI}}, \code{\link{E.STpiPS}}, +#' \code{\link{PikSTPPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' x <- c(52, 60, 75, 100, 50) +#' Strata <- c("A", "A", "A", "B", "B") +#' nh <- c(2, 2) +#' res <- S.STpiPS(Strata, x, nh) +#' sam <- res[, 1] +#' U[sam] +#' pik <- res[, 2] +#' pik +#' ############ +#' ## Example 2 +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' N1 <- summary(Level)[[1]] +#' N2 <- summary(Level)[[2]] +#' N3 <- summary(Level)[[3]] +#' nh <- c(70, 100, 200) +#' res <- S.STpiPS(Level, Employees, nh) +#' sam <- res[, 1] +#' data <- Lucy[sam, ] +#' dim(data) +#' pik <- res[, 2] -S.STpiPS<-function(S, x, nh) -{ - S<-as.factor(S) - S<-as.factor(as.integer(S)) - res<-matrix(NA, nrow = sum(nh), ncol=2) - cum<-cumsum(nh) - - for(k in 1: length(nh)){ - h <- which(S==k) +S.STpiPS <- function(S, x, nh) { + S <- as.factor(S) + S <- as.factor(as.integer(S)) + res <- matrix(NA, nrow = sum(nh), ncol = 2) + cum <- cumsum(nh) + for (k in 1:length(nh)) { + h <- which(S == k) res.h <- S.piPS(nh[k], x[h]) - sam.h <- res.h[,1] - pik.h <- res.h[,2] - if(k==1){ - res[1:nh[k],1]<-h[sam.h] - res[1:nh[k],2]<-pik.h + sam.h <- res.h[, 1] + pik.h <- res.h[, 2] + if (k == 1) { + res[1:nh[k], 1] <- h[sam.h] + res[1:nh[k], 2] <- pik.h } - if(k>1){ - res[(cum[k-1]+1):(cum[k]),1]<-h[sam.h] - res[(cum[k-1]+1):(cum[k]),2]<-pik.h + if (k > 1) { + res[(cum[k-1]+1):(cum[k]), 1] <- h[sam.h] + res[(cum[k-1]+1):(cum[k]), 2] <- pik.h } } res[order(res[, 1]), ] -} +} \ No newline at end of file diff --git a/R/S.SY.r b/R/S.SY.r index 57a4f24..6130d48 100644 --- a/R/S.SY.r +++ b/R/S.SY.r @@ -1,16 +1,72 @@ #' @export +#' +#' @title +#' Systematic Sampling +#' @description +#' Draws a systematic sample from a finite population of size \code{N} using +#' a fixed sampling interval \code{a}. A random start \code{r} is chosen +#' uniformly from \code{1} to \code{a}, and every \code{a}-th unit thereafter +#' is selected. +#' @return +#' A vector containing the population indices of the selected units. +#' @details +#' The random start \code{r} is drawn from \code{sample(a, 1)}, and then +#' units \eqn{r, r+a, r+2a, \ldots} are selected. If \code{N} is not a +#' multiple of \code{a}, the sample size varies by one unit depending on the +#' random start. Use \code{\link{E.SY}} to estimate population totals. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param a Sampling interval (skip). The expected sample size is +#' approximately \code{N/a}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.SY}}, \code{\link{S.SI}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # The population of size N=5 is divided in a=2 groups +#' # Draws a Systematic sample. +#' sam <- S.SY(5,2) +#' sam +#' # The selected sample is +#' U[sam] +#' # There are only two possible samples +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a Systematic sample +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' # The population is divided in 6 groups +#' # The selected sample +#' sam <- S.SY(N,6) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) -S.SY<-function (N, a) -{ - r <- sample(a, 1) - c <- N - a * floor(N/a) - if (r <= c) - n <- floor((N/a)) + 1 - else n <- floor(N/a) - sam <- matrix(0, n, 1) - for (k in 0:n) { - sam[k] <- r + (a * (k - 1)) - } - sam -} - +S.SY <- function(N, a) { + r <- sample(a, 1) + c <- N - a * floor(N/a) + if (r <= c) + n <- floor((N/a)) + 1 + else + n <- floor(N/a) + sam <- matrix(0, n, 1) + for (k in 1:n) { + sam[k] <- r + (a * (k - 1)) + } + sam +} \ No newline at end of file diff --git a/R/S.WR.r b/R/S.WR.r index 4097b64..335dfa8 100644 --- a/R/S.WR.r +++ b/R/S.WR.r @@ -1,22 +1,73 @@ #' @export +#' +#' @title +#' Simple Random Sampling With Replacement +#' @description +#' Draws a simple random sample of size \code{m} with replacement from a +#' finite population of size \code{N}. Returns the frequency of selection +#' for each unit drawn at least once. +#' @return +#' A vector of population indices of length \code{m}, where each element is +#' the index of a selected unit. Units may appear more than once. +#' @details +#' The number of times each unit is selected follows a multinomial +#' distribution with equal probabilities \eqn{1/N}. The function uses a +#' sequential binomial draw approach. Use \code{\link{E.WR}} to estimate +#' population totals. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param m Number of draws (sample size with replacement). +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.WR}}, \code{\link{S.SI}}, \code{\link{S.PPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Draws a simple random sample witho replacement of size m=3 +#' sam <- S.WR(5,3) +#' sam +#' # The selected sample +#' U[sam] +#' +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data to draw a random sample of units accordind to a +#' # simple random sampling with replacement design +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' m <- 400 +#' sam<-S.WR(N,m) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' data +#' dim(data) -S.WR<-function(N,m){ -nk<-rep(0,N) - for(k in 1:N){ - suma<-sum(nk) - nk[k]<-rbinom(1,(m-suma),(1/(N-k+1))) - } -x<-which(nk>0) -w<-nk[x] -sam<-rep(x[1],w[1]) - -if(length(x)==1){ -return(sam)} - -if(length(x)>1){ -for(i in 2:length(x)){ -sam<-c(sam,rep(x[i],w[i])) - } -} -sam +S.WR <- function(N, m) { + nk <- rep(0, N) + for (k in 1:N) { + suma <- sum(nk) + nk[k] <- rbinom(1, (m - suma), (1/(N - k + 1))) + } + x <- which(nk > 0) + w <- nk[x] + sam <- rep(x[1], w[1]) + if (length(x) == 1) return(sam) + if (length(x) > 1) { + for (i in 2:length(x)) { + sam <- c(sam, rep(x[i], w[i])) + } + } + sam } \ No newline at end of file diff --git a/R/S.piPS.r b/R/S.piPS.r index d626b3d..b3fc038 100644 --- a/R/S.piPS.r +++ b/R/S.piPS.r @@ -1,36 +1,84 @@ #' @export +#' +#' @title +#' Probability Proportional to Size Without-Replacement Sampling (piPS) +#' @description +#' Draws a without-replacement sample of size \code{n} using a sequential +#' algorithm that produces inclusion probabilities proportional to an +#' auxiliary size variable \code{x}. +#' @return +#' A matrix with \code{n} rows and two columns: +#' \itemize{ +#' \item Column 1: population indices of the selected units. +#' \item Column 2: first-order inclusion probabilities of the selected units. +#' } +#' @author Hugo Andres Gutierrez Rojas +#' @param n Sample size. +#' @param x Vector of length \code{N} with positive auxiliary size values. +#' @param e Optional vector of \code{N} uniform random variates in \code{(0,1)}. +#' If omitted, \code{runif(N)} is used. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.piPS}}, \code{\link{PikPPS}}, \code{\link{S.STPPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' x <- c(52, 60, 75, 100, 50) +#' # Draws a piPS sample without replacement of size n=3 +#' res <- S.piPS(3, x) +#' res +#' sam <- res[, 1] +#' U[sam] +#' ############ +#' ## Example 2 +#' ############ +#' # Uses the Lucy data +#' data(Lucy) +#' attach(Lucy) +#' res <- S.piPS(400, Income) +#' sam <- res[, 1] +#' Pik.s <- res[, 2] +#' data <- Lucy[sam, ] +#' dim(data) -S.piPS <- function (n, x, e = runif(length(x))) { - if(length(x) != 1){ - N <- length(x) - x1 <- sort(x, decreasing = TRUE) +S.piPS <- function(n, x, e = runif(length(x))) { + if (length(x) != 1) { + N <- length(x) + x1 <- sort(x, decreasing = TRUE) Pik <- PikPPS(n, x1) - V <- cumsum(Pik) - nk <- matrix(0, N, 1) - d <- matrix(0, N, 1) - I <- matrix(0, N, 1) + V <- cumsum(Pik) + nk <- matrix(0, N, 1) + d <- matrix(0, N, 1) + I <- matrix(0, N, 1) sam <- matrix(0, N, 1) if (e[1] < Pik[1]) { - I[1] <- 1 + I[1] <- 1 sam[1] <- 1 } for (k in 2:N) { nk[k] <- nk[k - 1] + I[k - 1] - d[k] <- Pik[k] * (n - nk[k])/(n - V[k - 1]) + d[k] <- Pik[k] * (n - nk[k])/(n - V[k - 1]) if (e[k] <= d[k]) { - I[k] <- 1 + I[k] <- 1 sam[k] <- cumsum(I[1:(k - 1)])[(k - 1)] + I[k] } } - samp <- rev(order(x))[which(sam != 0)] - Pik1 <- PikPPS(n, x) + samp <- rev(order(x))[which(sam != 0)] + Pik1 <- PikPPS(n, x) Pik.s <- Pik1[samp] return(cbind(samp, Pik.s)) } - - if(length(x) == 1){ + if (length(x) == 1) { Pik.s <- 1 - samp <- 1 + samp <- 1 return(cbind(samp, Pik.s)) } } \ No newline at end of file diff --git a/R/Support.r b/R/Support.r index b41f5ed..9d0b37f 100644 --- a/R/Support.r +++ b/R/Support.r @@ -1,15 +1,54 @@ #' @export #' @import stats +#' +#' @title +#' Sampling Support for Fixed-Size Without-Replacement Designs +#' @description +#' Enumerates all possible samples of size \code{n} from a population of +#' size \code{N}, returning the complete sampling support as a matrix. +#' @return +#' A matrix with \code{choose(N, n)} rows and \code{n} columns. Each row +#' contains the indices (or labels if \code{ID} is provided) of the units +#' in one possible sample. Samples are listed in lexicographic order. +#' @details +#' This function uses a combinatorial algorithm to enumerate all +#' \code{choose(N, n)} subsets of size \code{n} from \eqn{\{1, \ldots, N\}}. +#' It is intended for small populations only. For \code{N > 15} it becomes +#' very slow. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Recommended \code{N <= 15}. +#' @param n Sample size. +#' @param ID Optional vector of population labels of length \code{N}. +#' If provided, labels replace integer indices in the output. +#' If \code{FALSE} (default), integer indices are returned. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Ik}}, \code{\link{SupportWR}}, \code{\link{SupportRS}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' n <- 2 +#' # Ten possible samples of size n=2 +#' Support(N, n) +#' # Labeled support +#' Support(N, n, ID = U) +#' # Support showing values of y +#' y <- c(32, 34, 46, 89, 35) +#' Support(N, n, ID = y) Support <- function(N, n, ID = FALSE) { - m <- matrix(0, choose(N, n), n) + m <- matrix(0, choose(N, n), n) sam <- matrix(0, choose(N, n), n) - for (i in 1:n) - { + for (i in 1:n) { a <- 0 t <- i - for (r in 1:choose(N, n)) - { + for (r in 1:choose(N, n)) { a <- a + 1 B <- choose(N - t, n - i) if (a > B) { @@ -19,14 +58,10 @@ Support <- function(N, n, ID = FALSE) { if (t > N - n + i) { t <- m[r, i - 1] + 1 } - m[r, i] <- t + m[r, i] <- t sam[r, i] <- ID[t] } } - if (is.logical(ID) == TRUE) { - return(m) - } - else { - return(sam) - } + if (is.logical(ID) == TRUE) return(m) + else return(sam) } \ No newline at end of file diff --git a/R/SupportRS.r b/R/SupportRS.r index 464aa1b..339658a 100644 --- a/R/SupportRS.r +++ b/R/SupportRS.r @@ -1,15 +1,50 @@ #' @export +#' +#' @title +#' Complete Sampling Support for All Sample Sizes +#' @description +#' Enumerates all possible non-empty subsets of a population of size \code{N}, +#' covering all sample sizes from 1 to \code{N}. The result includes the +#' empty set as the first row. +#' @return +#' A matrix with \eqn{2^N} rows and \code{N} columns. Each row is one subset, +#' with \code{NA} used as padding for subsets smaller than \code{N}. The first +#' row represents the empty set (all zeros). +#' @details +#' This function stacks the outputs of \code{\link{Support}} for all sample +#' sizes \eqn{n = 1, \ldots, N}. It is only feasible for small populations +#' (\code{N <= 10}) due to exponential growth. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Recommended \code{N <= 10}. +#' @param ID Optional vector of population labels of length \code{N}. +#' If provided, labels replace integer indices in the output. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{Support}}, \code{\link{IkRS}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' # Complete support for all sample sizes +#' SupportRS(N) +#' # Labeled support +#' SupportRS(N, ID = U) -SupportRS <- function(N, ID=FALSE){ -sam <- matrix(NA, ncol=N, nrow=1) -for(k in 1:N){ -sam<-rbind(sam, cbind(Support(N,k),matrix(NA,ncol=N-k, nrow=choose(N,k)))) -} -if (is.logical(ID) == TRUE){ -return(sam) -} -else{ -sam<-matrix(ID[SupportRS(N)],nrow=2^N) -return(sam) -} +SupportRS <- function(N, ID = FALSE) { + sam <- matrix(NA, ncol = N, nrow = 1) + for (k in 1:N) { + sam <- rbind(sam, + cbind(Support(N, k), + matrix(NA, ncol = N - k, nrow = choose(N, k)))) + } + if (is.logical(ID) == TRUE) return(sam) + else { + sam <- matrix(ID[SupportRS(N)], nrow = 2^N) + return(sam) + } } \ No newline at end of file diff --git a/R/SupportWR.r b/R/SupportWR.r index 67f93bb..458a1de 100644 --- a/R/SupportWR.r +++ b/R/SupportWR.r @@ -1,36 +1,71 @@ #' @export +#' +#' @title +#' Sampling Support for With-Replacement Designs +#' @description +#' Enumerates all distinct unordered outcomes (multisets) of size \code{m} +#' drawn with replacement from a population of size \code{N}. +#' @return +#' A matrix with \code{choose(N+m-1, m)} rows and \code{m} columns. Each +#' row contains the (sorted) indices of one possible unordered outcome. +#' If \code{ID} is provided, population labels replace indices. +#' @details +#' The number of distinct unordered with-replacement outcomes of size \code{m} +#' from \code{N} units is \eqn{\binom{N+m-1}{m}}. This is much smaller than +#' the \eqn{N^m} ordered outcomes. The algorithm uses a nested loop to +#' generate all non-decreasing sequences of length \code{m} from +#' \eqn{\{1, \ldots, N\}}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param m Number of draws (sample size with replacement). +#' @param ID Optional vector of population labels of length \code{N}. +#' If \code{FALSE} (default), integer indices are returned. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{IkWR}}, \code{\link{nk}}, \code{\link{p.WR}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' m <- 2 +#' # With-replacement support +#' SupportWR(N, m) +#' SupportWR(N, m, ID = U) +#' y <- c(32, 34, 46, 89, 35) +#' SupportWR(N, m, ID = y) -SupportWR <- function(N, m, ID=FALSE){ -S=0 -a=rep(1,m) -P1<-a -S=S+1 -k=m -while(k>0){ -while(a[k]1) -k=k-1 -if(a[k] 0) { + while (a[k] < N) { + a[k] <- a[k] + 1 + P1 <- rbind(P1, a) + S <- S + 1 + } + if (k > 1) k <- k - 1 + if (a[k] < N) { + a[k] <- a[k] + 1 + k1 <- k + 1 + a[k1:m] <- a[k] + P1 <- rbind(P1, a) + S <- S + 1 + k <- m + } else { + if (k == 1) k <- 0 + } + } + nr <- choose(N + m - 1, m) + P1 <- matrix(P1, nrow = nr) + sam <- matrix(ID[P1], nrow = nr) + if (is.logical(ID) == TRUE) return(P1) + else return(sam) +} \ No newline at end of file diff --git a/R/T.SIC.r b/R/T.SIC.r index 901d7cb..ce17696 100644 --- a/R/T.SIC.r +++ b/R/T.SIC.r @@ -1,26 +1,80 @@ #' @export +#' +#' @title +#' Cluster Totals for Single-Stage Cluster Sampling +#' @description +#' Computes the total of each variable of interest within each cluster +#' (Primary Sampling Unit) in a single-stage cluster sample. +#' @return +#' A matrix with one row per cluster and one column per variable of interest +#' (plus a first column \code{Ni} with the cluster size). Row names are the +#' cluster labels. +#' @details +#' This function aggregates the sample data by cluster, producing the cluster- +#' level totals needed for estimation under single-stage cluster sampling. +#' The output can be passed directly to \code{\link{E.1SI}} or \code{\link{E.SI}} +#' treating each cluster total as an observation. +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector, matrix or data frame containing the values of the +#' variables of interest for every unit in the sample. +#' @param Cluster Vector identifying the cluster (PSU) membership of each +#' unit in the sample. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{E.1SI}}, \code{\link{E.2SI}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' y1 <- c(32, 34, 46, 89, 35) +#' y2 <- c(1, 1, 1, 0, 0) +#' y3 <- cbind(y1, y2) +#' Cluster <- c("C1", "C2", "C1", "C2", "C1") +#' T.SIC(y1, Cluster) +#' T.SIC(y3, Cluster) +#' ############ +#' ## Example 2 - Cluster sampling with Lucy data +#' ############ +#' data(Lucy) +#' attach(Lucy) +#' UI <- c("A", "B", "C", "D", "E") +#' NI <- length(UI) +#' nI <- 2 +#' samI <- S.SI(NI, nI) +#' dataI <- UI[samI] +#' Lucy1 <- Lucy[which(Zone == dataI[1]), ] +#' Lucy2 <- Lucy[which(Zone == dataI[2]), ] +#' LucyI <- rbind(Lucy1, Lucy2) +#' attach(LucyI) +#' Cluster <- as.factor(as.integer(Zone)) +#' estima <- data.frame(Income, Employees, Taxes) +#' Ty <- T.SIC(estima, Cluster) +#' E.SI(NI, nI, Ty) -T.SIC<-function(y,Cluster){ - - Cluster<-as.factor(Cluster) - y<-cbind(1,y) - y<-as.data.frame(y) +T.SIC <- function(y, Cluster) { + Cluster <- as.factor(Cluster) + y <- cbind(1, y) + y <- as.data.frame(y) names(y)[1] <- "Ni" - - nI<-length(levels(Cluster)) - - Total<-matrix(NA,nrow=nI,ncol=dim(y)[2],) - rownames(Total)<-levels(Cluster) - colnames(Total)<-names(y) - Cluster<-as.factor(as.integer(Cluster)) - - for(k in 1: nI){ - e<-which(Cluster==k) - ye<-y[e,] - ye<-as.matrix(ye) - tye<-colSums(ye) - Total[k,]<-tye + nI <- length(levels(Cluster)) + Total <- matrix(NA, nrow = nI, ncol = dim(y)[2]) + rownames(Total) <- levels(Cluster) + colnames(Total) <- names(y) + Cluster <- as.factor(as.integer(Cluster)) + for (k in 1:nI) { + e <- which(Cluster == k) + ye <- y[e, ] + ye <- as.matrix(ye) + tye <- colSums(ye) + Total[k, ] <- tye } - Total<-as.matrix(Total) + Total <- as.matrix(Total) return(Total) } \ No newline at end of file diff --git a/R/VarHT.r b/R/VarHT.r index 66a178f..213e2ca 100644 --- a/R/VarHT.r +++ b/R/VarHT.r @@ -1,13 +1,57 @@ #' @export +#' +#' @title +#' Exact Variance of the Horvitz-Thompson Estimator +#' @description +#' Computes the exact variance of the Horvitz-Thompson estimator of the +#' population total for a given fixed-size without-replacement sampling design, +#' using the full sampling support. +#' @return +#' A scalar: the exact variance of the Horvitz-Thompson estimator +#' \eqn{V(\hat{t}_{y,\pi})}. +#' @details +#' The exact Horvitz-Thompson variance is: +#' \deqn{V(\hat{t}_{y,\pi}) = \sum_{k=1}^N \sum_{l=1}^N \Delta_{kl} +#' \frac{y_k}{\pi_k} \frac{y_l}{\pi_l}} +#' where \eqn{\Delta_{kl} = \pi_{kl} - \pi_k \pi_l}. This requires +#' enumerating the full support and is only feasible for small populations +#' (\code{N <= 15}). +#' @author Hugo Andres Gutierrez Rojas +#' @param y Vector of length \code{N} with the population values of the +#' variable of interest. +#' @param N Population size. Recommended \code{N <= 15}. +#' @param n Sample size. +#' @param p Vector of probabilities for each possible sample in the support. +#' Must sum to 1. +#' +#' @references +#' Horvitz, D.G. and Thompson, D.J. (1952). A generalization of sampling +#' without replacement from a finite universe. +#' \emph{Journal of the American Statistical Association}, 47, 663-685.\cr +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer. +#' +#' @seealso \code{\link{Deltakl}}, \code{\link{VarSYGHT}}, \code{\link{HT}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' y1 <- c(32, 34, 46, 89, 35) +#' y2 <- c(1, 1, 1, 0, 0) +#' N <- length(U) +#' n <- 2 +#' p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) +#' # Theoretical variance of the HT estimator +#' VarHT(y1, N, n, p) +#' VarHT(y2, N, n, p) -VarHT<-function(y, N, n, p){ -Ind <- Ik(N,n) -pi1 <- as.matrix(Pik(p, Ind)) -pi2 <- Pikl(N,n,p) -Delta <- Deltakl(N,n,p) -y <- t(as.matrix(y)) -ykylexp <- t(y/pi1)%*%(y/pi1) -A <- (Delta)*(ykylexp) -Var <- sum(A) -return(Var) +VarHT <- function(y, N, n, p) { + Ind <- Ik(N, n) + pi1 <- as.matrix(Pik(p, Ind)) + pi2 <- Pikl(N, n, p) + Delta <- Deltakl(N, n, p) + y <- t(as.matrix(y)) + ykylexp <- t(y/pi1) %*% (y/pi1) + A <- (Delta) * (ykylexp) + Var <- sum(A) + return(Var) } \ No newline at end of file diff --git a/R/VarSYGHT.R b/R/VarSYGHT.R index 462db6d..3352ba2 100644 --- a/R/VarSYGHT.R +++ b/R/VarSYGHT.R @@ -110,4 +110,4 @@ VarSYGHT <- function (y, N, n, p) } Resultado <- data.frame(I = Ind, p = p, Est.HT = Est.HT, Est.Var1 = Est.Var1, Est.Var2 = Est.Var2) return(Resultado) -} +} \ No newline at end of file diff --git a/R/Wk.r b/R/Wk.r index eabbb44..b8d0b82 100644 --- a/R/Wk.r +++ b/R/Wk.r @@ -1,16 +1,205 @@ #' @export +#' +#' @title +#' GREG Generalised Weights +#' @description +#' Computes the generalised regression (GREG) weights for each unit in the +#' sample. These weights incorporate both the sampling design weights and a +#' calibration adjustment based on known population totals of auxiliary +#' variables. +#' @return +#' A numeric vector of length \code{n} with the GREG weight for each unit +#' in the sample. +#' @details +#' The GREG weight for unit \eqn{k} is: +#' \deqn{w_k = \frac{1}{\pi_k} + \mathbf{x}_k^T +#' \left(\sum_s \frac{v_k \mathbf{x}_k \mathbf{x}_k^T}{\pi_k}\right)^{-1} +#' (\mathbf{t}_x - \hat{\mathbf{t}}_{x,\pi})} +#' where \eqn{v_k = 1/(\pi_k c_k)} and \eqn{c_k} is a variance-stabilising +#' constant. The GREG estimator is then \eqn{\hat{t}_{GREG} = \sum_s w_k y_k}. +#' @author Hugo Andres Gutierrez Rojas +#' @param x Vector or matrix of auxiliary variables observed in the sample. +#' @param tx Vector of known population totals of the auxiliary variables. +#' @param Pik Vector of first-order inclusion probabilities for each unit +#' in the sample. +#' @param ck Vector of variance-stabilising constants. Typically \code{ck = 1} +#' (homoscedastic) or \code{ck = x} (heteroscedastic). +#' @param b0 Logical. If \code{TRUE}, an intercept column is prepended to +#' \code{x}. Default is \code{FALSE}. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{GREG.SI}}, \code{\link{E.Beta}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # Without replacement sampling +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector x is the auxiliary information and y is the variables of interest +#' x<-c(32, 34, 46, 89, 35) +#' y<-c(52, 60, 75, 100, 50) +#' # pik is some vector of inclusion probabilities in the sample +#' # In this case the sample size is equal to the population size +#' pik<-rep(1,5) +#' w1<-Wk(x,tx=236,pik,ck=1,b0=FALSE) +#' sum(x*w1) +#' # Draws a sample size without replacement +#' sam <- sample(5,2) +#' pik <- c (0.8,0.2,0.2,0.5,0.3) +#' # The auxiliary information an variable of interest in the selected smaple +#' x.s<-x[sam] +#' y.s<-y[sam] +#' # The vector of inclusion probabilities in the selected smaple +#' pik.s<-pik[sam] +#' # Calibration weights under some specifics model +#' w2<-Wk(x.s,tx=236,pik.s,ck=1,b0=FALSE) +#' sum(x.s*w2) +#' +#' w3<-Wk(x.s,tx=c(5,236),pik.s,ck=1,b0=TRUE) +#' sum(w3) +#' sum(x.s*w3) +#' +#' w4<-Wk(x.s,tx=c(5,236),pik.s,ck=x.s,b0=TRUE) +#' sum(w4) +#' sum(x.s*w4) +#' +#' w5<-Wk(x.s,tx=236,pik.s,ck=x.s,b0=FALSE) +#' sum(x.s*w5) +#' +#' ###################################################################### +#' ## Example 2: Linear models involving continuous auxiliary information +#' ###################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' Pik <- rep(n/N, n) +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' +#' ########### common ratio model ################### +#' +#' estima<-data.frame(Income) +#' x <- Employees +#' tx <- sum(Lucy$Employees) +#' w <- Wk(x, tx, Pik, ck=1, b0=FALSE) +#' sum(x*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' ########### Simple regression model without intercept ################### +#' +#' estima<-data.frame(Income, Employees) +#' x <- Taxes +#' tx <- sum(Lucy$Taxes) +#' w<-Wk(x,tx,Pik,ck=x,b0=FALSE) +#' sum(x*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' ########### Multiple regression model without intercept ################### +#' +#' estima<-data.frame(Income) +#' x <- cbind(Employees, Taxes) +#' tx <- c(sum(Lucy$Employees), sum(Lucy$Taxes)) +#' w <- Wk(x,tx,Pik,ck=1,b0=FALSE) +#' sum(x[,1]*w) +#' sum(x[,2]*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' ########### Simple regression model with intercept ################### +#' +#' estima<-data.frame(Income, Employees) +#' x <- Taxes +#' tx <- c(N,sum(Lucy$Taxes)) +#' w <- Wk(x,tx,Pik,ck=1,b0=TRUE) +#' sum(1*w) +#' sum(x*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' ########### Multiple regression model with intercept ################### +#' +#' estima<-data.frame(Income) +#' x <- cbind(Employees, Taxes) +#' tx <- c(N, sum(Lucy$Employees), sum(Lucy$Taxes)) +#' w <- Wk(x,tx,Pik,ck=1,b0=TRUE) +#' sum(1*w) +#' sum(x[,1]*w) +#' sum(x[,2]*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' #################################################################### +#' ## Example 3: Linear models involving discrete auxiliary information +#' #################################################################### +#' +#' # Draws a simple random sample without replacement +#' data(Lucy) +#' attach(Lucy) +#' +#' N <- dim(Lucy)[1] +#' n <- 400 +#' sam <- S.SI(N,n) +#' # The information about the units in the sample is stored in an object called data +#' data <- Lucy[sam,] +#' attach(data) +#' names(data) +#' # Vector of inclusion probabilities for units in the selected sample +#' Pik<-rep(n/N,n) +#' # The auxiliary information is discrete type +#' Doma<-Domains(Level) +#' +#' ########### Poststratified common mean model ################### +#' +#' estima<-data.frame(Income, Employees, Taxes) +#' tx <- colSums(Domains(Lucy$Level)) +#' w <- Wk(Doma,tx,Pik,ck=1,b0=FALSE) +#' sum(Doma[,1]*w) +#' sum(Doma[,2]*w) +#' sum(Doma[,3]*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) +#' +#' ########### Poststratified common ratio model ################### +#' +#' estima<-data.frame(Income, Employees) +#' x<-Doma*Taxes +#' tx <- colSums(Domains(Lucy$Level)) +#' w <- Wk(x,tx,Pik,ck=1,b0=FALSE) +#' sum(x[,1]*w) +#' sum(x[,2]*w) +#' sum(x[,3]*w) +#' tx +#' # The calibration estimation +#' colSums(estima*w) -Wk<-function(x,tx,Pik,ck,b0=FALSE){ - - if (b0 == TRUE){ - x<-as.matrix(cbind(1,x))} - if (b0 == FALSE){ - x<-as.matrix(x)} - - tx<-as.matrix(tx) - txpi<-as.matrix(t(x)%*%(1/Pik)) - V<-1/(Pik*ck) - - Wk<-(1/Pik)+((V*x)%*%solve(t(V*x)%*%x)%*%(tx-txpi)) - return(Wk) -} +Wk <- function(x, tx, Pik, ck, b0 = FALSE) { + if (b0 == TRUE) x <- as.matrix(cbind(1, x)) + if (b0 == FALSE) x <- as.matrix(x) + tx <- as.matrix(tx) + txpi <- as.matrix(t(x) %*% (1/Pik)) + V <- 1/(Pik * ck) + result <- (1/Pik) + ((V * x) %*% solve(t(V * x) %*% x) %*% (tx - txpi)) + return(result) +} \ No newline at end of file diff --git a/R/kish_allocation.R b/R/kish_allocation.R new file mode 100644 index 0000000..f6b16cf --- /dev/null +++ b/R/kish_allocation.R @@ -0,0 +1,89 @@ +#' @export +#' +#' @title +#' Kish Allocation for Stratified Sampling +#' @description +#' Computes the optimal sample size allocation across strata using the +#' Kish (1992) compromise allocation method, which interpolates between +#' uniform and proportional allocation through a design effect parameter \code{I}. +#' +#' @param n Integer. Total desired sample size. +#' @param N_h Named numeric vector. Population sizes for each stratum +#' \eqn{h = 1, \ldots, H}. +#' @param I Non-negative numeric. Intraclass correlation coefficient (ICC) +#' or design effect parameter controlling the allocation: +#' \itemize{ +#' \item \code{I = 0} → Uniform allocation (equal sample per stratum). +#' \item \code{I = Inf} → Proportional allocation (proportional to \eqn{N_h}). +#' \item \code{0 < I < Inf} → Compromise between uniform and proportional. +#' \item Recommended value: \code{I = 0.5} (Kish, 1992). +#' } +#' +#' @return A named integer vector of length \eqn{H} with the allocated sample +#' sizes per stratum. The values sum to approximately \code{n} (rounding may +#' cause a difference of ±1). +#' +#' @details +#' The Kish compromise allocation assigns sample sizes as: +#' \deqn{ +#' n_h = n \cdot \frac{\sqrt{I \, W_h^2 + H^{-2}}} +#' {\sum_{h=1}^{H} \sqrt{I \, W_h^2 + H^{-2}}} +#' } +#' where \eqn{W_h = N_h / N} is the stratum weight and \eqn{H} is the number +#' of strata. This formulation nests two classical allocations as limiting +#' cases: when \eqn{I = 0} the numerator reduces to \eqn{1/H} (uniform), +#' and as \eqn{I \to \infty} it is dominated by \eqn{W_h} (proportional). +#' +#' @references +#' Kish, L. (1992). Weighting for unequal \eqn{P_i}. +#' \emph{Journal of Official Statistics}, 8(2), 183–200. +#' +#' @author Yury Vanessa Ochoa Montes +#' +#' @seealso +#' \code{\link{E.STSI}} for estimation under stratified sampling, +#' \code{\link{S.STSI}} for stratified simple random sampling. +#' +#' @examples +#' N_h <- c( +#' Corozal = 41847, +#' Orange_Walk = 48175, +#' Belize = 57658, +#' Cayo = 78473, +#' Stann_Creek = 31347, +#' Toledo = 31711 +#' ) +#' +#' # Uniform allocation (I = 0) +#' kish_allocation(n = 3096, N_h = N_h, I = 0) +#' +#' # Proportional allocation (I -> Inf) +#' kish_allocation(n = 3096, N_h = N_h, I = 1e6) +#' +#' # Kish recommended compromise (I = 0.5) +#' kish_allocation(n = 3096, N_h = N_h, I = 0.5) + +kish_allocation <- function(n, N_h, I = 0.5) { + + if (!is.numeric(n) || length(n) != 1L || n <= 0 || n != round(n)) + stop("`n` must be a single positive integer.", call. = FALSE) + + if (!is.numeric(N_h) || any(N_h <= 0)) + stop("`N_h` must be a numeric vector of positive stratum sizes.", call. = FALSE) + + if (!is.numeric(I) || length(I) != 1L || I < 0) + stop("`I` must be a single non-negative number.", call. = FALSE) + + if (n > sum(N_h)) + stop("`n` cannot exceed the total population size sum(N_h).", call. = FALSE) + + H <- length(N_h) + W_h <- N_h / sum(N_h) + + num <- sqrt(I * W_h^2 + 1 / H^2) + n_h <- round(n * num / sum(num)) + + if (!is.null(names(N_h))) names(n_h) <- names(N_h) + + n_h +} \ No newline at end of file diff --git a/R/nk.r b/R/nk.r index 3783684..5196e9c 100644 --- a/R/nk.r +++ b/R/nk.r @@ -1,16 +1,49 @@ #' @export +#' +#' @title +#' Frequency Matrix for With-Replacement Sampling +#' @description +#' Constructs the frequency matrix of the with-replacement sampling support +#' for a population of size \code{N} and \code{m} draws. Each row corresponds +#' to one possible outcome and each column to one population unit, with entry +#' \eqn{(s, k)} equal to the number of times unit \eqn{k} was selected in +#' outcome \eqn{s}. +#' @return +#' An integer matrix of dimension \code{choose(N+m-1, m) x N}, where entry +#' \eqn{(s, k)} is the frequency of unit \eqn{k} in outcome \eqn{s}. +#' @details +#' Unlike \code{\link{IkWR}}, which records only whether a unit was selected, +#' this function records how many times each unit was selected. This is needed +#' for with-replacement estimators based on selection frequencies. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. Keep small due to combinatorial growth. +#' @param m Number of draws (sample size with replacement). +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{IkWR}}, \code{\link{SupportWR}}, \code{\link{p.WR}} +#' +#' @examples +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' N <- length(U) +#' m <- 2 +#' # Frequency matrix for with-replacement sampling +#' nk(N, m) -nk <- function(N, m) -{ -Q <- SupportWR(N, m, ID = FALSE) -I <- matrix(0, choose(N+m-1, m), N) -for (i in 1:m) { -for (j in 1:choose(N+m-1, m)) { -for (k in 1:N) { -if (Q[j, i] == k) -I[j, k] <- sum(as.double(Q[j,]==k)) -} -} -} -I +nk <- function(N, m) { + Q <- SupportWR(N, m, ID = FALSE) + I <- matrix(0, choose(N + m - 1, m), N) + for (i in 1:m) { + for (j in 1:choose(N + m - 1, m)) { + for (k in 1:N) { + if (Q[j, i] == k) + I[j, k] <- sum(as.double(Q[j, ] == k)) + } + } + } + I } \ No newline at end of file diff --git a/R/p.WR.r b/R/p.WR.r index ad03606..a229e67 100644 --- a/R/p.WR.r +++ b/R/p.WR.r @@ -1,12 +1,77 @@ #' @export +#' +#' @title +#' Sample Probabilities under With-Replacement Sampling +#' @description +#' Computes the probability of each possible outcome in the with-replacement +#' sampling support, given unit selection probabilities \code{pk}. +#' @return +#' A numeric vector of length \code{choose(N+m-1, m)} with the probability +#' of each distinct unordered outcome in the with-replacement support. +#' @details +#' For each distinct unordered outcome (multiset) in the support enumerated +#' by \code{\link{nk}}, the probability is computed as a multinomial +#' probability: +#' \deqn{p(s) = \frac{m!}{\prod_k n_k!} \prod_k p_k^{n_k}} +#' where \eqn{n_k} is the number of times unit \eqn{k} appears in outcome +#' \eqn{s} and \eqn{p_k} is the selection probability of unit \eqn{k}. +#' @author Hugo Andres Gutierrez Rojas +#' @param N Population size. +#' @param m Number of draws (sample size with replacement). +#' @param pk Vector of length \code{N} with selection probabilities for each +#' unit. Must sum to 1. +#' +#' @references +#' Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +#' \emph{Model Assisted Survey Sampling}. Springer.\cr +#' Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +#' y estimacion de parametros}. Editorial Universidad Santo Tomas. +#' +#' @seealso \code{\link{nk}}, \code{\link{SupportWR}}, \code{\link{S.PPS}} +#' +#' @examples +#' ############ +#' ## Example 1 +#' ############ +#' # With replacement simple random sampling +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector pk is the sel?ection probability of the units in the finite population +#' pk <- c(0.2, 0.2, 0.2, 0.2, 0.2) +#' sum(pk) +#' N <- length(pk) +#' m <- 3 +#' # The smapling design +#' p <- p.WR(N, m, pk) +#' p +#' sum(p) +#' +#' ############ +#' ## Example 2 +#' ############ +#' # With replacement PPS random sampling +#' # Vector U contains the label of a population of size N=5 +#' U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +#' # Vector x is the auxiliary information and y is the variables of interest +#' x<-c(32, 34, 46, 89, 35) +#' y<-c(52, 60, 75, 100, 50) +#' # Vector pk is the sel?ection probability of the units in the finite population +#' pk <- x/sum(x) +#' sum(pk) +#' N <- length(pk) +#' m <- 3 +#' # The smapling design +#' p <- p.WR(N, m, pk) +#' p +#' sum(p) -p.WR <- function(N, m, pk){ -p <- rep(0,N) -I <- nk(N,m) -N <- dim(I)[1] -for(i in 1:N){ -ni <- c(I[i,]) -p[i] <- dmultinom(ni, prob=pk) -} -p +p.WR <- function(N, m, pk) { + p <- rep(0, N) + I <- nk(N, m) + N <- dim(I)[1] + for (i in 1:N) { + ni <- c(I[i, ]) + p[i] <- dmultinom(ni, prob = pk) + } + p } \ No newline at end of file diff --git a/README.md b/README.md index 25efb61..5f42955 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,163 @@ # TeachingSampling + ### An R package that draws complex samples and estimates complex parameters -This is the version control site for the `TeachingSampling` package. This software allows you to select samples from the most common (but somehow complex) sampling schemes. Along with this feature, the software allows you to estimate parameters such as totals, means, ratios, coefficient regressions, percentiles, medians, etc. +`TeachingSampling` allows you to select samples from the most common probabilistic sampling designs and estimate complex parameters such as totals, means, ratios, regression coefficients, and quantiles. + +The package is based on: -The software is based on the book written by the author of the package. +> Gutierrez, H. A. (2009). *Estrategias de muestreo: diseño de encuestas y estimación de parámetros*. Editorial Universidad Santo Tomás. -Gutierrez, H. A. (2009), *Estrategias de muestreo: diseno de encuestas y estimacion de parametros*. Editorial Universidad Santo Tomas +--- ## Installation -First, you need to install the `devtools` R package +### Stable version from CRAN + +```r +install.packages("TeachingSampling") ``` + +### Development version from GitHub + +```r install.packages("devtools") +devtools::install_github("psirusteam/TeachingSampling") ``` -Then load the `devtools` R package -``` -library(devtools) -``` +--- -Finally type -``` -install_github("psirusteam/TeachingSampling") +## Functions + +### Sampling designs + +| Function | Description | +|---|---| +| `S.SI()` | Simple random sampling without replacement | +| `S.SY()` | Systematic sampling | +| `S.BE()` | Bernoulli sampling | +| `S.PO()` | Poisson sampling | +| `S.WR()` | Simple random sampling with replacement | +| `S.PPS()` | PPS sampling with replacement | +| `S.piPS()` | PPS sampling without replacement | +| `S.STSI()` | Stratified simple random sampling | +| `S.STPPS()` | Stratified PPS sampling with replacement | +| `S.STpiPS()` | Stratified PPS sampling without replacement | + +### Inclusion probabilities + +| Function | Description | +|---|---| +| `PikPPS()` | Inclusion probabilities proportional to size | +| `PikSTPPS()` | Inclusion probabilities for stratified PPS | +| `PikHol()` | Optimal inclusion probabilities (Holmberg) | +| `Pik()` | First-order inclusion probabilities from design | +| `Pikl()` | Second-order inclusion probabilities | + +### Estimation + +| Function | Description | +|---|---| +| `E.SI()` | Estimation under simple random sampling | +| `E.SY()` | Estimation under systematic sampling | +| `E.BE()` | Estimation under Bernoulli sampling | +| `E.PO()` | Estimation under Poisson sampling | +| `E.WR()` | Estimation under with-replacement sampling | +| `E.PPS()` | Hansen-Hurwitz estimator under PPS-WR | +| `E.piPS()` | HT estimator under piPS sampling | +| `E.STSI()` | Estimation under stratified SI | +| `E.STPPS()` | Estimation under stratified PPS-WR | +| `E.STpiPS()` | Estimation under stratified piPS | +| `E.1SI()` | Estimation under single-stage cluster sampling | +| `E.2SI()` | Estimation under two-stage SI sampling | +| `E.UC()` | Estimation using the Ultimate Cluster method | +| `E.Quantile()` | Weighted quantile estimation | +| `E.Trim()` | Weight trimming and redistribution | + +### Regression and calibration + +| Function | Description | +|---|---| +| `E.Beta()` | Regression coefficient estimation | +| `GREG.SI()` | Generalised regression estimator | +| `Wk()` | GREG calibration weights | +| `IPFP()` | Iterative proportional fitting (raking) | + +### Variance estimation + +| Function | Description | +|---|---| +| `VarHT()` | Exact Horvitz-Thompson variance | +| `VarSYGHT()` | HT and Sen-Yates-Grundy variance estimators | +| `HT()` | Horvitz-Thompson estimator | +| `Deltakl()` | Matrix of joint inclusion probability differences | + +### Sampling support (small populations) + +| Function | Description | +|---|---| +| `Support()` | Sampling support for SI designs | +| `SupportWR()` | Sampling support for WR designs | +| `SupportRS()` | Complete support for all sample sizes | +| `Ik()` | Sample membership indicator matrix | +| `IkWR()` | Frequency indicator matrix for WR sampling | +| `IkRS()` | Indicator matrix for all sample sizes | +| `OrderWR()` | Ordered WR sampling support | +| `nk()` | Frequency matrix for WR sampling | +| `p.WR()` | Sample probabilities under WR sampling | + +### Allocation + +| Function | Description | +|---|---| +| `kish_allocation()` | Kish compromise allocation for stratified sampling | + +### Utilities + +| Function | Description | +|---|---| +| `Domains()` | Domain indicator matrix | +| `T.SIC()` | Cluster totals for single-stage sampling | + +--- + +## Usage example + +```r +library(TeachingSampling) + +data("Lucy") +N <- nrow(Lucy) +n <- 400 + +# Draw a simple random sample without replacement +sam <- S.SI(N, n) +sam <- sam[sam != 0] + +# Estimate population totals +y <- data.frame(Income = Lucy$Income[sam], + Expenditure = Lucy$Expenditure[sam]) + +E.SI(N, n, y) ``` -## Author -This package is maintained by Andrés Gutiérrez. Email: hagutierrezro@gmail.com +--- + +## Authors + +**Hugo Andrés Gutiérrez Rojas** — Package author and maintainer +Email: hagutierrezro@gmail.com +GitHub: [@psirusteam](https://github.com/psirusteam) + +**Yury Vanessa Ochoa Montes** +Email: yury.ochoa@urosario.edu.co + +--- + +## Support + +- 📖 [Reference manual (CRAN)](https://cran.r-project.org/web/packages/TeachingSampling/TeachingSampling.pdf) +- [CRAN page](https://cran.r-project.org/web/packages/TeachingSampling) +- [Report an issue](https://github.com/psirusteam/TeachingSampling/issues) -### Support or Contact -Having trouble with the `TeachingSampling` package? Check out the [documentation](http://cran.r-project.org/web/packages/TeachingSampling/TeachingSampling.pdf) or [contact support](https://github.com/psirusteam). Comments, amends, and critics are very welcome. This is the [CRAN site](http://cran.r-project.org/web/packages/TeachingSampling) of the last stable version of the package. \ No newline at end of file +Comments, corrections, and suggestions are always welcome. diff --git a/data/datalist b/data/datalist new file mode 100644 index 0000000..9b7de3d --- /dev/null +++ b/data/datalist @@ -0,0 +1,3 @@ +BigCity +BigLucy +Lucy diff --git a/man/BigCity.Rd b/man/BigCity.Rd deleted file mode 100644 index 3b7d5d8..0000000 --- a/man/BigCity.Rd +++ /dev/null @@ -1,52 +0,0 @@ -\name{BigCity} -\docType{data} -\alias{BigCity} -\title{Full Person-level Population Database} -\description{ -This data set corresponds to some socioeconomic variables from 150266 people of a city in a particular year. -} -\seealso{ -\code{\link{Lucy}, \link{BigLucy}} -} -\usage{data(BigCity)} -\format{ - \describe{ -\item{HHID}{The identifier of the household. It corresponds to an alphanumeric sequence (four letters and five digits).} -\item{PersonID}{The identifier of the person within the household. NOTE it is not a unique identifier of a person for the whole population. It corresponds to an alphanumeric sequence (five letters and two digits).} -\item{Stratum}{Households are located in geographic strata. There are 119 strata across the city.} -\item{PSU}{Households are clustered in cartographic segments defined as primary sampling units (PSU). There are 1664 PSU and they are nested within strata.} -\item{Zone}{Segments clustered within strata can be located within urban or rural areas along the city.} -\item{Sex}{Sex of the person.} -\item{Income}{Per capita monthly income.} -\item{Expenditure}{Per capita monthly expenditure.} -\item{Employment}{A person's employment status.} -\item{Poverty}{This variable indicates whether the person is poor or not. It depends on income.} -} -} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} - -\examples{ -data(BigCity) -attach(BigCity) - -estima <- data.frame(Income, Expenditure) -# The population totals -colSums(estima) -# Some parameters of interest -table(Poverty, Zone) -xtabs(Income ~ Poverty + Zone) -# Correlations among characteristics of interest -cor(estima) -# Some useful histograms -hist(Income) -hist(Expenditure) -# Some useful plots -boxplot(Income ~ Poverty) -barplot(table(Employment)) -pie(table(MaritalST)) -} -\keyword{datasets} diff --git a/man/BigLucy.rd b/man/BigLucy.rd deleted file mode 100644 index 29fb639..0000000 --- a/man/BigLucy.rd +++ /dev/null @@ -1,57 +0,0 @@ -\name{BigLucy} -\docType{data} -\alias{BigLucy} -\title{Full Business Population Database} -\description{ -This data set corresponds to some financial variables of 85396 industrial companies of a city in a particular fiscal year. -} -\seealso{ -\code{\link{Lucy}, \link{BigCity}} -} -\usage{data(BigLucy)} -\format{ - \describe{ -\item{ID}{The identifier of the company. It correspond to an alphanumeric sequence (two letters and three digits)} -\item{Ubication}{The address of the principal office of the company in the city} -\item{Level}{The industrial companies are discrimitnated according to the Taxes declared. -There are small, medium and big companies} -\item{Zone}{The country is divided by counties. A company belongs to a particular zone according to its cartographic location.} -\item{Income}{The total ammount of a company's earnings (or profit) in the previuos fiscal year. It is calculated by taking -revenues and adjusting for the cost of doing business} -\item{Employees}{The total number of persons working for the company in the previuos fiscal year} -\item{Taxes}{The total ammount of a company's income Tax} -\item{SPAM}{Indicates if the company uses the Internet and WEBmail options in order to make self-propaganda.} -\item{ISO}{Indicates if the company is certified by the International Organization for Standardization.} -\item{Years}{The age of the company.} -\item{Segments}{Cartographic segments by county. A segment comprises in average 10 companies located close to each other.} -} -} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} - -\examples{ -data(BigLucy) -attach(BigLucy) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -# The population totals -colSums(estima) -# Some parameters of interest -table(SPAM,Level) -xtabs(Income ~ Level+SPAM) -# Correlations among characteristics of interest -cor(estima) -# Some useful histograms -hist(Income) -hist(Taxes) -hist(Employees) -# Some useful plots -boxplot(Income ~ Level) -barplot(table(Level)) -pie(table(SPAM)) -} -\keyword{datasets} diff --git a/man/Deltakl.rd b/man/Deltakl.rd index b088ed4..3cfe049 100644 --- a/man/Deltakl.rd +++ b/man/Deltakl.rd @@ -1,40 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Deltakl.r \name{Deltakl} \alias{Deltakl} -\title{Variance-Covariance Matrix of the Sample Membership Indicators for Fixed Size Without Replacement Sampling Designs} -\description{Computes the Variance-Covariance matrix of the sample membership indicators in the population given a -fixed sample size design} +\title{Matrix of Joint Inclusion Probability Differences} \usage{ Deltakl(N, n, p) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} -\item{p}{A vector containing the selection probabilities of a fixed size without replacement sampling design. The sum of the values of this vector must be one} +\item{N}{Population size. Recommended \code{N <= 15}.} + +\item{n}{Sample size.} + +\item{p}{Vector of probabilities for each possible sample in the support. +Must sum to 1.} } -\seealso{ -\code{\link{VarHT}, \link{Pikl}, \link{Pik}} +\value{ +An \code{N x N} matrix where entry \eqn{(k, l)} equals +\eqn{\pi_{kl} - \pi_k \pi_l}. Diagonal entries equal +\eqn{\pi_k(1 - \pi_k)}. } -\details{The \eqn{kl}th unit of the Variance-Covariance matrix of the sample membership indicators is defined as -\eqn{\Delta_{kl}=\pi_{kl}-\pi_k\pi_l} +\description{ +Computes the matrix \eqn{\Delta_{kl} = \pi_{kl} - \pi_k \pi_l} for all +pairs of units in a finite population. This matrix appears in the exact +Horvitz-Thompson variance formula. } -\value{The function returns a symmetric matrix of size \eqn{N \times N} containing the variances-covariances among the sample membership indicators for each pair of units in the finite population.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The matrix \eqn{\Delta} is central to the Horvitz-Thompson variance +estimator: +\deqn{V(\hat{t}_{y,\pi}) = \sum_k \sum_l \Delta_{kl} \frac{y_k}{\pi_k} +\frac{y_l}{\pi_l}} +It requires computing both first-order (\code{\link{Pik}}) and +second-order (\code{\link{Pikl}}) inclusion probabilities, so it is only +feasible for small populations. } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -# The sample size is n=2 n <- 2 -# p is the probability of selection of every sample. p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) -# Note that the sum of the elements of this vector is one sum(p) -# Computation of the Variance-Covariance matrix of the sample membership indicators +# Variance-Covariance matrix of the sample membership indicators Deltakl(N, n, p) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Pik}}, \code{\link{Pikl}}, \code{\link{VarHT}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/Domains.rd b/man/Domains.rd index 8a3f870..a7493fe 100644 --- a/man/Domains.rd +++ b/man/Domains.rd @@ -1,25 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Domains.r \name{Domains} \alias{Domains} -\title{Domains Indicator Matrix} -\description{Creates a matrix of domain indicator variables for every single unit in the selected sample or in the entire population} +\title{Domain Indicator Matrix} \usage{ Domains(y) } \arguments{ -\item{y}{Vector of the domain of interest containing the membership of each unit to a specified category of the domain} +\item{y}{A vector (factor or coercible to factor) identifying the domain +membership of each unit in the sample.} } -\seealso{ -\code{\link{E.SI}} +\value{ +A binary matrix of dimension \code{n x D}, where \code{D} is the number +of domains (levels of \code{y}). Entry \eqn{(k, d) = 1} if unit \eqn{k} +belongs to domain \eqn{d}, and 0 otherwise. Column names are the domain +labels. } -\details{Each value of y represents the domain which a specified unit belongs} -\value{The function returns a \eqn{n\times p} matrix, where \eqn{n} is the number of units in the selected -sample and \eqn{p} is the number of categories of the domain of interest. The values of this matrix are zero, if the unit does not -belongs to a specified category and one, otherwise.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Creates a binary indicator matrix that identifies the domain membership +of each unit in the sample. Each column corresponds to one domain +(level of \code{y}) and each row to one unit. +} +\details{ +This function is useful for domain estimation, where population totals or +means must be estimated for subgroups of the population. The indicator +matrix can be multiplied element-wise with the variable of interest to +restrict estimation to each domain. } \examples{ ############ @@ -64,4 +70,15 @@ SPAM.yes <- estima*Doma[,2] E.SI(N,n,SPAM.no) E.SI(N,n,SPAM.yes) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.SI}}, \code{\link{E.STSI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.2SI.rd b/man/E.2SI.rd index 363d861..fc84d23 100644 --- a/man/E.2SI.rd +++ b/man/E.2SI.rd @@ -1,29 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.2SI.r \name{E.2SI} \alias{E.2SI} -\title{Estimation of the Population Total under Two Stage Simple Random Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a 2SI sampling design} +\title{Estimation of the Population Total under Two Stage Simple Random Sampling} \usage{ E.2SI(NI, nI, Ni, ni, y, PSU) } \arguments{ -\item{NI}{Population size of Primary Sampling Units} -\item{nI}{Sample size of Primary Sampling Units} -\item{Ni}{Vector of population sizes of Secundary Sampling Units selected in the first draw} -\item{ni}{Vector of sample sizes of Secundary Sampling Units} -\item{y}{Vector, matrix or data frame containig the recollected information of the variables of interest for every -unit in the selected sample} -\item{PSU}{Vector identifying the membership to the strata of each unit in the population} +\item{NI}{Population size of Primary Sampling Units (PSUs).} + +\item{nI}{Sample size of Primary Sampling Units (PSUs).} + +\item{Ni}{Vector of population sizes of Secondary Sampling Units within +each selected PSU.} + +\item{ni}{Vector of sample sizes of Secondary Sampling Units within +each selected PSU.} + +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} + +\item{PSU}{Vector identifying the PSU membership of each unit in the sample.} } -\seealso{ -\code{\link{S.SI}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Dise?o de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +two-stage simple random sampling without replacement design, where both +Primary Sampling Units (PSUs) and Secondary Sampling Units (SSUs) are +selected by simple random sampling without replacement. +} +\details{ +The variance estimator decomposes into two components: the between-PSU +component and the within-PSU component, following the classical two-stage +variance decomposition of Sarndal et al. (1992). } \examples{ ############ @@ -140,4 +157,15 @@ estima <- data.frame(Income, Employees, Taxes) E.2SI(NI,nI,Ni,ni,estima,Cluster) # Sampling error is null } -\keyword{survey} \ No newline at end of file +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.1SI}}, \code{\link{E.UC}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.BE.rd b/man/E.BE.rd index ac0ca84..9719e96 100644 --- a/man/E.BE.rd +++ b/man/E.BE.rd @@ -1,41 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.BE.r \name{E.BE} \alias{E.BE} -\title{Estimation of the Population Total under Bernoulli Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a BE sampling design} +\title{Estimation of the Population Total under Bernoulli Sampling} \usage{ E.BE(y, prob) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample} -\item{prob}{Inclusion probability for each unit in the population} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} + +\item{prob}{Scalar. The (constant) inclusion probability used in the +Bernoulli sampling design. Must satisfy \code{0 < prob <= 1}.} } -\seealso{ -\code{\link{S.BE}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation under an BE sampling design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +Bernoulli sampling design, where each unit in the population is independently +selected with the same inclusion probability. +} +\details{ +Under Bernoulli sampling, the sample size is random. The inclusion +probability is constant and equal to \code{prob} for all units. The +variance estimator accounts for the randomness of the sample size. } \examples{ -# Uses the Lucy data to draw a Bernoulli sample -data(Lucy) +data('Lucy') attach(Lucy) - -N <- dim(Lucy)[1] -n=400 -prob=n/N -sam <- S.BE(N,prob) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.BE(estima,prob) -} -\keyword{survey} \ No newline at end of file +N <- nrow(Lucy) +prob <- 0.1 +sam <- S.BE(N, prob) +sam <- sam[sam != 0] +y <- data.frame(Income = Income[sam], Employees = Employees[sam]) +E.BE(y, prob) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.BE}}, \code{\link{E.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.Beta.rd b/man/E.Beta.rd index 9ac937f..7557eb4 100644 --- a/man/E.Beta.rd +++ b/man/E.Beta.rd @@ -1,28 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.Beta.r \name{E.Beta} \alias{E.Beta} -\title{Estimation of the population regression coefficients under SI designs} -\description{Computes the estimation of regression coefficients using the principles of the Horvitz-Thompson estimator} +\title{Estimation of Regression Coefficients under Simple Random Sampling} \usage{ -E.Beta(N, n, y, x, ck=1, b0=FALSE) +E.Beta(N, n, y, x, ck = 1, b0 = FALSE) } \arguments{ -\item{N}{The population size} -\item{n}{The sample size} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample} -\item{x}{Vector, matrix or data frame containing the recollected auxiliary information for every unit in the selected sample} -\item{ck}{By default equals to one. It is a vector of weights induced by the structure of variance of the supposed model} -\item{b0}{By default FALSE. The intercept of the regression model} +\item{N}{Population size.} + +\item{n}{Sample size.} + +\item{y}{Vector, matrix or data frame of variables of interest (response).} + +\item{x}{Vector, matrix or data frame of auxiliary variables (predictors).} + +\item{ck}{Optional variance-stabilising constant. Default is \code{1} +(homoscedastic model).} + +\item{b0}{Logical. If \code{TRUE}, an intercept column of ones is +prepended to \code{x}. Default is \code{FALSE}.} } -\seealso{ -\code{\link{GREG.SI}} +\value{ +A three-dimensional array with dimensions \code{[3, P, Q]}, where +\code{P} is the number of auxiliary variables and \code{Q} is the number +of variables of interest. The three rows correspond to: +\itemize{ + \item \code{Beta estimation}: Estimated regression coefficient. + \item \code{Standard Error}: Estimated standard error. + \item \code{CVE}: Estimated coefficient of variation (in percentage). } -\details{Returns the estimation of the population regression coefficients in a supposed linear model, its estimated variance and its estimated coefficient of variation under an SI sampling design} -\value{The function returns a vector whose entries correspond to the estimated parameters of the regression coefficients} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the weighted least squares estimator of regression coefficients +for a finite population under simple random sampling without replacement. +Both the estimated coefficients and their estimated standard errors are +returned. +} +\details{ +The estimator uses a working model with weights \eqn{V = 1/(\pi_k c_k)}, +where \eqn{\pi_k = n/N} under simple random sampling and \eqn{c_k} is an +optional variance-stabilising constant. The variance is estimated using +the residual-based sandwich approach of Sarndal et al. (1992). } \examples{ ###################################################################### @@ -108,4 +127,15 @@ estima<-data.frame(Income, Employees) x<-Doma*Taxes E.Beta(N, n, estima,x,ck=1,b0=FALSE) } -\keyword{survey} \ No newline at end of file +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{GREG.SI}}, \code{\link{E.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.PO.rd b/man/E.PO.rd index 5fc282c..8916a4b 100644 --- a/man/E.PO.rd +++ b/man/E.PO.rd @@ -1,45 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.PO.r \name{E.PO} \alias{E.PO} -\title{Estimation of the Population Total under Poisson Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a PO sampling design} +\title{Estimation of the Population Total under Poisson Sampling} \usage{ E.PO(y, Pik) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample} -\item{Pik}{Vector of inclusion probabilities for each unit in the selected sample} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} + +\item{Pik}{Vector of first-order inclusion probabilities for each unit +in the sample.} } -\seealso{ -\code{\link{S.PO}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation under a PO sampling design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +Poisson sampling design, where each unit is independently selected with +its own inclusion probability. +} +\details{ +Under Poisson sampling, units are selected independently, so the exact +variance of the Horvitz-Thompson estimator has a simple closed form: +\eqn{V(\hat{t}) = \sum_k (1 - \pi_k)(y_k/\pi_k)^2}. } \examples{ -# Uses the Lucy data to draw a Poisson sample -data(Lucy) +data('Lucy') attach(Lucy) -N <- dim(Lucy)[1] -# The population size is 2396. The expected sample size is 400 -# The inclusion probability is proportional to the variable Income +N <- nrow(Lucy) n <- 400 -Pik<-n*Income/sum(Income) -# The selected sample -sam <- S.PO(N,Pik) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The inclusion probabilities of each unit in the selected smaple -inclusion <- Pik[sam] -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.PO(estima,inclusion) -} -\keyword{survey} \ No newline at end of file +Pik <- PikPPS(n, Employees) +sam <- S.PO(N, Pik) +sam <- sam[sam != 0] +y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +E.PO(y, Pik[sam]) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.PO}}, \code{\link{E.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.PPS.rd b/man/E.PPS.rd index 83ca3ae..6cf8992 100644 --- a/man/E.PPS.rd +++ b/man/E.PPS.rd @@ -1,44 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.PPS.r \name{E.PPS} \alias{E.PPS} -\title{Estimation of the Population Total under Probability Proportional to Size Sampling With Replacement} -\description{Computes the Hansen-Hurwitz estimator of the population total according to a probability proportional to size sampling with replacement design} +\title{Estimation of the Population Total under PPS With-Replacement Sampling} \usage{ E.PPS(y, pk) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample} -\item{pk}{A vector containing selection probabilities for each unit in the sample} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every selected unit (with possible repetitions).} + +\item{pk}{Vector of selection probabilities for each draw in the sample.} } -\seealso{ -\code{\link{S.PPS}, \link{HH}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation estimated under a probability proportional to size sampling with replacement design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Hansen-Hurwitz estimator of the population total under a +probability proportional to size with-replacement (PPS-WR) sampling design. +} +\details{ +The Hansen-Hurwitz estimator is \eqn{\hat{t} = (1/m)\sum_{i=1}^m y_i/p_i}, +where \eqn{p_i} is the selection probability of the \eqn{i}-th draw and +\eqn{m} is the number of draws. } \examples{ -# Uses the Lucy data to draw a random sample according to a -# PPS with replacement design -data(Lucy) +data('Lucy') attach(Lucy) -# The selection probability of each unit is proportional to the variable Income -m <- 400 -res <- S.PPS(m,Income) -# The selected sample -sam <- res[,1] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# pk.s is the selection probability of each unit in the selected sample -pk.s <- res[,2] -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.PPS(estima,pk.s) -} -\keyword{survey} \ No newline at end of file +m <- 400 +res <- S.PPS(m, Employees) +sam <- res[, 1] +pk <- res[, 2] +y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +E.PPS(y, pk) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.PPS}}, \code{\link{HH}}, \code{\link{E.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.Quantile.rd b/man/E.Quantile.rd index 047bf25..652b77d 100644 --- a/man/E.Quantile.rd +++ b/man/E.Quantile.rd @@ -1,78 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.Quantile.r \name{E.Quantile} \alias{E.Quantile} -\title{Estimation of a Population quantile} -\description{Computes the estimation of a population quantile using the principles of the Horvitz-Thompson estimator} +\title{Estimation of Population Quantiles} \usage{ E.Quantile(y, Qn, Pik) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{Qn}{Quantile of interest} -\item{Pik}{A vector containing inclusion probabilities for each unit in the sample. -If missing, the function will assign the same weights to each unit in the sample} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} + +\item{Qn}{Scalar in \eqn{(0, 1)}. The desired quantile level +(e.g. \code{0.5} for the median, \code{0.25} for the first quartile).} + +\item{Pik}{Optional vector of first-order inclusion probabilities. If +omitted, equal probabilities are assumed.} } -\seealso{ -\code{\link{HT}} +\value{ +A numeric vector of length equal to the number of variables in \code{y}, +containing the estimated quantile for each variable. } -\details{Returns the estimation of the population quantile of every single variable of interest} -\value{The function returns a vector whose entries correspond to the estimated quantiles of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Computes a weighted quantile estimator for finite populations. When +inclusion probabilities are provided, the estimator uses the +Horvitz-Thompson weights \eqn{d_k = 1/\pi_k}; otherwise, equal weights +are assumed (simple random sampling). +} +\details{ +The estimator is based on the weighted empirical cumulative distribution +function. For each variable, units are sorted by their observed value, +cumulative weights are computed, and the quantile is located by +interpolation. } \examples{ ############ ## Example 1 ############ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# Vectors y and x give the values of the variables of interest -y<-c(32, 34, 46, 89, 35) -x<-c(52, 60, 75, 100, 50) -z<-cbind(y,x) -# Inclusion probabilities for a design of size n=2 -Pik<-c(0.58, 0.34, 0.48, 0.33, 0.27) -# Estimation of the sample median +y <- c(32, 34, 46, 89, 35) +x <- c(52, 60, 75, 100, 50) +z <- cbind(y, x) +Pik <- c(0.58, 0.34, 0.48, 0.33, 0.27) E.Quantile(y, 0.5) -# Estimation of the sample Q1 E.Quantile(x, 0.25) -# Estimation of the sample Q3 E.Quantile(z, 0.75) -# Estimation of the sample median E.Quantile(z, 0.5, Pik) - ############ ## Example 2 ############ -# Uses the Lucy data to draw a PPS sample with replacement - data(Lucy) attach(Lucy) - -# The selection probability of each unit is proportional to the variable Income -# The sample size is m=400 -m=400 -res <- S.PPS(m,Income) -# The selected sample -sam <- res[,1] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -# The vector of selection probabilities of units in the sample -pk.s <- res[,2] -# The vector of inclusion probabilities of units in the sample -Pik.s<-1-(1-pk.s)^m -# The information about the sample units is stored in an object called data -data <- Lucy[sam,] +m <- 400 +res <- S.PPS(m, Income) +sam <- res[, 1] +pk.s <- res[, 2] +Pik.s <- 1 - (1 - pk.s)^m +data <- Lucy[sam, ] attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima estima <- data.frame(Income, Employees, Taxes) -# Estimation of sample median -E.Quantile(estima,0.5,Pik.s) +E.Quantile(estima, 0.5, Pik.s) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.SI}}, \code{\link{E.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/E.SI.rd b/man/E.SI.rd index 774baa0..46fd8b2 100644 --- a/man/E.SI.rd +++ b/man/E.SI.rd @@ -1,27 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.SI.r \name{E.SI} \alias{E.SI} -\title{Estimation of the Population Total under Simple Random Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to an SI sampling design} +\title{Estimation of the Population Total under Simple Random Sampling Without +Replacement} \usage{ E.SI(N, n, y) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} +\item{N}{Population size.} + +\item{n}{Sample size.} + +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} } -\seealso{ -\code{\link{S.SI}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect (always 1 under SI, included for + consistency with other estimators). } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its -estimated coefficient of variation under an SI sampling design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +simple random sampling without replacement (SI) design. +} +\details{ +Under simple random sampling without replacement, the Horvitz-Thompson +estimator reduces to \eqn{\hat{t}_y = N \bar{y}_s}, the expansion +estimator. The design effect is always 1 because SI is the reference design. } \examples{ ############ @@ -107,6 +118,16 @@ E.SI(N,n,SPAM.no) # Finnaly, note that the sum of the point estimates of the last two # columns gives exactly the point estimate in the second column E.SI(N,n,SPAM.yes) - } -\keyword{survey} \ No newline at end of file +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.SI}}, \code{\link{E.STSI}}, \code{\link{GREG.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.STPPS.Rd b/man/E.STPPS.Rd new file mode 100644 index 0000000..b54f803 --- /dev/null +++ b/man/E.STPPS.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.STPPS.r +\name{E.STPPS} +\alias{E.STPPS} +\title{Estimation of the Population Total under Stratified PPS With-Replacement Sampling} +\usage{ +E.STPPS(y, pk, mh, S) +} +\arguments{ +\item{y}{Vector, matrix or data frame of variables of interest.} + +\item{pk}{Vector of selection probabilities for each draw in the sample.} + +\item{mh}{Integer vector with the number of draws within each stratum.} + +\item{S}{Vector identifying the stratum membership of each unit in the sample.} +} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. +} +} +\description{ +Computes the Hansen-Hurwitz estimator of the population total under a +stratified PPS with-replacement (STPPS) sampling design. +} +\examples{ +# Uses the Lucy data to draw a stratified random sample +# according to a PPS design in each stratum +data(Lucy) +attach(Lucy) +m1 <- 83; m2 <- 100; m3 <- 200 +mh <- c(m1, m2, m3) +res <- S.STPPS(Level, Income, mh) +sam <- res[, 1] +pk <- res[, 2] +data <- Lucy[sam, ] +attach(data) +estima <- data.frame(Income, Employees, Taxes) +E.STPPS(estima, pk, mh, Level) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.STPPS}}, \code{\link{E.PPS}}, \code{\link{E.STpiPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.STPPS.rd b/man/E.STPPS.rd deleted file mode 100644 index c205bdf..0000000 --- a/man/E.STPPS.rd +++ /dev/null @@ -1,57 +0,0 @@ -\name{E.STPPS} -\alias{E.STPPS} -\title{Estimation of the Population Total under Stratified Probability Proportional to Size Sampling With Replacement} -\description{Computes the Hansen-Hurwitz estimator of the population total according to a probability proportional to size -sampling with replacement design} -\usage{ -E.STPPS(y, pk, mh, S) -} -\arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{pk}{A vector containing selection probabilities for each unit in the sample} -\item{mh}{Vector of sample size in each stratum} -\item{S}{Vector identifying the membership to the strata of each unit in selected sample} -} -\seealso{ -\code{\link{S.STPPS}} -} -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation in all of the stratum and finally in the entire population} -\value{The function returns an array composed by several matrices representing each variable of interest. The columns of each matrix -correspond to the estimated parameters of the variables of interest in each stratum and in the entire population} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} -\examples{ -# Uses the Lucy data to draw a stratified random sample -# according to a PPS design in each stratum - -data(Lucy) -attach(Lucy) -# Level is the stratifying variable -summary(Level) -# Defines the sample size at each stratum -m1<-83 -m2<-100 -m3<-200 -mh<-c(m1,m2,m3) -# Draws a stratified sample -res<-S.STPPS(Level, Income, mh) -# The selected sample -sam<-res[,1] -# The selection probability of each unit in the selected sample -pk <- res[,2] -pk -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.STPPS(estima,pk,mh,Level) -} -\keyword{survey} \ No newline at end of file diff --git a/man/E.STSI.Rd b/man/E.STSI.Rd new file mode 100644 index 0000000..395db8b --- /dev/null +++ b/man/E.STSI.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.STSI.r +\name{E.STSI} +\alias{E.STSI} +\title{Estimation of the Population Total under Stratified Simple Random Sampling} +\usage{ +E.STSI(S, Nh, nh, y) +} +\arguments{ +\item{S}{Vector identifying the stratum membership of each unit in the sample.} + +\item{Nh}{Integer vector with the population size of each stratum.} + +\item{nh}{Integer vector with the sample size of each stratum.} + +\item{y}{Vector, matrix or data frame of variables of interest.} +} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. +} +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +stratified simple random sampling without replacement (STSI) design. +} +\examples{ +############ +## Example 1 +############ +data(Lucy) +attach(Lucy) +N1 <- summary(Level)[[1]] +N2 <- summary(Level)[[2]] +N3 <- summary(Level)[[3]] +Nh <- c(N1, N2, N3) +n1 <- N1; n2 <- 100; n3 <- 200 +nh <- c(n1, n2, n3) +sam <- S.STSI(Level, Nh, nh) +data <- Lucy[sam, ] +attach(data) +estima <- data.frame(Income, Employees, Taxes) +E.STSI(Level, Nh, nh, estima) +############ +## Example 2 +############ +# The variable SPAM is a domain of interest +Doma <- Domains(SPAM) +SPAM.no <- estima * Doma[, 1] +SPAM.yes <- estima * Doma[, 2] +E.STSI(Level, Nh, nh, Doma) +E.STSI(Level, Nh, nh, SPAM.no) +E.STSI(Level, Nh, nh, SPAM.yes) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.STSI}}, \code{\link{E.SI}}, \code{\link{E.STpiPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.STSI.rd b/man/E.STSI.rd deleted file mode 100644 index be94d78..0000000 --- a/man/E.STSI.rd +++ /dev/null @@ -1,73 +0,0 @@ -\name{E.STSI} -\alias{E.STSI} -\title{Estimation of the Population Total under Stratified Simple Random Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a STSI sampling design} -\usage{ -E.STSI(S, Nh, nh, y) -} -\arguments{ -\item{S}{Vector identifying the membership to the strata of each unit in the population} -\item{Nh}{Vector of stratum sizes} -\item{nh}{Vector of sample sizes in each stratum} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -} -\seealso{ -\code{\link{S.STSI}} -} -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation in all of the strata and finally in the entire population} -\value{The function returns an array composed by several matrices representing each variable of interest. The columns of each matrix -correspond to the estimated parameters of the variables of interest in each stratum and in the entire population} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} -\examples{ -############ -## Example 1 -############ -# Uses the Lucy data to draw a stratified random sample -# according to a SI design in each stratum - -data(Lucy) -attach(Lucy) -# Level is the stratifying variable -summary(Level) -# Defines the size of each stratum -N1<-summary(Level)[[1]] -N2<-summary(Level)[[2]] -N3<-summary(Level)[[3]] -N1;N2;N3 -Nh <- c(N1,N2,N3) -# Defines the sample size at each stratum -n1<-N1 -n2<-100 -n3<-200 -nh<-c(n1,n2,n3) -# Draws a stratified sample -sam <- S.STSI(Level, Nh, nh) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.STSI(Level,Nh,nh,estima) - -############ -## Example 2 -############ -# Following with Example 1. The variable SPAM is a domain of interest -Doma <- Domains(SPAM) -# This function allows to estimate the parameters of the variables of interest -# for every category in the domain SPAM -SPAM.no <- estima*Doma[,1] -SPAM.yes <- estima*Doma[,2] -E.STSI(Level, Nh, nh, Doma) -E.STSI(Level, Nh, nh, SPAM.no) -E.STSI(Level, Nh, nh, SPAM.yes) -} -\keyword{survey} \ No newline at end of file diff --git a/man/E.STpiPS.Rd b/man/E.STpiPS.Rd index 1161fa9..0409a83 100644 --- a/man/E.STpiPS.Rd +++ b/man/E.STpiPS.Rd @@ -1,62 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.STpiPS.R \name{E.STpiPS} \alias{E.STpiPS} -\title{Estimation of the Population Total under Stratified Probability Proportional to Size Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a probability proportional to size -sampling without replacement design in each stratum} +\title{Estimation of the Population Total under Stratified piPS Sampling} \usage{ -E.STpiPS(y, pik, S) +E.STpiPS(y, Pik, S) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{pik}{A vector containing inclusion probabilities for each unit in the sample} -\item{S}{Vector identifying the membership to the strata of each unit in selected sample} +\item{y}{Vector, matrix or data frame of variables of interest.} + +\item{Pik}{Vector of first-order inclusion probabilities for each unit +in the sample.} + +\item{S}{Vector identifying the stratum membership of each unit in the sample.} } -\seealso{ -\code{\link{S.STpiPS}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error, its estimated coefficient of variation and its corresponding DEFF in all of the strata and finally in the entire population} -\value{The function returns an array composed by several matrices representing each variable of interest. The columns of each matrix -correspond to the estimated parameters of the variables of interest in each stratum and in the entire population} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +stratified without-replacement probability proportional to size (piPS) +sampling design. } \examples{ -# Uses the Lucy data to draw a stratified random sample -# according to a PPS design in each stratum - +# Uses the Lucy data to draw a stratified random sample +# according to a piPS design in each stratum data(Lucy) attach(Lucy) -# Level is the stratifying variable -summary(Level) - -# Defines the size of each stratum -N1<-summary(Level)[[1]] -N2<-summary(Level)[[2]] -N3<-summary(Level)[[3]] -N1;N2;N3 - -# Defines the sample size at each stratum -n1<-N1 -n2<-100 -n3<-200 -nh<-c(n1,n2,n3) -nh -# Draws a stratified sample -S <- Level -x <- Employees - +N1 <- summary(Level)[[1]] +N2 <- summary(Level)[[2]] +N3 <- summary(Level)[[3]] +nh <- c(N1, 100, 200) +S <- Level +x <- Employees res <- S.STpiPS(S, x, nh) -sam <- res[,1] -pik <- res[,2] - -data <- Lucy[sam,] +sam <- res[, 1] +pik <- res[, 2] +data <- Lucy[sam, ] attach(data) - estima <- data.frame(Income, Employees, Taxes) -E.STpiPS(estima,pik,Level) +E.STpiPS(estima, pik, Level) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.STpiPS}}, \code{\link{E.piPS}}, \code{\link{E.STSI}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/E.SY.rd b/man/E.SY.rd index de27043..fc2e6f7 100644 --- a/man/E.SY.rd +++ b/man/E.SY.rd @@ -1,43 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.SY.r \name{E.SY} \alias{E.SY} -\title{Estimation of the Population Total under Systematic Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to an SY sampling design} +\title{Estimation of the Population Total under Systematic Sampling} \usage{ E.SY(N, a, y) } \arguments{ -\item{N}{Population size} -\item{a}{Number of groups dividing the population} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} +\item{N}{Population size.} + +\item{a}{Sampling interval (skip). The expected sample size is \code{N/a}.} + +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} } -\seealso{ -\code{\link{S.SY}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated standard error and its estimated coefficient of variation under an SY sampling design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +systematic sampling design with sampling interval \code{a}. +} +\details{ +Under systematic sampling the sample size is \eqn{n = N/a}. Because only +one systematic sample is observed, the variance cannot be estimated without +assumptions. Here, the variance is approximated by treating the systematic +sample as a simple random sample of the same size, which is a common +conservative approximation. } \examples{ -# Uses the Lucy data to draw a Systematic sample -data(Lucy) +data('Lucy') attach(Lucy) - -N <- dim(Lucy)[1] -# The population is divided in 6 groups -# The selected sample -sam <- S.SY(N,6) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.SY(N,6,estima) -} -\keyword{survey} \ No newline at end of file +N <- nrow(Lucy) +a <- 10 +sam <- S.SY(N, a) +y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +E.SY(N, a, y) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.SY}}, \code{\link{E.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.WR.rd b/man/E.WR.rd index c3ce6e4..1970d78 100644 --- a/man/E.WR.rd +++ b/man/E.WR.rd @@ -1,44 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.WR.r \name{E.WR} \alias{E.WR} -\title{Estimation of the Population Total under Simple Random Sampling With Replacement} -\description{Computes the Hansen-Hurwitz estimator of the population total according to a simple random -sampling with replacement design} +\title{Estimation of the Population Total under Simple Random Sampling With +Replacement} \usage{ E.WR(N, m, y) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} +\item{N}{Population size.} + +\item{m}{Number of draws (sample size with replacement).} + +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every draw in the sample (repetitions allowed).} } -\seealso{ -\code{\link{S.WR}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling + without replacement. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated variance and its -estimated coefficient of variation estimated under an simple random with replacement design} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Hansen-Hurwitz estimator of the population total under a +simple random sampling with replacement (WR) design. +} +\details{ +Under simple random sampling with replacement with \code{m} draws, the +Hansen-Hurwitz estimator is \eqn{\hat{t} = (N/m)\sum_{i=1}^m y_i}. } \examples{ -# Uses the Lucy data to draw a random sample according to a WR design -data(Lucy) +data('Lucy') attach(Lucy) - -N <- dim(Lucy)[1] -m <- 400 -sam <- S.WR(N,m) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.WR(N,m,estima) -} -\keyword{survey} \ No newline at end of file +N <- nrow(Lucy) +m <- 400 +sam <- S.WR(N, m) +y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +E.WR(N, m, y) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.WR}}, \code{\link{E.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/E.piPS.rd b/man/E.piPS.rd index bba91a6..72587d0 100644 --- a/man/E.piPS.rd +++ b/man/E.piPS.rd @@ -1,50 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/E.piPS.r \name{E.piPS} \alias{E.piPS} -\title{Estimation of the Population Total under Probability Proportional to Size Sampling Without Replacement} -\description{Computes the Horvitz-Thompson estimator of the population total according to a \eqn{\pi}PS sampling design} +\title{Estimation of the Population Total under Pi Probability Proportional to +Size Sampling} \usage{ E.piPS(y, Pik) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{Pik}{Vector of inclusion probabilities for each unit in the selected sample} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the selected sample.} + +\item{Pik}{Vector of first-order inclusion probabilities for each +unit in the sample.} } -\seealso{ -\code{\link{S.piPS}} +\value{ +A matrix with four rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: Estimated population total. + \item \code{Standard Error}: Estimated standard error of the total. + \item \code{CVE}: Estimated coefficient of variation (in percentage). + \item \code{DEFF}: Design effect with respect to simple random sampling. } -\details{Returns the estimation of the population total of every single variable of interest, its estimated variance and its estimated coefficient of variation under a \eqn{\pi}PPS sampling design. This function uses the results of approximate expressions for -the estimated variance of the Horvitz-Thompson estimator} -\value{The function returns a data matrix whose columns correspond to the estimated parameters of the variables of interest} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Matei, A. and Tille, Y. (2005), Evaluation of Variance Approximations and Estimators in Maximun -Entropy Sampling with Unequal Probability and Fixed Sample Design. \emph{Journal of Official Statistics}. Vol 21, 4, 543-570.\cr -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Horvitz-Thompson estimator of the population total under a +without-replacement probability proportional to size (piPS) sampling design. +The variance is estimated using the Horvitz-Thompson variance approximation +based on first-order inclusion probabilities. +} +\details{ +When all inclusion probabilities are equal (i.e. \code{sum(Pik) == n}), +the variance is set to zero, reflecting an equal-probability design. } \examples{ -# Uses the Lucy data to draw a sample according to a piPS -# without replacement design -data(Lucy) +data('Lucy') attach(Lucy) -# The inclusion probability of each unit is proportional to the variable Income -# The selected sample of size n=400 +N <- nrow(Lucy) n <- 400 -res <- S.piPS(n, Income) -sam <- res[,1] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# Pik.s is the inclusion probability of every single unit in the selected sample -Pik.s <- res[,2] -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.piPS(estima,Pik.s) -# Same results than HT function -HT(estima, Pik.s) -} -\keyword{survey} \ No newline at end of file +x <- Employees +res <- S.piPS(n, x) +sam <- res[, 1] +Pik <- res[, 2] +y <- data.frame(Income = Income[sam], Expenditure = Expenditure[sam]) +E.piPS(y, Pik) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.piPS}}, \code{\link{PikPPS}}, \code{\link{E.PO}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/GREG.SI.rd b/man/GREG.SI.rd index f77b9fb..32fd044 100644 --- a/man/GREG.SI.rd +++ b/man/GREG.SI.rd @@ -1,29 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GREG.SI.r \name{GREG.SI} \alias{GREG.SI} -\title{The Generalized Regression Estimator under SI sampling design} -\description{Computes the generalized regression estimator of the population total for several variables of interest under simple random sampling without replacement} +\title{Generalised Regression Estimator under Simple Random Sampling} \usage{ -GREG.SI(N, n, y, x, tx, b, b0=FALSE) +GREG.SI(N, n, y, x, tx, b, b0 = FALSE) } \arguments{ -\item{N}{The population size} -\item{n}{The sample size} -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{x}{Vector, matrix or data frame containing the recollected auxiliary information for every unit in the selected sample} -\item{tx}{Vector containing the populations totals of the auxiliary information} -\item{b}{Vector of estimated regression coefficients} -\item{b0}{By default FALSE. The intercept of the regression model} +\item{N}{Population size.} + +\item{n}{Sample size.} + +\item{y}{Vector, matrix or data frame of variables of interest.} + +\item{x}{Vector, matrix or data frame of auxiliary variables observed +in the sample.} + +\item{tx}{Vector of known population totals for the auxiliary variables.} + +\item{b}{Matrix of regression coefficients (e.g. from \code{\link{E.Beta}}).} + +\item{b0}{Logical. If \code{TRUE}, an intercept column is prepended to +\code{x}. Default is \code{FALSE}.} } -\seealso{ -\code{\link{E.Beta}} +\value{ +A matrix with three rows and one column per variable of interest: +\itemize{ + \item \code{Estimation}: GREG estimated population total. + \item \code{Standard Error}: Estimated standard error. + \item \code{CVE}: Estimated coefficient of variation (in percentage). } -\value{The function returns a vector of total population estimates for each variable of interest, its estimated standard error and its estimated coefficient of variation.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Computes the Generalised Regression (GREG) estimator of the population +total under simple random sampling without replacement, using known +population totals of auxiliary variables to improve efficiency. +} +\details{ +The GREG estimator is: +\deqn{\hat{t}_{GREG} = \hat{t}_{HT} + (\mathbf{t}_x - +\hat{\mathbf{t}}_{x,HT})^T \hat{\boldsymbol{\beta}}} +where \eqn{\hat{\boldsymbol{\beta}}} are the regression coefficients +estimated from the sample, \eqn{\mathbf{t}_x} are the known population +totals, and variance is estimated from the residuals. } \examples{ ###################################################################### @@ -174,7 +193,17 @@ estima<-Doma*Taxes model <- E.Beta(N, n, estima, Doma, ck=1, b0=FALSE) b <- t(as.matrix(model[1,,])) tx <- colSums(Domains(Lucy$Level)) -GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) - +GREG.SI(N,n,estima,Doma,tx, b, b0=FALSE) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.Beta}}, \code{\link{E.SI}}, \code{\link{Wk}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/HH.rd b/man/HH.Rd similarity index 70% rename from man/HH.rd rename to man/HH.Rd index 57a142b..fcd94da 100644 --- a/man/HH.rd +++ b/man/HH.Rd @@ -1,28 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HH.r \name{HH} \alias{HH} -\title{The Hansen-Hurwitz Estimator} -\description{Computes the Hansen-Hurwitz Estimator estimator of the population total for several variables of interest} +\title{Hansen-Hurwitz Estimator of the Population Total} \usage{ HH(y, pk) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every unit in the selected sample} -\item{pk}{A vector containing selection probabilities for each unit in the selected sample} +\item{y}{Vector or matrix of values of the variable(s) of interest for +units in the sample (with possible repetitions).} + +\item{pk}{Vector of selection probabilities for each draw in the sample.} } -\seealso{ -\code{\link{HT}} +\value{ +A numeric vector or matrix with the estimated total for each variable +of interest. } -\details{The Hansen-Hurwitz estimator is given by -\deqn{\sum_{i=1}^m\frac{y_i}{p_i}} -where \eqn{y_i} is the value of the variables of interest for the \eqn{i}th unit, and \eqn{p_i} is its corresponding -selection probability. This estimator is restricted to with replacement sampling designs. +\description{ +Computes the Hansen-Hurwitz (HH) estimator of the population total under +a with-replacement sampling design, given the sample observations and +their selection probabilities. } -\value{The function returns a vector of total population estimates for each variable of interest, its estimated standard error and its estimated coefficient of variation.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The Hansen-Hurwitz estimator is: +\deqn{\hat{t}_{HH} = \frac{1}{m}\sum_{i=1}^m \frac{y_i}{p_i}} +where \eqn{p_i} is the selection probability of the \eqn{i}-th draw +and \eqn{m} is the number of draws. This estimator is design-unbiased +under any with-replacement sampling design. } \examples{ ############ @@ -127,5 +131,15 @@ data.frame(Ind, Est, p) sum(Est*p) sum(y) } - -\keyword{survey} +\references{ +Hansen, M.H. and Hurwitz, W.N. (1943). On the theory of sampling from +finite populations. \emph{Annals of Mathematical Statistics}, 14, 333-362.\cr +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer. +} +\seealso{ +\code{\link{E.PPS}}, \code{\link{HT}}, \code{\link{S.PPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/HT.rd b/man/HT.rd index 0f84ddf..c272605 100644 --- a/man/HT.rd +++ b/man/HT.rd @@ -1,31 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HT.r \name{HT} \alias{HT} -\title{The Horvitz-Thompson Estimator} -\description{Computes the Horvitz-Thompson estimator of the population total for several -variables of interest} +\title{Horvitz-Thompson Estimator of the Population Total} \usage{ HT(y, Pik) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{Pik}{A vector containing the inclusion probabilities for each unit in the selected sample} +\item{y}{Vector or matrix of values of the variable(s) of interest for +units in the sample.} + +\item{Pik}{Vector of first-order inclusion probabilities for each unit +in the sample.} } -\seealso{ -\code{\link{HH}} +\value{ +A numeric vector or matrix with the estimated total for each variable +of interest. } -\details{The Horvitz-Thompson estimator is given by -\deqn{\sum_{k \in U}\frac{y_k}{{\pi}_k}} -where \eqn{y_k} is the value of the variables of interest for the \eqn{k}th unit, and \eqn{{\pi}_k} -its corresponding inclusion probability. This estimator could be used for without replacement designs -as well as for with replacement designs. +\description{ +Computes the Horvitz-Thompson (HT) estimator of the population total for +one or more variables of interest, given the sample observations and their +first-order inclusion probabilities. } -\value{The function returns a vector of total population estimates for each variable of interest.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The Horvitz-Thompson estimator is defined as: +\deqn{\hat{t}_{y,\pi} = \sum_{k \in s} \frac{y_k}{\pi_k}} +where \eqn{\pi_k} is the first-order inclusion probability of unit \eqn{k}. +This estimator is design-unbiased for any fixed-size sampling design. } \examples{ ############ @@ -300,4 +301,16 @@ data.frame(Ind, Est, p) sum(Est*p) sum(y) } -\keyword{survey} +\references{ +Horvitz, D.G. and Thompson, D.J. (1952). A generalization of sampling +without replacement from a finite universe. +\emph{Journal of the American Statistical Association}, 47, 663-685.\cr +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer. +} +\seealso{ +\code{\link{VarHT}}, \code{\link{E.SI}}, \code{\link{E.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/IPFP.rd b/man/IPFP.rd index 9b5a7d2..94ec3b5 100644 --- a/man/IPFP.rd +++ b/man/IPFP.rd @@ -1,73 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IPFP.r \name{IPFP} \alias{IPFP} -\title{Iterative Proportional Fitting Procedure} -\description{Adjustment of a table on the margins} +\title{Iterative Proportional Fitting Procedure (Raking)} \usage{ -IPFP(Table, Col.knw, Row.knw, tol=0.0001) +IPFP(Table, Col.knw, Row.knw, tol = 1e-04) } \arguments{ -\item{Table}{A contingency table} -\item{Col.knw}{A vector containing the true totals of the columns} -\item{Row.knw}{A vector containing the true totals of the Rows} -\item{tol}{The control value, by default equal to 0.0001} +\item{Table}{A matrix or data frame of initial cell counts or weights to +be adjusted.} + +\item{Col.knw}{Numeric vector of known column marginal totals.} + +\item{Row.knw}{Numeric vector of known row marginal totals.} + +\item{tol}{Convergence tolerance. The algorithm stops when the total +absolute deviation between known and estimated marginals is below +\code{tol}. Default is \code{0.0001}.} } -\details{Adjust a contingency table on the know margins of the population with the Raking Ratio method} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Deming, W. & Stephan, F. (1940), On a least squares adjustment of a sampled frequency -table when the expected marginal totals are known. \emph{Annals of Mathematical Statistics}, 11, 427-444.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\value{ +A matrix with \code{nrow(Table) + 1} rows and \code{ncol(Table) + 1} +columns containing the adjusted cell counts, with an added row of +estimated column totals and an added column of estimated row totals. +} +\description{ +Adjusts a contingency table so that its row and column marginals match +known population totals, using the Iterative Proportional Fitting +Procedure (IPFP), also known as raking or RAS algorithm. +} +\details{ +The algorithm alternates between row and column adjustments until +convergence. At each step, cells in each row (or column) are multiplied +by the ratio of the known marginal to the current estimated marginal. +Convergence is assessed by the sum of absolute differences between +known and estimated marginals. } \examples{ ############ ## Example 1 ############ -# Some example of Ardilly and Tille -Table <- matrix(c(80,90,10,170,80,80,150,210,130),3,3) -rownames(Table) <- c("a1", "a2","a3") -colnames(Table) <- c("b1", "b2","b3") -# The table with labels -Table -# The known and true margins -Col.knw <- c(150,300,550) -Row.knw <- c(430,360,210) -# The adjusted table -IPFP(Table,Col.knw,Row.knw,tol=0.0001) - +Table <- matrix(c(80, 90, 10, 170, 80, 80, 150, 210, 130), 3, 3) +rownames(Table) <- c("a1", "a2", "a3") +colnames(Table) <- c("b1", "b2", "b3") +Col.knw <- c(150, 300, 550) +Row.knw <- c(430, 360, 210) +IPFP(Table, Col.knw, Row.knw, tol = 0.0001) ############ ## Example 2 ############ -# Draws a simple random sample data(Lucy) attach(Lucy) - -N<-dim(Lucy)[1] -n<-400 -sam<-sample(N,n) -data<-Lucy[sam,] +N <- dim(Lucy)[1] +n <- 400 +sam <- sample(N, n) +data <- Lucy[sam, ] attach(data) -dim(data) -# Two domains of interest -Doma1<-Domains(Level) -Doma2<-Domains(SPAM) -# Cross tabulate of domains -SPAM.no<-Doma2[,1]*Doma1 -SPAM.yes<-Doma2[,2]*Doma1 -# Estimation -E.SI(N,n,Doma1) -E.SI(N,n,Doma2) -est1 <-E.SI(N,n,SPAM.no)[,2:4] -est2 <-E.SI(N,n,SPAM.yes)[,2:4] -est1;est2 -# The contingency table estimated from above -Table <- cbind(est1[1,],est2[1,]) -rownames(Table) <- c("Big", "Medium","Small") -colnames(Table) <- c("SPAM.no", "SPAM.yes") -# The known and true margins +Doma1 <- Domains(Level) +Doma2 <- Domains(SPAM) +SPAM.no <- Doma2[, 1] * Doma1 +SPAM.yes <- Doma2[, 2] * Doma1 +est1 <- E.SI(N, n, SPAM.no)[, 2:4] +est2 <- E.SI(N, n, SPAM.yes)[, 2:4] +Table <- cbind(est1[1, ], est2[1, ]) Col.knw <- colSums(Domains(Lucy$SPAM)) -Row.knw<- colSums(Domains(Lucy$Level)) -# The adjusted table -IPFP(Table,Col.knw,Row.knw,tol=0.0001) +Row.knw <- colSums(Domains(Lucy$Level)) +IPFP(Table, Col.knw, Row.knw, tol = 0.0001) +} +\references{ +Deming, W.E. and Stephan, F.F. (1940). On a least squares adjustment of +a sampled frequency table when the expected marginal totals are known. +\emph{Annals of Mathematical Statistics}, 11(4), 427-444.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Domains}}, \code{\link{Wk}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/Ik.rd b/man/Ik.rd index be3a90a..61ce770 100644 --- a/man/Ik.rd +++ b/man/Ik.rd @@ -1,33 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Ik.r \name{Ik} \alias{Ik} -\title{Sample Membership Indicator} -\description{Creates a matrix of values (0, if the unit belongs to a specified sample and 1, otherwise) -for every possible sample under fixed sample size designs without replacement} +\title{Sample Membership Indicator Matrix} \usage{ Ik(N, n) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} +\item{N}{Population size. Recommended \code{N <= 15}.} + +\item{n}{Sample size.} } -\seealso{ -\code{\link{Support}, \link{Pik}} +\value{ +A binary matrix of dimension \code{choose(N, n) x N}, where entry +\eqn{(s, k) = 1} if unit \eqn{k} belongs to sample \eqn{s}, and 0 +otherwise. } -\value{The function returns a matrix of \eqn{binom(N)(n)} rows and \eqn{N} columns. The \eqn{k}th column corresponds to the sample -membership indicator, of the \eqn{k}th unit, to a possible sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Constructs the indicator matrix of the sampling support for a fixed-size +without-replacement design. Each row corresponds to one possible sample +and each column to one population unit. +} +\details{ +The full enumeration of all \code{choose(N, n)} possible samples is +computationally feasible only for small populations. For \code{N > 15} +this function will be very slow. It is intended primarily for theoretical +illustrations and teaching purposes. } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) n <- 2 -# The sample membership matrix for fixed size without replacement sampling designs -Ik(N,n) -# The first unit, Yves, belongs to the first four possible samples +# The sample membership matrix +Ik(N, n) +# The first unit, Yves, belongs to the first four possible samples +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Pik}}, \code{\link{Pikl}}, \code{\link{Support}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} diff --git a/man/IkRS.rd b/man/IkRS.rd index 8020380..59cc45a 100644 --- a/man/IkRS.rd +++ b/man/IkRS.rd @@ -1,32 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IkRS.r \name{IkRS} \alias{IkRS} -\title{Sample Membership Indicator for Random Size sampling designs} -\description{Creates a matrix of values (0, if the unit belongs to a specified sample and 1, otherwise) -for every possible sample under random sample size designs without replacement} +\title{Sample Membership Indicator Matrix for All Possible Sample Sizes} \usage{ IkRS(N) } \arguments{ -\item{N}{Population size} +\item{N}{Population size. Recommended \code{N <= 10}.} } -\seealso{ -\code{\link{SupportRS}, \link{Pik}} +\value{ +A binary matrix with \eqn{2^N} rows (one per non-empty subset, including +the empty set as the first row of zeros) and \code{N} columns. Entry +\eqn{(s, k) = 1} if unit \eqn{k} belongs to subset \eqn{s}. } -\value{The function returns a matrix of \eqn{2^N} rows and \eqn{N} columns. The \eqn{k}th column corresponds to the sample -membership indicator, of the \eqn{k}th unit, to a possible sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Constructs the indicator matrix of the complete sampling support, stacking +the indicator matrices for all sample sizes from 1 to \code{N}. This +covers every possible non-empty subset of the population. +} +\details{ +This function calls \code{\link{Ik}} for each possible sample size +\eqn{n = 1, \ldots, N} and stacks the results. It is intended for small +populations only (\code{N <= 10}) due to the exponential growth of the +support size. } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -n <- 3 -# The sample membership matrix for fixed size without replacement sampling designs +# The sample membership matrix for all sample sizes IkRS(N) # The first sample is a null one and the last sample is a census } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Ik}}, \code{\link{SupportRS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/IkWR.rd b/man/IkWR.rd index b49599b..9e8a5e7 100644 --- a/man/IkWR.rd +++ b/man/IkWR.rd @@ -1,32 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IkWR.r \name{IkWR} \alias{IkWR} -\title{Sample Membership Indicator for with Replacements sampling designs} -\description{Creates a matrix of values (1, if the unit belongs to a specified sample and 0, otherwise) -for every possible sample under fixed sample size designs without replacement} +\title{Frequency Indicator Matrix for With-Replacement Sampling} \usage{ IkWR(N, m) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} +\item{N}{Population size. Keep small due to combinatorial growth.} + +\item{m}{Number of draws (sample size with replacement).} } -\seealso{ -\code{\link{nk}, \link{Support}, \link{Pik}} +\value{ +A binary matrix of dimension \code{choose(N+m-1, m) x N}, where entry +\eqn{(s, k) = 1} if unit \eqn{k} appears in the \eqn{s}-th outcome of +the with-replacement support, and 0 otherwise. } -\value{The function returns a matrix of \eqn{binom(N+m-1)(m)} rows and \eqn{N} columns. The \eqn{k}th column corresponds to the sample -membership indicator, of the \eqn{k}th unit, to a possible sample. It returns a value of 1, even if the element is selected more than once in a with replacement sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Constructs the indicator matrix of the with-replacement sampling support +for a population of size \code{N} and \code{m} draws. Each row corresponds +to one possible ordered outcome and each column to one population unit, +with entry \eqn{(s, k) = 1} if unit \eqn{k} was selected at least once +in outcome \eqn{s}. +} +\details{ +The with-replacement support is enumerated via \code{\link{SupportWR}}. +This function is intended for small populations and few draws only, as the +support grows rapidly with \code{N} and \code{m}. } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) m <- 2 -# The sample membership matrix for fixed size without replacement sampling designs -IkWR(N,m) +# The sample membership matrix for with-replacement sampling +IkWR(N, m) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Ik}}, \code{\link{SupportWR}}, \code{\link{nk}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} diff --git a/man/Lucy.rd b/man/Lucy.rd deleted file mode 100644 index 99aaf76..0000000 --- a/man/Lucy.rd +++ /dev/null @@ -1,54 +0,0 @@ -\name{Lucy} -\docType{data} -\alias{Lucy} -\title{Some Business Population Database} -\description{ -This data set corresponds to a random sample of BigLucy. It contains some financial variables of 2396 industrial companies of a city in a particular fiscal year. -} -\seealso{ -\code{\link{BigLucy}, \link{BigCity}} -} -\usage{data(Lucy)} -\format{ - \describe{ -\item{ID}{The identifier of the company. It correspond to an alphanumeric sequence (two letters and three digits)} -\item{Ubication}{The address of the principal office of the company in the city} -\item{Level}{The industrial companies are discrimitnated according to the Taxes declared. -There are small, medium and big companies} -\item{Zone}{The city is divided by geoghrafical zones. A company is classified in a particular zone according to its address} -\item{Income}{The total ammount of a company's earnings (or profit) in the previuos fiscal year. It is calculated by taking -revenues and adjusting for the cost of doing business} -\item{Employees}{The total number of persons working for the company in the previuos fiscal year} -\item{Taxes}{The total ammount of a company's income Tax} -\item{SPAM}{Indicates if the company uses the Internet and WEBmail options in order to make self-propaganda.} -} -} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} - -\examples{ -data(Lucy) -attach(Lucy) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -# The population totals -colSums(estima) -# Some parameters of interest -table(SPAM,Level) -xtabs(Income ~ Level+SPAM) -# Correlations among characteristics of interest -cor(estima) -# Some useful histograms -hist(Income) -hist(Taxes) -hist(Employees) -# Some useful plots -boxplot(Income ~ Level) -barplot(table(Level)) -pie(table(SPAM)) -} -\keyword{datasets} diff --git a/man/OrderWR.rd b/man/OrderWR.rd index f9d31fb..0498237 100644 --- a/man/OrderWR.rd +++ b/man/OrderWR.rd @@ -1,54 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/OrderWR.r \name{OrderWR} \alias{OrderWR} -\title{Pseudo-Support for Fixed Size With Replacement Sampling Designs} -\description{Creates a matrix containing every possible ordered sample under fixed sample size with replacement designs} +\title{Ordered With-Replacement Sampling Support} \usage{ -OrderWR(N,m,ID=FALSE) +OrderWR(N, m, ID = FALSE) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} -\item{ID}{By default FALSE, a vector of values (numeric or string) identifying each unit in the population} +\item{N}{Population size.} + +\item{m}{Number of draws.} + +\item{ID}{Optional vector of population labels of length \code{N}. +If provided, labels are substituted for integer indices in the output. +If \code{FALSE} (default), integer indices are returned.} } -\seealso{ -\code{\link{SupportWR}, \link{Support}} +\value{ +A matrix with \code{N^m} rows and \code{m} columns, where each row is one +ordered sequence of draws. If \code{ID} is provided, population labels are +substituted for indices. } -\details{The number of samples in a with replacement support is not equal to the number -of ordered samples induced by a with replacement sampling design.} -\value{The function returns a matrix of \eqn{N^m} rows and \eqn{m} columns. Each row of this matrix -corresponds to a possible ordered sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}. The author acknowledges to Hanwen Zhang for valuable suggestions.} -\references{ -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas +\description{ +Enumerates all ordered sequences of \code{m} draws from a population of +size \code{N} with replacement. Unlike \code{\link{SupportWR}}, this +function considers order, so sequences that differ only in draw order are +treated as distinct outcomes. +} +\details{ +The total number of ordered with-replacement sequences of size \code{m} +from \code{N} units is \eqn{N^m}. This grows rapidly and the function +should only be used for small \code{N} and \code{m}. } \examples{ -# Vector U contains the label of a population U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -# Under this context, there are five (5) possible ordered samples -OrderWR(N,1) -# The same output, but labeled -OrderWR(N,1,ID=U) -# y is the variable of interest -y<-c(32,34,46,89,35) -OrderWR(N,1,ID=y) - -# If the smaple size is m=2, there are (25) possible ordered samples -OrderWR(N,2) -# The same output, but labeled -OrderWR(N,2,ID=U) -# y is the variable of interest -y<-c(32,34,46,89,35) -OrderWR(N,2,ID=y) - -# Note that the number of ordered samples is not equal to the number of -# samples in a well defined with-replacement support -OrderWR(N,2) -SupportWR(N,2) - -OrderWR(N,4) -SupportWR(N,4) +# Five possible ordered samples of size m=1 +OrderWR(N, 1) +OrderWR(N, 1, ID = U) +# 25 possible ordered samples of size m=2 +OrderWR(N, 2) +OrderWR(N, 2, ID = U) +# Note: ordered samples differ from unordered (SupportWR) +OrderWR(N, 2) +SupportWR(N, 2) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{SupportWR}}, \code{\link{IkWR}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} diff --git a/man/Pik.rd b/man/Pik.rd index 1b2145f..b61bf7d 100644 --- a/man/Pik.rd +++ b/man/Pik.rd @@ -1,45 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Pik.r \name{Pik} \alias{Pik} -\title{Inclusion Probabilities for Fixed Size Without Replacement Sampling Designs} -\description{Computes the first-order inclusion probability of each unit in the population given a -fixed sample size design} +\title{First-Order Inclusion Probabilities from a Sampling Design} \usage{ Pik(p, Ind) } \arguments{ -\item{p}{A vector containing the selection probabilities of a fixed size without replacement sampling design. The sum of the values of this vector must be one} -\item{Ind}{A sample membership indicator matrix} +\item{p}{Vector of probabilities for each possible sample in the support. +Must sum to 1.} + +\item{Ind}{Indicator matrix of the sampling support, as returned by +\code{\link{Ik}}. Rows are samples, columns are population units.} } -\seealso{ -\code{\link{HT}} +\value{ +A row vector (1 x N matrix) of first-order inclusion probabilities +\eqn{\pi_k = P(k \in s)} for each unit \eqn{k} in the population. } -\details{The inclusion probability of the \eqn{k}th unit is defined as the probability that this unit will be -included in a sample, it is denoted by \eqn{\pi_k} and obtained from a given sampling design as follows: -\deqn{\pi_k=\sum_{s\ni k}p(s)} +\description{ +Computes the first-order inclusion probabilities for each unit in a finite +population, given the probability of each possible sample and the indicator +matrix of the sampling support. } -\value{The function returns a vector of inclusion probabilities for each unit in the finite population.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The inclusion probability of unit \eqn{k} is computed as the sum of the +probabilities of all samples that contain unit \eqn{k}: +\deqn{\pi_k = \sum_{s \ni k} p(s)} +The indicator matrix \code{Ind} (output of \code{\link{Ik}}) has one row +per possible sample and one column per population unit, with entry 1 if +unit \eqn{k} is in sample \eqn{s} and 0 otherwise. } \examples{ -# Vector U contains the label of a population of size N=5 +# Population of size N = 5, sample size n = 2 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -# The sample size is n=2 n <- 2 -# The sample membership matrix for fixed size without replacement sampling designs -Ind <- Ik(N,n) -# p is the probability of selection of every sample. +# Sample probabilities (one per possible sample) p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) -# Note that the sum of the elements of this vector is one -sum(p) -# Computation of the inclusion probabilities -inclusion <- Pik(p, Ind) -inclusion -# The sum of inclusion probabilities is equal to the sample size n=2 -sum(inclusion) -} -\keyword{survey} +Ind <- Ik(N, n) +pik <- Pik(p, Ind) +pik +# Check: inclusion probabilities sum to n +sum(pik) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Ik}}, \code{\link{Pikl}}, \code{\link{PikPPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/PikHol.rd b/man/PikHol.rd index 2dfb312..92db8b3 100644 --- a/man/PikHol.rd +++ b/man/PikHol.rd @@ -1,103 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PikHol.r \name{PikHol} \alias{PikHol} -\title{Optimal Inclusion Probabilities Under Multi-purpose Sampling} -\description{Computes the population vector of optimal inclusion probabilities under the Holmbergs's Approach} +\title{Optimal Inclusion Probabilities for Multiple Surveys (Holmberg)} \usage{ -PikHol(n, sigma, e, Pi) +PikHol(n, sigma, e, Pi = NULL) } \arguments{ -\item{n}{Vector of optimal sample sizes for each of the characteristics of interest.} -\item{sigma}{A matrix containing the size measures for each characteristics of interest.} -\item{e}{Maximum allowed error under the ANOREL approach.} -\item{Pi}{Matrix of first order inclusion probabilities. By default, this probabilites are -proportional to each sigma.} +\item{n}{Integer vector of length \code{p} with the desired sample size +for each of the \code{p} surveys.} + +\item{sigma}{Matrix of dimension \code{N x p} where column \eqn{k} contains +the auxiliary size variable for survey \eqn{k}.} + +\item{e}{Scalar. Relative tolerance parameter controlling the precision +target across surveys.} + +\item{Pi}{Optional matrix of dimension \code{N x p} with initial inclusion +probabilities for each survey. If omitted, \code{\link{PikPPS}} is used.} } -\details{Assuming that all of the characteristic of interest are equally important, the Holmberg's sampling design -yields the following inclusion probabilities -\deqn{\pi_{(opt)k}=\frac{n^*\sqrt{a_{qk}}}{\sum_{k\in U}\sqrt{a_{qk}}}} -where -\deqn{n^*\geq \frac{(\sum_{k\in U}\sqrt{a_{qk}})^2}{(1+c)Q+\sum_{k\in U}a_{qk}}} -and -\deqn{a_{qk}= \sum_{q=1}^Q \frac{\sigma^2_{qk}}{\sum_{k\in U}\left( \frac{1}{\pi_{qk}}-1\right)\sigma^2_{qk}}} -Note that \eqn{\sigma^2_{qk}} is a size measure associated with the k-th element in the q-th characteristic of interest. +\value{ +A numeric vector of length \code{N} with the optimal inclusion probability +for each unit in the population. } -\value{The function returns a vector of inclusion probabilities.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Holmberg, A. (2002), On the Choice of Sampling Design under GREG Estimation in Multiparameter Surveys. -\emph{RD Department, Statistics Sweden}.\cr -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas +\description{ +Computes optimal first-order inclusion probabilities for a population that +is surveyed on multiple occasions, minimising a measure of total variance +across surveys. This implements the approach of Holmberg (2002) for +coordinated sampling over time. +} +\details{ +For each survey \eqn{k}, the initial inclusion probabilities are computed +via \code{\link{PikPPS}}. An optimal composite size measure is then derived +by combining the per-survey auxiliary variables through a weighted sum, and +the final inclusion probabilities are computed proportional to the square +root of this composite. The resulting sample size \code{n.st} is chosen to +minimise total variance subject to a relative precision target \code{e}. } \examples{ - -####################### -#### First example #### -####################### - -# Uses the Lucy data to draw an otpimal sample -# in a multipurpose survey context +############ +## Example 1 +############ data(Lucy) attach(Lucy) -# Different sample sizes for two characteristics of interest: Employees and Taxes N <- dim(Lucy)[1] -n <- c(350,400) -# The size measure is the same for both characteristics of interest, -# but the relationship in between is different +n <- c(350, 400) sigy1 <- sqrt(Income^(1)) sigy2 <- sqrt(Income^(2)) -# The matrix containign the size measures for each characteristics of interest -sigma<-cbind(sigy1,sigy2) -# The vector of optimal inclusion probabilities under the Holmberg's approach -Piks<-PikHol(n,sigma,0.03) -# The optimal sample size is given by the sum of piks -n=round(sum(Piks)) -# Performing the S.piPS function in order to select the optimal sample of size n -res<-S.piPS(n,Piks) -sam <- res[,1] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -attach(data) -names(data) -# Pik.s is the vector of inclusion probability of every single unit -# in the selected sample -Pik.s <- res[,2] -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima -estima <- data.frame(Income, Employees, Taxes) -E.piPS(estima,Pik.s) - -######################## -#### Second example #### -######################## - -# We can define our own first inclusion probabilities +sigma <- cbind(sigy1, sigy2) +Piks <- PikHol(n, sigma, 0.03) +n.opt <- round(sum(Piks)) +res <- S.piPS(n.opt, Piks) +sam <- res[, 1] +Pik.s <- res[, 2] +estima <- data.frame(Lucy$Income[sam], Lucy$Employees[sam]) +E.piPS(estima, Pik.s) +############ +## Example 2 - with custom inclusion probabilities +############ data(Lucy) attach(Lucy) - N <- dim(Lucy)[1] -n <- c(350,400) - +n <- c(350, 400) sigy1 <- sqrt(Income^(1)) sigy2 <- sqrt(Income^(2)) -sigma<-cbind(sigy1,sigy2) +sigma <- cbind(sigy1, sigy2) pikas <- cbind(rep(400/N, N), rep(400/N, N)) - -Piks<-PikHol(n,sigma,0.03, pikas) - -n=round(sum(Piks)) -n - -res<-S.piPS(n,Piks) -sam <- res[,1] - -data <- Lucy[sam,] -attach(data) -names(data) - -Pik.s <- res[,2] -estima <- data.frame(Income, Employees, Taxes) -E.piPS(estima,Pik.s) +Piks <- PikHol(n, sigma, 0.03, pikas) +round(sum(Piks)) +} +\references{ +Holmberg, A. (2002). A multiparameter perspective on the choice of sampling +design in surveys. \emph{Statistics in Transition}, 5(6), 969-994.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{PikPPS}}, \code{\link{PikSTPPS}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/PikPPS.rd b/man/PikPPS.rd index 4072ec9..3d16b0e 100644 --- a/man/PikPPS.rd +++ b/man/PikPPS.rd @@ -1,30 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PikPPS.r \name{PikPPS} \alias{PikPPS} -\title{Inclusion Probabilities in Proportional to Size Sampling Designs} -\description{For a given sample size, this function returns a vector of first order -inclusion probabilities for a sampling design proportional to an auxiliary variable} +\title{Inclusion Probabilities Proportional to Size} \usage{ -PikPPS(n,x) +PikPPS(n, x) } \arguments{ -\item{n}{Integer indicating the sample size} -\item{x}{Vector of auxiliary information for each unit in the population} +\item{n}{Desired sample size.} + +\item{x}{Vector of length \code{N} with positive auxiliary size values +for each unit in the population.} } -\seealso{ -\code{\link{PikHol}, \link{E.piPS}, \link{S.piPS}} +\value{ +A numeric vector of length \code{N} with the first-order inclusion +probability for each unit in the population. Values are in \code{(0, 1]}. } -\details{For a given vector of auxiliary information with value \eqn{x_k} for the \eqn{k}-th unit and -population total \eqn{t_x}, the following expression -\deqn{\pi_k=n\times \frac{x_k}{t_x}} -is not always less than unity. A sequential algorithm must be used in order to ensure that for every -unit in the population the inclusion probability gives less or equal to unity.} -\value{The function returns a vector of inclusion probabilities of size \eqn{N}. -Every element of this vector is a value between zero and one.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Computes first-order inclusion probabilities proportional to an auxiliary +size variable \code{x} for a without-replacement sample of size \code{n}. +A sequential truncation algorithm ensures all probabilities are at most 1. +} +\details{ +The initial probabilities \eqn{\pi_k = n x_k / \sum x} may exceed 1 for +large units. The algorithm iteratively sets those probabilities to 1 and +redistributes the remaining sample size among the other units until all +probabilities are valid. The result satisfies \eqn{\sum \pi_k = n}. } \examples{ ############ @@ -72,4 +73,15 @@ sum(pik) # They are called forced inclusion units which(pik==1) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.piPS}}, \code{\link{PikSTPPS}}, \code{\link{PikHol}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/Pikl.rd b/man/Pikl.rd index 8d94cc2..1e7fc49 100644 --- a/man/Pikl.rd +++ b/man/Pikl.rd @@ -1,42 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Pikl.r \name{Pikl} \alias{Pikl} -\title{Second Order Inclusion Probabilities for Fixed Size Without Replacement Sampling Designs} -\description{Computes the second-order inclusion probabilities of each par of units in the population given a -fixed sample size design} +\title{Second-Order Inclusion Probabilities} \usage{ Pikl(N, n, p) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} -\item{p}{A vector containing the selection probabilities of a fixed size without replacement sampling design. The sum of the values of this vector must be one} +\item{N}{Population size. Keep small (recommended \code{N <= 15}) due to +the combinatorial enumeration of all possible samples.} + +\item{n}{Sample size.} + +\item{p}{Vector of probabilities for each possible sample in the support. +Must sum to 1.} } -\seealso{ -\code{\link{VarHT}, \link{Deltakl}, \link{Pik}} +\value{ +An \code{N x N} matrix where entry \eqn{(k, l)} is the probability that +both units \eqn{k} and \eqn{l} are included in the same sample. Diagonal +entries \eqn{(k,k)} equal the first-order inclusion probability \eqn{\pi_k}. } -\details{The second-order inclusion probability of the \eqn{kl}th units is defined as the probability that unit \eqn{k} and unit -\eqn{l} will be both included in a sample; it is denoted by \eqn{\pi_{kl}} and obtained from a given sampling design as follows: -\deqn{\pi_{kl}=\sum_{s\ni k,l}p(s)} +\description{ +Computes the matrix of second-order inclusion probabilities +\eqn{\pi_{kl} = P(k \in s \text{ and } l \in s)} for all pairs of units +in a finite population of size \code{N} under a fixed-size sampling design. } -\value{The function returns a symmetric matrix of size \eqn{N \times N} containing the second-order inclusion probabilities -for each pair of units in the finite population.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The second-order inclusion probabilities are needed to compute the exact +Horvitz-Thompson variance estimator and the Sen-Yates-Grundy variance +estimator. This function enumerates the full sampling support via +\code{\link{Ik}} and is therefore only feasible for small populations +(\code{N <= 15}). } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -# The sample size is n=2 n <- 2 -# p is the probability of selection of every sample. p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) -# Note that the sum of the elements of this vector is one sum(p) -# Computation of the second-order inclusion probabilities +# Second-order inclusion probabilities Pikl(N, n, p) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Pik}}, \code{\link{Deltakl}}, \code{\link{VarHT}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.BE.rd b/man/S.BE.rd index ed7f20f..b3ac584 100644 --- a/man/S.BE.rd +++ b/man/S.BE.rd @@ -1,28 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.BE.r \name{S.BE} \alias{S.BE} -\title{Bernoulli Sampling Without Replacement} -\description{Draws a Bernoulli sample without replacement of expected size $n$ from a population of size $N$} +\title{Bernoulli Sampling} \usage{ S.BE(N, prob) } \arguments{ -\item{N}{Population size} -\item{prob}{Inclusion probability for each unit in the population} +\item{N}{Population size.} + +\item{prob}{Scalar. Inclusion probability, must satisfy \code{0 < prob <= 1}.} } -\seealso{ -\code{\link{E.BE}} -} -\details{The selected sample is drawn according to a sequential procedure algorithm based on an uniform distribution. The Bernoulli -sampling design is not a fixed sample size one.} -\value{The function returns a vector of size \eqn{N}. Each element of this vector indicates if the unit was selected. Then, -if the value of this vector for unit \eqn{k} is zero, the unit \eqn{k} was not selected in the sample; otherwise, the unit was -selected in the sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas.\cr -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer. +\value{ +A vector of length \code{N} where selected units contain their population +index and non-selected units contain \code{0}. +} +\description{ +Draws a Bernoulli sample from a finite population of size \code{N}. +Each unit is independently selected with the same inclusion probability +\code{prob}. +} +\details{ +The sample size under Bernoulli sampling is random, following a +Binomial(\code{N}, \code{prob}) distribution. To extract the selected +indices, use \code{sam[sam != 0]}. } \examples{ ############ @@ -53,4 +54,15 @@ data <- Lucy[sam,] data dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.BE}}, \code{\link{S.PO}}, \code{\link{S.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.PO.rd b/man/S.PO.rd index f3934af..6fbe0e6 100644 --- a/man/S.PO.rd +++ b/man/S.PO.rd @@ -1,28 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.PO.r \name{S.PO} \alias{S.PO} \title{Poisson Sampling} -\description{Draws a Poisson sample of expected size $n$ from a population of size $N$} \usage{ S.PO(N, Pik) } \arguments{ -\item{N}{Population size} -\item{Pik}{Vector of inclusion probabilities for each unit in the population} +\item{N}{Population size.} + +\item{Pik}{Vector of length \code{N} containing the first-order inclusion +probability for each unit in the population. Values must be in \code{(0, 1]}.} } -\seealso{ -\code{\link{E.PO}} -} -\details{The selected sample is drawn according to a sequential procedure algorithm based on a uniform distribution. The Poisson -sampling design is not a fixed sample size one.} -\value{The function returns a vector of size \eqn{N}. Each element of this vector indicates if the unit was selected. Then, -if the value of this vector for unit \eqn{k} is zero, the unit \eqn{k} was not selected in the sample; otherwise, the unit was -selected in the sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas.\cr -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer. +\value{ +A vector of length \code{N} where selected units contain their population +index and non-selected units contain \code{0}. +} +\description{ +Draws a Poisson sample from a finite population of size \code{N}. +Each unit \eqn{k} is independently selected with its own inclusion +probability \eqn{\pi_k}. +} +\details{ +Poisson sampling is a generalisation of Bernoulli sampling that allows +unequal inclusion probabilities. The sample size is random. To extract +the selected indices, use \code{sam[sam != 0]}. } \examples{ ############ @@ -56,4 +58,15 @@ data <- Lucy[sam,] data dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.PO}}, \code{\link{PikPPS}}, \code{\link{S.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.PPS.rd b/man/S.PPS.rd index e3922f2..e451992 100644 --- a/man/S.PPS.rd +++ b/man/S.PPS.rd @@ -1,25 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.PPS.r \name{S.PPS} \alias{S.PPS} -\title{Probability Proportional to Size Sampling With Replacement} -\description{Draws a probability proportional to size sample with replacement of size \eqn{m} from a population of size \eqn{N}} +\title{Probability Proportional to Size With-Replacement Sampling} \usage{ -S.PPS(m,x) +S.PPS(m, x) } \arguments{ -\item{m}{Sample size} -\item{x}{Vector of auxiliary information for each unit in the population} +\item{m}{Number of draws (sample size with replacement).} + +\item{x}{Vector of length \code{N} containing positive auxiliary size +values for each unit in the population.} } -\seealso{ -\code{\link{E.PPS}} +\value{ +A matrix with \code{m} rows and two columns: +\itemize{ + \item Column 1 (\code{sam}): population indices of the selected units. + \item Column 2 (\code{pk}): selection probability of each draw. } -\details{The selected sample is drawn according to the cumulative total method (sequential-list procedure)} -\value{The function returns a matrix of \eqn{m} rows and two columns. Each element of the first column indicates the unit that -was selected. Each element of the second column indicates the selection probability of this unit} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Draws a with-replacement sample of size \code{m} from a finite population +using probabilities proportional to an auxiliary size variable \code{x}. +} +\details{ +At each draw, unit \eqn{k} is selected with probability +\eqn{p_k = x_k / \sum x}. Since sampling is with replacement, the same +unit may appear more than once. Use \code{\link{E.PPS}} or \code{\link{HH}} +to estimate population totals from this sample. } \examples{ ############ @@ -52,4 +60,15 @@ data <- Lucy[sam,] data dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.PPS}}, \code{\link{HH}}, \code{\link{S.piPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.SI.rd b/man/S.SI.rd index 9d25897..529e83c 100644 --- a/man/S.SI.rd +++ b/man/S.SI.rd @@ -1,29 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.SI.r \name{S.SI} \alias{S.SI} \title{Simple Random Sampling Without Replacement} -\description{Draws a simple random sample without replacement of size \eqn{n} from a population of size \eqn{N}} \usage{ -S.SI(N, n, e=runif(N)) +S.SI(N, n, e = runif(N)) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} -\item{e}{By default, a vector of size \eqn{N} of independent random numbers drawn from the \eqn{Uniform(0,1)}} +\item{N}{Population size.} + +\item{n}{Sample size. Must satisfy \code{n <= N}.} + +\item{e}{Optional vector of \code{N} uniform random variates in \code{(0,1)}. +If omitted, \code{runif(N)} is used. Useful for reproducibility or +coordinated sampling.} } -\seealso{ -\code{\link{E.SI}} -} -\details{The selected sample is drawn according to a selection-rejection (list-sequential) algorithm} -\value{The function returns a vector of size \eqn{N}. Each element of this vector indicates if the unit was selected. Then, -if the value of this vector for unit \eqn{k} is zero, the unit \eqn{k} was not selected in the sample; otherwise, the unit was -selected in the sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Fan, C.T., Muller, M.E., Rezucha, I. (1962), Development of sampling plans by using sequential -(item by item) selection techniques and digital computer, \emph{Journal of the American Statistical Association}, 57, 387-402.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\value{ +A vector of length \code{N} where selected units contain their population +index and non-selected units contain \code{0}. +} +\description{ +Draws a simple random sample of size \code{n} without replacement from a +finite population of size \code{N} using the sequential algorithm of +Fan, Muller and Rezucha (1962). +} +\details{ +The sequential algorithm selects units one at a time by comparing a uniform +random variate with the conditional inclusion probability at each step, +ensuring exactly \code{n} units are selected. To extract the selected +indices, filter out the zeros: \code{sam[sam != 0]}. } \examples{ ############ @@ -32,27 +37,36 @@ Editorial Universidad Santo Tomas. # Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") # Fixes the random numbers in order to select a sample -# Ideal for teaching purposes in the blackboard e <- c(0.4938, 0.7044, 0.4585, 0.6747, 0.0640) # Draws a simple random sample without replacement of size n=3 -sam <- S.SI(5,3,e) +sam <- S.SI(5, 3, e) sam # The selected sample is U[sam] - ############ ## Example 2 ############ -# Uses the Marco and Lucy data to draw a random sample according to a SI design -data(Marco) +# Uses the Lucy data to draw a random sample according to a SI design data(Lucy) - +attach(Lucy) N <- dim(Lucy)[1] n <- 400 -sam<-S.SI(N,n) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -data +sam <- S.SI(N, n) +# The information about the units in the sample +data <- Lucy[sam, ] dim(data) } -\keyword{survey} +\references{ +Fan, C.T., Muller, M.E. and Rezucha, I. (1962). Development of sampling +plans by using sequential (item by item) selection techniques and digital +computers. \emph{Journal of the American Statistical Association}, +57(298), 387-402.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.SI}}, \code{\link{S.STSI}}, \code{\link{S.SY}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.STPPS.rd b/man/S.STPPS.rd index e08c754..2ab154f 100644 --- a/man/S.STPPS.rd +++ b/man/S.STPPS.rd @@ -1,27 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.STPPS.r \name{S.STPPS} \alias{S.STPPS} -\title{Stratified Sampling Applying PPS Design in all Strata} -\description{Draws a probability proportional to size simple random sample with -replacement of size \eqn{m_h} in stratum \eqn{h} of size \eqn{N_h}} +\title{Stratified Probability Proportional to Size With-Replacement Sampling} \usage{ -S.STPPS(S,x,mh) +S.STPPS(S, x, mh) } \arguments{ -\item{S}{Vector identifying the membership to the strata of each unit in the population} -\item{x}{Vector of auxiliary information for each unit in the population} -\item{mh}{Vector of sample size in each stratum} +\item{S}{Vector of length \code{N} identifying the stratum membership of +each unit in the population.} + +\item{x}{Vector of length \code{N} containing positive auxiliary size +values for each unit in the population.} + +\item{mh}{Integer vector of length \code{H} specifying the number of +draws within each stratum.} } -\seealso{ -\code{\link{E.STPPS}} +\value{ +A data frame with \code{sum(mh)} rows and two columns: +\itemize{ + \item \code{sam}: population indices of the selected units. + \item \code{pk}: within-stratum selection probabilities of each draw. } -\details{The selected sample is drawn according to the cumulative total method (sequential-list procedure) in each stratum} -\value{The function returns a matrix of \eqn{m=m_1+\cdots+m_h} rows and two columns. Each element of the first column indicates the unit that -was selected. Each element of the second column indicates the selection probability of this unit} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Draws a stratified with-replacement sample where within each stratum units +are selected using probability proportional to size (PPS-WR). +} +\details{ +Within each stratum \eqn{h}, \code{mh[h]} draws are made with +probabilities \eqn{p_k = x_k / \sum_{k \in h} x_k}. The same unit may +appear more than once within a stratum. Use \code{\link{E.STPPS}} to +estimate population totals from this sample. } \examples{ ############ @@ -71,4 +81,15 @@ dim(data) pk <- res[,2] pk } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.PPS}}, \code{\link{S.STpiPS}}, \code{\link{E.STPPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.STSI.rd b/man/S.STSI.rd index cc47eea..53bee36 100644 --- a/man/S.STSI.rd +++ b/man/S.STSI.rd @@ -1,70 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.STSI.r \name{S.STSI} \alias{S.STSI} -\title{Stratified sampling applying SI design in all strata} -\description{Draws a simple random sample without replacement of size \eqn{n_h} in stratum \eqn{h} of size \eqn{N_h}} +\title{Stratified Simple Random Sampling Without Replacement} \usage{ S.STSI(S, Nh, nh) } \arguments{ -\item{S}{Vector identifying the membership to the strata of each unit in the population} -\item{Nh}{Vector of stratum sizes} -\item{nh}{Vector of sample size in each stratum} +\item{S}{Vector of length \code{N} identifying the stratum membership of +each unit in the population.} + +\item{Nh}{Integer vector of length \code{H} with the population size of +each stratum.} + +\item{nh}{Integer vector of length \code{H} with the sample size of each +stratum. Must satisfy \code{nh[h] <= Nh[h]} for all \code{h}.} } -\seealso{ -\code{\link{E.STSI}} +\value{ +A sorted vector of population indices of the selected units, of length +\code{sum(nh)}. } -\details{The selected sample is drawn according to a selection-rejection (list-sequential) algorithm in each stratum} -\value{The function returns a vector of size \eqn{n=n_1+\cdots+n_H}. Each element of this vector indicates the unit that was selected.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Draws a stratified simple random sample without replacement from a finite +population. Within each stratum, units are selected by simple random +sampling without replacement. +} +\details{ +The function selects \code{nh[h]} units from stratum \eqn{h} using +\code{base::sample}, and returns all selected indices sorted in ascending +order. Use \code{\link{E.STSI}} to estimate population totals from this +sample. } \examples{ ############ ## Example 1 ############ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# Vector Strata contains an indicator variable of stratum membership Strata <- c("A", "A", "A", "B", "B") -Strata -# The stratum sizes -Nh <- c(3,2) -# Then sample size in each stratum -nh <- c(2,1) -# Draws a stratified simple random sample without replacement of size n=3 +Nh <- c(3, 2) +nh <- c(2, 1) sam <- S.STSI(Strata, Nh, nh) sam -# The selected sample is U[sam] - ############ ## Example 2 ############ -# Uses the Lucy data to draw a stratified random sample -# accordind to a SI design in each stratum data(Lucy) attach(Lucy) -# Level is the stratifying variable -summary(Level) -# Defines the size of each stratum -N1<-summary(Level)[[1]] -N2<-summary(Level)[[2]] -N3<-summary(Level)[[3]] -N1;N2;N3 -Nh <- c(N1,N2,N3) -# Defines the sample size at each stratum -n1<-70 -n2<-100 -n3<-200 -nh<-c(n1,n2,n3) -# Draws a stratified sample +N1 <- summary(Level)[[1]] +N2 <- summary(Level)[[2]] +N3 <- summary(Level)[[3]] +Nh <- c(N1, N2, N3) +nh <- c(70, 100, 200) sam <- S.STSI(Level, Nh, nh) -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -data +data <- Lucy[sam, ] dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.STSI}}, \code{\link{S.SI}}, \code{\link{S.STpiPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.STpiPS.Rd b/man/S.STpiPS.Rd index b387324..dd3d41c 100644 --- a/man/S.STpiPS.Rd +++ b/man/S.STpiPS.Rd @@ -1,84 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.STpiPS.R \name{S.STpiPS} \alias{S.STpiPS} -\title{Stratified Sampling Applying Without Replacement piPS Design in all Strata} -\description{Draws a probability proportional to size simple random sample without -replacement of size \eqn{n_h} in stratum \eqn{h} of size \eqn{N_h}} +\title{Stratified Probability Proportional to Size Without-Replacement Sampling} \usage{ -S.STpiPS(S,x,nh) +S.STpiPS(S, x, nh) } \arguments{ -\item{S}{Vector identifying the membership to the strata of each unit in the population} -\item{x}{Vector of auxiliary information for each unit in the population} -\item{nh}{Vector of sample size in each stratum} +\item{S}{Vector of length \code{N} identifying the stratum membership of +each unit in the population.} + +\item{x}{Vector of length \code{N} containing positive auxiliary size +values for each unit in the population.} + +\item{nh}{Integer vector of length \code{H} specifying the sample size +within each stratum.} } -\seealso{ -\code{\link{E.STpiPS}} +\value{ +A matrix with \code{sum(nh)} rows and two columns, sorted by population +index: +\itemize{ + \item Column 1: population indices of the selected units. + \item Column 2: first-order inclusion probabilities of the selected units. } -\details{The selected sample is drawn according to the Sunter method (sequential-list procedure) in each stratum} -\value{The function returns a matrix of \eqn{n=n_1+\cdots+n_h} rows and two columns. Each element of the first column indicates the unit that -was selected. Each element of the second column indicates the inclusion probability of this unit} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +} +\description{ +Draws a stratified sample where within each stratum units are selected +using a probability proportional to size without-replacement (piPS) design. +} +\details{ +Within each stratum \eqn{h}, the function calls \code{\link{S.piPS}} to +draw \code{nh[h]} units with probabilities proportional to \code{x}. +The global population indices are preserved in the output. } \examples{ ############ ## Example 1 ############ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# The auxiliary information x <- c(52, 60, 75, 100, 50) -# Vector Strata contains an indicator variable of stratum membership Strata <- c("A", "A", "A", "B", "B") -# Then sample size in each stratum -mh <- c(2,2) -# Draws a stratified PPS sample with replacement of size n=4 -res <- S.STPPS(Strata, x, mh) -# The selected sample -sam <- res[,1] +nh <- c(2, 2) +res <- S.STpiPS(Strata, x, nh) +sam <- res[, 1] U[sam] -# The selection probability of each unit selected to be in the sample -pk <- res[,2] -pk - +pik <- res[, 2] +pik ############ ## Example 2 ############ -# Uses the Lucy data to draw a stratified random sample -# according to a piPS design in each stratum - data(Lucy) attach(Lucy) -# Level is the stratifying variable -summary(Level) - -# Defines the size of each stratum -N1<-summary(Level)[[1]] -N2<-summary(Level)[[2]] -N3<-summary(Level)[[3]] -N1;N2;N3 - -# Defines the sample size at each stratum -n1<-70 -n2<-100 -n3<-200 -nh<-c(n1,n2,n3) -nh -# Draws a stratified sample -S <- Level -x <- Employees - -res <- S.STpiPS(S, x, nh) -sam<-res[,1] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -data +N1 <- summary(Level)[[1]] +N2 <- summary(Level)[[2]] +N3 <- summary(Level)[[3]] +nh <- c(70, 100, 200) +res <- S.STpiPS(Level, Employees, nh) +sam <- res[, 1] +data <- Lucy[sam, ] dim(data) -# The selection probability of each unit selected in the sample -pik <- res[,2] -pik +pik <- res[, 2] +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{S.piPS}}, \code{\link{S.STSI}}, \code{\link{E.STpiPS}}, + \code{\link{PikSTPPS}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} diff --git a/man/S.SY.rd b/man/S.SY.rd index 46e6e13..7ab7c58 100644 --- a/man/S.SY.rd +++ b/man/S.SY.rd @@ -1,26 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.SY.r \name{S.SY} \alias{S.SY} \title{Systematic Sampling} -\description{Draws a Systematic sample of size $n$ from a population of size $N$} \usage{ S.SY(N, a) } \arguments{ -\item{N}{Population size} -\item{a}{Number of groups dividing the population} +\item{N}{Population size.} + +\item{a}{Sampling interval (skip). The expected sample size is +approximately \code{N/a}.} } -\seealso{ -\code{\link{E.SY}} +\value{ +A vector containing the population indices of the selected units. } -\details{The selected sample is drawn according to a random start.} -\value{The function returns a vector of size \eqn{n}. Each element of this vector indicates the unit that was selected.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}. The author acknowledges to -Kristina Stodolova \email{Kristyna.Stodolova@seznam.cz} for valuable suggestions.} -\references{ -Madow, L.H. and Madow, W.G. (1944), On the theory of systematic sampling. \emph{Annals of Mathematical Statistics}. 15, 1-24.\cr -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling. Springer}.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Draws a systematic sample from a finite population of size \code{N} using +a fixed sampling interval \code{a}. A random start \code{r} is chosen +uniformly from \code{1} to \code{a}, and every \code{a}-th unit thereafter +is selected. +} +\details{ +The random start \code{r} is drawn from \code{sample(a, 1)}, and then +units \eqn{r, r+a, r+2a, \ldots} are selected. If \code{N} is not a +multiple of \code{a}, the sample size varies by one unit depending on the +random start. Use \code{\link{E.SY}} to estimate population totals. } \examples{ ############ @@ -52,4 +57,15 @@ data <- Lucy[sam,] data dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.SY}}, \code{\link{S.SI}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.WR.rd b/man/S.WR.rd index 4a8dafa..4d12723 100644 --- a/man/S.WR.rd +++ b/man/S.WR.rd @@ -1,24 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.WR.r \name{S.WR} \alias{S.WR} \title{Simple Random Sampling With Replacement} -\description{Draws a simple random sample witht replacement of size \eqn{m} from a population of size \eqn{N}} \usage{ S.WR(N, m) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} +\item{N}{Population size.} + +\item{m}{Number of draws (sample size with replacement).} } -\seealso{ -\code{\link{E.WR}} +\value{ +A vector of population indices of length \code{m}, where each element is +the index of a selected unit. Units may appear more than once. } -\details{The selected sample is drawn according to a sequential procedure algorithm based on a binomial distribution} -\value{The function returns a vector of size \eqn{m}. Each element of this vector indicates the unit that was selected.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Draws a simple random sample of size \code{m} with replacement from a +finite population of size \code{N}. Returns the frequency of selection +for each unit drawn at least once. +} +\details{ +The number of times each unit is selected follows a multinomial +distribution with equal probabilities \eqn{1/N}. The function uses a +sequential binomial draw approach. Use \code{\link{E.WR}} to estimate +population totals. } \examples{ ############ @@ -48,4 +54,15 @@ data <- Lucy[sam,] data dim(data) } -\keyword{survey} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.WR}}, \code{\link{S.SI}}, \code{\link{S.PPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.piPS.Rd b/man/S.piPS.Rd new file mode 100644 index 0000000..7bef94b --- /dev/null +++ b/man/S.piPS.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/S.piPS.r +\name{S.piPS} +\alias{S.piPS} +\title{Probability Proportional to Size Without-Replacement Sampling (piPS)} +\usage{ +S.piPS(n, x, e = runif(length(x))) +} +\arguments{ +\item{n}{Sample size.} + +\item{x}{Vector of length \code{N} with positive auxiliary size values.} + +\item{e}{Optional vector of \code{N} uniform random variates in \code{(0,1)}. +If omitted, \code{runif(N)} is used.} +} +\value{ +A matrix with \code{n} rows and two columns: +\itemize{ + \item Column 1: population indices of the selected units. + \item Column 2: first-order inclusion probabilities of the selected units. +} +} +\description{ +Draws a without-replacement sample of size \code{n} using a sequential +algorithm that produces inclusion probabilities proportional to an +auxiliary size variable \code{x}. +} +\examples{ +############ +## Example 1 +############ +U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") +x <- c(52, 60, 75, 100, 50) +# Draws a piPS sample without replacement of size n=3 +res <- S.piPS(3, x) +res +sam <- res[, 1] +U[sam] +############ +## Example 2 +############ +# Uses the Lucy data +data(Lucy) +attach(Lucy) +res <- S.piPS(400, Income) +sam <- res[, 1] +Pik.s <- res[, 2] +data <- Lucy[sam, ] +dim(data) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.piPS}}, \code{\link{PikPPS}}, \code{\link{S.STPPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/S.piPS.rd b/man/S.piPS.rd deleted file mode 100644 index 2ef16df..0000000 --- a/man/S.piPS.rd +++ /dev/null @@ -1,60 +0,0 @@ -\name{S.piPS} -\alias{S.piPS} -\title{Probability Proportional to Size Sampling Without Replacement} -\description{Draws a probability proportional to size sample without replacement of size \eqn{n} from a population of size \eqn{N}.} -\usage{ -S.piPS(n, x, e) -} -\arguments{ -\item{x}{Vector of auxiliary information for each unit in the population} -\item{n}{Sample size} -\item{e}{By default, a vector of size \eqn{N} of independent random numbers drawn from the \eqn{Uniform(0,1)}} -} -\seealso{ -\code{\link{E.piPS}} -} -\details{The selected sample is drawn according to the Sunter method (sequential-list procedure)} -\value{The function returns a matrix of \eqn{m} rows and two columns. Each element of the first column indicates the unit that -was selected. Each element of the second column indicates the selection probability of this unit} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. -} -\examples{ -############ -## Example 1 -############ -# Vector U contains the label of a population of size N=5 -U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# The auxiliary information -x <- c(52, 60, 75, 100, 50) -# Draws a piPS sample without replacement of size n=3 -res <- S.piPS(3,x) -res -sam <- res[,1] -sam -# The selected sample is -U[sam] - -############ -## Example 2 -############ -# Uses the Lucy data to draw a random sample of units accordind to a -# piPS without replacement design - -data(Lucy) -attach(Lucy) -# The selection probability of each unit is proportional to the variable Income -res <- S.piPS(400,Income) -# The selected sample -sam <- res[,1] -# The inclusion probabilities of the units in the sample -Pik.s <- res[,2] -# The information about the units in the sample is stored in an object called data -data <- Lucy[sam,] -data -dim(data) -} -\keyword{survey} diff --git a/man/Support.rd b/man/Support.rd index d2a1914..67497cb 100644 --- a/man/Support.rd +++ b/man/Support.rd @@ -1,42 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Support.r \name{Support} \alias{Support} -\title{Sampling Support for Fixed Size Without Replacement Sampling Designs} -\description{Creates a matrix containing every possible sample under fixed sample size designs} +\title{Sampling Support for Fixed-Size Without-Replacement Designs} \usage{ -Support(N, n, ID=FALSE) +Support(N, n, ID = FALSE) } \arguments{ -\item{N}{Population size} -\item{n}{Sample size} -\item{ID}{By default FALSE, a vector of values (numeric or string) identifying each unit in the population} +\item{N}{Population size. Recommended \code{N <= 15}.} + +\item{n}{Sample size.} + +\item{ID}{Optional vector of population labels of length \code{N}. +If provided, labels replace integer indices in the output. +If \code{FALSE} (default), integer indices are returned.} } -\seealso{ -\code{\link{Ik}} +\value{ +A matrix with \code{choose(N, n)} rows and \code{n} columns. Each row +contains the indices (or labels if \code{ID} is provided) of the units +in one possible sample. Samples are listed in lexicographic order. } -\details{A support is defined as the set of samples such that for any sample in the support, all the permutations -of the coordinates of the sample are also in the support} -\value{The function returns a matrix of \eqn{binom(N)(n)} rows and \eqn{n} columns. Each row of this matrix -corresponds to a possible sample} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas +\description{ +Enumerates all possible samples of size \code{n} from a population of +size \code{N}, returning the complete sampling support as a matrix. +} +\details{ +This function uses a combinatorial algorithm to enumerate all +\code{choose(N, n)} subsets of size \code{n} from \eqn{\{1, \ldots, N\}}. +It is intended for small populations only. For \code{N > 15} it becomes +very slow. } \examples{ -# Vector U contains the label of a population U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) n <- 2 -# The support for fixed size without replacement sampling designs -# Under this context, there are ten (10) possibles samples -Support(N,n) -# The same support, but labeled -Support(N,n,ID=U) -# y is the variable of interest -y<-c(32,34,46,89,35) -# The following output is very useful when checking -# the design-unbiasedness of an estimator -Support(N,n,ID=y) -} -\keyword{survey} +# Ten possible samples of size n=2 +Support(N, n) +# Labeled support +Support(N, n, ID = U) +# Support showing values of y +y <- c(32, 34, 46, 89, 35) +Support(N, n, ID = y) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Ik}}, \code{\link{SupportWR}}, \code{\link{SupportRS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/SupportRS.rd b/man/SupportRS.rd index 5ae8231..8e61d9c 100644 --- a/man/SupportRS.rd +++ b/man/SupportRS.rd @@ -1,38 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SupportRS.r \name{SupportRS} \alias{SupportRS} -\title{Sampling Support for Random Size Without Replacement Sampling Designs} -\description{Creates a matrix containing every possible sample under random sample size designs} +\title{Complete Sampling Support for All Sample Sizes} \usage{ -SupportRS(N, ID=FALSE) +SupportRS(N, ID = FALSE) } \arguments{ -\item{N}{Population size} -\item{ID}{By default FALSE, a vector of values (numeric or string) identifying each unit in the population} +\item{N}{Population size. Recommended \code{N <= 10}.} + +\item{ID}{Optional vector of population labels of length \code{N}. +If provided, labels replace integer indices in the output.} } -\seealso{ -\code{\link{IkRS}} +\value{ +A matrix with \eqn{2^N} rows and \code{N} columns. Each row is one subset, +with \code{NA} used as padding for subsets smaller than \code{N}. The first +row represents the empty set (all zeros). } -\details{A support is defined as the set of samples such that for any sample in the support, all the permutations of the coordinates of the sample are also in the support} -\value{The function returns a matrix of \eqn{2^N} rows and \eqn{N} columns. Each row of this matrix corresponds to a possible sample} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas +\description{ +Enumerates all possible non-empty subsets of a population of size \code{N}, +covering all sample sizes from 1 to \code{N}. The result includes the +empty set as the first row. +} +\details{ +This function stacks the outputs of \code{\link{Support}} for all sample +sizes \eqn{n = 1, \ldots, N}. It is only feasible for small populations +(\code{N <= 10}) due to exponential growth. } \examples{ -# Vector U contains the label of a population U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) -# The support for fixed size without replacement sampling designs -# Under this context, there are ten (10) possibles samples +# Complete support for all sample sizes SupportRS(N) -# The same support, but labeled -SupportRS(N, ID=U) -# y is the variable of interest -y<-c(32,34,46,89,35) -# The following output is very useful when checking -# the design-unbiasedness of an estimator -SupportRS(N, ID=y) -} -\keyword{survey} +# Labeled support +SupportRS(N, ID = U) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{Support}}, \code{\link{IkRS}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/SupportWR.rd b/man/SupportWR.rd index e101a1a..2540389 100644 --- a/man/SupportWR.rd +++ b/man/SupportWR.rd @@ -1,43 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SupportWR.r \name{SupportWR} \alias{SupportWR} -\title{Sampling Support for Fixed Size With Replacement Sampling Designs} -\description{Creates a matrix containing every possible sample under fixed sample size with replacement designs} +\title{Sampling Support for With-Replacement Designs} \usage{ -SupportWR(N, m, ID=FALSE) +SupportWR(N, m, ID = FALSE) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} -\item{ID}{By default FALSE, a vector of values (numeric or string) identifying each unit in the population} +\item{N}{Population size.} + +\item{m}{Number of draws (sample size with replacement).} + +\item{ID}{Optional vector of population labels of length \code{N}. +If \code{FALSE} (default), integer indices are returned.} } -\seealso{ -\code{\link{Support}} +\value{ +A matrix with \code{choose(N+m-1, m)} rows and \code{m} columns. Each +row contains the (sorted) indices of one possible unordered outcome. +If \code{ID} is provided, population labels replace indices. } -\details{A support is defined as the set of samples such that, for any sample in the support, all the permutations -of the coordinates of the sample are also in the support} -\value{The function returns a matrix of \eqn{binom(N+m-1)(m)} rows and \eqn{m} columns. Each row of this matrix -corresponds to a possible sample} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Ortiz, J. E. (2009), \emph{Simulacion y metodos estadisticos}. Editorial Universidad Santo Tomas. \cr -Tille, Y. (2006), \emph{Sampling Algorithms}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Enumerates all distinct unordered outcomes (multisets) of size \code{m} +drawn with replacement from a population of size \code{N}. +} +\details{ +The number of distinct unordered with-replacement outcomes of size \code{m} +from \code{N} units is \eqn{\binom{N+m-1}{m}}. This is much smaller than +the \eqn{N^m} ordered outcomes. The algorithm uses a nested loop to +generate all non-decreasing sequences of length \code{m} from +\eqn{\{1, \ldots, N\}}. } \examples{ -# Vector U contains the label of a population U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) m <- 2 -# The support for fixed size without replacement sampling designs -# Under this context, there are ten (10) possibles samples +# With-replacement support SupportWR(N, m) -# The same support, but labeled -SupportWR(N, m, ID=U) -# y is the variable of interest -y<-c(32,34,46,89,35) -# The following output is very useful when checking -# the design-unbiasedness of an estimator -SupportWR(N, m, ID=y) -} -\keyword{survey} +SupportWR(N, m, ID = U) +y <- c(32, 34, 46, 89, 35) +SupportWR(N, m, ID = y) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{IkWR}}, \code{\link{nk}}, \code{\link{p.WR}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/T.SIC.rd b/man/T.SIC.rd index 66168b9..e13e9dc 100644 --- a/man/T.SIC.rd +++ b/man/T.SIC.rd @@ -1,74 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/T.SIC.r \name{T.SIC} -\alias{T.SIC} -\title{Computation of Population Totals for Clusters} -\description{Computes the population total of the characteristics of interest in clusters. This function is used in order to estimate totals when doing a Pure Cluster Sample.} +\alias{T.SIC} +\title{Cluster Totals for Single-Stage Cluster Sampling} \usage{ -T.SIC(y,Cluster) +T.SIC(y, Cluster) } \arguments{ -\item{y}{Vector, matrix or data frame containing the recollected information of the variables of interest for every -unit in the selected sample} -\item{Cluster}{Vector identifying the membership to the cluster of each unit in the selected sample of clusters} +\item{y}{Vector, matrix or data frame containing the values of the +variables of interest for every unit in the sample.} + +\item{Cluster}{Vector identifying the cluster (PSU) membership of each +unit in the sample.} } -\seealso{ -\code{\link{S.SI}, \link{E.SI}} +\value{ +A matrix with one row per cluster and one column per variable of interest +(plus a first column \code{Ni} with the cluster size). Row names are the +cluster labels. } -\value{The function returns a matrix of clusters totals. The columns of each matrix -correspond to the totals of the variables of interest in each cluster} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Computes the total of each variable of interest within each cluster +(Primary Sampling Unit) in a single-stage cluster sample. +} +\details{ +This function aggregates the sample data by cluster, producing the cluster- +level totals needed for estimation under single-stage cluster sampling. +The output can be passed directly to \code{\link{E.1SI}} or \code{\link{E.SI}} +treating each cluster total as an observation. } \examples{ ############ ## Example 1 ############ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# Vector y1 and y2 are the values of the variables of interest -y1<-c(32, 34, 46, 89, 35) -y2<-c(1,1,1,0,0) -y3<-cbind(y1,y2) -# Vector Cluster contains a indicator variable of cluster membership +y1 <- c(32, 34, 46, 89, 35) +y2 <- c(1, 1, 1, 0, 0) +y3 <- cbind(y1, y2) Cluster <- c("C1", "C2", "C1", "C2", "C1") -Cluster -# Draws a stratified simple random sample without replacement of size n=3 -T.SIC(y1,Cluster) -T.SIC(y2,Cluster) -T.SIC(y3,Cluster) - -######################################################## -## Example 2 Sampling and estimation in Cluster smapling -######################################################## -# Uses Lucy data to draw a clusters sample according to a SI design -# Zone is the clustering variable +T.SIC(y1, Cluster) +T.SIC(y3, Cluster) +############ +## Example 2 - Cluster sampling with Lucy data +############ data(Lucy) attach(Lucy) -summary(Zone) -# The population of clusters -UI<-c("A","B","C","D","E") -NI=length(UI) -# The sample size -nI=2 -# Draws a simple random sample of two clusters -samI<-S.SI(NI,nI) -dataI<-UI[samI] -dataI -# The information about each unit in the cluster is saved in Lucy1 and Lucy2 -data(Lucy) -Lucy1<-Lucy[which(Zone==dataI[1]),] -Lucy2<-Lucy[which(Zone==dataI[2]),] -LucyI<-rbind(Lucy1,Lucy2) +UI <- c("A", "B", "C", "D", "E") +NI <- length(UI) +nI <- 2 +samI <- S.SI(NI, nI) +dataI <- UI[samI] +Lucy1 <- Lucy[which(Zone == dataI[1]), ] +Lucy2 <- Lucy[which(Zone == dataI[2]), ] +LucyI <- rbind(Lucy1, Lucy2) attach(LucyI) -# The clustering variable is Zone Cluster <- as.factor(as.integer(Zone)) -# The variables of interest are: Income, Employees and Taxes -# This information is stored in a data frame called estima estima <- data.frame(Income, Employees, Taxes) -Ty<-T.SIC(estima,Cluster) -# Estimation of the Population total -E.SI(NI,nI,Ty) +Ty <- T.SIC(estima, Cluster) +E.SI(NI, nI, Ty) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{E.1SI}}, \code{\link{E.2SI}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} \ No newline at end of file diff --git a/man/VarHT.rd b/man/VarHT.rd index aa4f18c..20dda7d 100644 --- a/man/VarHT.rd +++ b/man/VarHT.rd @@ -1,45 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VarHT.r \name{VarHT} \alias{VarHT} -\title{Variance of the Horvitz-Thompson Estimator} -\description{Computes the theoretical variance of the Horvitz-Thompson estimator given a without replacement fixed sample size design} +\title{Exact Variance of the Horvitz-Thompson Estimator} \usage{ VarHT(y, N, n, p) } \arguments{ -\item{y}{Vector containing the recollected information of the characteristic of interest for every unit in the population} -\item{N}{Population size} -\item{n}{Sample size} -\item{p}{A vector containing the selection probabilities of a fixed size without replacement sampling design. The sum of the values of this vector must be one} +\item{y}{Vector of length \code{N} with the population values of the +variable of interest.} + +\item{N}{Population size. Recommended \code{N <= 15}.} + +\item{n}{Sample size.} + +\item{p}{Vector of probabilities for each possible sample in the support. +Must sum to 1.} } -\seealso{ -\code{\link{HT}, \link{Deltakl}, \link{Pikl}, \link{Pik}} +\value{ +A scalar: the exact variance of the Horvitz-Thompson estimator +\eqn{V(\hat{t}_{y,\pi})}. } -\details{The variance of the Horvitz-Thompson estimator, under a given sampling design \eqn{p}, is given by -\deqn{Var_p(\hat{t}_{y,\pi})=\sum_{k\in U}\sum_{l \in U}\Delta_{kl}\frac{y_k}{\pi_k}\frac{y_l}{\pi_l}} +\description{ +Computes the exact variance of the Horvitz-Thompson estimator of the +population total for a given fixed-size without-replacement sampling design, +using the full sampling support. } -\value{The function returns the value of the theoretical variances of the Horviz-Thompson estimator.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\details{ +The exact Horvitz-Thompson variance is: +\deqn{V(\hat{t}_{y,\pi}) = \sum_{k=1}^N \sum_{l=1}^N \Delta_{kl} +\frac{y_k}{\pi_k} \frac{y_l}{\pi_l}} +where \eqn{\Delta_{kl} = \pi_{kl} - \pi_k \pi_l}. This requires +enumerating the full support and is only feasible for small populations +(\code{N <= 15}). } \examples{ -# Without replacement sampling -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") -# Vector y1 and y2 are the values of the variables of interest -y1<-c(32, 34, 46, 89, 35) -y2<-c(1,1,1,0,0) -# The population size is N=5 +y1 <- c(32, 34, 46, 89, 35) +y2 <- c(1, 1, 1, 0, 0) N <- length(U) -# The sample size is n=2 n <- 2 -# p is the probability of selection of every possible sample p <- c(0.13, 0.2, 0.15, 0.1, 0.15, 0.04, 0.02, 0.06, 0.07, 0.08) - -# Calculates the theoretical variance of the HT estimator +# Theoretical variance of the HT estimator VarHT(y1, N, n, p) VarHT(y2, N, n, p) } -\keyword{survey} +\references{ +Horvitz, D.G. and Thompson, D.J. (1952). A generalization of sampling +without replacement from a finite universe. +\emph{Journal of the American Statistical Association}, 47, 663-685.\cr +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer. +} +\seealso{ +\code{\link{Deltakl}}, \code{\link{VarSYGHT}}, \code{\link{HT}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/Wk.rd b/man/Wk.rd index d70cc58..25d53d3 100644 --- a/man/Wk.rd +++ b/man/Wk.rd @@ -1,26 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Wk.r \name{Wk} \alias{Wk} -\title{The Calibration Weights} -\description{Computes the calibration weights (Chi-squared distance) for the estimation of the population total of several variables of interest.} +\title{GREG Generalised Weights} \usage{ -Wk(x,tx,Pik,ck,b0) +Wk(x, tx, Pik, ck, b0 = FALSE) } \arguments{ -\item{x}{Vector, matrix or data frame containing the recollected auxiliary information for every unit in the selected sample} -\item{tx}{Vector containing the populations totals of the auxiliary information} -\item{Pik}{A vector containing inclusion probabilities for each unit in the sample} -\item{ck}{A vector of weights induced by the structure of variance of the supposed model} -\item{b0}{By default FALSE. The intercept of the regression model} +\item{x}{Vector or matrix of auxiliary variables observed in the sample.} + +\item{tx}{Vector of known population totals of the auxiliary variables.} + +\item{Pik}{Vector of first-order inclusion probabilities for each unit +in the sample.} + +\item{ck}{Vector of variance-stabilising constants. Typically \code{ck = 1} +(homoscedastic) or \code{ck = x} (heteroscedastic).} + +\item{b0}{Logical. If \code{TRUE}, an intercept column is prepended to +\code{x}. Default is \code{FALSE}.} } -\details{The calibration weights satisfy the following expression -\deqn{\sum_{k\in S}w_kx_k=\sum_{k\in U}x_k} +\value{ +A numeric vector of length \code{n} with the GREG weight for each unit +in the sample. } -\value{The function returns a vector of calibrated weights.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Computes the generalised regression (GREG) weights for each unit in the +sample. These weights incorporate both the sampling design weights and a +calibration adjustment based on known population totals of auxiliary +variables. +} +\details{ +The GREG weight for unit \eqn{k} is: +\deqn{w_k = \frac{1}{\pi_k} + \mathbf{x}_k^T +\left(\sum_s \frac{v_k \mathbf{x}_k \mathbf{x}_k^T}{\pi_k}\right)^{-1} +(\mathbf{t}_x - \hat{\mathbf{t}}_{x,\pi})} +where \eqn{v_k = 1/(\pi_k c_k)} and \eqn{c_k} is a variance-stabilising +constant. The GREG estimator is then \eqn{\hat{t}_{GREG} = \sum_s w_k y_k}. } \examples{ ############ @@ -180,6 +196,16 @@ sum(x[,3]*w) tx # The calibration estimation colSums(estima*w) - } -\keyword{survey} \ No newline at end of file +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{GREG.SI}}, \code{\link{E.Beta}} +} +\author{ +Hugo Andres Gutierrez Rojas +} diff --git a/man/kish_allocation.Rd b/man/kish_allocation.Rd new file mode 100644 index 0000000..6134330 --- /dev/null +++ b/man/kish_allocation.Rd @@ -0,0 +1,74 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kish_allocation.R +\name{kish_allocation} +\alias{kish_allocation} +\title{Kish Allocation for Stratified Sampling} +\usage{ +kish_allocation(n, N_h, I = 0.5) +} +\arguments{ +\item{n}{Integer. Total desired sample size.} + +\item{N_h}{Named numeric vector. Population sizes for each stratum +\eqn{h = 1, \ldots, H}.} + +\item{I}{Non-negative numeric. Intraclass correlation coefficient (ICC) +or design effect parameter controlling the allocation: +\itemize{ + \item \code{I = 0} → Uniform allocation (equal sample per stratum). + \item \code{I = Inf} → Proportional allocation (proportional to \eqn{N_h}). + \item \code{0 < I < Inf} → Compromise between uniform and proportional. + \item Recommended value: \code{I = 0.5} (Kish, 1992). +}} +} +\value{ +A named integer vector of length \eqn{H} with the allocated sample + sizes per stratum. The values sum to approximately \code{n} (rounding may + cause a difference of ±1). +} +\description{ +Computes the optimal sample size allocation across strata using the +Kish (1992) compromise allocation method, which interpolates between +uniform and proportional allocation through a design effect parameter \code{I}. +} +\details{ +The Kish compromise allocation assigns sample sizes as: +\deqn{ + n_h = n \cdot \frac{\sqrt{I \, W_h^2 + H^{-2}}} + {\sum_{h=1}^{H} \sqrt{I \, W_h^2 + H^{-2}}} +} +where \eqn{W_h = N_h / N} is the stratum weight and \eqn{H} is the number +of strata. This formulation nests two classical allocations as limiting +cases: when \eqn{I = 0} the numerator reduces to \eqn{1/H} (uniform), +and as \eqn{I \to \infty} it is dominated by \eqn{W_h} (proportional). +} +\examples{ +N_h <- c( + Corozal = 41847, + Orange_Walk = 48175, + Belize = 57658, + Cayo = 78473, + Stann_Creek = 31347, + Toledo = 31711 +) + +# Uniform allocation (I = 0) +kish_allocation(n = 3096, N_h = N_h, I = 0) + +# Proportional allocation (I -> Inf) +kish_allocation(n = 3096, N_h = N_h, I = 1e6) + +# Kish recommended compromise (I = 0.5) +kish_allocation(n = 3096, N_h = N_h, I = 0.5) +} +\references{ +Kish, L. (1992). Weighting for unequal \eqn{P_i}. +\emph{Journal of Official Statistics}, 8(2), 183–200. +} +\seealso{ +\code{\link{E.STSI}} for estimation under stratified sampling, +\code{\link{S.STSI}} for stratified simple random sampling. +} +\author{ +Yury Vanessa Ochoa Montes +} diff --git a/man/nk.rd b/man/nk.rd index 240e823..d5f451a 100644 --- a/man/nk.rd +++ b/man/nk.rd @@ -1,31 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nk.r \name{nk} \alias{nk} -\title{Sample Selection Indicator for With Replacement Sampling Designs} -\description{The function returns a matrix of \eqn{binom(N+m-1)(m)} rows and \eqn{N} columns. Creates a matrix of values (0, if the unit does not belongs to a specified sample, 1, if the unit is selected once in the sample; 2, if the unit is selected twice in the sample, etc.) for every possible sample under fixed sample size designs with replacement} +\title{Frequency Matrix for With-Replacement Sampling} \usage{ nk(N, m) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} +\item{N}{Population size. Keep small due to combinatorial growth.} + +\item{m}{Number of draws (sample size with replacement).} } -\seealso{ -\code{\link{SupportWR}, \link{Pik}} +\value{ +An integer matrix of dimension \code{choose(N+m-1, m) x N}, where entry +\eqn{(s, k)} is the frequency of unit \eqn{k} in outcome \eqn{s}. } -\value{The function returns a matrix of \eqn{binom(N+m-1)(m)} rows and \eqn{N} columns. The \eqn{k}th column corresponds to the sample -selection indicator, of the \eqn{k}th unit, to a possible sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Constructs the frequency matrix of the with-replacement sampling support +for a population of size \code{N} and \code{m} draws. Each row corresponds +to one possible outcome and each column to one population unit, with entry +\eqn{(s, k)} equal to the number of times unit \eqn{k} was selected in +outcome \eqn{s}. +} +\details{ +Unlike \code{\link{IkWR}}, which records only whether a unit was selected, +this function records how many times each unit was selected. This is needed +for with-replacement estimators based on selection frequencies. } \examples{ -# Vector U contains the label of a population of size N=5 U <- c("Yves", "Ken", "Erik", "Sharon", "Leslie") N <- length(U) m <- 2 -# The sample membership matrix for fixed size without replacement sampling designs -nk(N,m) +# Frequency matrix for with-replacement sampling +nk(N, m) +} +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{IkWR}}, \code{\link{SupportWR}}, \code{\link{p.WR}} +} +\author{ +Hugo Andres Gutierrez Rojas } -\keyword{survey} diff --git a/man/p.WR.rd b/man/p.WR.rd index fc0639d..d3bd638 100644 --- a/man/p.WR.rd +++ b/man/p.WR.rd @@ -1,26 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/p.WR.r \name{p.WR} \alias{p.WR} -\title{Generalization of every with replacement sampling design} -\description{Computes the selection probability (sampling design) of each with replacement sample} +\title{Sample Probabilities under With-Replacement Sampling} \usage{ p.WR(N, m, pk) } \arguments{ -\item{N}{Population size} -\item{m}{Sample size} -\item{pk}{A vector containing selection probabilities for each unit in the population} +\item{N}{Population size.} + +\item{m}{Number of draws (sample size with replacement).} + +\item{pk}{Vector of length \code{N} with selection probabilities for each +unit. Must sum to 1.} } -\details{Every with replacement sampling design is a particular case of a multinomial -distribution. -\deqn{p(\mathbf{S}=\mathbf{s})=\frac{m!}{n_1!n_2!\cdots n_N!}\prod_{i=1}^N p_k^{n_k}} -where \eqn{n_k} is the number of times that the \eqn{k}-th unit is selected in a sample. +\value{ +A numeric vector of length \code{choose(N+m-1, m)} with the probability +of each distinct unordered outcome in the with-replacement support. } -\value{The function returns a vector of selection probabilities for every with-replacement sample.} -\author{Hugo Andres Gutierrez Rojas \email{hagutierrezro@gmail.com}} -\references{ -Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), \emph{Model Assisted Survey Sampling}. Springer.\cr -Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas y estimacion de parametros}. -Editorial Universidad Santo Tomas. +\description{ +Computes the probability of each possible outcome in the with-replacement +sampling support, given unit selection probabilities \code{pk}. +} +\details{ +For each distinct unordered outcome (multiset) in the support enumerated +by \code{\link{nk}}, the probability is computed as a multinomial +probability: +\deqn{p(s) = \frac{m!}{\prod_k n_k!} \prod_k p_k^{n_k}} +where \eqn{n_k} is the number of times unit \eqn{k} appears in outcome +\eqn{s} and \eqn{p_k} is the selection probability of unit \eqn{k}. } \examples{ ############ @@ -58,4 +66,15 @@ p <- p.WR(N, m, pk) p sum(p) } -\keyword{survey} \ No newline at end of file +\references{ +Sarndal, C-E. and Swensson, B. and Wretman, J. (1992), +\emph{Model Assisted Survey Sampling}. Springer.\cr +Gutierrez, H. A. (2009), \emph{Estrategias de muestreo: Diseno de encuestas +y estimacion de parametros}. Editorial Universidad Santo Tomas. +} +\seealso{ +\code{\link{nk}}, \code{\link{SupportWR}}, \code{\link{S.PPS}} +} +\author{ +Hugo Andres Gutierrez Rojas +}