\documentclass[article, 11pt, oneside]{memoir} \usepackage{Sweave} \usepackage[OT1]{fontenc} \usepackage[utf8]{inputenc} \begin{document} % \VignetteIndexEntry{SDaA} \title{Reproduction of Analyses in Lohr (1999)\\ using the \texttt{survey} package} \author{Tobias Verbeke} \date{2011-03-21} \maketitle \tableofcontents <>= options(width=65) @ \chapter{Introduction} % Chapter 1 The Introduction chapter does not contain any numerical examples demonstrating survey methodology. Before reproducing the analyses of the following chapters, we load the \texttt{SDaA} package as well as the \texttt{survey} <>= library(SDaA) library(survey) @ The \texttt{survey} package is loaded as well as it was specified as a dependency of the \texttt{SDaA} package. \chapter{Simple Probability Samples} % Chapter 2 \chapter{Ratio and Regression Estimation} % Chapter 3 \section{Ratio Estimation} <<>>= ### Example 3.2, p. 63 agsrsDesign <- svydesign(ids=~1, weights = ~1, data = agsrs) svyratio(numerator = ~acres92, denominator = ~acres87, design = agsrsDesign) # proportion B hat @ \begin{figure} <>= ### part of Example 3.2, p. 64 plot(I(acres92/10^6) ~ I(acres87/10^6), xlab = "Millions of Acres Devoted to Farms (1987)", ylab = "Millions of Acres Devoted to Farms (1992)", data = agsrs) abline(lm(I(acres92/10^6) ~ 0 + I(acres87/10^6), # through the origin data = agsrs), col = "red", lwd = 2) @ \includegraphics{SDaA_using_survey-acreagetwoyears} \caption{Figure 3.1, p. 64} \end{figure} <>= ### Example 3.5, p. 72, table 3.3 seedlings <- data.frame(tree = 1:10, x = c(1, 0, 8, 2, 76, 60, 25, 2, 1, 31), y = c(0, 0, 1, 2, 10, 15, 3, 2, 1, 27)) names(seedlings) <- c("tree", "x", "y") @ % TODO cast into a appropriate data frame to % specify a cluster design (tree IDs *and* seedling IDs) \begin{figure} <>= plot(y ~ x, data = seedlings, xlab = "Seedlings Alive (March 1992)", ylab = "Seedlings That Survived (February 1994)") # abline(lm(y ~ 0 + x, data = seedlings), lwd = 2, col = "red") # TODO: add proper abline @ \includegraphics{SDaA_using_survey-acresboxplot} \caption{Figure 3.4, p. 73} \end{figure} \section{Regression Estimation} <<>>= ### Example 3.6, p. 75 pf <- data.frame(photo = c(10, 12, 7, 13, 13, 6, 17, 16, 15, 10, 14, 12, 10, 5, 12, 10, 10, 9, 6, 11, 7, 9, 11, 10, 10), field = c(15, 14, 9, 14, 8, 5, 18, 15, 13, 15, 11, 15, 12, 8, 13, 9, 11, 12, 9, 12, 13, 11, 10, 9, 8)) @ \section{Estimation in Domains} \section{Models for Ratio and Regression Estimation} <>= ### Example 3.9, p. 83 recacr87 <- agsrs$acres87 recacr87[recacr87 > 0] <- 1/recacr87[recacr87 > 0] # cf. p. 450 model1 <- lm(acres92 ~ 0 + acres87, weights = recacr87, data = agsrs) summary(model1) @ \begin{figure} <>= ### Figure 3.6, p. 85 wtresid <- resid(model1) / sqrt(agsrs$acres87) plot(wtresid ~ I(agsrs$acres87/10^6), xlab = "Millions of Acres Devoted to Farms (1987)", ylab = "Weighted Residuals") @ \includegraphics{SDaA_using_survey-modelregplot} \caption{Figure 3.6, p. 85} \end{figure} \chapter{Stratified Sampling} % Chapter 4 \begin{figure} <>= boxplot(acres92/10^6 ~ region, xlab = "Region", ylab = "Millions of Acres", data = agstrat) @ \includegraphics{SDaA_using_survey-acresboxplot} \caption{Figure 4.1, p. 97} \end{figure} \chapter{Cluster Sampling with Equal Probabilities} % Chapter 5 \section{Notation for Cluster Sampling} No analyses contained in this section. \section{One-Stage Cluster Sampling} <>= ### Example 5.2, p. 137 middle GPA <- cbind(expand.grid(1:4, 1:5), gpa = c(3.08, 2.60, 3.44, 3.04, 2.36, 3.04, 3.28, 2.68, 2.00, 2.56, 2.52, 1.88, 3.00, 2.88, 3.44, 3.64, 2.68, 1.92, 3.28, 3.20)) names(GPA)[1:2] <- c("person_num", "cluster") GPA$pwt <- 100/5 clusterDesign <- svydesign(ids = ~ cluster, weights = ~ pwt, data = GPA) svytotal(~ gpa, design = clusterDesign) # total SE # gpa 1130.4 67.167 # Stata results: 1130.4 67.16666 ---> corresponds perfectly @ \section{Two-Stage Cluster Sampling} <>= ### Figure 5.3 plot(volume ~ clutch, xlim = c(0,200), pch=19, data = coots, xlab = "Clutch Number", ylab = "Egg Volume") @ <>= ### Figure 5.3 plot(volume ~ clutch, xlim = c(0,200), pch=19, data = coots, xlab = "Clutch Number", ylab = "Egg Volume") @ % Figure 5.4 seems a good plot to test the grammar of graphics % (sorting statistic, mean statistic etc.) \chapter{Sampling with Unequal Probabilities} % Chapter 6 % cf. % http://statistics.ats.ucla.edu/stat/stata/examples/lohr/lohrstata6.htm % <>= data(statepop) statepop$psi <- statepop$popn / 255077536 @ <>= ### page 191, figure 6.1 plot(phys ~ psi, data = statepop, xlab = expression(paste(Psi[i], " for County")), ylab = "Physicians in County (in thousands)") @ % TODO aanvullen met rest van gegevens op pagina \chapter{Complex Surveys} % Chapter 7 \section{Estimating a Distribution Function} <>= ### Figure 7.1 data(htpop) popecdf <- ecdf(htpop$height) plot(popecdf, do.points = FALSE, ylab = "F(y)", xlab = "Height Value, y") @ <>= ### Figure 7.2 minht <- min(htpop$height) breaks <- c(minht-1, seq(from = minht, to = max(htpop$height), by = 1)) hist(htpop$height, ylab = "f(y)", breaks = breaks, xlab = "Height Value, y", freq = FALSE) @ <>= ### Figure 7.3 data(htsrs) hist(htsrs$height, ylab = "Relative Frequency", xlab = "Height (cm)", freq = FALSE) @ % TODO Figure 7.3 can be improved (change number of breaks) <>= ### Figure 7.4 data(htstrat) hist(htstrat$height, ylab = "Relative Frequency", xlab = "Height (cm)", freq = FALSE) @ <>= ### Figure 7.5 (a) minht <- min(htstrat$height) breaks <- c(minht-1, seq(from = minht, to = max(htstrat$height), by = 1)) hist(htstrat$height, ylab = expression(hat(f)(y)), breaks = breaks, xlab = "Height Value, y", freq = FALSE) @ <>= ### Figure 7.5 (b) stratecdf <- ecdf(htstrat$height) plot(stratecdf, do.points = FALSE, ylab = expression(hat(F)(y)), xlab = "Height Value, y") @ \section{Plotting Data from a Complex Survey} <>= ### Figure 7.6 data(syc) hist(syc$age, freq = FALSE, xlab = "Age") @ % TODO add Figure 7.7 (?) Note that in its current implementation, \texttt{svyboxplot} will only plot minimum and maximum as outliers if they are situated outside the whiskers. Other outliers are not plotted (see \texttt{?svyboxplot}). This explains the minor difference with Figure 7.8 on p. 237 of Lohr (1999). <>= ### Figure 7.8 sycdesign <- svydesign(ids= ~ psu, strata = ~ stratum, data = syc, weights=~finalwt) # p. 235: "Each of the 11 facilities with 360 or more youth # formed its own stratum (strata 6-16)", so in order # to avoid a lonely.psu error message # Error in switch(lonely.psu, certainty = scale * crossprod(x), remove = scale * : # Stratum (6) has only one PSU at stage 1 # we set the option to "certainty" for this example # to see the problem, use: by(syc$psu, syc$stratum, unique) oo <- options(survey.lonely.psu = "certainty") svyboxplot(age ~ factor(stratum), design = sycdesign) # mind the factor options(oo) @ This kind of plot is particularly easy to formulate in the grammar of graphics, i.e. using the \texttt{ggplot2} package~: <>= ### Figure 7.9 library(ggplot2) p <- ggplot(syc, aes(x = factor(stratum), y = factor(age))) g <- p + stat_sum(aes(group=1, weight = finalwt, size = ..n..)) print(g) @ % TODO: check that it is really the sum of finalwt that is displayed Note that in its current implementation, \texttt{svyboxplot} will only plot minimum and maximum as outliers if they are situated outside the whiskers. Other outliers are not plotted (see \texttt{?svyboxplot}). This explains the minor difference with Figure 7.10 on p. 238 of Lohr (1999). <>= ### Figure 7.10 oo <- options(survey.lonely.psu = "certainty") sycstrat5 <- subset(sycdesign, stratum == 5) svyboxplot(age ~ factor(psu), design = sycstrat5) options(oo) @ <>= ### Figure 7.11 sycstrat5df <- subset(syc, stratum == 5) p <- ggplot(sycstrat5df, aes(x = factor(psu), y = factor(age))) g <- p + stat_sum(aes(group=1, weight = finalwt, size = ..n..)) print(g) @ \chapter{Nonresponse} % Chapter 8 \chapter{Variance Estimation in Complex Surveys} % Chapter 9 \section{Linearization (Taylor Series) Methods} \section{Random Group Methods} \section{Resampling and Replication Methods} \section{Generalized Variance Functions} \section{Confidence Intervals} \chapter{Categorical Data Analysis in Complex Surveys} % Chapter 10 \section{Chi-Square Tests with Multinomial Sampling} <<>>= ### Example 10.1 hh <- rbind(c(119, 188), c(88, 105)) rownames(hh) <- c("cableYes", "cableNo") colnames(hh) <- c("computerYes", "computerNo") addmargins(hh) chisq.test(hh, correct = FALSE) # OK @ % TODO: add G^2 <<>>= ### Example 10.2 (nursing students and tutors) nst <- rbind(c(46, 222), c(41, 109), c(17, 40), c(8, 26)) colnames(nst) <- c("NR", "R") rownames(nst) <- c("generalStudent", "generalTutor", "psychiatricStudent", "psychiatricTutor") addmargins(nst) chisq.test(nst, correct = FALSE) # OK @ <<>>= ### Example 10.3 (Air Force Pilots) afp <- data.frame(nAccidents = 0:7, nPilots = c(12475, 4117, 1016, 269, 53, 14, 6, 2)) # estimate lambda lambdahat <- sum(afp$nAccidents * afp$nPilots / sum(afp$nPilots)) # expected counts observed <- afp$nPilots expected <- dpois(0:7, lambda = lambdahat) * sum(afp$nPilots) sum((observed - expected)^2 / expected) # NOT OK @ % TODO check what is wrong here: different Chi-Square value ?! \section{Effects of Survey Design on Chi-Square Tests} <<>>= ### Example 10.4 hh2 <- rbind(c(238, 376), c(176, 210)) rownames(hh2) <- c("cableYes", "cableNo") colnames(hh2) <- c("computerYes", "computerNo") addmargins(hh2) chisq.test(hh2, correct = FALSE) # OK @ \section{Corrections to Chi-Square Tests} % from ?svychisq % 'svychisq' computes first and second-order Rao-Scott corrections % to the Pearson chisquared test, and two Wald-type tests. <<>>= ### example 10.5 @ \chapter{Regression with Complex Survey Data} % Chapter 11 \section{Model-Based Regression in Simple Random Samples} \section{Regression in Complex Surveys} \chapter{Other Topics in Sampling} % Chapter 12 \end{document}