#' ---
#' title: "Example R code to replicate NCHS Data Brief No. 303, Figure 1"                                         
#' author: " "
#' date: " "
#' --- 
#' # Figure 1.  Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013-2016
#'
#' Brody DJ, Pratt LA, Hughes J. Prevalence of depression among adults aged 20 and over: United States, 2013-2016. NCHS Data Brief, no 303. Hyattsville, MD: National Center for Health Statistics. 2018.

# Load survey and dplyr packages
#+ message = FALSE, warning=FALSE
library(dplyr)
library(survey)
#'
options(survey.lonely.psu='adjust')

# Display Version Information
cat("R package versions:\n")
for (p in c("base", "survey","dplyr")) { 
  cat(p, ": ", as.character(packageVersion(p)), "\n")
}

#' # Data preparation
# Download & Read SAS Transport Files
# Demographic (DEMO)
download.file("https://wwwn.cdc.gov/nchs/nhanes/2013-2014/DEMO_H.XPT", tf <- tempfile(), mode="wb")
DEMO_H <- foreign::read.xport(tf)[,c("SEQN","RIAGENDR","RIDAGEYR","SDMVSTRA","SDMVPSU","WTMEC2YR")]
download.file("https://wwwn.cdc.gov/nchs/nhanes/2015-2016/DEMO_I.XPT", tf <- tempfile(), mode="wb")
DEMO_I <- foreign::read.xport(tf)[,c("SEQN","RIAGENDR","RIDAGEYR","SDMVSTRA","SDMVPSU","WTMEC2YR")]

# Mental Health - Depression Screener (DPQ) 
download.file("http://wwwn.cdc.gov/nchs/nhanes/2013-2014/DPQ_H.XPT", tf <- tempfile(), mode="wb")
DPQ_H <- foreign::read.xport(tf)
download.file("http://wwwn.cdc.gov/nchs/nhanes/2015-2016/DPQ_I.XPT", tf <- tempfile(), mode="wb")
DPQ_I <- foreign::read.xport(tf)

# Append Files
DEMO <- bind_rows(DEMO_H, DEMO_I)
DPQ <- bind_rows(DPQ_H, DPQ_I)

# Merge DEMO and DPQ files and create derived variables

One <- left_join(DEMO, DPQ, by="SEQN") %>%
  # Set 7=Refused and 9=Don't Know To Missing for variables DPQ010 thru DPQ090 ##
  mutate_at(vars(DPQ010:DPQ090), ~ifelse(. >=7, NA, .)) %>%
  mutate(. , 
         # create indicator for overall summary
         one = 1,
         # Create depression score as sum of variables DPQ010 -- DPQ090
         Depression.Score = rowSums(select(. , DPQ010:DPQ090)),
         # Create depression indicator as binary 0/100 variable. (is missing if Depression.Score is missing)
         Depression= ifelse(Depression.Score >=10, 100, 0), 
         # Create factor variables
         Gender = factor(RIAGENDR, labels=c("Men", "Women")),
         Age.Group = cut(RIDAGEYR, breaks=c(-Inf,19,39,59,Inf),labels=c("Under 20", "20-39","40-59","60 and over")),
         # Generate 4-year MEC weight (Divide weight by 2 because we are appending 2 survey cycles)                                                             
         # Note: using the MEC Exam Weights (WTMEC2YR), per the analytic notes on the               
         #       Mental Health - Depression Screener (DPQ_H) documentation                          
         WTMEC4YR = WTMEC2YR/2 ,
         # Define indicator for analysis population of interest: adults aged 20 and over with a valid depression score
         inAnalysis= (RIDAGEYR >= 20 & !is.na(Depression.Score))
         ) %>% 
  # drop DPQ variables
  select(., -starts_with("DPQ"))

#' ## Define survey design 
# Define survey design for overall dataset
NHANES_all <- svydesign(data=One, id=~SDMVPSU, strata=~SDMVSTRA, weights=~WTMEC4YR, nest=TRUE)
                    
# Create a survey design object for the subset of interest: adults aged 20 and over with a valid depression score 
# Subsetting the original survey design object ensures we keep the design information about the number of clusters and strata
NHANES <- subset(NHANES_all, inAnalysis)

#' ## Analysis
# Define a function to call svymean and unweighted count
getSummary <- function(varformula, byformula, design){
  # Get mean, stderr, and unweighted sample size
  c <- svyby(varformula, byformula, design, unwtd.count ) 
  p <- svyby(varformula, byformula, design, svymean ) 
  outSum <- left_join(select(c,-se), p) 
  outSum
}

#' ### Calculate prevalence of depression overall, by gender, by age group, and by age and gender
#' Adults
getSummary(~Depression, ~one, NHANES)
#' By sex
getSummary(~Depression, ~Gender, NHANES)
#' By age
getSummary(~Depression, ~Age.Group, NHANES)
#' By sex and age
getSummary(~Depression, ~Gender + Age.Group, NHANES)

#' ### Compare Prevalence Between Men And Women
svyttest(Depression~Gender, NHANES)$p.value %>% as.numeric 
svyttest(Depression~Gender, subset(NHANES, Age.Group=="20-39"))$p.value %>% as.numeric
svyttest(Depression~Gender, subset(NHANES, Age.Group=="40-59"))$p.value %>% as.numeric
svyttest(Depression~Gender, subset(NHANES, Age.Group=="60 and over"))$p.value %>% as.numeric


#' ### Pairwise t-testing by age groups for total, men, and women
#' Differences by age group, among all adults
# Full output from svyttest command
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="40-59"))
# Displaying p-values only
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="40-59"))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="20-39" | Age.Group=="60 and over"))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Age.Group=="40-59" | Age.Group=="60 and over"))$p.value %>% as.numeric
#' Differences by age group, among men
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="20-39" | Age.Group=="40-59")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="20-39" | Age.Group=="60 and over")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Men" & (Age.Group=="40-59" | Age.Group=="60 and over")))$p.value %>% as.numeric
#' Differences by age group, among women
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="20-39" | Age.Group=="40-59")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="20-39" | Age.Group=="60 and over")))$p.value %>% as.numeric
svyttest(Depression~Age.Group, subset(NHANES, Gender=="Women" & (Age.Group=="40-59" | Age.Group=="60 and over")))$p.value %>% as.numeric