****************************************************************************************************************;
* Example SAS code to replicate NCHS Data Brief No. 303, Figures 1                                             *;
* Prevalence of Depression Among Adults Aged 20 and Over: United States, 2013–2016                             *;
*                                                                                                              *;
* Brody DJ, Pratt LA, Hughes JP. Prevalence of Depression Among Adults Aged 20 and Over: United                *;
* States, 2013–2016. NCHS Data Brief. No 303. Hyattsville, MD: National Center for Health Statistics. 2018.    *;                *;
*                                                                                                              *;
* Available at: https://www.cdc.gov/nchs/products/databriefs/db303.htm                                         *;
****************************************************************************************************************;

options nocenter nodate nonumber pagesize=max linesize=150;
options FORMCHAR="|----|+|---+=|-/\<>*";

** print SAS version to log **;
%put NOTE: Run in SAS &sysver (maintenance release and release year: &sysvlong4);

* Define paths to Demographic (DEMO) and Mental Health - Depression Screener (DPQ) data *;
filename demo_h url 'https://wwwn.cdc.gov/nchs/nhanes/2013-2014/demo_h.xpt'; libname demo_h xport;
filename demo_i url 'https://wwwn.cdc.gov/nchs/nhanes/2015-2016/demo_i.xpt'; libname demo_i xport;
filename dpq_h url 'https://wwwn.cdc.gov/nchs/nhanes/2013-2014/dpq_h.xpt'; libname dpq_h xport;
filename dpq_i url 'https://wwwn.cdc.gov/nchs/nhanes/2015-2016/dpq_i.xpt'; libname dpq_i xport;

* Read in SAS transport files using a data step and append across survey cycles - Demographic files *;
data demo;
  set demo_h.demo_h(keep=seqn riagendr ridageyr sdmvstra sdmvpsu wtmec2yr)
      demo_i.demo_i(keep=seqn riagendr ridageyr sdmvstra sdmvpsu wtmec2yr);
run;

* Read in SAS transport files and append  across survey cycles - Mental Health - Depression Screener files *;
data dpq;
  set dpq_h.dpq_h
      dpq_i.dpq_i;

  ** Set Refused/Don't Know To Missing (for all variable names starting with "dpq") **;
  array _dpq dpq:;
  do over _dpq;
    if (_dpq >= 7) then call missing(_dpq);
  end;

  ** Create Depression Score (score will be missing if any of the items are missing) **;
  Depression_Score = dpq010+dpq020+dpq030+dpq040+dpq050+dpq060+dpq070+dpq080+dpq090;

  ** Create binary depression indicator as 0/100 variable, to calculate the prevalence of depression **; 
  if (0 <= Depression_Score < 10) then Depression_Indicator = 0;
  else if (Depression_Score >= 10) then Depression_Indicator = 100;

  keep seqn Depression_Score Depression_Indicator;
run;

* Merge component files to produce analysis dataset *;
data one;
  merge demo
        dpq;
  by seqn;

  ** Create Selection Variable For Subpopulation Of Interest **;
  if (ridageyr >= 20) then Select = 1;

  ** Calculate MEC weight for 4-year data *;
  ** Use the MEC exam weights, per the analytic notes in the DPQ documentation file **;
  ** Although the outcome of interest is derived from a questionnaire, these questions were asked at the MEC and so only MEC participants were eligible *; 
  WTMEC4YR = 1/2 * WTMEC2YR;
run;

*******************************************;
* Labels for categorized variables *;
*******************************************;

proc format;
  value genf
    .='Both Sexes' 1='Men' 2='Women';
  value agef
    .='20 and over' 0-19='<20' 20-39='20-39' 40-59='40-59' 60-high='60 or more';
run;


************************************************************;
** CALCULATE PROPORTIONS                                  **;
************************************************************;

** Use Proc Surveymeans To Calculate Prevalences **;
* to get correct variance estimates, you MUST specify option nomcar -- treat missing values as not missing completely at random (NOMCAR) for Taylor series variance estimation *;
proc surveymeans data=one nomcar nobs mean stderr;
  * specify survey design variables in the strata, cluster, and weight statements *;
  strata sdmvstra;
  cluster sdmvpsu;
  weight WTMEC4YR;
  * specify your subpopulation(s) of interest in the domain statement *;
  domain Select Select*riagendr Select*ridageyr Select*riagendr*ridageyr;
  var Depression_Indicator;
  * ODS SELECT statement chooses which output to write to results window *;
  * ODS OUTPUT statement writes specified output to an output dataset *;
  ods select Domain ;
  ods output domain=work.fig1_domain;
  format riagendr genf. ridageyr agef. ;
  title "Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013–2016";
run;


**********************************;
** Prepare and print data table **;
**********************************;
proc sort data = fig1_domain;
  by riagendr ridageyr;
run;

proc print data = fig1_domain noobs;
  var riagendr ridageyr n mean stderr ;
  format n comma8. mean 5.1 stderr 5.1;
  title "Data table: Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013–2016";
  footnote "NOTES: Depression was defined as a score greater than or equal to 10 on the Patient Health Questionnaire.";
  footnote2 "SOURCE: NCHS, National Health and Nutrition Examination Survey, 2013–2016.";
run;

* clear footnote statements *;
footnote;


************************************************************;
** T-tests of Sex                                         **;
************************************************************;

** Use proc surveyreg to test for differences between men and women, overall and by age group **;
** can request the test using either an ESTIMATE statement or an LSMEANS statement **;

* option 1: use estimate statement to conduct the hypothesis test *;
proc surveyreg data=one nomcar;
  strata sdmvstra;
  cluster sdmvpsu;
  weight WTMEC4YR;
  * DOMAIN statement: request comparisons for the overall analysis population (i.e. where Select=1) and by agewithin the analysis population (Select*ridageyr) *;
  * Note that ridageyr is a continuous variable but is formatted to create age groups 20-39, 40-59, and 60 and over *;
  domain Select Select*ridageyr;
  * CLASS statement: indicate that riagendr should be treated as a categorical variable instead of a continous variable *;
  class riagendr;
  * MODEL statement: noint request no intercept in the model (so the parameter estimates are the age-specific means) *;
  *          solution requests the parameter estimates be printed (not printed by default if a class statement is used) *;
  *          vadjust specifies whether to use an adjustment for degrees of freedom in the variance estimation. *;
  *          vadjust=none produces variance estimates that match the default options in proc surveymeans *;
  model Depression_Indicator = riagendr /noint solution vadjust=none;
  * ESTIMATE statement: produce the contrast as the mean value of Depression_Indicator for men minus the mean value of Depression_Indicator for women *;
  estimate 'Men vs Women' riagendr 1 -1;
  ods select Estimates ParameterEstimates;
  ods output estimates = estimates_gender;
  * FORMAT statement: apply a format to create categories from the continuous age variable ridageyr and to apply meaningful labels to the values of riagendr *;
  format riagendr genf. ridageyr agef.;
  title "Test for differences between men and women: Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013–2016";
  title2 "Using an ESTIMATE statement";
run;

* option 2: use lsmeans statement to conduct the hypothesis test *;
proc surveyreg data=one nomcar;
  strata sdmvstra;
  cluster sdmvpsu;
  weight WTMEC4YR;
  domain Select Select*ridageyr;
  class riagendr;
  model Depression_Indicator = riagendr /noint solution vadjust=none;
  lsmeans riagendr /diff;
  ods select Diffs ;
  ods output Diffs=diffs_gender;
  format riagendr genf. ridageyr agef.;
  title "Test for differences between men and women: Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013–2016";
  title2 "Using an LSMEANS statement";
run;

***********************************************************************;
** T-tests of Age Groups, overall and for each sex **;
***********************************************************************;

proc surveyreg data=one nomcar;
  strata sdmvstra;
  cluster sdmvpsu;
  weight WTMEC4YR;
  domain Select Select*riagendr;
  class ridageyr;
  model Depression_Indicator = ridageyr /noint solution vadjust=none;
  estimate '20-39 vs 40-59' ridageyr 1 -1 0,
           '20-39 vs 60 or more' ridageyr 1 0 -1,
           '40-59 vs 60 or more' ridageyr 0 1 -1;
  ods select Estimates;
  format riagendr genf. ridageyr agef.;
  title "Test for differences between age groups: Percentage of persons aged 20 and over with depression, by age and sex: United States, 2013–2016";
run;