****************************************************************************************************************;
* Example SAS code to replicate NCHS Data Brief No. 52, Figure 2                                               *;
* Mean blood mercury and lead levels in pregnant women aged 18-49 years, by age group: United States 2003-2008 *;
*                                                                                                              *;
* Jones L, Parker JD, Mendola P. Blood lead and mercury levels in pregnant women in the United States,         *;
* NCHS Data Brief. No 52. Hyattsville, MD: National Center for Health Statistics. 2010.                        *;
*                                                                                                              *;
* Available at: https://www.cdc.gov/nchs/products/databriefs/db52.htm                                          *;
****************************************************************************************************************;

options nocenter nodate nonumber pagesize=100 linesize=150;
OPTIONS FORMCHAR="|----|+|---+=|-/\<>*";

%put Run in SAS &sysver (maintenance release and release year: &sysvlong4);

** Macro To Download Data from NHANES website **;
%macro CreateDS(myDS);
  %let i = 1;
  %let DS = %scan(&myDS, &i);

  %do %until(&DS = %nrstr());
    %let Suffix = %lowcase(%substr(&DS, %eval(%length(&DS)-1)));
    %if       (&Suffix = _j) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2017-2018/&DS..xpt"; %end;
    %else %if (&Suffix = _i) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2015-2016/&DS..xpt"; %end;
    %else %if (&Suffix = _h) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2013-2014/&DS..xpt"; %end;
    %else %if (&Suffix = _g) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2011-2012/&DS..xpt"; %end;
    %else %if (&Suffix = _f) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2009-2010/&DS..xpt"; %end;
    %else %if (&Suffix = _e) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2007-2008/&DS..xpt"; %end;
    %else %if (&Suffix = _d) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2005-2006/&DS..xpt"; %end;
    %else %if (&Suffix = _c) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2003-2004/&DS..xpt"; %end;
    %else %if (&Suffix = _b) %then %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/2001-2002/&DS..xpt"; %end;
    %else %do; filename xptIn url "https://wwwn.cdc.gov/nchs/nhanes/1999-2000/&DS..xpt"; %end;
    libname xptIn xport;

    data &DS;
      set xptIn.&DS;
    run;

    libname xptIn clear;
    filename xptIn clear;

    %let i = %eval(&i+1);
    %let DS = %scan(&myDS, &i);
  %end;
%mend CreateDS;

** Call macro to import datafiles for multiple individual components **;

* Demographic (DEMO) *;
* Cadmium, Lead, & Total Mercury - Blood (named L06BMT_C for 2003-2004, PbCd_x for 2005-2006 and 2007-2008) *;

%CreateDS(demo_c demo_d demo_e 
         L06BMT_C PbCd_D PbCd_E );


** Append data files from multiple survey cycles **;
* Demographic (DEMO) *;
data demo;
  set demo_c
      demo_d
      demo_e;
run;

proc contents data=demo; run;

* Cadmium, Lead, & Total Mercury - Blood (note file name change between 2003-2004 and 2005-2006 cycles) *;
*LBXTHG - Mercury, total (ug/L) ;
* LBXBPB - Lead (ug/dL) ;
data PbCd (keep = seqn LBXTHG LBXBPB);
  set L06BMT_C 
      PbCd_D 
      PbCd_E ;
run;

data baseDS;
  merge demo (in = a)
        PbCd;
  by seqn;
  if a;

  * Create variable to define analysis population: pregnant adults age 18 - 49 (inclusive) with positive exam weight *;
  inAnalysis= (18<=ridageyr<=49 and RIDEXPRG=1 and wtmec2yr>0);
  
  * Create 6-year MEC exam weight *;
  wtmec6yr=wtmec2yr/3;

  * Create age categories *;
  if 18<=ridageyr<25 then ageCat=1;
  else if 25<=ridageyr<30 then ageCat=2;
  else if 30<=ridageyr<35 then ageCat=3;
  else if 35<=ridageyr<50 then ageCat=4;

  * Log-transform the outcome variables (for method 2 of calculating the geometric mean) *;
  LBXTHG_log=log(LBXTHG);
  LBXBPB_log=log(LBXBPB);
run;

* Define formats for labeling output *;
proc format;
  value ageLabelF
  1="18-24 years"
  2="25-29 years"
  3="30-34 years"
  4="35 years or over"
  0,-2=" "
  ;
  value $ varlabel
  "LBXTHG"="Mercury (ug/L)"
  "LBXBPB"="Lead (ug/dL)"
  ;
run;


*****************************************************;
** Descriptive statistics in SAS - check normality **;
*****************************************************;

* Distribution of total mercury and total lead (not normally distributed) *;

* Proc Univariate options: PLOT option generates histogram, box, and normal probability plots ;
*  NORMAL option generates statistics to test normality *;
proc univariate data = baseDS plot normal;
  * WHERE statement: specify population of interest *;
  where inAnalysis=1;    
  * VAR statement: specify variable(s) for descriptive statistics *;
  var LBXTHG LBXBPB;       
  * FREQ statement: specify appropriate sample weight (i.e. the exam weight, for this example) *;
  freq wtmec6yr; 
  title "Distribution of total mercury and total lead among pregnant women age 18-49: NHANES 2003-2008";           
run;

* A note on the use of the FREQ statement vs. the WEIGHT statement and the estimated standard deviation 
  The FREQ statement with appropriate sample weight yields an estimate of the standard deviation 
     whose denominator is an estimate of the POPULATION SIZE, i.e., the sum of the the sample weights
  The WEIGHT statement with the sample weight would instead yield an estimate of the standard deviation 
     whose denominator is the SAMPLE SIZE.*;

** Distribution of total mercury and total lead by age group **;

proc sort data = baseDS;
  by ageCat;
run;

proc univariate data = baseDS plot normal;
  where inAnalysis=1;
  by ageCat;    
  VAR LBXTHG;       
  freq wtmec6yr; 
  FORMAT ageCat ageLabelF.; 
  title "Distribution of total mercury, by age, among pregnant women age 18-49: NHANES 2003-2008";           
run;

* Note that the log-transformed variables are more normally distributed *;
proc univariate data = baseDS plot normal;
  where inAnalysis=1;    
  VAR LBXTHG_log LBXBPB_log;       
  freq wtmec6yr; 
  * output statement saves the 5th and 95th percentile values to a dataset *;
  output out=stats p5=p5_loghg p5_logpb p95=p95_loghg p95_logpb ;
  title "Distribution of log(total mercury) and log(total lead) among pregnant women age 18-49: NHANES 2003-2008";           
run;

***********************************************************************************;
** Check for outlier values with extreme weights - possible influential points   **;
***********************************************************************************;
* save 5th and 95th percentile values as macro variables, to use as reference line in scatterplot *;
proc transpose data = stats out=stats2;
run;

data _null_;
  set stats2;
  call symputx(_name_, col1);
run;

* Plot values of total mercury against the sample weight, with dashed reference lines at 5th and 95th percentile values *;
proc gplot data = baseDS (where = (inAnalysis=1));
  plot LBXTHG_log*wtmec6yr /vref=&p5_loghg &p95_loghg lvref=2;
  title "Plot of log(total mercury) vs weight";
run;

* Plot values of total lead against the sample weight, with dashed reference lines at 5th and 95th percentile values *;
proc gplot data = baseDS (where = (inAnalysis=1));
  plot LBXBPB_log*wtmec6yr /vref=&p5_logpb &p95_logpb lvref=2;
  title "Plot of log(total lead) vs weight";
run;
quit;

****************************************************************************************;

*********************************************************;
** Calculate geometric mean of mercury and lead levels **;
** Data brief 52, Figure 2                             **;
*********************************************************;

** Method 1: Request the geometric mean and standard error in Proc Surveymeans **;
* Note that the confidence intervals for the geometric mean in Proc Surveymeans (statistic-keyword GMCLM) do NOT account for the reduction in degrees of freedom *;
*    for subpopulations that are not found in all PSU/Strata *;

* to get correct variance estimates, you MUST specify option nomcar -- treat missing values as not missing completely at random (NOMCAR) for Taylor series variance estimation *;
proc surveymeans data=baseDS nomcar nobs geomean gmstderr;
  * specify survey design variables in the strata, cluster, and weight statements *;
  strata sdmvstra;
  cluster sdmvpsu;
  weight wtmec6yr;
  * specify your subpopulation(s) of interest in the domain statement *;
  domain inAnalysis*ageCat;
  * specify your analysis variable(s) in the var statement *;
  var LBXTHG LBXBPB;
  * ODS SELECT statement chooses which output to write to results window *;
  * ODS OUTPUT statement writes specified output to an output dataset *;
  ods select  DomainGeoMeans  ;
  ods output DomainGeoMeans =fig2_domain domain=fig2_n;
  format ageCat ageLabelf.;
  title "Mean blood mercury and lead levels in pregnant women aged 18-49 years, by age group: United States 2003-2008";
  title "Geometric mean from Method 1: Proc Surveymeans";
run;

* Combine domain and domain geometric means output datasets *;
data fig2;
  merge fig2_domain
        fig2_n;
  * only keep rows that contain domains of interest *;
  if inAnalysis=1;
run;

proc sort data = fig2;
  by varname agecat;
run; 

* Display results in a table *;
proc print data = fig2 noobs label;
  var varlabel ageCat n geomean gmstderr ;
  format  geomean gmstderr  5.2;
  title "Mean blood mercury and lead levels in pregnant women aged 18-49 years, by age group: United States 2003-2008";
  title2 "Geometric mean from Method 1: Proc Surveymeans";
run;


*********************************************************;

** Method 2: Request the arithmetic mean of the log-transformed variables and then transform back to the original scale **;

* Request the (arithmetic) mean and standard error of the LOG-TRANSFORMED variables in Proc Surveymeans **;
proc surveymeans data=baseDS nomcar nobs mean stderr ;
  * specify survey design variables in the strata, cluster, and weight statements *;
  strata sdmvstra;
  cluster sdmvpsu;
  weight wtmec6yr;
  * specify your subpopulation(s) of interest in the domain statement *;
  domain inAnalysis*ageCat;
  * specify your analysis variable(s) in the var statement *;
  var LBXTHG_log LBXBPB_log;
  * ODS SELECT statement chooses which output to write to results window *;
  * ODS OUTPUT statement writes specified output to an output dataset *;
  ods select  Domain  ;
  ods output domain=fig2_logtmp;
  format ageCat ageLabelf.;
  title "Arithmetic mean of log(mercury) and log(lead) in pregnant women aged 18-49 years, by age group: United States 2003-2008";
run;

* Transform back into original scale (exponentiate the mean of the log-transformed variables) *;
data fig2_logMethod;
  set fig2_logtmp  (where=(inAnalysis=1)) ;
  * exponentiate the mean of the log-transformed variables *;
  geomean=exp(mean);
  * approximate the standard error of the geometric mean (see Alf and Grossberg paper for details) *;
  segeomean= exp(mean) * stderr; 
  * remove log from variable name *;
  varname = translate(varname, "", "_log");
  label ageCat="Age category"
        geomean = "Geometric mean"
        segeomean="SE geometric mean"
        ;
run;

proc sort data = fig2_logMethod;
  by varname;
run;

* Display results in a table *;
proc print data = fig2_logMethod noobs label;
  var varname ageCat n geomean segeomean ;
  format varname $varlabel. geomean segeomean  5.2;
  title "Mean blood mercury and lead levels in pregnant women aged 18-49 years, by age group: United States 2003-2008";
  title2 "Geometric mean from Method 2: calculated as the mean of the log-transformed values, then exponentiated";
run;

footnote;


********************************;
** References **;
** Alf EF, Grossberg JMJP, Psychophysics. The geometric mean: Confidence limits and significance tests. 1979 26(5):419-421. *;