log using "C:\Nhanes\clean_recode.log", replace use C:\Nhanes\Data\demo_bp,clear //*********Identify missing and unavailabe values**************************** //*********title(Check missing, min, max values for numeric variables)******* tabstat bpq* mcq* if (ridageyr >=20 & ridageyr <.) & ridstatr==2, stat(n min max) nmissing bpq* mcq* if (ridageyr >=20 & ridageyr <.) & ridstatr==2 //***the nmissing command can be installed from package dm67_3 url: http://www.stata-journal.com/software/sj5-4 //*********title(Check frequency distribution for categorical variables)************ tabulate bpq010 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing // **** can use code below to run more that one variable at a time ***** foreach i in bpq020 bpq030 bpq050a bpq100d bpq070 bpq080 mcq160b mcq160c mcq160d mcq160e mcq160f { tabulate `i' if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing } //*************************Recode unavailabe values as missing**************************** //*********************option 1 Recode Missing Values One Variable at a Time*************** replace bpq010=. if bpq010==7 | bpq010==9 //or use recode command: recode bpq010 (7 9 =.) // ******************** option 2 Assign Missing Values by Group*********** foreach i in bpq020 bpq050a bpq100d bpq070 bpq080 mcq160b mcq160c mcq160d mcq160e mcq160f { replace `i' =. if `i' >=7 } save C:\Nhanes\Data\demo_bp1, replace //*******************Evaluate extent of missing data************************************ //************title(Percent of missing values for categorical variables)**************** tabulate bpq010 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing // **** can use code below to run more that one variable at a time **** foreach i in bpq020 bpq070 bpq080 mcq160b mcq160c mcq160d mcq160e mcq160f { tabulate `i' if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing } //********************Check data for skip patterns******************************** //*************title(Check skip pattern for BP questionnaire)********************* tabulate bpq020 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing tabulate bpq030 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing tabulate bpq050a if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing tabulate bpq020 bpq030 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing tabulate bpq020 bpq050a if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing //**************Recode data as necessary******************************************* //*************title(Check recode BPQ030)****************************************** // **** option 1 Directly Recode Variable and Check After Recode **** ***bpq030=1 if bpq030==1*** replace bpq030=2 if bpq030!=1 & ((bpq020==1 | bpq020==2) & bpq030 <7) replace bpq030=. if bpq030!=1 | bpq030!=2 tabulate bpq020 bpq030 if (ridageyr >=20 & ridageyr <.) & ridstatr==2 , missing save C:\Nhanes\Data\demo_bp2a, replace // ****option 2 Check derived variable (diaghtn) 1-yes,2-no **** //***********title(Check derived variable diaghtn)********************************** use C:\Nhanes\Data\demo_bp1,clear gen diaghtn=. replace diaghtn=1 if bpq030==1 replace diaghtn=2 if diaghtn !=1 & (bpq020==1 | bpq020==2) & bpq030 !=9 bysort diaghtn: tab bpq020 bpq030 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, row missing save C:\Nhanes\Data\demo_bp2b, replace //**********title(Check distributions for serum total cholesterol)****************** summarize lbxtc [w=wtmec4yr] if (ridageyr >=20 & ridageyr <.) & ridstatr==2, detail histogram lbxtc if (ridageyr >=20 & ridageyr <.) & ridstatr==2, normal graph save "C:\nhanes\graph\histogram_discriptive.gph", replace graph box lbxtc [w=wtmec4yr], medtype(line), if (ridageyr >=20 & ridageyr <.)& ridstatr==2 graph save "C:\nhanes\graph\box_plot.gph", replace // MEC weight plotted against cholesterol variable graph twoway scatter wtmec4yr lbxtc if (ridageyr >=20 & ridageyr <.)& ridstatr==2, mlabel(seqn) /// title(NHANES 1999-2002: adults age 20 years and older) // ****Identify outliers and compare estimates with and without outliers**************** //***********title(Mean of serum total cholesterol - excluding three outliers)**************** label define race 1 "Mex American", label define race 2 "Other Hispanic", add label define race 3 "NH White ", add label define race 4 "NH Black", add label define race 5 "Other Race - Including Multi-Racial", add label values ridreth1 race // delete extreme values drop if seqn==10494 | seqn==13866 | seqn==17821 save C:\Nhanes\Data\exclu_3sps, replace // mean total cholesterol without extreme values mean lbxtc if (ridageyr >=20 & ridageyr <.) & ridstatr==2 [pweight=wtmec4yr], over(ridreth1) //***********title(Mean of serum total cholesterol - including outliers)**************** use C:\Nhanes\Data\demo_bp2b, clear // mean total cholesterol with etreme values included mean lbxtc if (ridageyr >=20 & ridageyr <.) & ridstatr==2 [pweight=wtmec4yr], over(ridreth1) //************Recode based on alternate definitions****************************************** //************title(Check regroup/recode/definitions of categorical variables)*************** gen raceth=1 if ridreth1==3 replace raceth=2 if ridreth1==4 replace raceth=3 if ridreth1==1 replace raceth=4 if ridreth1==2 | ridreth1==5 // or, use recode ridreth1 (3=1)(4=2)(1=3)(2 5 =4), generate(race) recode ridageyr (min/19=.) (20/39 = 1) (40/59 = 2) (60/85 = 3), generate(age3cat) // Use these set of functions to count systolic and diastolic blood pressure readings // and change "0" to missing gen n_sbp= !missing(bpxsy1)+ !missing(bpxsy2)+ !missing(bpxsy3)+ !missing(bpxsy4) gen n_dbp= !missing(bpxdi1)+ !missing(bpxdi2)+ !missing(bpxdi3)+ !missing(bpxdi4) foreach i in bpxdi1 bpxdi2 bpxdi3 bpxdi4 { replace `i' =. if `i'==0 } // additional code that will chang "0" to missing // replace bpxdi1=. if bpxdi1==0 // replace bpxdi2=. if bpxdi2==0 // replace bpxdi3=. if bpxdi3==0 // replace bpxdi4=. if bpxdi4==0 // use the egen (command/function) to calculate mean systolic and diastolic blood pressures egen mean_sbp = rowmean(bpxsy1 bpxsy2 bpxsy3 bpxsy4) egen mean_dbp = rowmean(bpxdi1 bpxdi2 bpxdi3 bpxdi4) // use the following set of commands to define a new variable hbp (high blood pressure=1 or 0) gen hbp_trt=1 if bpq050a==1 replace hbp_trt=0 if hbp_trt !=1 & (bpq020==1 | bpq020==2) & (bpq050a !=7 | bpq050a !=9) gen sbp140=1 if mean_sbp>=140 & mean_sbp<. & ((n_sbp >0 & n_sbp <.) & (n_dbp >0 & n_dbp <.)) replace sbp140=0 if sbp140 !=1 & ((n_sbp >0 & n_sbp <.) & (n_dbp >0 & n_dbp <.)) gen dbp90=1 if mean_dbp>=90 & mean_dbp<. & ((n_sbp >0 & n_sbp <.) & (n_dbp >0 & n_dbp <.)) replace dbp90=0 if dbp90 !=1 & ((n_sbp >0 & n_sbp <.) & (n_dbp >0 & n_dbp <.)) gen hbp=1 if (hbp_trt==1 | sbp140==1 | dbp90==1) & ((hbp_trt>=0 & hbp_trt<.) & (sbp140>=0 & sbp140<.) & (dbp90>=0 & dbp90<.)) replace hbp=0 if hbp !=1 & ((hbp_trt>=0 & hbp_trt<.) & (sbp140>=0 & sbp140<.) & (dbp90>=0 & dbp90<.)) // use the following set of commands to define a new variable hlp (hyperlipidemia =1 or 0) gen hlp_trt=1 if bpq100d==1 replace hlp_trt=0 if hlp_trt !=1 & (bpq080==1 | bpq080==2) & (bpq100d !=7 | bpq100d !=9) gen hlp_lab=1 if lbxtc>=240 & lbxtc <. replace hlp_lab=0 if hlp_lab !=1 & (lbxtc>=0 & lbxtc <.) gen hlp=1 if ((hlp_lab >=0 & hlp_lab <.) & (hlp_trt >=0 & hlp_trt <.)) & (hlp_lab==1 | hlp_trt==1) replace hlp=0 if hlp !=1 & ((hlp_lab >=0 & hlp_lab <.) & (hlp_trt >=0 & hlp_trt <.)) save C:\Nhanes\Data\demo_bp3, replace tab raceth ridreth1 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, missing bysort hbp_trt: tab bpq020 bpq050a if (ridageyr >=20 & ridageyr <.) & ridstatr==2, row missing bysort hbp hbp_trt: tab sbp140 dbp90 if (ridageyr >=20 & ridageyr <.) & ridstatr==2, row missing //table hbp_trt sbp140 dbp90, by(hbp) row col missing bysort hlp_trt: tab bpq080 bpq100d if (ridageyr >=20 & ridageyr <.) & ridstatr==2, row missing bysort hlp: tab hlp_trt hlp_lab if (ridageyr >=20 & ridageyr <.) & ridstatr==2, row missing tabstat ridageyr if (ridageyr >=20 & ridageyr <.) & ridstatr==2, by(age3cat) stat(n min max) tabstat mean_sbp if (ridageyr >=20 & ridageyr <.) & ridstatr==2, by(sbp140) stat(n min max) tabstat mean_dbp if (ridageyr >=20 & ridageyr <.) & ridstatr==2, by(dbp90) stat(n min max) tabstat lbxtc if (ridageyr >=20 & ridageyr <.) & ridstatr==2, by(hlp_lab) stat(n min max) log close