NHANES R data parser

NHANES is a great ressource for doing epidemiological research. As the NIH website provides only data import for commercial software here is my rewrite in R. First load from their site

adult.exe
youth.exe
lab.exe
lab2.exe
exam.exe

put everything in one directory and expand the self-extracting archives. Then create from each SAS file a new variable content file that will only contain variable name and tab separated start position in the .dat file. Adult.var for example would read like this:

SEQN 1
DMPFSEQ 6
DMPSTAT 11
DMARETHN 12
DMARACER 13
...
HAZNOK5R 3345

Then start the following R job with the datasets and variables that you are interested in

 1:
 2:
 3:
 4:
 5:
 6:
 7:
 8:
 9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
70:
    separ  <- "--------------------------------------------------------------------------"
    ti     <- c("NHANES R data parser")
    au     <- c("WJST")
    # 07Feb06 revised data
    direct <- c("D:/data/0702work/")
    dt     <- paste("run",date())
    head   <- cat(separ,"\n",ti,"\n",au,"\n",dt,"\n",separ,"\n",sep="")

    #------------------------------------|        |-----------------------------------

    read.nhanes<-function(option1,option2) {
        # read variable positions
        var <- read.table(paste(direct,option1,".var",sep=""),header=FALSE,sep="\t",dec=",",na.strings="")

        # the variables we are interested in
        var.list <- option2

        # i holds the rownames where line positions may be found
        i <- as.numeric(row.names(var[var[,1] %in% var.list,]))

        # k holds the picklist where to find the variable content
        k <- NA
        for (j in 1:(length(i)-1)) {
            k[(j*2)-1] <- var[i[j]+1,2] - var[i[j],2]
            if ( i[j+1] == i[j]+1 ) {
                k[j*2] <- NA
            }
            else {
                k[j*2] <- -(var[i[j+1],2] - var[i[j],2] - k[(j*2)-1])
            }
        }
        j <- j+1
        k[(j*2)-1] <- var[i[j]+1,2] - var[i[j],2]
        k[(j*2)] <- -(var[dim(var)[1],2] - var[i[j],2])
        k <- k[complete.cases(k)]

        # now read data, label and save
        ds <- read.fwf(paste(direct,option1,".dat",sep=""), k, header = FALSE, sep = "\t", as.is = FALSE, skip = 0)
        colnames(ds) <- var[var[,1] %in% var.list,1]
        ds
    }

    #------------------------------------|        |-----------------------------------

    youth <- read.nhanes("youth",c("SEQN","HFC6D","HFC6D1","DMARACER","HSSEX","HSAITMOR","HYE1G","HYE1I","HYG8","HYG13A","HYG9","HSDOIMO"))
    adult <- read.nhanes("adult",c("SEQN","HFC6D","HFC6D1","DMARACER","HSSEX","HSAITMOR","HAC1E","HAC1H","HAL6","HAL11A","HAD1","HAD5R","HAL7","HSDOIMO"))

    nc <- c(colnames(adult),colnames(youth)[!colnames(youth) %in% colnames(adult)])
    both <- data.frame(matrix( nrow=nrow(adult)+nrow(youth), ncol=length(nc), byrow=TRUE ,NA))
    colnames(both) <- c(nc)

    for (i in nc) {
        if (i %in% colnames(adult)) {
            both[1:nrow(adult),i] <- adult[,i]
        }
        if (i %in% colnames(youth)) {
            both[(nrow(adult)+1):nrow(both),i] <- youth[,i]
        }
    }
    exam <- read.nhanes("exam",c("SEQN","ALPCATWL","ALPCATWW","ALPMITWL","ALPMITWW","ALPRYEWL","ALPRYEWW","ALPPEAWL","ALPPEAWW","ALPRAGWL","ALPRAGWW","MPPB2","BMPHT","BMPWT","BMPBMI"))
    nhanes <- merge( both, exam, by=c("SEQN"), all=TRUE)

    lab <- read.nhanes("lab",c("SEQN","WCPSI","LMPPCNT","MOPPCNT","GRPPCNT","LMP","MOP","LMPDIF","MOPDIF","EOP","FRP","SCP","SCPSI","PSP","PSPSI","DMPCREGN","WTPFQX6","TEP","AHP","HBP","SSP","SAP","HCP","DHP","H1P","H2P","RUP","RUPUNIT","VRP","TOP"))
    nhanes <- merge( nhanes, lab, by=c("SEQN"), all=TRUE)

    lab2 <- read.nhanes("lab2",c("SEQN","VDPSI"))
    nhanes <- merge( nhanes, lab2, by=c("SEQN"), all=TRUE)

    save(nhanes, file=paste(direct,"nhanes.Rdata",sep=""), compress=TRUE)