Created
September 15, 2015 18:26
-
-
Save izahn/db16d52ed6b4289425bb to your computer and use it in GitHub Desktop.
Haven package frustrations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(haven) | |
library(dplyr) | |
library(foreign) | |
## Read Stata dataset with foreign | |
xf <- read.dta("http://izahn.crabdance.com/owncloud/index.php/s/PlXcLrNFyKoIDk3/download") | |
## Read Stata dataset with haven | |
xh <- read_dta("http://izahn.crabdance.com/owncloud/index.php/s/PlXcLrNFyKoIDk3/download") | |
str(xf) ## structure of foreign result is comprehensable | |
str(xh) ## structure of haven result is a cluttered mess | |
glimpse(xf) # works on foreign result | |
glimpse(xh) # doesn't work on haven result | |
## Error: `x` and `labels` must be same type | |
## Extracting metadata from foreign result is easy | |
xf.info <- list(variables = data.frame(attributes(xf)[c("names", "var.labels", "formats")]), | |
values = attributes(xf)$label.table) | |
## Extracting metadata from haven result is harder but not too bad | |
xh.info <- list(variables = data.frame(names = names(xh), | |
labels = sapply(xh, attr, which = "label")), | |
values = sapply(xh, attr, which = "labels")) | |
## The foreign package mostly distinguishes numeric | |
## from non-numeric columns correctly; the haven package | |
## doesn't even try, it just stores everthing as doubles | |
## and slaps labels on it. | |
(data.comparison <- data.frame(havenclass = sapply(xh, class), | |
foreignclass = sapply(xf, class), | |
haventype = sapply(xh, typeof), | |
foreigntype = sapply(xf, typeof), | |
description = xf.info$variables$var.labels)) | |
## Cleaning up the haven result is difficult. As far as I can tell getting | |
## these data into a usable form (e.g., age should be an number, | |
## zodiac should be a factor or a character) requires examining | |
## each variable and manually converting it: | |
factor.vars <- c("sex", "usecomp", "usemail", "useweb", "hapmar", "happy", "rincome", | |
"income", "marital", "postlife", "pres96", "richwork", "satjob", | |
"spdeg", "spwrksta", "vote96", "wrkstat", "zodiac", "husbft", | |
"wifeft") | |
num.vars <- setdiff(names(xh), factor.vars) | |
xh[num.vars] <- lapply(xh[num.vars], as.numeric) | |
xh[factor.vars] <- lapply(xh[factor.vars], as_factor) | |
## Well, I guess that wasn't so bad, but mostly that's because I cheated | |
## and looked at the output form the foreign package to make the list of | |
## factor variables. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment