R build status GitHub release CRAN_Status_Badge

Overview

This package offers a pair of functions, renamefrom() and encodefrom(), for renaming and encoding data frames using external crosswalk files. It is especially useful when constructing master data sets from multiple smaller data sets that do not name or encode variables consistently across files. Based on renamefrom and encodefrom Stata commands written by Sally Hudson and team.

Installation

Install the latest release version from CRAN with

install.packages('crosswalkr')

Install the latest development version from Github with

devtools::install_github('btskinner/crosswalkr')

Usage

library(crosswalkr)
library(dplyr)
library(haven)

## starting data frame
df <- data.frame(state = c('Kentucky','Tennessee','Virginia'),
                 fips = c(21,47,51),
                 region = c('South','South','South'))
df

##       state fips region
## 1  Kentucky   21  South
## 2 Tennessee   47  South
## 3  Virginia   51  South

## crosswalk with which to convert old names to new names with labels
cw <- data.frame(old_name = c('state','fips'),
                 new_name = c('stname','stfips'),
                 label = c('Full state name', 'FIPS code'))
cw

##   old_name new_name           label
## 1    state   stname Full state name
## 2     fips   stfips       FIPS code

Renaming

Convert old variable names to new names and add labels from crosswalk.

df1 <- renamefrom(df, cw_file = cw, raw = old_name, clean = new_name, label = label)
df1

##      stname stfips
## 1  Kentucky     21
## 2 Tennessee     47
## 3  Virginia     51

Convert old variable names to new names using old names as labels (ignoring labels in crosswalk).

df2 <- renamefrom(df, cw_file = cw, raw = old_name, clean = new_name, name_label = TRUE)
df2

##      stname stfips
## 1  Kentucky     21
## 2 Tennessee     47
## 3  Virginia     51

Convert old variable names to new names, but keep unmatched old names in the data frame.

df3 <- renamefrom(df, cw_file = cw, raw = old_name, clean = new_name, drop_extra = FALSE)
df3 

##      stname stfips region
## 1  Kentucky     21  South
## 2 Tennessee     47  South
## 3  Virginia     51  South

Encoding

## starting data frame
df <- data.frame(state = c('Kentucky','Tennessee','Virginia'),
                 stfips = c(21,47,51),
                 cenregnm = c('South','South','South'))
df

##       state stfips cenregnm
## 1  Kentucky     21    South
## 2 Tennessee     47    South
## 3  Virginia     51    South

## use state crosswalk data file from package
cw <- get(data(stcrosswalk))
cw

## # A tibble: 51 x 7
##    stfips stabbr stname               cenreg cenregnm  cendiv cendivnm          
##     <int> <chr>  <chr>                 <int> <chr>      <int> <chr>             
##  1      1 AL     Alabama                   3 South          6 East South Central
##  2      2 AK     Alaska                    4 West           9 Pacific           
##  3      4 AZ     Arizona                   4 West           8 Mountain          
##  4      5 AR     Arkansas                  3 South          7 West South Central
##  5      6 CA     California                4 West           9 Pacific           
##  6      8 CO     Colorado                  4 West           8 Mountain          
##  7      9 CT     Connecticut               1 Northeast      1 New England       
##  8     10 DE     Delaware                  3 South          5 South Atlantic    
##  9     11 DC     District of Columbia      3 South          5 South Atlantic    
## 10     12 FL     Florida                   3 South          5 South Atlantic    
## # … with 41 more rows

Create a new column with factor-encoded values

df$state2 <- encodefrom(df, var = state, cw_file = cw, raw = stname, clean = stfips, label = stabbr)
df

##       state stfips cenregnm state2
## 1  Kentucky     21    South     KY
## 2 Tennessee     47    South     TN
## 3  Virginia     51    South     VA

Create a new column with labelled values.

## convert to tbl_df
df <- tibble::as_tibble(df)
df$state3 <- encodefrom(df, var = state, cw_file = cw, raw = stname, clean = stfips, label = stabbr)

Create new column with factor-encoded values (ignores the fact that df is a tibble)

df$state4 <- encodefrom(df, var = state, cw_file = cw, raw = stname, clean = stfips, label = stabbr, ignore_tibble = TRUE)

Show factors with labels:

as_factor(df)

## # A tibble: 3 x 6
##   state     stfips cenregnm state2 state3 state4
##   <chr>      <dbl> <chr>    <fct>  <fct>  <fct> 
## 1 Kentucky      21 South    KY     KY     KY    
## 2 Tennessee     47 South    TN     TN     TN    
## 3 Virginia      51 South    VA     VA     VA

Show factors without labels:

zap_labels(df)

## # A tibble: 3 x 6
##   state     stfips cenregnm state2 state3 state4
##   <chr>      <dbl> <chr>    <fct>   <int> <fct> 
## 1 Kentucky      21 South    KY         21 KY    
## 2 Tennessee     47 South    TN         47 TN    
## 3 Virginia      51 South    VA         51 VA