Chapter 39 9. Create date variables and calculate age

We will use lubridate to create date of birth (DOB) and date of death (DOD). Then we calculate age as the difference between year of death and year of birth.

rates <- rates %>%
  mutate(
    DOB = as_date(str_c(B_year, B_month, B_day, sep = "-")),
    DOD = as_date(str_c(D_year, D_month, D_day, sep = "-")),
    Age = year(DOD) - year(DOB),
    Age = if_else(Age < 0, 0.5, as.numeric(Age))
  )

range(rates$Age, na.rm = TRUE)
## [1]   0 111

39.1 9.1 Check remaining missing values

map_int(rates, ~ sum(is.na(.x)))
##             ID            Sex Cause_of_death      dbuid2016     Postalcode         B_year 
##              0             13              0              4              6             24 
##        B_month          B_day         D_year        D_month          D_day            DOB 
##              0              0              0              0              0             30 
##            DOD            Age 
##              0             30
rates %>%
  filter(if_any(everything(), is.na))
## # A tibble: 53 x 14
##         ID Sex   Cause_of_death   dbuid2016 Postalcode B_year B_month B_day D_year D_month D_day
##      <dbl> <chr> <chr>                <dbl> <chr>       <dbl>   <dbl> <dbl>  <dbl>   <dbl> <dbl>
##  1 1000185 F     C85            59350259032 V4V2G1         NA       6    15   2016      11     2
##  2 1000345 <NA>  C19            59152106002 V3W1S7       1964       4    22   2016       9    23
##  3 1001139 F     C50            59510179006 V0J0B5         NA       6    15   2016       6     6
##  4 1001262 M     C25            59150514001 <NA>         1952       1    25   2016       5     2
##  5 1001402 F     C34            59390163010 V1E1V2         NA       7    11   2016       8    18
##  6 1001696 <NA>  C34            59170383013 V8T4P6       1931       8     6   2016      10    16
##  7 1001964 M     C61            59150945002 <NA>         1948       1    22   2016       3    17
##  8 1002150 M     C26            59090838002 V2P6S3         NA       6    15   2016       2    22
##  9 1003446 F     C34            59170152001 V8T4E5         NA       6    15   2016      10    10
## 10 1004221 M     C34            59170164004 V8V4V6         NA       6    15   2016      10    28
## # i 43 more rows
## # i 3 more variables: DOB <date>, DOD <date>, Age <dbl>

39.2 9.2 Drop records missing essential identifiers

For this exercise, records missing B_year or dbuid2016 are removed because these values are required for age calculation and geographic linkage.

rates <- rates %>%
  drop_na(B_year, dbuid2016)

39.3 9.3 Impute missing sex values

For practice purposes, missing Sex values are randomly assigned as M or F. The set.seed() function makes the random assignment reproducible.

set.seed(123)

# Count and locate missing Sex values.
missing_sex_index <- is.na(rates$Sex)
missing_sex_count <- sum(missing_sex_index)

# Replace only the missing rows. This avoids the recycling error that can happen
# when using if_else() with a shorter sampled vector.
rates$Sex <- as.character(rates$Sex)
rates$Sex[missing_sex_index] <- sample(
  c("M", "F"),
  size = missing_sex_count,
  replace = TRUE
)
rates$Sex <- as.factor(rates$Sex)

rates %>% count(Sex)
## # A tibble: 2 x 2
##   Sex       n
##   <fct> <int>
## 1 F     28617
## 2 M     31475

39.4 9.4 Recalculate age where needed

Some records may still have missing age values because day and month values may have been swapped. For this exercise, we try an alternate date construction.

rates <- rates %>%
  mutate(
    DOB = if_else(
      is.na(DOB),
      as_date(str_c(B_year, B_day, B_month, sep = "-")),
      DOB
    ),
    Age = year(DOD) - year(DOB),
    Age = if_else(Age < 0, 0.5, as.numeric(Age))
  )

rates %>% filter(is.na(Age))
## # A tibble: 0 x 14
## # i 14 variables: ID <dbl>, Sex <fct>, Cause_of_death <chr>, dbuid2016 <dbl>, Postalcode <chr>,
## #   B_year <dbl>, B_month <dbl>, B_day <dbl>, D_year <dbl>, D_month <dbl>, D_day <dbl>,
## #   DOB <date>, DOD <date>, Age <dbl>