Chapter 37 7. Clean mortality data

37.1 7.1 Remove columns not needed for the current analysis

rates <- mort %>%
  select(-c(Location_of_death, Marital_status))

rates
## # A tibble: 60,143 x 11
##         ID Sex   Cause_of_death   dbuid2016 Postalcode B_year B_month B_day D_year D_month D_day
##      <dbl> <chr> <chr>                <dbl> <chr>       <dbl>   <dbl> <dbl>  <dbl>   <dbl> <dbl>
##  1 1000001 M     D47            13070175019 E1A1E9       1943       7     8   2016       9    22
##  2 1000002 F     C34            48060924001 T2E0T2       1943       5    22   2016      12    14
##  3 1000003 M     C26            35204502002 M4J1L1       1953       8     9   2016       4    21
##  4 1000004 M     C16            24663048013 H9H1B4       1923       6    14   2016      11    21
##  5 1000005 M     C22            24560249013 J2W2B6       1931       8     8   2016       4     4
##  6 1000006 F     C50            35250845002 L8H2K1       1933      11     4   2016       2     6
##  7 1000007 F     C85            35370763005 N9Y3X5       1918       5    20   2016       2     9
##  8 1000008 F     C56            35250788003 L8H7G2       1972       8    18   2016       6    13
##  9 1000009 F     C64            24662429005 H9P2B3       1922       7    23   2016       5    25
## 10 1000010 M     C22            48111094001 T5M1G2       1953       2     7   2016       1    16
## # i 60,133 more rows

37.2 7.2 Check and remove duplicate records

rates_dup <- rates %>%
  count(ID) %>%
  filter(n > 1)

rates_dup
## # A tibble: 23 x 2
##         ID     n
##      <dbl> <int>
##  1 1000170     2
##  2 1000662     2
##  3 1001349     2
##  4 1001352     2
##  5 1004287     2
##  6 1004618     2
##  7 1004869     2
##  8 1005245     2
##  9 1008199     2
## 10 1012433     2
## # i 13 more rows
rates %>%
  filter(ID %in% rates_dup$ID)
## # A tibble: 47 x 11
##         ID Sex   Cause_of_death   dbuid2016 Postalcode B_year B_month B_day D_year D_month D_day
##      <dbl> <chr> <chr>                <dbl> <chr>       <dbl>   <dbl> <dbl>  <dbl>   <dbl> <dbl>
##  1 1000170 F     C91            12060095005 B4V1T1       1922       7    25   2016       7    22
##  2 1000170 F     C91            12060095005 B4V1T1       1922       7    25   2016       7    22
##  3 1000662 M     C34            24720038001 J7P4H7       1965       3    29   2016       5     2
##  4 1000662 M     C34            24720038001 J7P4H7       1965       3    29   2016       5     2
##  5 1001349 M     C16            24370129009 G9A5H9       1961       4     6   2016       5    24
##  6 1001349 M     C16            24370129009 G9A5H9       1961       4     6   2016       5    24
##  7 1001352 M     C24            48010141007 T1B3E3       1956      12    27   2016       4    28
##  8 1001352 M     C24            48010141007 T1B3E3       1956      12    27   2016       4    28
##  9 1004287 M     C34            35320194004 N4S2E1       1949       2    18   2016       1    19
## 10 1004287 M     C34            35320194004 N4S2E1       1949       2    18   2016       1    19
## # i 37 more rows
rates <- rates %>%
  distinct()