Chapter 33 3. Initial data review

Before cleaning, it is helpful to review the dimensions, names, missing values, and structure of each dataset.

data_list <- list(
  mort = mort,
  pop = pop,
  corr = corr,
  env = env
)

33.1 3.1 Dataset dimensions

map_dfr(
  data_list,
  ~ tibble(rows = nrow(.x), columns = ncol(.x)),
  .id = "dataset"
)
## # A tibble: 4 x 3
##   dataset   rows columns
##   <chr>    <int>   <int>
## 1 mort     60143      13
## 2 pop         51      25
## 3 corr    420963       6
## 4 env     116011       8

33.2 3.2 Column names

map(data_list, names)
## $mort
##  [1] "ID"                "Sex"               "Cause_of_death"    "dbuid2016"        
##  [5] "Location_of_death" "Marital_status"    "Postalcode"        "B_year"           
##  [9] "B_month"           "B_day"             "D_year"            "D_month"          
## [13] "D_day"            
## 
## $pop
##  [1] "...1"                         "Health Service Delivery Area" "Year"                        
##  [4] "Gender"                       "<1"                           "04-Jan"                      
##  [7] "09-May"                       "14-Oct"                       "15-19"                       
## [10] "20-24"                        "25-29"                        "30-34"                       
## [13] "35-39"                        "40-44"                        "45-49"                       
## [16] "50-54"                        "55-59"                        "60-64"                       
## [19] "65-69"                        "70-74"                        "75-79"                       
## [22] "80-84"                        "85-89"                        "90+"                         
## [25] "Total"                       
## 
## $corr
## [1] "dbuid2016"      "csduid2016"     "hruid2017"      "hrname_english" "hrname_french" 
## [6] "dbpop2016"     
## 
## $env
## [1] "POSTALCODE12" "WTHNRC12_01"  "WTHNRC12_02"  "WTHNRC12_03"  "WTHNRC12_04"  "WTHNRC12_05" 
## [7] "WTHNRC12_06"  "WTHNRC12_07"

33.3 3.3 Missing values by dataset

map(
  data_list,
  ~ map_int(.x, ~ sum(is.na(.x)))
)
## $mort
##                ID               Sex    Cause_of_death         dbuid2016 Location_of_death 
##                 0                13                 0                 4                 0 
##    Marital_status        Postalcode            B_year           B_month             B_day 
##                 0                 6                24                26                26 
##            D_year           D_month             D_day 
##                 0                 0                 0 
## 
## $pop
##                         ...1 Health Service Delivery Area                         Year 
##                            0                            0                            0 
##                       Gender                           <1                       04-Jan 
##                            0                            0                            0 
##                       09-May                       14-Oct                        15-19 
##                            0                            0                            0 
##                        20-24                        25-29                        30-34 
##                            0                            0                            0 
##                        35-39                        40-44                        45-49 
##                            0                            0                            0 
##                        50-54                        55-59                        60-64 
##                            0                            0                            0 
##                        65-69                        70-74                        75-79 
##                            0                            0                            0 
##                        80-84                        85-89                          90+ 
##                            0                            0                            0 
##                        Total 
##                            0 
## 
## $corr
##      dbuid2016     csduid2016      hruid2017 hrname_english  hrname_french      dbpop2016 
##              0              0              0              0              0             35 
## 
## $env
## POSTALCODE12  WTHNRC12_01  WTHNRC12_02  WTHNRC12_03  WTHNRC12_04  WTHNRC12_05  WTHNRC12_06 
##            0            0            0            0            0            0            0 
##  WTHNRC12_07 
##            0