Chapter 46 16. Postal code quality check activity

Postal codes often require cleaning before they can be used for joins. In this activity, we examine whether postal codes match the expected Canadian postal code format.

46.1 16.1 Standardize postal codes

rates <- rates %>%
  mutate(
    Postalcode_clean = Postalcode %>%
      str_to_upper() %>%
      str_replace_all("\\s+", "")
  )

46.2 16.2 Identify postal codes that do not match the expected format

postal_pattern <- "^(?!.*[DFIOQU])[A-VXY][0-9][A-Z][0-9][A-Z][0-9]$"

postal_issues <- rates %>%
  filter(is.na(str_match(Postalcode_clean, postal_pattern))) %>%
  select(ID, Postalcode, Postalcode_clean)

postal_issues
## # A tibble: 38 x 3
##         ID Postalcode Postalcode_clean
##      <dbl> <chr>      <chr>           
##  1 1000103 V2A6X      V2A6X           
##  2 1000695 V5S2HY     V5S2HY          
##  3 1000760 V6Y1ML     V6Y1ML          
##  4 1000896 VOC1HO     VOC1HO          
##  5 1001210 V0ROB8     V0ROB8          
##  6 1001262 <NA>       <NA>            
##  7 1001964 <NA>       <NA>            
##  8 1003432 V2SOA4     V2SOA4          
##  9 1003907 VOB1HO     VOB1HO          
## 10 1004334 VOB1G8     VOB1G8          
## # i 28 more rows

46.3 16.3 Check each postal code position

rates %>%
  transmute(
    position_1 = str_sub(Postalcode_clean, 1, 1),
    position_2 = str_sub(Postalcode_clean, 2, 2),
    position_3 = str_sub(Postalcode_clean, 3, 3),
    position_4 = str_sub(Postalcode_clean, 4, 4),
    position_5 = str_sub(Postalcode_clean, 5, 5),
    position_6 = str_sub(Postalcode_clean, 6, 6)
  ) %>%
  map(unique)
## $position_1
##  [1] "T" "M" "H" "J" "L" "N" "B" "S" "P" "K" "V" "G" "E" "R" "C" "A" "Y" "X" NA 
## 
## $position_2
##  [1] "2" "4" "9" "8" "5" "3" "6" "0" "1" "7" "O" NA  "L" "S"
## 
## $position_3
##  [1] "E" "J" "H" "W" "Y" "P" "M" "A" "G" "L" "B" "N" "S" "K" "V" "T" "X" "Z" "R" "C" NA 
## 
## $position_4
##  [1] "0" "1" "2" "3" "7" "4" "5" "6" "8" "9" "O" NA  "L" "S"
## 
## $position_5
##  [1] "T" "L" "B" "K" "X" "G" "J" "C" "E" "M" "A" "Y" "N" "Z" "V" "H" "P" "S" "R" "W" NA  "1" "7"
## [24] "2"
## 
## $position_6
##  [1] "2" "1" "4" "6" "5" "3" "0" "8" "7" "9" ""  "Y" "L" "O" NA  "R" "S" "V"

46.4 16.4 Check postal code length

rates %>%
  mutate(postal_length = str_length(Postalcode_clean)) %>%
  count(postal_length)
## # A tibble: 4 x 2
##   postal_length     n
##           <int> <int>
## 1             5     1
## 2             6 58905
## 3             7     4
## 4            NA     6