## assertive has some important changes. Read ?changes for details.
The customer_data dataset contains (made-up) personal data from some customers. Import the dataset.
# you may need to adjust the path
customer_data <- read.csv(
"C:/Users/rjc2003/Dropbox/useR2015_workshop/for distribution/answers/assertive/customer_data.csv",
stringsAsFactors = FALSE
)
str(customer_data)
## 'data.frame': 20 obs. of 7 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "MR" "MRS" "" "Prof" ...
## $ FirstName : chr "Vaughn" "Shi" "Nithya" "Mukesh" ...
## $ LastName : chr "F" "Ma" "Tamboli" "Kulkarni" ...
## $ DateOfBirth: chr "1927-01-09" "1957-10-12" "1925-04-14" "1964-03-04" ...
## $ Telephone : chr "(01294) 907358 " "(0114) 229 71370" "+44 1202 101080 " "(01352) 794762 " ...
## $ Postcode : chr "KA12 8SE " "S7 1FF " "" "CH64 4DU " ...
Use the assertive package to find bad data points that may require cleaning.
Check the ID
column.
# check that the `ID` column is numeric
assert_is_numeric(customer_data$Id)
# check that the `ID` column contains natural numbers
assert_all_are_positive(customer_data$Id)
## Error in eval(expr, envir, enclos): customer_data$Id contains non-positive values.
## There was 1 failure:
## Position Value Cause
## 1 14 -14 too low
assert_all_are_whole_numbers(customer_data$Id)
# check that the `ID` column has no duplicates
assert_has_no_duplicates(customer_data$Id)
## Error in eval(expr, envir, enclos): customer_data$Id has duplicates.
Check the Title
column.
# check that the `Title` column is character
assert_is_character(customer_data$Title)
# check that the `Title` column contains valid honorifics
assert_all_are_honorifics(customer_data$Title)
## Error in eval(expr, envir, enclos): customer_data$Title are not all honorifics.
## There were 7 failures:
## Position Value Cause
## 1 3 bad format
## 2 4 Prof bad format
## 3 7 Ninja bad format
## 4 12 Mrr bad format
## 5 14 Brigadier bad format
## 6 19 Rt Hon bad format
## 7 20 Darth bad format
Check the FirstName
column.
# check that the `FirstName` column is character
assert_is_character(customer_data$FirstName)
# check that the `FirstName` column contains non missing/empty
# strings
assert_all_are_not_missing_nor_empty_characters(customer_data$FirstName)
## Error in eval(expr, envir, enclos): customer_data$FirstName contains missing or empty strings.
## There was 1 failure:
## Position Value Cause
## 1 20 empty
Check the LastName
column.
# check that the `LastName` column is character
assert_is_character(customer_data$LastName)
# check that the `LastName` column contains strings of a sensible
# number of characters
nch_last_name <- nchar(customer_data$LastName)
assert_all_are_in_closed_range(nch_last_name, 2, 30)
## Error in eval(expr, envir, enclos): nch_last_name are not all in the range [2,30].
## There was 1 failure:
## Position Value Cause
## 1 1 1 too low
Check the DateOfBirth
column.
Parsing dates and times can give unexpected results. Notice that the badly formatted date “06-05-1957” still parses to a real date (though not the right one!). This is why the extra check for a plausible age is necessary.
# check that the `DateOfBirth` column contains dates in ISO 8601
# form, `%Y-%m-%d`
assert_all_are_date_strings(customer_data$DateOfBirth, "%Y-%m-%d")
## Error in eval(expr, envir, enclos): customer_data$DateOfBirth is not a character vector of dates.
## There was 1 failure:
## Position Value Cause
## 1 10 1964-13-15 bad format
# check that the `DateOfBirth` column indicate that the customers
# are aged between 18 and 120
# Once we are happy with the format, we can convert to Dates
customer_data$DateOfBirth <- as.Date(customer_data$DateOfBirth, "%Y-%m-%d")
age <- difftime(Sys.Date(), customer_data$DateOfBirth, units = )
Check the Telephone
column.
# check that the `Telephone` column is character
assert_is_character(customer_data$Telephone)
# check that the `Telephone` column contains valid UK telephone
# numbers
assert_all_are_uk_telephone_numbers(customer_data$Telephone)
## Error in eval(expr, envir, enclos): customer_data$Telephone are not all UK telephone numbers.
## There were 11 failures (showing the first 10):
## Position Value Cause
## 1 2 011422971370 bad format
## 2 7 bad format
## 3 8 02093078556 bad format
## 4 10 bad format
## 5 12 0125255354 bad format
## 6 13 bad format
## 7 15 bad format
## 8 16 1617090166 bad format
## 9 17 01135676514 bad format
## 10 19 9802575 bad format
Check the Postcode
column.
# check that the `Postcode` column is character
assert_is_character(customer_data$Postcode)
# check that the `Postcode` column contains valid UK postcodes
assert_all_are_uk_postcodes(customer_data$Postcode)
## Error in eval(expr, envir, enclos): customer_data$Postcode are not all UK postcodes.
## There were 19 failures (showing the first 10):
## Position Value Cause
## 1 1 KA12 8SE bad format
## 2 2 S7 1FF bad format
## 3 3 bad format
## 4 4 CH64 4DU bad format
## 5 5 bad format
## 6 6 N10 1QX bad format
## 7 7 GU12 6DD bad format
## 8 8 W4 3QYQ bad format
## 9 9 B650HE bad format
## 10 11 CH100 9AL bad format