## ----setup, echo = FALSE------------------------------------------------------ library(knitr) opts_chunk$set(cache = TRUE, message = FALSE) ## ----------------------------------------------------------------------------- library(dplyr) library(fuzzyjoin) data(misspellings) misspellings ## ----words-------------------------------------------------------------------- # use the dictionary of words from the qdapDictionaries package, # which is based on the Nettalk corpus. library(qdapDictionaries) words <- tbl_df(DICTIONARY) words ## ----sub_misspellings--------------------------------------------------------- set.seed(2016) sub_misspellings <- misspellings %>% sample_n(1000) ## ----joined, dependson = c("words", "sub_misspellings")----------------------- joined <- sub_misspellings %>% stringdist_inner_join(words, by = c(misspelling = "word"), max_dist = 1) ## ----dependson = "joined"----------------------------------------------------- joined ## ----dependson = "joined"----------------------------------------------------- joined %>% count(misspelling, correct) ## ----dependson = "joined"----------------------------------------------------- which_correct <- joined %>% group_by(misspelling, correct) %>% summarize(guesses = n(), one_correct = any(correct == word)) which_correct # percentage of guesses getting at least one right mean(which_correct$one_correct) # number uniquely correct (out of the original 1000) sum(which_correct$guesses == 1 & which_correct$one_correct) ## ----left_joined, dependson = "misspellings"---------------------------------- left_joined <- sub_misspellings %>% stringdist_left_join(words, by = c(misspelling = "word"), max_dist = 1) left_joined left_joined %>% filter(is.na(word)) ## ----left_joined2, dependson = "misspellings"--------------------------------- left_joined2 <- sub_misspellings %>% stringdist_left_join(words, by = c(misspelling = "word"), max_dist = 2) left_joined2 left_joined2 %>% filter(is.na(word))