I am now using the lines from all three documents to predict the next word for a quiz.
This data will be eventually used for predicting the next word in a random statement.
See the Appendix for code.
[1] "SETUP COMPLETE"
[1] "REPLACEMENT COMPLETE"
Possible Next Word Frequency
1 beer 4
2 Boxer 1
3 Four 1
4 Moose 1
5 mountain 1
6 Pabst 1
7 Sly 1
8 Surge 1
9 wine 1
Possible Next Word Frequency
1 world 41
2 absolute 1
3 entire 1
4 owner 1
5 world3 1
Possible Next Word Frequency
1 happiest 1
Possible Next Word Frequency
1 are 1
Possible Next Word Frequency
1 end 5
2 time 5
3 App 1
4 art 1
5 Beverly 1
6 bottom 1
7 braves 1
8 Cake 1
9 cheese 1
10 chip 1
Possible Next Word Frequency
1 own 3
2 linked 1
3 mouth 1
4 phone 1
5 way 1
Possible Next Word Frequency
1 time 1
Possible Next Word Frequency
1 brother 5
2 sister 3
3 bedroom 1
4 bitty 1
5 brothers 1
6 claws 1
7 company 1
8 cousins 1
9 daughter 1
10 dog 1
Possible Next Word Frequency
1 last 1
2 worship 1
Possible Next Word Frequency
1 a 1
2 able 1
3 amazingly 1
4 blind 1
5 cheating 1
6 crrrraaaaaazzzzyyyyy 1
7 doing 1
8 familiar 1
9 in 1
10 prepared 1
#Function:
#"’" is replaced with "'", contractions are expanded.
#"U.S." is replaced with "US".
#"a.m." is replaced with "am".
#"p.m." is replaced with "pm".
replacement <- function(data) {
temp <- replace_contraction(gsub("’", "'", data),
contraction.key = lexicon::key_contractions)
temp <- gsub("U.S.", "US", temp)
temp <- gsub("a.m.", "am", temp)
temp <- gsub("p.m.", "pm", temp)
return(temp)
}
#Function returns the word immediately after a string for all lines that contain the string."
retrieve_next_words <- function(input_string) {
#Word length of input string.
n <- length(strsplit(input_string, " ")[[1]])
#Final list.
next_words <- NULL
for (bnt in bnts) {
#Check if the string is in the line.
if(grepl(input_string, bnt)) {
#Find where in the line it is located.
beginning <- str_locate(bnt, input_string)[1]
#Start from where the string is, split by space, grab the next word.
next_word <- strsplit(substring(bnt, beginning), " ")[[1]][n+1]
#Get rid of non-alphanumeric.
next_word <- str_replace_all(next_word, regex("\\W+"), "")
#Update final list.
next_words <- c(next_words, next_word)
}
}
return(next_words)
}
possibilities <- function(q_string){
#Cleanup.
q_string <- replacement(q_string)
#Word length of input string.
temp_str <- str_split(q_string, " ")[[1]]
n <- length(temp_str)
#Start with last 10 words.
if (n > 10) {
q_string <- paste(temp_str[(n-9):n], collapse = ' ')
}
while (nchar(q_string) > 0) {
#Get next words from blogs, news, and tweets.
q_words <- retrieve_next_words(q_string)
#If there is at least one word...
if (length(q_words > 0)) {
#Return at most 10 entries of the frequency table.
df_freq <- df_pred_creator(q_words)
if (dim(df_freq)[1] > 10){
return(df_freq[1:10,])
} else {
return(df_freq)
}
#No words means try again, with a shortened string.
} else {
#Shorten string by removing left most word.
q_string <- substring(q_string, str_locate(q_string, " ")[1]+1)
}
}
#All string lengths failed to match.
return("NO MATCHES")
}
#This function will create a Frequency dataframe from next_words.
df_pred_creator <- function(next_words) {
#Convert sorted table to dataframe.
df <- as.data.frame(sort(table(next_words), decreasing = T))
#If there is only one row, change to two columns from index and single column.
if (length(next_words) == 1) {
df <- cbind(newColName = rownames(df), df)
rownames(df) <- 1:nrow(df)
}
#Change Column Names.
df <- df %>%
`colnames<-`(c("Possible Next Word", "Frequency"))
return(df)
}
print("SETUP COMPLETE")