I. Background

I am now using the lines from all three documents to predict the next word for a quiz.

This data will be eventually used for predicting the next word in a random statement.

See the Appendix for code.


[1] "SETUP COMPLETE"

II. Predictions

[1] "REPLACEMENT COMPLETE"

1. “The guy in front of me just bought a pound of bacon, a bouquet, and a case of…”

  Possible Next Word Frequency
1               beer         4
2              Boxer         1
3               Four         1
4              Moose         1
5           mountain         1
6              Pabst         1
7                Sly         1
8              Surge         1
9               wine         1

2. “You’re the reason why I smile everyday. Can you follow me please? It would mean the…”

  Possible Next Word Frequency
1              world        41
2           absolute         1
3             entire         1
4              owner         1
5             world3         1

3. “Hey sunshine, can you follow me and make me the…”

  Possible Next Word Frequency
1           happiest         1

4. “Very early observations on the Bills game: Offense still struggling but the…”

  Possible Next Word Frequency
1                are         1

5. “Go on a romantic date at the…”

   Possible Next Word Frequency
1                 end         5
2                time         5
3                 App         1
4                 art         1
5             Beverly         1
6              bottom         1
7              braves         1
8                Cake         1
9              cheese         1
10               chip         1

6. “Well I’m pretty sure my granny has some old bagpipes in her garage I’ll dust them off and be on my…”

  Possible Next Word Frequency
1                own         3
2             linked         1
3              mouth         1
4              phone         1
5                way         1

7. “Ohhhhh #PointBreak is on tomorrow. Love that film and haven’t seen it in quite some…”

  Possible Next Word Frequency
1               time         1

8. “After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little…”

   Possible Next Word Frequency
1             brother         5
2              sister         3
3             bedroom         1
4               bitty         1
5            brothers         1
6               claws         1
7             company         1
8             cousins         1
9            daughter         1
10                dog         1

9. “Be grateful for the good times and keep the faith during the…”

  Possible Next Word Frequency
1               last         1
2            worship         1

10. “If this isn’t the cutest thing you’ve ever seen, then you must be…”

     Possible Next Word Frequency
1                     a         1
2                  able         1
3             amazingly         1
4                 blind         1
5              cheating         1
6  crrrraaaaaazzzzyyyyy         1
7                 doing         1
8              familiar         1
9                    in         1
10             prepared         1

Appendix


Setup

knitr::opts_chunk$set(comment = NA)
options("scipen" = 100)
options(java.parameters = "-Xmx4g")

Packages

library(dplyr)
library(readr)
library(RWeka)
library(stringr)
library(textclean)
library(tm)

Read Files

#Files are read.

blogs <- read_lines(file = "en_US.blogs.txt")

news <- read_lines(file = "en_US.news.txt")

tweets <- read_lines(file = "en_US.twitter.txt")

Helper Functions

#Function:
#"’" is replaced with "'", contractions are expanded.
#"U.S." is replaced with "US".
#"a.m." is replaced with "am".
#"p.m." is replaced with "pm".
replacement <- function(data) {
  
  temp <- replace_contraction(gsub("’", "'", data),
                              contraction.key = lexicon::key_contractions)
  temp <- gsub("U.S.", "US", temp)
  temp <- gsub("a.m.", "am", temp)
  temp <- gsub("p.m.", "pm", temp)
  return(temp)
}



#Function returns the word immediately after a string for all lines that contain the string."
retrieve_next_words <- function(input_string) {

  #Word length of input string.
  n <- length(strsplit(input_string, " ")[[1]])
  #Final list.
  next_words <- NULL
  
  for (bnt in bnts) {
    #Check if the string is in the line.
    if(grepl(input_string, bnt)) {
      #Find where in the line it is located.
      beginning <- str_locate(bnt, input_string)[1]
      #Start from where the string is, split by space, grab the next word.
      next_word <- strsplit(substring(bnt, beginning), " ")[[1]][n+1]
      #Get rid of non-alphanumeric.
      next_word <- str_replace_all(next_word, regex("\\W+"), "")
      #Update final list.
      next_words <- c(next_words, next_word)
    }
  }
  return(next_words)
}



possibilities <- function(q_string){
  
  #Cleanup.
  q_string <- replacement(q_string)
  #Word length of input string.
  temp_str <- str_split(q_string, " ")[[1]]
  n <- length(temp_str)
  
  #Start with last 10 words.
  if (n > 10) {
    q_string <- paste(temp_str[(n-9):n], collapse = ' ')    
  }

  while (nchar(q_string) > 0) {

    #Get next words from blogs, news, and tweets.
    q_words <- retrieve_next_words(q_string)
    
    #If there is at least one word...
    if (length(q_words > 0)) {
      
      #Return at most 10 entries of the frequency table.
      df_freq <- df_pred_creator(q_words)
      
      if (dim(df_freq)[1] > 10){
        return(df_freq[1:10,])
      } else {
        return(df_freq)
      }
      
    #No words means try again, with a shortened string.  
    } else {
      #Shorten string by removing left most word.
      q_string <- substring(q_string, str_locate(q_string, " ")[1]+1)
    }
  }
  #All string lengths failed to match.
  return("NO MATCHES")
}



#This function will create a Frequency dataframe from next_words.
df_pred_creator <- function(next_words) {

  #Convert sorted table to dataframe.
  df <- as.data.frame(sort(table(next_words), decreasing = T))
  
  #If there is only one row, change to two columns from index and single column.
  if (length(next_words) == 1) {
    df <- cbind(newColName = rownames(df), df)
    rownames(df) <- 1:nrow(df)
  }
  
  #Change Column Names.
  df <- df %>%
    `colnames<-`(c("Possible Next Word", "Frequency"))
  
  return(df)
}

print("SETUP COMPLETE")

II. Predictions

blogs <- replacement(blogs)

news <- replacement(news)

tweets <- replacement(tweets)

bnts <- c(blogs, news, tweets)

print("REPLACEMENT COMPLETE")

1.

q1_string <- "The guy in front of me just bought a pound of bacon, a bouquet, and a case of"

possibilities(q1_string)

2.

q2_string <- "You're the reason why I smile everyday. Can you follow me please? It would mean the"

possibilities(q2_string)

3.

q3_string <- "Hey sunshine, can you follow me and make me the"

possibilities(q3_string)

4.

q4_string <- "Very early observations on the Bills game: Offense still struggling but the"

possibilities(q4_string)

5.

q5_string <- "Go on a romantic date at the"

possibilities(q5_string)

6.

q6_string <- "Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my"

possibilities(q6_string)

7.

q7_string <- "Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some"

possibilities(q7_string)

8.

q8_string <- "After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little"

possibilities(q8_string)

9.

q9_string <- "Be grateful for the good times and keep the faith during the"

possibilities(q9_string)

10.

q10_string <- "If this isn't the cutest thing you've ever seen, then you must be"

possibilities(q10_string)