This is an exploratory data analysis of blogs, news, and tweets provided by the course.
This data will be eventually used for predicting the next word in a statement.
See the Appendix for code.
For each of Blogs, News, and Tweets, the following information is provided:
There are 899,288 blogs in the file.
There are a total of 37,334,131 words and 206,824,505 characters in the file.
On average, each blog in the file has 41.5 words and 230 characters.
Remaining word counts in blogs are:
[1] 1001 1003 1009 1014 1015 1024 1106 1127 1133 1157 1219 1295 1362 1380 1656
[16] 1716 2031 2079 2379 3283 6170 6630
Remaining character counts in blogs are:
[1] 4513 4527 4596 4606 4615 4702 4702 4714 4736 4772 4883 4919
[13] 4974 5054 5057 5385 5496 5557 5739 5883 6142 6179 6613 6886
[25] 7012 7375 8190 9189 9810 10785 12409 14201 19795 37191 40833
29.9% of the total words are from the top 20.
Word Frequency
1 the 1,860,156
2 and 1,094,401
3 to 1,069,440
4 a 900,362
5 of 876,799
6 i 775,032
7 in 598,532
8 that 460,782
9 is 432,712
10 it 403,902
11 for 363,838
12 you 298,702
13 with 286,733
14 was 278,347
15 on 276,511
16 my 270,855
17 this 259,008
18 as 223,949
19 have 218,930
20 be 209,061
There are 1,010,242 news texts in the file.
There are a total of 34,372,530 words and 203,223,159 characters in the file.
On average, each news text in the file has 34 words and 201.2 characters.
Remaining word counts in news texts are:
[1] 451 458 460 463 480 485 507 517 521 563 691 708 870 898 1031
[16] 1370 1792
Remaining character counts in news texts are:
[1] 2333 2363 2377 2386 2428 2442 2559 2581 2592 2625 2626 2644
[13] 2756 2785 2900 3008 3264 3555 3885 4198 5120 5236 5760 8949
[25] 11384
27.4% of the total words are from the top 20.
Word Frequency
1 the 1,974,366
2 to 906,145
3 and 889,511
4 a 878,035
5 of 774,502
6 in 679,065
7 for 353,901
8 that 347,079
9 is 284,240
10 on 269,881
11 with 254,813
12 said 250,418
13 he 228,996
14 was 228,970
15 it 220,092
16 at 214,177
17 as 187,560
18 i 158,856
19 his 157,671
20 be 152,860
There are 2,360,148 tweets in the file.
There are a total of 30,373,543 words and 162,096,031 characters in the file.
On average, each tweet in the file has 12.9 words and 68.7 characters.
Remaining word counts in tweets are:
[1] 47
25.4% of the total words are from the top 20.
Word Frequency
1 the 937,405
2 to 788,645
3 i 723,447
4 a 611,358
5 you 548,089
6 and 438,538
7 for 385,348
8 in 380,376
9 of 359,635
10 is 358,775
11 it 295,087
12 my 291,906
13 on 278,022
14 that 234,661
15 me 202,547
16 be 187,850
17 at 186,753
18 with 173,477
19 your 171,221
20 have 168,670
All three data files have similar right skewed distributions of words and characters. Tweets is slightly different as there was a 140 character maximum per tweet.
The top 20 words for each data file are extremely similar, and account for a similar respective proportion for each data file.
To predict the next word for an input statement, the string will be run for each line in the three data files. If it matches, the next word will be saved. A frequency table will be made for all the words saved.
#This function will determine the total number of lines and the number of words and characters from each line.
words_chars <- function(many_lines){
#Total Number of lines.
total_lines <- length(many_lines)
temp_df <- data.frame("Lines" = many_lines)
#Number of words in each line.
words <- apply(temp_df, 1, wordcount)
#Number of characters in each line.
chars <- apply(temp_df, 1, nchar)
return(list(total_lines, words, chars))
}
#This function will determine the total and average number of words and characters from all lines.
summary_stats <- function(info) {
lines <- info[[1]]
words <- info[[2]]
chars <- info[[3]]
num_words <- sum(words)
avg_words <- round(num_words / lines, 1)
num_chars <- sum(chars)
avg_chars <- round(num_chars / lines, 1)
return(list(lines, num_words, avg_words, num_chars, avg_chars))
}
#This function will create a summary statement.
summary_statement <- function(label1, label2, summary_info) {
lines <- summary_info[[1]]
total_words <- summary_info[[2]]
avg_words <- summary_info[[3]]
total_chars <- summary_info[[4]]
avg_chars <- summary_info[[5]]
statement <- paste("There are",
prettyNum(lines, big.mark = ","),
label1,
"in the file.")
total_statement <- paste("There are a total of",
prettyNum(total_words, big.mark = ","),
"words and",
prettyNum(total_chars, big.mark = ","),
"characters in the file.")
avg_statement <- paste("On average, each",
label2,
"in the file has",
prettyNum(avg_words, big.mark = ","),
"words and",
prettyNum(avg_chars, big.mark = ","),
"characters.")
cat(paste(statement,
total_statement,
avg_statement,
sep = "\n"))
}
#This function returns a table of the frequency of all words in a file, in descending order.
word_frequency <- function(many_lines, summary_info) {
total_words <- summary_info[[2]]
df <- data.frame(sort(table(tokenize_words(toString(many_lines))), decreasing = T)) %>%
`colnames<-`(c("Word", "Frequency"))
top_twenty <- sum(df[1:20, "Frequency"])
word_percent <- paste(round(top_twenty * 100 / total_words, 1), "%", sep = "")
cat(paste(word_percent,
"of the total words are from the top 20."))
df <- df[1:20,] %>%
mutate(Frequency = prettyNum(Frequency, big.mark = ","))
return(df)
}
#Splits data into two parts. The histogram is further split into two halves.
#Histogram
histogram_helper <- function(data_label, df, limit_1, limit_2, bins_1, bins_2, wc, title_shift){
#Plural.
label_plural = paste(data_label, "s", sep = "")
#Main Title.
title = paste("Distribution of",
wc,
"Count per",
data_label)
#The left half is where the vast majority of data is located.
g1 <- ggplot(mapping = aes(x = df[df$Item < limit_1 + 1,]))
g1 <- g1 + geom_histogram(bins = bins_1, color = "white", fill = "grey31")
#Titles and Axes.
g1 <- g1 + ggtitle("")
g1 <- g1 + scale_x_continuous(name = paste(wc, "Count"), breaks = pretty_breaks(), labels = comma)
g1 <- g1 + scale_y_continuous(name = paste("Number of", label_plural), labels = comma)
#Modify labels and text.
g1 <- g1 + theme(plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 12, face = "bold"))
#The right half contains most of the right skew.
g2 <- ggplot(mapping = aes(x = df[df$Item > limit_1 & df$Item < limit_2 + 1,]))
g2 <- g2 + geom_histogram(bins = bins_2, color = "white", fill = "grey31")
#Titles and Axes.
g2 <- g2 + ggtitle(title)
g2 <- g2 + scale_x_continuous(name = paste(wc, "Count"),
breaks = c(limit_1, (2 * limit_1 + limit_2) / 3,
(limit_1 + 2 * limit_2) / 3, limit_2),
labels = comma)
g2 <- g2 + scale_y_continuous(name = "", labels = comma)
#Modify labels and text.
g2 <- g2 + theme(plot.title = element_text(hjust = title_shift, size = 14, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 12, face = "bold"))
#Combine two halves.
grid.arrange(g1, g2, layout_matrix = matrix(c(1, 2), ncol = 2))
#The remaining extreme right skew is printed out.
cat(paste("Remaining",
tolower(wc),
"counts in",
tolower(label_plural),
"are: "))
sort(df[df$Item > limit_2,])
}
df <- data.frame("Item" = info_t[[3]])
#Main Title.
title = paste("Distribution of Character Count per Tweet")
g <- ggplot(mapping = aes(x = df$Item))
g <- g + geom_histogram(bins = 35, color = "white", fill = "grey31")
#Titles and Axes.
g <- g + ggtitle(title)
g <- g + scale_x_continuous(name = paste("Character Count"), labels = comma)
g <- g + scale_y_continuous(name = paste("Number of Tweets"), labels = comma)
#Modify labels and text.
g <- g + theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text.x = element_text(size = 10),
axis.title.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 10),
axis.title.y = element_text(size = 12, face = "bold"))
g