For this word cloud analysis, Great Expectations by Charles Dickens was studied. Great Expectations charts Pip, an orphan's, personal development, exploring universal themes like guilt, persistence and social advancement and historical constructs like wealth, poverty, morality, good versus evil, and Victorian social structures.
This post will provide a brief bigram analysis of his work. All other works from The Project Gutenberg can be retrieved by View(gutenberg_works()).


Import packages

library(gutenbergr)
library(wordcloud)
library(dplyr)
library(tidytext)
library(reshape2)
library(tidyr)


Retrieve text from Project Gutenberg

gutenberg_works(author == "Dickens, Charles") %>% head(10)
## # A tibble: 10 x 8
##    gutenberg_id title    author gutenberg_autho… language gutenberg_books…
##           <int> <chr>    <chr>             <int> <chr>    <chr>           
##  1           46 A Chris… Dicke…               37 en       Christmas/Child…
##  2           98 A Tale … Dicke…               37 en       Historical Fict…
##  3          564 The Mys… Dicke…               37 en       Mystery Fiction
##  4          580 The Pic… Dicke…               37 en       Best Books Ever…
##  5          588 Master … Dicke…               37 en       <NA>            
##  6          644 The Hau… Dicke…               37 en       Christmas       
##  7          650 Picture… Dicke…               37 en       <NA>            
##  8          653 "The Ch… Dicke…               37 en       <NA>            
##  9          675 America… Dicke…               37 en       <NA>            
## 10          678 The Cri… Dicke…               37 en       Children's Lite…
## # ... with 2 more variables: rights <chr>, has_text <lgl>
gutenberg_works(title == "Great Expectations")
## # A tibble: 1 x 8
##   gutenberg_id title   author   gutenberg_autho… language gutenberg_books…
##          <int> <chr>   <chr>               <int> <chr>    <chr>           
## 1         1400 Great … Dickens…               37 en       Best Books Ever…
## # ... with 2 more variables: rights <chr>, has_text <lgl>
great_expectations <- gutenberg_download(c(1400), meta_fields = "title") # passing in gutenberg_id


Tokenize text

tokenize <- great_expectations %>% unnest_tokens(word, text)
head(tokenize)
## # A tibble: 6 x 3
##   gutenberg_id title              word        
##          <int> <chr>              <chr>       
## 1         1400 Great Expectations great       
## 2         1400 Great Expectations expectations
## 3         1400 Great Expectations 1867        
## 4         1400 Great Expectations edition     
## 5         1400 Great Expectations by          
## 6         1400 Great Expectations charles


Remove stop words

tokenize_stop <- tokenize %>%
  anti_join(stop_words, by = "word") %>%
  dplyr::count(title, word, sort = TRUE)


Plot word cloud

set.seed(90)
wordcloud(tokenize_stop$word, tokenize_stop$n,
          rot.per = 0, max.words = 150, scale = c(2, 2), color = "#0072B2")

"image"

Sentiment word cloud

tokenize %>%
  dplyr::inner_join(get_sentiments("bing")) %>%
  dplyr::count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("#D55E00", "#009E73"),
                 scale = c(2, 2), rot.per = 0)

"image"