### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
tidytext
functionality.Spacy
, a high-level DeepLearning based NLP library that will help us to do complex stuff with not too much codeLet’s raise the bar with some Fyodor Dostoevsky
library(tidytext)
# We first need to get the book text. It can be conveniently retrieved via the gutenbergr library, linking r to the Gutenberg project
library(gutenbergr)
# check the id of crime and punishment
gutenberg_metadata %>%
filter(title == "Crime and Punishment")
text_raw <- gutenberg_download(2554)
text_raw %>% glimpse()
Rows: 22,061
Columns: 2
$ gutenberg_id <int> 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554,…
$ text <chr> "CRIME AND PUNISHMENT", "", "By Fyodor Dostoevsky", "", "", "", "Translated By Constance Garnett", "", "", "", "", "TRANSLATOR’S PREFACE", "", "A few words about Dosto…
# LEts take a look
text_raw %>% head(200)
text <- text_raw %>%
select(-gutenberg_id) %>%
slice(-c(1:102)) %>%
mutate(line_num = row_number(),# create new variable line_num
part = cumsum(str_detect(text, regex("^PART [\\divxlc]",
ignore_case = TRUE)))) %>% # create variable part: Crime and Punishment has 7 parts %>%
group_by(part) %>%
mutate(chapter = cumsum(str_detect(text, regex("^CHAPTER [\\divxlc]",
ignore_case = TRUE)))) %>% # create new variable number of Chapter per part %>%
ungroup() %>%
filter(text != "" & !str_detect(text, regex("^[PART|CHAPTER]"))) %>%
mutate(index = 1:n()) %>%
relocate(index, line_num, part, chapter, text)
text %>% glimpse()
Rows: 16,577
Columns: 5
$ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, …
$ line_num <int> 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59…
$ part <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ chapter <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ text <chr> "On an exceptionally hot evening early in July a young man came out of", "the garret in which he lodged in S. Place and walked slowly, as though", "in hesitation, towards …
Cool!
text_tidy <- text %>% unnest_tokens(word, text, token = 'words') %>%
anti_join(stop_words, by = 'word')
text_tidy %>% head(10)
library(SnowballC) # Includes stemming algos
text_tidy %>%
mutate(stem = wordStem(word)) %>%
head(10)
text_tidy %>%
mutate(stem = wordStem(word)) %>%
count(stem, sort = TRUE) %>%
head(50)
# top 10 words used in Crime and Punishment
text_tidy %>%
count(word, sort = TRUE) %>%
slice(1:10) %>%
ggplot(aes(x = fct_reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Crime and Punishment: Top 10 words used", x = NULL)
# People love wordclouds
library(wordcloud)
text_tidy %>%
count(word) %>%
with(wordcloud(word, n,
max.words = 50,
color = "blue"))
While interesting, word frequency does not tell us much about the emotions/states of mind present in the novel.
For this reason, we will go ahead with a sentiment analysis of “Crime and Punishment”
While sounding very comlpex, sentiment analysis is usually done in a rather simple way.
There are already predefined sentiment lexica around, linking words ith certain sentiments
So we do have to only join our word-token with the corresponding sentiments
The most popular dictionaries available are
# You might need to first install the 'textdata' package for some of the lexica
get_sentiments("bing") %>%
head(20)
Lets calculate them all
sentiment_bing <- text_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(chapter, index = index %/% 100, sentiment) %>% # index of 100 lines of text
mutate(lexicon = 'Bing')
sentiment_nrc <- text_tidy %>%
inner_join(get_sentiments("nrc")) %>%
count(chapter, index = index %/% 100, sentiment) %>% # index of 100 lines of text
mutate(lexicon = 'NRC')
sentiment_afinn <- text_tidy %>%
select(-line_num, -part) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = index %/% 100) %>% # index of 100 lines of text
summarise(sentiment = sum(value, na.rm = TRUE)) %>%
mutate(lexicon = 'AFINN')
# Lets join them all together for plotting
sentiment_all <- sentiment_afinn %>%
bind_rows(sentiment_bing %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
select(index, sentiment, lexicon) ) %>%
bind_rows(sentiment_nrc %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
select(index, sentiment, lexicon) )
# crime and punishment -
sentiment_all %>%
ggplot(aes(x = index, y = sentiment, fill = lexicon)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ lexicon) +
labs(title = "Sentiment Analysis: “Crime and Punishment",
subtitle = 'Using the Bing, NRC, AFINN lexicon')
text_tidy %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
filter(sentiment %in% c("joy", "sadness")) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
as.data.frame() %>%
remove_rownames() %>%
column_to_rownames("word") %>%
comparison.cloud(colors = c("darkgreen", "grey75"),
max.words = 100,
title.size = 1.5)
# crime and punishment -
sentiment_nrc %>%
ggplot(aes(x = index, y = sentiment, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ chapter) +
labs(title = "Sentiment Analysis: “Crime and Punishment",
subtitle = 'By chapter: Using NRC lexicon')
tidytext
had an amazing function pairwise_count
for that. However, since it is of more general use, the developers outsourced it into the not-text-specific widyr
package, which they also maintain.library(widyr)
el_words <- text_tidy %>%
pairwise_count(word, index, sort = TRUE) %>%
rename(from = item1, to = item2, weight = n)
el_words %>% head()
library(tidygraph)
library(ggraph)
g <- el_words %>%
filter(weight >= 9) %>%
as_tbl_graph(directed = FALSE) %>%
igraph::simplify() %>% as_tbl_graph()
set.seed(1337)
g %N>%
# filter(centrality_degree(weight = weight) > 100) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight, edge_alpha = weight)) +
geom_node_point(aes(size = centrality_degree(weight = weight)), color = "plum4") +
geom_node_text(aes(label = name,), repel = TRUE) +
theme_graph() +
theme(legend.position = 'none') +
labs(title = 'Co-Word Network Crime and Punishment')
unnest_tokens
function to tokenize by word, or sometimes by sentence, which is useful for the kinds of sentiment and frequency analyses we’ve been doing so far.token = 'ngrams'
argument, and specify how many subsequent words we want to consider.text_tidy_ngrams <- text %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
na.omit()
text_tidy_ngrams %>% head()
text_tidy_ngrams %>%
count(bigram, sort = TRUE) %>%
head(100)
# Seperate them
text_tidy_ngrams %<>%
separate(bigram, c("word1", "word2"), sep = " ")
text_tidy_ngrams %>% head()
# Get rid of stopwords
text_tidy_ngrams %<>%
anti_join(stop_words, by = c('word1' = 'word')) %>%
anti_join(stop_words, by = c('word2' = 'word'))
# And unite again
text_tidy_ngrams %<>%
unite(bigram, word1, word2, sep = " ")
# And finally count
text_tidy_ngrams %<>%
count(index, bigram)
text_tidy_ngrams %>%
count(bigram, wt = n, sort = TRUE) %>%
head(50)
Just for fun, a little gender analysis now…
# Define our pronouns
pronouns <- c("he", "she")
# Get our bigram where first word is a pronoun
gender_bigrams <- text %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
na.omit() %>%
count(bigram) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word1 %in% pronouns) %>%
count(word1, word2, wt = n, sort = TRUE)
{width=600}
spacyr
wrapper.library(spacyr)
# spacy_install() # creates a new conda environment called spacy_condaenv, as long as some version of conda is installed
spacy_initialize(model = "en_core_web_sm")
text_example <- c(d1 = "spaCy is great at fast natural language processing. Everybody loves it!
In Denmark and elsewhere.",
d2 = "We can also use it in R via the great spacyR wrapper. Daniel Michels does that sometimes")
# process documents and obtain a data.table
text_parsed <- spacy_parse(text_example)
text_parsed
text_example %>% spacy_parse(dependency = TRUE, lemma = FALSE, pos = FALSE)
text_chapter <- text %>%
group_by(chapter) %>%
summarise(text = paste(text, collapse = ' ')) %>%
pivot_wider(names_from = chapter, values_from = text)
text_entities <-text_chapter %>% as.character() %>% spacy_parse(entity = TRUE)
text_entities %>% head(1000)
Cool or cool?
We can now again create a network, right? this time really a character network!
el_persons <- text_entities %>%
entity_consolidate() %>%
filter(entity_type %>% str_detect('PERSON')) %>%
unite(chap_sent_id, doc_id, sentence_id, sep = '_') %>%
pairwise_count(token, chap_sent_id, sort = TRUE) %>%
rename(from = item1, to = item2, weight = n)
el_persons %>% head(50)
g <- el_persons %>%
as_tbl_graph(directed = FALSE) %>%
igraph::simplify() %>% as_tbl_graph()
set.seed(1337)
g %N>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight, edge_alpha = weight)) +
geom_node_point(aes(size = centrality_degree(weight = weight)), color = "plum4") +
geom_node_text(aes(label = name,), repel = TRUE) +
theme_graph() +
theme(legend.position = 'none') +
labs(title = 'Character Network Crime and Punishment')
DataCamp (!Most courses have somewhat outdated ecosystems)
sessionInfo()
R version 4.0.3 (2020-10-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
Random number generation:
RNG: L'Ecuyer-CMRG
Normal: Inversion
Sample: Rejection
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] spacyr_1.2.1 ggraph_2.0.5 tidygraph_1.2.0 widyr_0.1.3 wordcloud_2.6 RColorBrewer_1.1-2 SnowballC_0.7.0 gutenbergr_0.2.0 tidytext_0.3.0
[10] magrittr_2.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.5 purrr_0.3.4 readr_1.4.0 tidyr_1.1.3 tibble_3.1.0 ggplot2_3.3.3
[19] tidyverse_1.3.0 knitr_1.31
loaded via a namespace (and not attached):
[1] viridis_0.5.1 httr_1.4.2 viridisLite_0.3.0 jsonlite_1.7.2 modelr_0.1.8 assertthat_0.2.1 triebeard_0.3.0 urltools_1.7.3 cellranger_1.1.0
[10] yaml_2.2.1 ggrepel_0.9.1 pillar_1.5.1 backports_1.2.1 lattice_0.20-41 reticulate_1.18 glue_1.4.2 textdata_0.4.1 digest_0.6.27
[19] polyclip_1.10-0 rvest_0.3.6 colorspace_2.0-0 Matrix_1.3-2 pkgconfig_2.0.3 broom_0.7.5 haven_2.3.1 scales_1.1.1 tweenr_1.0.1
[28] ggforce_0.3.3 generics_0.1.0 farver_2.1.0 ellipsis_0.3.1 withr_2.4.1 cli_2.3.1 crayon_1.4.1 readxl_1.3.1 tokenizers_0.2.1
[37] janeaustenr_0.1.5 fs_1.5.0 fansi_0.4.2 MASS_7.3-53.1 xml2_1.3.2 data.table_1.14.0 tools_4.0.3 hms_1.0.0 lifecycle_1.0.0
[46] munsell_0.5.0 reprex_1.0.0 compiler_4.0.3 rlang_0.4.10 debugme_1.1.0 grid_4.0.3 rstudioapi_0.13 rappdirs_0.3.3 igraph_1.2.6
[55] labeling_0.4.2 gtable_0.3.0 DBI_1.1.1 curl_4.3 graphlayouts_0.7.1 R6_2.5.0 gridExtra_2.3 lubridate_1.7.10 utf8_1.1.4
[64] stringi_1.5.3 Rcpp_1.0.6 vctrs_0.3.6 dbplyr_2.1.0 tidyselect_1.1.0 xfun_0.21