Code and Scripts (N-gram and KWIC)

The code developed for text analysis is presented below, complete with comments and annotations. Alternatively, the scripts can be downloaded as R files from the following links:

Visão Panorâmica (Bird’s-Eye View — N-gram)

Reinos Animal, Vegetal e Mineral (Kingdoms of Nature — N-gram + Keywords in Context)

The code version shared here in open access, intended for inspection and reuse, corresponds to that used to generate datasets-bundle-3 and its visualizations [release 2, February 2025].


SUMMARY

N-GRAM

RSTUDIO SETTINGS

DATA PREPROCESSING

BIRD’S EYE VIEW ANALYSIS

KINGDOMS OF NATURE ANALYSIS

KWIC – KEYWORDS IN CONTEXT

RSTUDIO SETTINGS

DATA PREPROCESSING

KINGDOMS OF NATURE ANALYSIS


N-GRAM

RSTUDIO SETTINGS

## List of required packages 

### tm - general use
### tidyverse (package collection) - general use
### stopwords - stopwords removal
### qdap - stopwords removal
### textstem - lemmatization process
### quanteda - lemmatization process

## Installing required packages

install.packages(c("tm",
                   "tidyverse", 
                   "stopwords",
                   "qdap",
                   "textstem",
                   "quanteda"))

## Loading installed packages

library(tm)       
library(tidyverse)
library(stopwords)
library(qdap)
library(textstem)
library(quanteda)

DATA PREPROCESSING

## 1st - Importing Data

### directory assignment
arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all"

### verifying the files
arrfiles <- list.files(path=arrcorpus)
arrfiles #files correctly ordered

### building corpus arrangement: [[ ]] file, [ ] verse (strings)
corpuslist <- paste(arrcorpus,"/", arrfiles, sep="")
corpuslist 
typeof(corpuslist) #character

corpus.list <- lapply(corpuslist, FUN=readLines)
corpus.list [[1]]
typeof(corpus.list) #list

## 2nd - Data cleaning

### converting into one string 
corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ")
corpus.list.line [[78]]
typeof(corpus.list.line) #list

### de-captalizing the words
corpus.list.line.lower <- tolower(corpus.list.line)
corpus.list.line.lower [[78]]
typeof(corpus.list.line.lower) #character

### tokenizing with text/file separation
corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W")
corpus.list.line.clean [[78]]
typeof(corpus.list.line.clean) #list
typeof(corpus.list.line.clean[[78]][985]) #character

## 3rd - Corpus 

### making a Simple Corpus: lists of vectors
corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean))) 
corpus.as.list 
typeof(arrcorpus) #character
typeof(corpus.as.list) #list
inspect(corpus.as.list[[2]]) 

## 4th - Removing stopwords 

### adapted list assignment (based on "stopwords-iso")
allstops_iso_alt <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/stopwords/stopwords-pt_txt_alterado_pr_24-01-24.txt", what="character", sep="\n")
allstops_iso_alt

### Additonal words to be removed
allstops_iso_alt_b <- c(allstops_iso_alt, "c", "dir")

### stopwords removal
corpus.as.list_b  <- tm_map(corpus.as.list, removeWords, c(allstops_iso_alt_b))
inspect(corpus.as.list_b[[1]])
inspect(corpus.as.list_b[[48]])

## 5th - More Data Cleaning

### removing numbers
corpus.as.list_b  <- tm_map(corpus.as.list_b, content_transformer(removeNumbers))
inspect(corpus.as.list_b[[48]])

### removing punctuation
corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(removePunctuation))
inspect(corpus.as.list_b[48])

### removing white spaces
corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(stripWhitespace))
typeof(corpus.as.list_b[48]) #list

## 6th - Lemmatization

corpus.as.list.lemma_b <- corpus.as.list_b

### setting a list/dictionary to proceed lemmatization (based on "lemmatization-pt" from Global Glossary Project)
lemma_dic <- read.delim(file = "/Users/patriciareina/Desktop/FAT/LISTAS/lematização/lemmatization-pt_rev_06-06-24.txt", header = FALSE, stringsAsFactors = FALSE)
names(lemma_dic) <- c("lemma", "term")
lemma_dic[1:1000,]
typeof(lemma_dic) #list

### fixing duplicate terms, keeping only first occurrences 
lemma_dic_unique <- lemma_dic[!duplicated(lemma_dic$term),]

### list of control for removed duplicated terms [external output]
term_doc <- print(lemma_dic$term)
term_doc[duplicated(term_doc)]
options(max.print=99999)
capture.output(term_doc[duplicated(term_doc)], file = "duplicados_lemma_06-06-24.txt")
options(max.print=9999)

### reordering dictionary disposition for term-lemma instead of lemma-term
lemma_dic_unique_term_lemma <- select(lemma_dic_unique, term, lemma) 
lemma_dic_unique_term_lemma[1:10,]

### lemmatization
for (i in 1:length(corpus.as.list.lemma_b)) {corpus.as.list.lemma_b [[i]][[1]] <- lemmatize_strings(corpus.as.list.lemma_b [[i]][[1]], dictionary = lemma_dic_unique_term_lemma)} 

corpus.as.list_b #stopwords removed
corpus.as.list.lemma_b #lemmatized + stopwords removed

N-GRAM BIRD’S EYE VIEW ANALYSIS

# N-GRAM BIRD'S EYE VIEW ANALYSIS

## Installing required packages for N-Gram

install.packages(c("koRpus",
                   "tokenizers", 
                   "readtext"))
### KoRpus- token transformation; tokenizers - tokenization process; readtext - reads different text formats

## Loading installed packages for N-Gram

library(koRpus)       
library(tokenizers)
library(readtext)
library(dplyr)
library(tidytext)

## setting directory for the external outputs
getwd()
setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS")

## tokenization

### quanteda Corpus for later tokens (lemmatized, stopwords removed)
typeof(corpus.as.list.lemma_b) #list
corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b) 
summary(corpus.as.list.lemma_b.toks) # each publication is a sentence

### tokens for each publication in total Corpus 
toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks) 

### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_total) 
`docnames<-`(toks_arr_total,arr.book.names)
names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names)

### list of token bi-grams in Total Corpus
toks_arr_bigram <- tokens_ngrams(toks_arr_total, n = 2)

## Working bi-grams

### separating bi-grams in 2 words
toks_arr_bigram_stack <- stack(toks_arr_bigram)
toks_arr_bigram_stack [1,] #r owname is 'value' for bi-grams
arr_bigrams_separated <- toks_arr_bigram_stack %>%
  separate(values, c("word1", "word2"), sep = "_") # each bi-gram word is distinguished
arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind"

### search for the 10 most frequent bigrams (generaal corpus, repetition)
arr_bigrams_united <- arr_bigrams_separated %>%
  unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table
bigrams_10_most_freq_corpus <- arr_bigrams_united %>% 
  count(bigram, sort = TRUE) # frequency of bigrams: the most freq. to the least freq.
first_10_bigrams_most_freq_corpus <- bigrams_10_most_freq_corpus[1:10,]
first_10_bigrams_most_freq_corpus_sep <- first_10_bigrams_most_freq_corpus %>%
  separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished

#### data preparation [external output for RAWGraphs] 
names(first_10_bigrams_most_freq_corpus_sep) <- c("first term","second term","frequency") # new rownames
write.csv(first_10_bigrams_most_freq_corpus_sep, file = "bigrams.relation.freq.general10.csv", row.names = FALSE)

### search for the most 10 frequent terms in bi-grams 
top.10.terms <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/top10_arr_total_corpus.txt", what="character", sep="\n")
top.10.terms.df <- data.frame(top.10.terms)
bigrams_filtered_top10_word1 <- arr_bigrams_separated %>%
  filter(word1 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the fist term within bi-gram
bigrams_filtered_top10_word2 <- arr_bigrams_separated %>%
  filter(word2 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the second term within bi-gram
bigrams_filtered_top10_word1plus2 <- rbind(bigrams_filtered_top10_word1, bigrams_filtered_top10_word2) # merges the results
bigrams_filtered_top10_word1plus2_optimazed <-bigrams_filtered_top10_word1plus2 %>%
  filter(!word1 == word2) # does not consider repeated terms within the bi-gram
bigrams_united_top10 <- bigrams_filtered_top10_word1plus2_optimazed %>%
  unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table

### top 10 bigrams that have top 10 terms correlated (top10, no repetition)
bigrams_freq <- bigrams_united_top10 %>%
  count(bigram, sort = TRUE) # frequency of bigrams
bigrams_freq_top10 <- bigrams_freq[1:10,]
bigrams_freq_top10_sep <- bigrams_freq_top10 %>%
  separate(bigram, c("word1", "word2"), sep = " ") # each bi-gram word is distinguished

#### data preparation [external output for RAWGraphs] 
names(bigrams_freq_top10_sep) <- c("first term","second term","frequency") # new rownames
write.csv(bigrams_freq_top10_sep, file = "bigrams.relation.freq.top10.csv", row.names = FALSE)

### top 50 bigrams correlated (general corpus, no repetition)
bigrams_10_most_freq_corpus [1:50,]
arr_bigrams__most_freq_separated <- bigrams_10_most_freq_corpus %>%
  separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished
arr_bigrams__most_freq_separated [1:50,]
arr_bigrams__most_freq_separated_no_rep <- arr_bigrams__most_freq_separated %>%
  filter(!word1 == word2) # do not consider repeated terms within the bi-gram
arr_bigrams_50_most_freq_separated_no_rep <- arr_bigrams__most_freq_separated_no_rep[1:50,]

####data preparation [external output for RAWGraphs]
names(arr_bigrams_50_most_freq_separated_no_rep) <- c("first term","second term","frequency") # new rownames
write.csv(arr_bigrams_50_most_freq_separated_no_rep, file = "bigrams.relation.top50.csv", row.names = FALSE)

N-GRAM KINGDOMS OF NATURE ANALYSIS

## Installing required packages for N-Gram

install.packages(c("koRpus",
                   "tokenizers", 
                   "readtext"))
### KoRpus- tonken transformation; tokenizers - tokenization process; readtext - reads different text formats

## Loading installed packages for N-Gram

library(koRpus)       
library(tokenizers)
library(readtext)
library(dplyr)
library(tidytext)

## setting directory for the external outputs
getwd()
setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS")

## tokenization

### quanteda Corpus for later tokens (lemmatized, stopwords removed)
typeof(corpus.as.list.lemma_b) #list
corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b) 
summary(corpus.as.list.lemma_b.toks) # each publication is a sentence

### tokens for each publication in total Corpus 
toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks) 

### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_total) 
`docnames<-`(toks_arr_total,arr.book.names)
names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names)

### list of token bi-grams in Total Corpus
toks_arr_bigram <- tokens_ngrams(names.toks_arr_total, n = 2)

## Working bi-grams

### separating bi-grams in 2 words
toks_arr_bigram_stack <- stack(toks_arr_bigram)
toks_arr_bigram_stack [1,] # rowname is "value" for bi-grams
arr_bigrams_separated <- toks_arr_bigram_stack %>%
  separate(values, c("word1", "word2"), sep = "_") #each bi-gram word is distinguished
arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind"

## Literal terms 

### List
kingdoms <- c("vegetal", "animal", "mineral")
kingdoms.df <- data.frame(kingdoms)

### search for the bi-grams that contain the  literal kingdoms terms (repetition, per book)
bigrams_literal_terms_corpus_word1 <- arr_bigrams_separated %>% 
  filter(word1 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the fist term within bi-gram
bigrams_literal_terms_corpus_word1b <- bigrams_literal_terms_corpus_word1
bigrams_literal_terms_corpus_word1b$kingdoms = bigrams_literal_terms_corpus_word1b$word1 # literal term names the kingdom for viz. optimization
bigrams_literal_terms_corpus_word2 <- arr_bigrams_separated %>%
  filter(word2 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the second term within bi-gram
bigrams_literal_terms_corpus_word2b <- bigrams_literal_terms_corpus_word2
bigrams_literal_terms_corpus_word2b$kingdoms = bigrams_literal_terms_corpus_word2b$word2 # literal term names the kingdom for viz. optimization
bigrams_literal_terms_corpus_word1plus2 <- rbind(bigrams_literal_terms_corpus_word1, bigrams_literal_terms_corpus_word2) # merges the results (not used)
bigrams_literal_terms_corpus_word1plus2b <- rbind(bigrams_literal_terms_corpus_word1b, bigrams_literal_terms_corpus_word2b) # merges the results (used)
bigrams_literal_terms_corpus_united <- bigrams_literal_terms_corpus_word1plus2b %>%
  unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table

### frequency of literal terms in bi-grams
bigrams_literal_terms_corpus_freq <- bigrams_literal_terms_corpus_united %>%
  count(kingdom, book, sort = TRUE) # frequency of literal terms bi-grams

#### data preparation [external output for RAW Graphs] 
options(max.print=999999)
names(bigrams_literal_terms_corpus_freq) <- c("literal term","book", "frequency") # new rownames
write.csv(bigrams_literal_terms_corpus_freq, file = "bigrams.literal.per.book.per.kingdoms.abs.freq.csv", row.names = FALSE)
options(max.print=9999)

## Generic Terms

### Search for generic terms within the literal terms bi-grams

#### animal
reino.animal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_tipo.txt", what="character", sep="\n")
reino.animal.tipo.lexico.df <- data.frame(reino.animal.tipo.lexico) # builds data frame
bigrams_animal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the fist term within bi-gram
bigrams_animal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the second term within bi-gram
bigrams_animal_generic_terms_corpus_bigram <- rbind(bigrams_animal_generic_terms_corpus_word1,bigrams_animal_generic_terms_corpus_word2) # merges the results
bigrams_animal_generic_terms_corpus_bigram$kingdom <- rep(c("animal")) # inserts kingdom category in results
bigrams_animal_generic_terms_corpus_bigram
#### vegetal
reino.vegetal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_tipo.txt", what="character", sep="\n")
reino.vegetal.tipo.lexico.df <- data.frame(reino.vegetal.tipo.lexico)
bigrams_vegetal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico)
bigrams_vegetal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico) 
bigrams_vegetal_generic_terms_corpus_bigram <- rbind(bigrams_vegetal_generic_terms_corpus_word1,bigrams_vegetal_generic_terms_corpus_word2)
bigrams_vegetal_generic_terms_corpus_bigram$kingdom <- rep(c("vegetal"))
bigrams_vegetal_generic_terms_corpus_bigram
#### mineral
reino.mineral.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_tipo.txt", what="character", sep="\n")
reino.mineral.tipo.lexico.df <- data.frame(reino.mineral.tipo.lexico)
bigrams_mineral_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico) 
bigrams_mineral_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico) 
bigrams_mineral_generic_terms_corpus_bigram <- rbind(bigrams_mineral_generic_terms_corpus_word1,bigrams_mineral_generic_terms_corpus_word2)
bigrams_mineral_generic_terms_corpus_bigram$kingdom <- rep(c("mineral"))
bigrams_mineral_generic_terms_corpus_bigram

#### data preparation [external output for RAW Graphs] 
bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_vegetal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram) # merges data
bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>%
  unite(bigrams, word1, word2, sep = " ") # unifies bi-gram as a single expression
bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>%
  count(bigrams, kingdom, sort = TRUE) # most freq. to the least freq.
names(bigrams_all_generic_terms_corpus_bigram_count) <- c("bigram","generic term kingdom", "frequency") # new rownames
write.csv(bigrams_all_generic_terms_corpus_bigram_count, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE)


bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram)
bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>%
  unite(bigram, word1, word2, sep = " ")
bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>%
  count(bigram, kingdom, sort = TRUE)
bigrams_all_generic_terms_corpus_bigram_count_literal <- bigrams_all_generic_terms_corpus_bigram_count %>%
  mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal",
                             str_detect(bigram, 'animal')~ "animal",
                             str_detect (bigram, 'mineral') ~ "mineral"))
names(bigrams_all_generic_terms_corpus_bigram_count_literal) <- c("bigram","generic term kingdom", "frequency", "literal term kingdom") # new rownames
bigrams_all_generic_terms_corpus_bigram_right_order <- select(bigrams_all_generic_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `generic term kingdom`) # reorders data table columns
write.csv(bigrams_all_generic_terms_corpus_bigram_right_order, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE)

## Specific Terms

### Search for specific terms within the literal terms bi-grams

#### animal
reino.animal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_especie.txt", what="character", sep="\n")
reino.animal.especie.lexico.df <- data.frame(reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_bigram <- rbind(bigrams_animal_specific_terms_corpus_word1,bigrams_animal_specific_terms_corpus_word2)
bigrams_animal_specific_terms_corpus_bigram$kingdom <- rep(c("animal"))
bigrams_animal_specific_terms_corpus_bigram
#### vegetal
reino.vegetal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_especie.txt", what="character", sep="\n")
reino.vegetal.especie.lexico.df <- data.frame(reino.vegetal.especie.lexico)
bigrams_vegetal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences
bigrams_vegetal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences
#### mineral
reino.mineral.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_especie.txt", what="character", sep="\n")
reino.mineral.especie.lexico.df <- data.frame(reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word1 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
  filter(word2 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_bigram <- rbind(bigrams_mineral_specific_terms_corpus_word1,bigrams_mineral_specific_terms_corpus_word2)
bigrams_mineral_specific_terms_corpus_bigram$kingdom <- rep(c("mineral"))
bigrams_mineral_specific_terms_corpus_bigram

#### data preparation [external output for RAW Graphs] 
bigrams_all_specific_terms_corpus_bigram <- rbind (bigrams_animal_specific_terms_corpus_bigram,bigrams_mineral_specific_terms_corpus_bigram)
bigrams_all_specific_terms_corpus_bigram_unite <- bigrams_all_specific_terms_corpus_bigram %>%
  unite(bigram, word1, word2, sep = " ")
bigrams_all_specific_terms_corpus_bigram_count <- bigrams_all_specific_terms_corpus_bigram_unite %>%
  count(bigram, kingdom, sort = TRUE)
bigrams_all_specific_terms_corpus_bigram_count_literal <- bigrams_all_specific_terms_corpus_bigram_count %>%
  mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal",
                             str_detect(bigram, 'animal')~ "animal",
                             str_detect (bigram, 'mineral') ~ "mineral")) # detects the kingdoms by identifying the literal term within the bi-gram expression
names(bigrams_all_specific_terms_corpus_bigram_count_literal) <- c("bigram","specific term kingdom", "frequency","literal term kingdom") # new rownames
bigrams_all_specific_terms_corpus_bigram_right_order <- select(bigrams_all_specific_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `specific term kingdom`) # reorders data table columns
write.csv(bigrams_all_specific_terms_corpus_bigram_right_order, file = "specific.terms.in.literal.bigram.freq.csv", row.names = FALSE)

KWIC – KEYWORDS IN CONTEXT

RSTUDIO SETTINGS

## List of required packages 

### tm - general use
### tidyverse (package collection) - general use
### stopwords - stopwords removal
### qdap - stopwords removal
### textstem - lemmatization process
### quanteda - lemmatization process

## Installing required packages

install.packages(c("tm",
                   "tidyverse", 
                   "stopwords",
                   "qdap",
                   "textstem",
                   "quanteda"))

## Loading installed packages

library(tm)       
library(tidyverse)
library(stopwords)
library(qdap)
library(textstem)
library(quanteda)

DATA PREPROCESSING

## 1st - Importing Data

### directory assignment
arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all"

### verifying the files
arrfiles <- list.files(path=arrcorpus)
arrfiles #files correctly ordered

### building corpus arrangement: [[ ]] file, [ ] verse (strings)
corpuslist <- paste(arrcorpus,"/", arrfiles, sep="")
corpuslist 
typeof(corpuslist) #character

corpus.list <- lapply(corpuslist, FUN=readLines)
corpus.list [[1]]
typeof(corpus.list) #list

## 2nd - Data cleaning

### converting into one string 
corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ")
corpus.list.line [[78]]
typeof(corpus.list.line) #list

### de-captalizing the words
corpus.list.line.lower <- tolower(corpus.list.line)
corpus.list.line.lower [[78]]
typeof(corpus.list.line.lower) #character

### tokenizing with text/file separation
corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W")
corpus.list.line.clean [[78]]
typeof(corpus.list.line.clean) #list
typeof(corpus.list.line.clean[[78]][985]) #character

## 3rd - Corpus 

### making a Simple Corpus: lists of vectors
corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean))) 
corpus.as.list 
typeof(arrcorpus) #character
typeof(corpus.as.list) #list
inspect(corpus.as.list[[2]]) 
corpus.as.list.integral <- corpus.as.list

## 5th - More Data Cleaning

### removing numbers
corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removeNumbers))
inspect(corpus.as.list.integral[[48]])

### removing punctuation
corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removePunctuation))
inspect(corpus.as.list.integral[48])

### removing white spaces
corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(stripWhitespace))
typeof(corpus.as.list.integral[48]) #list

corpus.as.list.integral #corpus without removing stopwords as well as lemmatizing

KWIC – KEYWORDS IN CONTEXT KINGDOMS OF NATURE ANALYSIS

## tokenization

### quanteda Corpus for kwic (non lemmatized, stopwords not removed)
corpus.as.list.toks <- corpus(corpus.as.list.integral) 
summary(corpus.as.list.toks)

### tokens
toks_arr_integral <- quanteda::tokens(corpus.as.list.toks)

### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_integral) 
`docnames<-`(toks_arr_integral,arr.book.names)
names.toks_arr_integral <- `docnames<-`(toks_arr_integral,arr.book.names)

### Used lists

#### Literal terms (stemming process included)
kingdoms.literal.terms <- c("vegetal", "animal", "mineral")
kingdoms.literal.terms.df <- data.frame(kingdoms.literal.terms)
kingdoms.literal.terms.stem <- stemDocument(kingdoms.literal.terms)
kingdoms.literal.terms.stem <- gsub('$','*',kingdoms.literal.terms.stem)
kingdoms.literal.terms.stem #"veget*" "anim*"  "miner*"

### All the corpus, by book
options(max.print=999999)

#### Literal terms
toks_arr_literal_terms_kwic <- kwic(names.toks_arr_integral, pattern = phrase(kingdoms.literal.terms.stem), valuetype = "glob", window = 5)

#### Data output
capture.output(toks_arr_literal_terms_kwic, file="toks_arr_literal_terms_kwic.txt")
options(max.print=9999)