O código desenvolvido e aplicado às análises de texto pode ser consultado abaixo, sob forma comentada. Em alternativa, os scripts podem também ser diretamente descarregados em ficheiro R a partir dos seguintes links:
↓ Visão Panorâmica (Bird’s-Eye View — N-gram)
↓ Reinos Animal, Vegetal e Mineral (Kingdoms of Nature — N-gram + Keywords in Context)
A versão do código que aqui se publica em acesso aberto, para inspeção e reuso, corresponde àquela utilizada para gerar o datasets-bundle-3 e respetivas visualizações [release 2, fevereiro 2025].
SUMMARY
N-GRAM
KWIC – KEYWORDS IN CONTEXT
N-GRAM
RSTUDIO SETTINGS
## List of required packages
### tm - general use
### tidyverse (package collection) - general use
### stopwords - stopwords removal
### qdap - stopwords removal
### textstem - lemmatization process
### quanteda - lemmatization process
## Installing required packages
install.packages(c("tm",
"tidyverse",
"stopwords",
"qdap",
"textstem",
"quanteda"))
## Loading installed packages
library(tm)
library(tidyverse)
library(stopwords)
library(qdap)
library(textstem)
library(quanteda)
DATA PREPROCESSING
## 1st - Importing Data
### directory assignment
arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all"
### verifying the files
arrfiles <- list.files(path=arrcorpus)
arrfiles #files correctly ordered
### building corpus arrangement: [[ ]] file, [ ] verse (strings)
corpuslist <- paste(arrcorpus,"/", arrfiles, sep="")
corpuslist
typeof(corpuslist) #character
corpus.list <- lapply(corpuslist, FUN=readLines)
corpus.list [[1]]
typeof(corpus.list) #list
## 2nd - Data cleaning
### converting into one string
corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ")
corpus.list.line [[78]]
typeof(corpus.list.line) #list
### de-captalizing the words
corpus.list.line.lower <- tolower(corpus.list.line)
corpus.list.line.lower [[78]]
typeof(corpus.list.line.lower) #character
### tokenizing with text/file separation
corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W")
corpus.list.line.clean [[78]]
typeof(corpus.list.line.clean) #list
typeof(corpus.list.line.clean[[78]][985]) #character
## 3rd - Corpus
### making a Simple Corpus: lists of vectors
corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean)))
corpus.as.list
typeof(arrcorpus) #character
typeof(corpus.as.list) #list
inspect(corpus.as.list[[2]])
## 4th - Removing stopwords
### adapted list assignment (based on "stopwords-iso")
allstops_iso_alt <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/stopwords/stopwords-pt_txt_alterado_pr_24-01-24.txt", what="character", sep="\n")
allstops_iso_alt
### Additonal words to be removed
allstops_iso_alt_b <- c(allstops_iso_alt, "c", "dir")
### stopwords removal
corpus.as.list_b <- tm_map(corpus.as.list, removeWords, c(allstops_iso_alt_b))
inspect(corpus.as.list_b[[1]])
inspect(corpus.as.list_b[[48]])
## 5th - More Data Cleaning
### removing numbers
corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(removeNumbers))
inspect(corpus.as.list_b[[48]])
### removing punctuation
corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(removePunctuation))
inspect(corpus.as.list_b[48])
### removing white spaces
corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(stripWhitespace))
typeof(corpus.as.list_b[48]) #list
## 6th - Lemmatization
corpus.as.list.lemma_b <- corpus.as.list_b
### setting a list/dictionary to proceed lemmatization (based on "lemmatization-pt" from Global Glossary Project)
lemma_dic <- read.delim(file = "/Users/patriciareina/Desktop/FAT/LISTAS/lematização/lemmatization-pt_rev_06-06-24.txt", header = FALSE, stringsAsFactors = FALSE)
names(lemma_dic) <- c("lemma", "term")
lemma_dic[1:1000,]
typeof(lemma_dic) #list
### fixing duplicate terms, keeping only first occurrences
lemma_dic_unique <- lemma_dic[!duplicated(lemma_dic$term),]
### list of control for removed duplicated terms [external output]
term_doc <- print(lemma_dic$term)
term_doc[duplicated(term_doc)]
options(max.print=99999)
capture.output(term_doc[duplicated(term_doc)], file = "duplicados_lemma_06-06-24.txt")
options(max.print=9999)
### reordering dictionary disposition for term-lemma instead of lemma-term
lemma_dic_unique_term_lemma <- select(lemma_dic_unique, term, lemma)
lemma_dic_unique_term_lemma[1:10,]
### lemmatization
for (i in 1:length(corpus.as.list.lemma_b)) {corpus.as.list.lemma_b [[i]][[1]] <- lemmatize_strings(corpus.as.list.lemma_b [[i]][[1]], dictionary = lemma_dic_unique_term_lemma)}
corpus.as.list_b #stopwords removed
corpus.as.list.lemma_b #lemmatized + stopwords removed
N-GRAM BIRD’S EYE VIEW ANALYSIS
# N-GRAM BIRD'S EYE VIEW ANALYSIS
## Installing required packages for N-Gram
install.packages(c("koRpus",
"tokenizers",
"readtext"))
### KoRpus- token transformation; tokenizers - tokenization process; readtext - reads different text formats
## Loading installed packages for N-Gram
library(koRpus)
library(tokenizers)
library(readtext)
library(dplyr)
library(tidytext)
## setting directory for the external outputs
getwd()
setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS")
## tokenization
### quanteda Corpus for later tokens (lemmatized, stopwords removed)
typeof(corpus.as.list.lemma_b) #list
corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b)
summary(corpus.as.list.lemma_b.toks) # each publication is a sentence
### tokens for each publication in total Corpus
toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks)
### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_total)
`docnames<-`(toks_arr_total,arr.book.names)
names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names)
### list of token bi-grams in Total Corpus
toks_arr_bigram <- tokens_ngrams(toks_arr_total, n = 2)
## Working bi-grams
### separating bi-grams in 2 words
toks_arr_bigram_stack <- stack(toks_arr_bigram)
toks_arr_bigram_stack [1,] #r owname is 'value' for bi-grams
arr_bigrams_separated <- toks_arr_bigram_stack %>%
separate(values, c("word1", "word2"), sep = "_") # each bi-gram word is distinguished
arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind"
### search for the 10 most frequent bigrams (generaal corpus, repetition)
arr_bigrams_united <- arr_bigrams_separated %>%
unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table
bigrams_10_most_freq_corpus <- arr_bigrams_united %>%
count(bigram, sort = TRUE) # frequency of bigrams: the most freq. to the least freq.
first_10_bigrams_most_freq_corpus <- bigrams_10_most_freq_corpus[1:10,]
first_10_bigrams_most_freq_corpus_sep <- first_10_bigrams_most_freq_corpus %>%
separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished
#### data preparation [external output for RAWGraphs]
names(first_10_bigrams_most_freq_corpus_sep) <- c("first term","second term","frequency") # new rownames
write.csv(first_10_bigrams_most_freq_corpus_sep, file = "bigrams.relation.freq.general10.csv", row.names = FALSE)
### search for the most 10 frequent terms in bi-grams
top.10.terms <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/top10_arr_total_corpus.txt", what="character", sep="\n")
top.10.terms.df <- data.frame(top.10.terms)
bigrams_filtered_top10_word1 <- arr_bigrams_separated %>%
filter(word1 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the fist term within bi-gram
bigrams_filtered_top10_word2 <- arr_bigrams_separated %>%
filter(word2 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the second term within bi-gram
bigrams_filtered_top10_word1plus2 <- rbind(bigrams_filtered_top10_word1, bigrams_filtered_top10_word2) # merges the results
bigrams_filtered_top10_word1plus2_optimazed <-bigrams_filtered_top10_word1plus2 %>%
filter(!word1 == word2) # does not consider repeated terms within the bi-gram
bigrams_united_top10 <- bigrams_filtered_top10_word1plus2_optimazed %>%
unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table
### top 10 bigrams that have top 10 terms correlated (top10, no repetition)
bigrams_freq <- bigrams_united_top10 %>%
count(bigram, sort = TRUE) # frequency of bigrams
bigrams_freq_top10 <- bigrams_freq[1:10,]
bigrams_freq_top10_sep <- bigrams_freq_top10 %>%
separate(bigram, c("word1", "word2"), sep = " ") # each bi-gram word is distinguished
#### data preparation [external output for RAWGraphs]
names(bigrams_freq_top10_sep) <- c("first term","second term","frequency") # new rownames
write.csv(bigrams_freq_top10_sep, file = "bigrams.relation.freq.top10.csv", row.names = FALSE)
### top 50 bigrams correlated (general corpus, no repetition)
bigrams_10_most_freq_corpus [1:50,]
arr_bigrams__most_freq_separated <- bigrams_10_most_freq_corpus %>%
separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished
arr_bigrams__most_freq_separated [1:50,]
arr_bigrams__most_freq_separated_no_rep <- arr_bigrams__most_freq_separated %>%
filter(!word1 == word2) # do not consider repeated terms within the bi-gram
arr_bigrams_50_most_freq_separated_no_rep <- arr_bigrams__most_freq_separated_no_rep[1:50,]
####data preparation [external output for RAWGraphs]
names(arr_bigrams_50_most_freq_separated_no_rep) <- c("first term","second term","frequency") # new rownames
write.csv(arr_bigrams_50_most_freq_separated_no_rep, file = "bigrams.relation.top50.csv", row.names = FALSE)
N-GRAM KINGDOMS OF NATURE ANALYSIS
## Installing required packages for N-Gram
install.packages(c("koRpus",
"tokenizers",
"readtext"))
### KoRpus- tonken transformation; tokenizers - tokenization process; readtext - reads different text formats
## Loading installed packages for N-Gram
library(koRpus)
library(tokenizers)
library(readtext)
library(dplyr)
library(tidytext)
## setting directory for the external outputs
getwd()
setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS")
## tokenization
### quanteda Corpus for later tokens (lemmatized, stopwords removed)
typeof(corpus.as.list.lemma_b) #list
corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b)
summary(corpus.as.list.lemma_b.toks) # each publication is a sentence
### tokens for each publication in total Corpus
toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks)
### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_total)
`docnames<-`(toks_arr_total,arr.book.names)
names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names)
### list of token bi-grams in Total Corpus
toks_arr_bigram <- tokens_ngrams(names.toks_arr_total, n = 2)
## Working bi-grams
### separating bi-grams in 2 words
toks_arr_bigram_stack <- stack(toks_arr_bigram)
toks_arr_bigram_stack [1,] # rowname is "value" for bi-grams
arr_bigrams_separated <- toks_arr_bigram_stack %>%
separate(values, c("word1", "word2"), sep = "_") #each bi-gram word is distinguished
arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind"
## Literal terms
### List
kingdoms <- c("vegetal", "animal", "mineral")
kingdoms.df <- data.frame(kingdoms)
### search for the bi-grams that contain the literal kingdoms terms (repetition, per book)
bigrams_literal_terms_corpus_word1 <- arr_bigrams_separated %>%
filter(word1 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the fist term within bi-gram
bigrams_literal_terms_corpus_word1b <- bigrams_literal_terms_corpus_word1
bigrams_literal_terms_corpus_word1b$kingdoms = bigrams_literal_terms_corpus_word1b$word1 # literal term names the kingdom for viz. optimization
bigrams_literal_terms_corpus_word2 <- arr_bigrams_separated %>%
filter(word2 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the second term within bi-gram
bigrams_literal_terms_corpus_word2b <- bigrams_literal_terms_corpus_word2
bigrams_literal_terms_corpus_word2b$kingdoms = bigrams_literal_terms_corpus_word2b$word2 # literal term names the kingdom for viz. optimization
bigrams_literal_terms_corpus_word1plus2 <- rbind(bigrams_literal_terms_corpus_word1, bigrams_literal_terms_corpus_word2) # merges the results (not used)
bigrams_literal_terms_corpus_word1plus2b <- rbind(bigrams_literal_terms_corpus_word1b, bigrams_literal_terms_corpus_word2b) # merges the results (used)
bigrams_literal_terms_corpus_united <- bigrams_literal_terms_corpus_word1plus2b %>%
unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table
### frequency of literal terms in bi-grams
bigrams_literal_terms_corpus_freq <- bigrams_literal_terms_corpus_united %>%
count(kingdom, book, sort = TRUE) # frequency of literal terms bi-grams
#### data preparation [external output for RAW Graphs]
options(max.print=999999)
names(bigrams_literal_terms_corpus_freq) <- c("literal term","book", "frequency") # new rownames
write.csv(bigrams_literal_terms_corpus_freq, file = "bigrams.literal.per.book.per.kingdoms.abs.freq.csv", row.names = FALSE)
options(max.print=9999)
## Generic Terms
### Search for generic terms within the literal terms bi-grams
#### animal
reino.animal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_tipo.txt", what="character", sep="\n")
reino.animal.tipo.lexico.df <- data.frame(reino.animal.tipo.lexico) # builds data frame
bigrams_animal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the fist term within bi-gram
bigrams_animal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the second term within bi-gram
bigrams_animal_generic_terms_corpus_bigram <- rbind(bigrams_animal_generic_terms_corpus_word1,bigrams_animal_generic_terms_corpus_word2) # merges the results
bigrams_animal_generic_terms_corpus_bigram$kingdom <- rep(c("animal")) # inserts kingdom category in results
bigrams_animal_generic_terms_corpus_bigram
#### vegetal
reino.vegetal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_tipo.txt", what="character", sep="\n")
reino.vegetal.tipo.lexico.df <- data.frame(reino.vegetal.tipo.lexico)
bigrams_vegetal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico)
bigrams_vegetal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico)
bigrams_vegetal_generic_terms_corpus_bigram <- rbind(bigrams_vegetal_generic_terms_corpus_word1,bigrams_vegetal_generic_terms_corpus_word2)
bigrams_vegetal_generic_terms_corpus_bigram$kingdom <- rep(c("vegetal"))
bigrams_vegetal_generic_terms_corpus_bigram
#### mineral
reino.mineral.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_tipo.txt", what="character", sep="\n")
reino.mineral.tipo.lexico.df <- data.frame(reino.mineral.tipo.lexico)
bigrams_mineral_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico)
bigrams_mineral_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico)
bigrams_mineral_generic_terms_corpus_bigram <- rbind(bigrams_mineral_generic_terms_corpus_word1,bigrams_mineral_generic_terms_corpus_word2)
bigrams_mineral_generic_terms_corpus_bigram$kingdom <- rep(c("mineral"))
bigrams_mineral_generic_terms_corpus_bigram
#### data preparation [external output for RAW Graphs]
bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_vegetal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram) # merges data
bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>%
unite(bigrams, word1, word2, sep = " ") # unifies bi-gram as a single expression
bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>%
count(bigrams, kingdom, sort = TRUE) # most freq. to the least freq.
names(bigrams_all_generic_terms_corpus_bigram_count) <- c("bigram","generic term kingdom", "frequency") # new rownames
write.csv(bigrams_all_generic_terms_corpus_bigram_count, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE)
bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram)
bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>%
unite(bigram, word1, word2, sep = " ")
bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>%
count(bigram, kingdom, sort = TRUE)
bigrams_all_generic_terms_corpus_bigram_count_literal <- bigrams_all_generic_terms_corpus_bigram_count %>%
mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal",
str_detect(bigram, 'animal')~ "animal",
str_detect (bigram, 'mineral') ~ "mineral"))
names(bigrams_all_generic_terms_corpus_bigram_count_literal) <- c("bigram","generic term kingdom", "frequency", "literal term kingdom") # new rownames
bigrams_all_generic_terms_corpus_bigram_right_order <- select(bigrams_all_generic_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `generic term kingdom`) # reorders data table columns
write.csv(bigrams_all_generic_terms_corpus_bigram_right_order, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE)
## Specific Terms
### Search for specific terms within the literal terms bi-grams
#### animal
reino.animal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_especie.txt", what="character", sep="\n")
reino.animal.especie.lexico.df <- data.frame(reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico)
bigrams_animal_specific_terms_corpus_bigram <- rbind(bigrams_animal_specific_terms_corpus_word1,bigrams_animal_specific_terms_corpus_word2)
bigrams_animal_specific_terms_corpus_bigram$kingdom <- rep(c("animal"))
bigrams_animal_specific_terms_corpus_bigram
#### vegetal
reino.vegetal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_especie.txt", what="character", sep="\n")
reino.vegetal.especie.lexico.df <- data.frame(reino.vegetal.especie.lexico)
bigrams_vegetal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences
bigrams_vegetal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences
#### mineral
reino.mineral.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_especie.txt", what="character", sep="\n")
reino.mineral.especie.lexico.df <- data.frame(reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word1 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>%
filter(word2 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico)
bigrams_mineral_specific_terms_corpus_bigram <- rbind(bigrams_mineral_specific_terms_corpus_word1,bigrams_mineral_specific_terms_corpus_word2)
bigrams_mineral_specific_terms_corpus_bigram$kingdom <- rep(c("mineral"))
bigrams_mineral_specific_terms_corpus_bigram
#### data preparation [external output for RAW Graphs]
bigrams_all_specific_terms_corpus_bigram <- rbind (bigrams_animal_specific_terms_corpus_bigram,bigrams_mineral_specific_terms_corpus_bigram)
bigrams_all_specific_terms_corpus_bigram_unite <- bigrams_all_specific_terms_corpus_bigram %>%
unite(bigram, word1, word2, sep = " ")
bigrams_all_specific_terms_corpus_bigram_count <- bigrams_all_specific_terms_corpus_bigram_unite %>%
count(bigram, kingdom, sort = TRUE)
bigrams_all_specific_terms_corpus_bigram_count_literal <- bigrams_all_specific_terms_corpus_bigram_count %>%
mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal",
str_detect(bigram, 'animal')~ "animal",
str_detect (bigram, 'mineral') ~ "mineral")) # detects the kingdoms by identifying the literal term within the bi-gram expression
names(bigrams_all_specific_terms_corpus_bigram_count_literal) <- c("bigram","specific term kingdom", "frequency","literal term kingdom") # new rownames
bigrams_all_specific_terms_corpus_bigram_right_order <- select(bigrams_all_specific_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `specific term kingdom`) # reorders data table columns
write.csv(bigrams_all_specific_terms_corpus_bigram_right_order, file = "specific.terms.in.literal.bigram.freq.csv", row.names = FALSE)
KWIC – KEYWORDS IN CONTEXT
RSTUDIO SETTINGS
## List of required packages
### tm - general use
### tidyverse (package collection) - general use
### stopwords - stopwords removal
### qdap - stopwords removal
### textstem - lemmatization process
### quanteda - lemmatization process
## Installing required packages
install.packages(c("tm",
"tidyverse",
"stopwords",
"qdap",
"textstem",
"quanteda"))
## Loading installed packages
library(tm)
library(tidyverse)
library(stopwords)
library(qdap)
library(textstem)
library(quanteda)
DATA PREPROCESSING
## 1st - Importing Data ### directory assignment arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all" ### verifying the files arrfiles <- list.files(path=arrcorpus) arrfiles #files correctly ordered ### building corpus arrangement: [[ ]] file, [ ] verse (strings) corpuslist <- paste(arrcorpus,"/", arrfiles, sep="") corpuslist typeof(corpuslist) #character corpus.list <- lapply(corpuslist, FUN=readLines) corpus.list [[1]] typeof(corpus.list) #list ## 2nd - Data cleaning ### converting into one string corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ") corpus.list.line [[78]] typeof(corpus.list.line) #list ### de-captalizing the words corpus.list.line.lower <- tolower(corpus.list.line) corpus.list.line.lower [[78]] typeof(corpus.list.line.lower) #character ### tokenizing with text/file separation corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W") corpus.list.line.clean [[78]] typeof(corpus.list.line.clean) #list typeof(corpus.list.line.clean[[78]][985]) #character ## 3rd - Corpus ### making a Simple Corpus: lists of vectors corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean))) corpus.as.list typeof(arrcorpus) #character typeof(corpus.as.list) #list inspect(corpus.as.list[[2]]) corpus.as.list.integral <- corpus.as.list ## 5th - More Data Cleaning ### removing numbers corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removeNumbers)) inspect(corpus.as.list.integral[[48]]) ### removing punctuation corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removePunctuation)) inspect(corpus.as.list.integral[48]) ### removing white spaces corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(stripWhitespace)) typeof(corpus.as.list.integral[48]) #list corpus.as.list.integral #corpus without removing stopwords as well as lemmatizing
KWIC – KEYWORDS IN CONTEXT KINGDOMS OF NATURE ANALYSIS
## tokenization
### quanteda Corpus for kwic (non lemmatized, stopwords not removed)
corpus.as.list.toks <- corpus(corpus.as.list.integral)
summary(corpus.as.list.toks)
### tokens
toks_arr_integral <- quanteda::tokens(corpus.as.list.toks)
### fixing the name of the book within corpus
arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n")
docnames(toks_arr_integral)
`docnames<-`(toks_arr_integral,arr.book.names)
names.toks_arr_integral <- `docnames<-`(toks_arr_integral,arr.book.names)
### Used lists
#### Literal terms (stemming process included)
kingdoms.literal.terms <- c("vegetal", "animal", "mineral")
kingdoms.literal.terms.df <- data.frame(kingdoms.literal.terms)
kingdoms.literal.terms.stem <- stemDocument(kingdoms.literal.terms)
kingdoms.literal.terms.stem <- gsub('$','*',kingdoms.literal.terms.stem)
kingdoms.literal.terms.stem #"veget*" "anim*" "miner*"
### All the corpus, by book
options(max.print=999999)
#### Literal terms
toks_arr_literal_terms_kwic <- kwic(names.toks_arr_integral, pattern = phrase(kingdoms.literal.terms.stem), valuetype = "glob", window = 5)
#### Data output
capture.output(toks_arr_literal_terms_kwic, file="toks_arr_literal_terms_kwic.txt")
options(max.print=9999)