O código desenvolvido e aplicado às análises de texto pode ser consultado abaixo, sob forma comentada. Em alternativa, os scripts podem também ser diretamente descarregados em ficheiro R a partir dos seguintes links:
↓ Visão Panorâmica (Bird’s-Eye View — N-gram)
↓ Reinos Animal, Vegetal e Mineral (Kingdoms of Nature — N-gram + Keywords in Context)
A versão do código que aqui se publica em acesso aberto, para inspeção e reuso, corresponde àquela utilizada para gerar o datasets-bundle-3 e respetivas visualizações [release 2, fevereiro 2025].
SUMMARY
N-GRAM
KWIC – KEYWORDS IN CONTEXT
N-GRAM
RSTUDIO SETTINGS
## List of required packages ### tm - general use ### tidyverse (package collection) - general use ### stopwords - stopwords removal ### qdap - stopwords removal ### textstem - lemmatization process ### quanteda - lemmatization process ## Installing required packages install.packages(c("tm", "tidyverse", "stopwords", "qdap", "textstem", "quanteda")) ## Loading installed packages library(tm) library(tidyverse) library(stopwords) library(qdap) library(textstem) library(quanteda)
DATA PREPROCESSING
## 1st - Importing Data ### directory assignment arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all" ### verifying the files arrfiles <- list.files(path=arrcorpus) arrfiles #files correctly ordered ### building corpus arrangement: [[ ]] file, [ ] verse (strings) corpuslist <- paste(arrcorpus,"/", arrfiles, sep="") corpuslist typeof(corpuslist) #character corpus.list <- lapply(corpuslist, FUN=readLines) corpus.list [[1]] typeof(corpus.list) #list ## 2nd - Data cleaning ### converting into one string corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ") corpus.list.line [[78]] typeof(corpus.list.line) #list ### de-captalizing the words corpus.list.line.lower <- tolower(corpus.list.line) corpus.list.line.lower [[78]] typeof(corpus.list.line.lower) #character ### tokenizing with text/file separation corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W") corpus.list.line.clean [[78]] typeof(corpus.list.line.clean) #list typeof(corpus.list.line.clean[[78]][985]) #character ## 3rd - Corpus ### making a Simple Corpus: lists of vectors corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean))) corpus.as.list typeof(arrcorpus) #character typeof(corpus.as.list) #list inspect(corpus.as.list[[2]]) ## 4th - Removing stopwords ### adapted list assignment (based on "stopwords-iso") allstops_iso_alt <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/stopwords/stopwords-pt_txt_alterado_pr_24-01-24.txt", what="character", sep="\n") allstops_iso_alt ### Additonal words to be removed allstops_iso_alt_b <- c(allstops_iso_alt, "c", "dir") ### stopwords removal corpus.as.list_b <- tm_map(corpus.as.list, removeWords, c(allstops_iso_alt_b)) inspect(corpus.as.list_b[[1]]) inspect(corpus.as.list_b[[48]]) ## 5th - More Data Cleaning ### removing numbers corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(removeNumbers)) inspect(corpus.as.list_b[[48]]) ### removing punctuation corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(removePunctuation)) inspect(corpus.as.list_b[48]) ### removing white spaces corpus.as.list_b <- tm_map(corpus.as.list_b, content_transformer(stripWhitespace)) typeof(corpus.as.list_b[48]) #list ## 6th - Lemmatization corpus.as.list.lemma_b <- corpus.as.list_b ### setting a list/dictionary to proceed lemmatization (based on "lemmatization-pt" from Global Glossary Project) lemma_dic <- read.delim(file = "/Users/patriciareina/Desktop/FAT/LISTAS/lematização/lemmatization-pt_rev_06-06-24.txt", header = FALSE, stringsAsFactors = FALSE) names(lemma_dic) <- c("lemma", "term") lemma_dic[1:1000,] typeof(lemma_dic) #list ### fixing duplicate terms, keeping only first occurrences lemma_dic_unique <- lemma_dic[!duplicated(lemma_dic$term),] ### list of control for removed duplicated terms [external output] term_doc <- print(lemma_dic$term) term_doc[duplicated(term_doc)] options(max.print=99999) capture.output(term_doc[duplicated(term_doc)], file = "duplicados_lemma_06-06-24.txt") options(max.print=9999) ### reordering dictionary disposition for term-lemma instead of lemma-term lemma_dic_unique_term_lemma <- select(lemma_dic_unique, term, lemma) lemma_dic_unique_term_lemma[1:10,] ### lemmatization for (i in 1:length(corpus.as.list.lemma_b)) {corpus.as.list.lemma_b [[i]][[1]] <- lemmatize_strings(corpus.as.list.lemma_b [[i]][[1]], dictionary = lemma_dic_unique_term_lemma)} corpus.as.list_b #stopwords removed corpus.as.list.lemma_b #lemmatized + stopwords removed
N-GRAM BIRD’S EYE VIEW ANALYSIS
# N-GRAM BIRD'S EYE VIEW ANALYSIS ## Installing required packages for N-Gram install.packages(c("koRpus", "tokenizers", "readtext")) ### KoRpus- token transformation; tokenizers - tokenization process; readtext - reads different text formats ## Loading installed packages for N-Gram library(koRpus) library(tokenizers) library(readtext) library(dplyr) library(tidytext) ## setting directory for the external outputs getwd() setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS") ## tokenization ### quanteda Corpus for later tokens (lemmatized, stopwords removed) typeof(corpus.as.list.lemma_b) #list corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b) summary(corpus.as.list.lemma_b.toks) # each publication is a sentence ### tokens for each publication in total Corpus toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks) ### fixing the name of the book within corpus arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n") docnames(toks_arr_total) `docnames<-`(toks_arr_total,arr.book.names) names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names) ### list of token bi-grams in Total Corpus toks_arr_bigram <- tokens_ngrams(toks_arr_total, n = 2) ## Working bi-grams ### separating bi-grams in 2 words toks_arr_bigram_stack <- stack(toks_arr_bigram) toks_arr_bigram_stack [1,] #r owname is 'value' for bi-grams arr_bigrams_separated <- toks_arr_bigram_stack %>% separate(values, c("word1", "word2"), sep = "_") # each bi-gram word is distinguished arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind" ### search for the 10 most frequent bigrams (generaal corpus, repetition) arr_bigrams_united <- arr_bigrams_separated %>% unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table bigrams_10_most_freq_corpus <- arr_bigrams_united %>% count(bigram, sort = TRUE) # frequency of bigrams: the most freq. to the least freq. first_10_bigrams_most_freq_corpus <- bigrams_10_most_freq_corpus[1:10,] first_10_bigrams_most_freq_corpus_sep <- first_10_bigrams_most_freq_corpus %>% separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished #### data preparation [external output for RAWGraphs] names(first_10_bigrams_most_freq_corpus_sep) <- c("first term","second term","frequency") # new rownames write.csv(first_10_bigrams_most_freq_corpus_sep, file = "bigrams.relation.freq.general10.csv", row.names = FALSE) ### search for the most 10 frequent terms in bi-grams top.10.terms <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/top10_arr_total_corpus.txt", what="character", sep="\n") top.10.terms.df <- data.frame(top.10.terms) bigrams_filtered_top10_word1 <- arr_bigrams_separated %>% filter(word1 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the fist term within bi-gram bigrams_filtered_top10_word2 <- arr_bigrams_separated %>% filter(word2 %in% top.10.terms.df$top.10.terms) # occurrences of any top10 word in as the second term within bi-gram bigrams_filtered_top10_word1plus2 <- rbind(bigrams_filtered_top10_word1, bigrams_filtered_top10_word2) # merges the results bigrams_filtered_top10_word1plus2_optimazed <-bigrams_filtered_top10_word1plus2 %>% filter(!word1 == word2) # does not consider repeated terms within the bi-gram bigrams_united_top10 <- bigrams_filtered_top10_word1plus2_optimazed %>% unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table ### top 10 bigrams that have top 10 terms correlated (top10, no repetition) bigrams_freq <- bigrams_united_top10 %>% count(bigram, sort = TRUE) # frequency of bigrams bigrams_freq_top10 <- bigrams_freq[1:10,] bigrams_freq_top10_sep <- bigrams_freq_top10 %>% separate(bigram, c("word1", "word2"), sep = " ") # each bi-gram word is distinguished #### data preparation [external output for RAWGraphs] names(bigrams_freq_top10_sep) <- c("first term","second term","frequency") # new rownames write.csv(bigrams_freq_top10_sep, file = "bigrams.relation.freq.top10.csv", row.names = FALSE) ### top 50 bigrams correlated (general corpus, no repetition) bigrams_10_most_freq_corpus [1:50,] arr_bigrams__most_freq_separated <- bigrams_10_most_freq_corpus %>% separate(bigram, c("word1", "word2"), sep = " ") # each bigram word is distinguished arr_bigrams__most_freq_separated [1:50,] arr_bigrams__most_freq_separated_no_rep <- arr_bigrams__most_freq_separated %>% filter(!word1 == word2) # do not consider repeated terms within the bi-gram arr_bigrams_50_most_freq_separated_no_rep <- arr_bigrams__most_freq_separated_no_rep[1:50,] ####data preparation [external output for RAWGraphs] names(arr_bigrams_50_most_freq_separated_no_rep) <- c("first term","second term","frequency") # new rownames write.csv(arr_bigrams_50_most_freq_separated_no_rep, file = "bigrams.relation.top50.csv", row.names = FALSE)
N-GRAM KINGDOMS OF NATURE ANALYSIS
## Installing required packages for N-Gram install.packages(c("koRpus", "tokenizers", "readtext")) ### KoRpus- tonken transformation; tokenizers - tokenization process; readtext - reads different text formats ## Loading installed packages for N-Gram library(koRpus) library(tokenizers) library(readtext) library(dplyr) library(tidytext) ## setting directory for the external outputs getwd() setwd("/Users/patriciareina/Desktop/FAT/RAWGRAPHS") ## tokenization ### quanteda Corpus for later tokens (lemmatized, stopwords removed) typeof(corpus.as.list.lemma_b) #list corpus.as.list.lemma_b.toks <- corpus(corpus.as.list.lemma_b) summary(corpus.as.list.lemma_b.toks) # each publication is a sentence ### tokens for each publication in total Corpus toks_arr_total <- quanteda::tokens(corpus.as.list.lemma_b.toks) ### fixing the name of the book within corpus arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n") docnames(toks_arr_total) `docnames<-`(toks_arr_total,arr.book.names) names.toks_arr_total <- `docnames<-`(toks_arr_total,arr.book.names) ### list of token bi-grams in Total Corpus toks_arr_bigram <- tokens_ngrams(names.toks_arr_total, n = 2) ## Working bi-grams ### separating bi-grams in 2 words toks_arr_bigram_stack <- stack(toks_arr_bigram) toks_arr_bigram_stack [1,] # rowname is "value" for bi-grams arr_bigrams_separated <- toks_arr_bigram_stack %>% separate(values, c("word1", "word2"), sep = "_") #each bi-gram word is distinguished arr_bigrams_separated[1,] # rownames are "word1", "word2", and "ind" ## Literal terms ### List kingdoms <- c("vegetal", "animal", "mineral") kingdoms.df <- data.frame(kingdoms) ### search for the bi-grams that contain the literal kingdoms terms (repetition, per book) bigrams_literal_terms_corpus_word1 <- arr_bigrams_separated %>% filter(word1 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the fist term within bi-gram bigrams_literal_terms_corpus_word1b <- bigrams_literal_terms_corpus_word1 bigrams_literal_terms_corpus_word1b$kingdoms = bigrams_literal_terms_corpus_word1b$word1 # literal term names the kingdom for viz. optimization bigrams_literal_terms_corpus_word2 <- arr_bigrams_separated %>% filter(word2 %in% kingdoms.df$kingdoms) # occurrences of any literal word as listed in as the second term within bi-gram bigrams_literal_terms_corpus_word2b <- bigrams_literal_terms_corpus_word2 bigrams_literal_terms_corpus_word2b$kingdoms = bigrams_literal_terms_corpus_word2b$word2 # literal term names the kingdom for viz. optimization bigrams_literal_terms_corpus_word1plus2 <- rbind(bigrams_literal_terms_corpus_word1, bigrams_literal_terms_corpus_word2) # merges the results (not used) bigrams_literal_terms_corpus_word1plus2b <- rbind(bigrams_literal_terms_corpus_word1b, bigrams_literal_terms_corpus_word2b) # merges the results (used) bigrams_literal_terms_corpus_united <- bigrams_literal_terms_corpus_word1plus2b %>% unite(bigram, word1, word2, sep = " ") # unifies bi-grams as a single expression in table ### frequency of literal terms in bi-grams bigrams_literal_terms_corpus_freq <- bigrams_literal_terms_corpus_united %>% count(kingdom, book, sort = TRUE) # frequency of literal terms bi-grams #### data preparation [external output for RAW Graphs] options(max.print=999999) names(bigrams_literal_terms_corpus_freq) <- c("literal term","book", "frequency") # new rownames write.csv(bigrams_literal_terms_corpus_freq, file = "bigrams.literal.per.book.per.kingdoms.abs.freq.csv", row.names = FALSE) options(max.print=9999) ## Generic Terms ### Search for generic terms within the literal terms bi-grams #### animal reino.animal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_tipo.txt", what="character", sep="\n") reino.animal.tipo.lexico.df <- data.frame(reino.animal.tipo.lexico) # builds data frame bigrams_animal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the fist term within bi-gram bigrams_animal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.animal.tipo.lexico.df$reino.animal.tipo.lexico) # finds generic terms as the second term within bi-gram bigrams_animal_generic_terms_corpus_bigram <- rbind(bigrams_animal_generic_terms_corpus_word1,bigrams_animal_generic_terms_corpus_word2) # merges the results bigrams_animal_generic_terms_corpus_bigram$kingdom <- rep(c("animal")) # inserts kingdom category in results bigrams_animal_generic_terms_corpus_bigram #### vegetal reino.vegetal.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_tipo.txt", what="character", sep="\n") reino.vegetal.tipo.lexico.df <- data.frame(reino.vegetal.tipo.lexico) bigrams_vegetal_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico) bigrams_vegetal_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.vegetal.tipo.lexico.df$reino.vegetal.tipo.lexico) bigrams_vegetal_generic_terms_corpus_bigram <- rbind(bigrams_vegetal_generic_terms_corpus_word1,bigrams_vegetal_generic_terms_corpus_word2) bigrams_vegetal_generic_terms_corpus_bigram$kingdom <- rep(c("vegetal")) bigrams_vegetal_generic_terms_corpus_bigram #### mineral reino.mineral.tipo.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_tipo.txt", what="character", sep="\n") reino.mineral.tipo.lexico.df <- data.frame(reino.mineral.tipo.lexico) bigrams_mineral_generic_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico) bigrams_mineral_generic_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.mineral.tipo.lexico.df$reino.mineral.tipo.lexico) bigrams_mineral_generic_terms_corpus_bigram <- rbind(bigrams_mineral_generic_terms_corpus_word1,bigrams_mineral_generic_terms_corpus_word2) bigrams_mineral_generic_terms_corpus_bigram$kingdom <- rep(c("mineral")) bigrams_mineral_generic_terms_corpus_bigram #### data preparation [external output for RAW Graphs] bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_vegetal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram) # merges data bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>% unite(bigrams, word1, word2, sep = " ") # unifies bi-gram as a single expression bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>% count(bigrams, kingdom, sort = TRUE) # most freq. to the least freq. names(bigrams_all_generic_terms_corpus_bigram_count) <- c("bigram","generic term kingdom", "frequency") # new rownames write.csv(bigrams_all_generic_terms_corpus_bigram_count, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE) bigrams_all_generic_terms_corpus_bigram <- rbind (bigrams_animal_generic_terms_corpus_bigram,bigrams_mineral_generic_terms_corpus_bigram) bigrams_all_generic_terms_corpus_bigram_unite <- bigrams_all_generic_terms_corpus_bigram %>% unite(bigram, word1, word2, sep = " ") bigrams_all_generic_terms_corpus_bigram_count <- bigrams_all_generic_terms_corpus_bigram_unite %>% count(bigram, kingdom, sort = TRUE) bigrams_all_generic_terms_corpus_bigram_count_literal <- bigrams_all_generic_terms_corpus_bigram_count %>% mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal", str_detect(bigram, 'animal')~ "animal", str_detect (bigram, 'mineral') ~ "mineral")) names(bigrams_all_generic_terms_corpus_bigram_count_literal) <- c("bigram","generic term kingdom", "frequency", "literal term kingdom") # new rownames bigrams_all_generic_terms_corpus_bigram_right_order <- select(bigrams_all_generic_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `generic term kingdom`) # reorders data table columns write.csv(bigrams_all_generic_terms_corpus_bigram_right_order, file = "generic.terms.in.literal.bigram.freq.csv", row.names = FALSE) ## Specific Terms ### Search for specific terms within the literal terms bi-grams #### animal reino.animal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /animais_especie.txt", what="character", sep="\n") reino.animal.especie.lexico.df <- data.frame(reino.animal.especie.lexico) bigrams_animal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico) bigrams_animal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.animal.especie.lexico.df$reino.animal.especie.lexico) bigrams_animal_specific_terms_corpus_bigram <- rbind(bigrams_animal_specific_terms_corpus_word1,bigrams_animal_specific_terms_corpus_word2) bigrams_animal_specific_terms_corpus_bigram$kingdom <- rep(c("animal")) bigrams_animal_specific_terms_corpus_bigram #### vegetal reino.vegetal.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /vegetais_especie.txt", what="character", sep="\n") reino.vegetal.especie.lexico.df <- data.frame(reino.vegetal.especie.lexico) bigrams_vegetal_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences bigrams_vegetal_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.vegetal.especie.lexico.df$reino.vegetal.especie.lexico) #zero occurrences #### mineral reino.mineral.especie.lexico <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/Listas Bruno 12-04 /minerais_especie.txt", what="character", sep="\n") reino.mineral.especie.lexico.df <- data.frame(reino.mineral.especie.lexico) bigrams_mineral_specific_terms_corpus_word1 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word1 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico) bigrams_mineral_specific_terms_corpus_word2 <- bigrams_literal_terms_corpus_word1plus2 %>% filter(word2 %in% reino.mineral.especie.lexico.df$reino.mineral.especie.lexico) bigrams_mineral_specific_terms_corpus_bigram <- rbind(bigrams_mineral_specific_terms_corpus_word1,bigrams_mineral_specific_terms_corpus_word2) bigrams_mineral_specific_terms_corpus_bigram$kingdom <- rep(c("mineral")) bigrams_mineral_specific_terms_corpus_bigram #### data preparation [external output for RAW Graphs] bigrams_all_specific_terms_corpus_bigram <- rbind (bigrams_animal_specific_terms_corpus_bigram,bigrams_mineral_specific_terms_corpus_bigram) bigrams_all_specific_terms_corpus_bigram_unite <- bigrams_all_specific_terms_corpus_bigram %>% unite(bigram, word1, word2, sep = " ") bigrams_all_specific_terms_corpus_bigram_count <- bigrams_all_specific_terms_corpus_bigram_unite %>% count(bigram, kingdom, sort = TRUE) bigrams_all_specific_terms_corpus_bigram_count_literal <- bigrams_all_specific_terms_corpus_bigram_count %>% mutate(literal = case_when(str_detect(bigram, 'vegetal')~ "vegetal", str_detect(bigram, 'animal')~ "animal", str_detect (bigram, 'mineral') ~ "mineral")) # detects the kingdoms by identifying the literal term within the bi-gram expression names(bigrams_all_specific_terms_corpus_bigram_count_literal) <- c("bigram","specific term kingdom", "frequency","literal term kingdom") # new rownames bigrams_all_specific_terms_corpus_bigram_right_order <- select(bigrams_all_specific_terms_corpus_bigram_count_literal, bigram, frequency,`literal term kingdom`, `specific term kingdom`) # reorders data table columns write.csv(bigrams_all_specific_terms_corpus_bigram_right_order, file = "specific.terms.in.literal.bigram.freq.csv", row.names = FALSE)
KWIC – KEYWORDS IN CONTEXT
RSTUDIO SETTINGS
## List of required packages ### tm - general use ### tidyverse (package collection) - general use ### stopwords - stopwords removal ### qdap - stopwords removal ### textstem - lemmatization process ### quanteda - lemmatization process ## Installing required packages install.packages(c("tm", "tidyverse", "stopwords", "qdap", "textstem", "quanteda")) ## Loading installed packages library(tm) library(tidyverse) library(stopwords) library(qdap) library(textstem) library(quanteda)
DATA PREPROCESSING
## 1st - Importing Data ### directory assignment arrcorpus <- "/Users/patriciareina/Desktop/FAT/BASE DE DADOS/Obra Completa TXT/TXT-livros/OP-all" ### verifying the files arrfiles <- list.files(path=arrcorpus) arrfiles #files correctly ordered ### building corpus arrangement: [[ ]] file, [ ] verse (strings) corpuslist <- paste(arrcorpus,"/", arrfiles, sep="") corpuslist typeof(corpuslist) #character corpus.list <- lapply(corpuslist, FUN=readLines) corpus.list [[1]] typeof(corpus.list) #list ## 2nd - Data cleaning ### converting into one string corpus.list.line <- lapply(corpus.list, FUN=paste, collapse=" ") corpus.list.line [[78]] typeof(corpus.list.line) #list ### de-captalizing the words corpus.list.line.lower <- tolower(corpus.list.line) corpus.list.line.lower [[78]] typeof(corpus.list.line.lower) #character ### tokenizing with text/file separation corpus.list.line.clean <- strsplit(corpus.list.line.lower, "\\W") corpus.list.line.clean [[78]] typeof(corpus.list.line.clean) #list typeof(corpus.list.line.clean[[78]][985]) #character ## 3rd - Corpus ### making a Simple Corpus: lists of vectors corpus.as.list <- Corpus(VectorSource(as.vector(corpus.list.line.clean))) corpus.as.list typeof(arrcorpus) #character typeof(corpus.as.list) #list inspect(corpus.as.list[[2]]) corpus.as.list.integral <- corpus.as.list ## 5th - More Data Cleaning ### removing numbers corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removeNumbers)) inspect(corpus.as.list.integral[[48]]) ### removing punctuation corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(removePunctuation)) inspect(corpus.as.list.integral[48]) ### removing white spaces corpus.as.list.integral <- tm_map(corpus.as.list.integral, content_transformer(stripWhitespace)) typeof(corpus.as.list.integral[48]) #list corpus.as.list.integral #corpus without removing stopwords as well as lemmatizing
KWIC – KEYWORDS IN CONTEXT KINGDOMS OF NATURE ANALYSIS
## tokenization ### quanteda Corpus for kwic (non lemmatized, stopwords not removed) corpus.as.list.toks <- corpus(corpus.as.list.integral) summary(corpus.as.list.toks) ### tokens toks_arr_integral <- quanteda::tokens(corpus.as.list.toks) ### fixing the name of the book within corpus arr.book.names <- scan("/Users/patriciareina/Desktop/FAT/LISTAS/nomes_obras_poeticas_arr.txt", what="character", sep="\n") docnames(toks_arr_integral) `docnames<-`(toks_arr_integral,arr.book.names) names.toks_arr_integral <- `docnames<-`(toks_arr_integral,arr.book.names) ### Used lists #### Literal terms (stemming process included) kingdoms.literal.terms <- c("vegetal", "animal", "mineral") kingdoms.literal.terms.df <- data.frame(kingdoms.literal.terms) kingdoms.literal.terms.stem <- stemDocument(kingdoms.literal.terms) kingdoms.literal.terms.stem <- gsub('$','*',kingdoms.literal.terms.stem) kingdoms.literal.terms.stem #"veget*" "anim*" "miner*" ### All the corpus, by book options(max.print=999999) #### Literal terms toks_arr_literal_terms_kwic <- kwic(names.toks_arr_integral, pattern = phrase(kingdoms.literal.terms.stem), valuetype = "glob", window = 5) #### Data output capture.output(toks_arr_literal_terms_kwic, file="toks_arr_literal_terms_kwic.txt") options(max.print=9999)