webscrapped_training_labels.Rmd
englishCorpus_file <- "./data/english_corpus.Rds"
englishCorpus <- readRDS(englishCorpus_file)
in_corpus_file <- "in_corpus.Rds"
in_corpus <- readRDS(in_corpus_file)
boolean_AuthKeywords <- readRDS("boolean_AuthKeywords.Rds")
The following code chunks perform a cross-walk between the topic model and EndNote databases using document id: EndNoteIdcorpus
and EndNoteIdLDA
. This ensures that the two databases can communicate and are aligned.
EndNoteIdcorpus <- get_EndNoteIdcorpus(in_corpus)
EndNoteIdLDA <- get_EndNoteIdLDA(englishCorpus)
QA_EndNoteIdCorpusLDA(EndNoteIdLDA, EndNoteIdcorpus)
in_corpus <- align_dataWithEndNoteIdcorpus(in_corpus, EndNoteIdcorpus, EndNoteIdLDA)
boolean_AuthKeywords <- align_dataWithEndNoteIdcorpus(boolean_AuthKeywords, EndNoteIdcorpus, EndNoteIdLDA)
englishCorpus <- align_dataWithEndNoteIdLDA(englishCorpus, EndNoteIdLDA, EndNoteIdcorpus)
EndNoteIdLDA <- align_dataWithEndNoteIdLDA(EndNoteIdLDA, EndNoteIdLDA, EndNoteIdcorpus)
EndNoteIdcorpus <- align_dataWithEndNoteIdcorpus(EndNoteIdcorpus, EndNoteIdcorpus, EndNoteIdLDA)
QA_EndNoteIdCorpusLDA(EndNoteIdLDA, EndNoteIdcorpus)
boolean_AuthKeywords <- order_data(boolean_AuthKeywords, EndNoteIdLDA, EndNoteIdcorpus)
in_corpus <- order_data(in_corpus, EndNoteIdLDA, EndNoteIdcorpus)
englishCorpus$abstract <- in_corpus$abstract
ind_hasCountryTag <- get_ind_hasCountryTag(boolean_AuthKeywords)
webscrapped_trainingData <- make_webscrapped_trainingData(boolean_AuthKeywords, ind_hasCountryTag, englishCorpus, englishCorpus_file)
country_tokens <- webscrapped_trainingData$country_tokens
webscrapped_validationDTM <- webscrapped_trainingData$webscrapped_validationDTM
webscrapped_trainingLabels <- webscrapped_trainingData$webscrapped_trainingLabels
saveRDS(country_tokens, "country_tokens.Rds")
saveRDS(webscrapped_validationDTM, "webscrapped_validationDTM.Rds")
saveRDS(webscrapped_trainingLabels, "webscrapped_trainingLabels.Rds")