This vignette documents the collection of additional article metadata. Because this vignette retrieves some licensed information, it is not run.


We define here an output directory out.dir and an API for rscopus.

languages <- c("english", "portuguese", "spanish")
out.dir <- "exploitation/out/run79"

Data loading

load(file = system.file("extdata", "language_dfs_updated_3", ".rda", package = "wateReview")
in_corpus <- language_dfs$english[which(language_dfs$english$collected == "in corpus"), ]
scopusID <- in_corpus$ArticleURL[grepl("scopus", in_corpus$ArticleURL)]
wosID <- in_corpus[!grepl("scopus", in_corpus$ArticleURL), ]
wosID <- wosID$DOI

Abstract retrieval

scopusAbstracts <- get_allMetadata(scopusID, fun = get_scopusAbstract)
wosAbstracts <- get_allMetadata(wosID, fun = get_wosAbstract)
in_corpus <- add_abstractsToCorpus(in_corpus, scopusAbstracts, wosAbstracts)


wosFullResult <- get_allMetadata(in_corpus$DOI, fun = get_wosFullResult)


We now extract the author-keywords from the full results.

relevant_countries <- get_relevantCountries()
wosAuthKeywords <- get_allAuthKeywords(wosFullResult)
ind_nullAuthKeywords <- QA.AuthKeywords(wosAuthKeywords, relevant_countries)

Iterate (optional)

At this point, it is possible to iterate and retry some of the failed entries

wosFullResult <- get_allMetadata(in_corpus$DOI, fun = get_wosFullResult, newpass = TRUE, metadata = wosFullResult, ind_null = ind_nullAuthKeywords)

Transform results to label

Using get_boolean_AuthKeywords() we transform the keywords into a boolean vector to use as input to the machine learning predictions.

boolean_AuthKeywords <- get_boolean_AuthKeywords(wosAuthKeywords, relevant_countries)

Identify entries with any label

Using get_ind_hasCountryTag() we retrive the indices of the documents of the corpus that have been identified to have author-keywords.

ind_hasCountryTag <- get_ind_hasCountryTag(boolean_AuthKeywords)