web_scrapping.Rmd
This vignette documents the collection of additional article metadata. Because this vignette retrieves some licensed information, it is not run.
library(wateReview)
if(!require("rvest")) install.packages("rvest")
library("rvest")
if(!require("rscopus")) install.packages("rscopus")
library("rscopus")
We define here an output directory out.dir
and an API for rscopus
.
languages <- c("english", "portuguese", "spanish")
out.dir <- "exploitation/out/run79"
rscopus::set_api_key("your/API/key")
load(file = system.file("extdata", "language_dfs_updated_3", ".rda", package = "wateReview")
in_corpus <- language_dfs$english[which(language_dfs$english$collected == "in corpus"), ]
scopusID <- in_corpus$ArticleURL[grepl("scopus", in_corpus$ArticleURL)]
wosID <- in_corpus[!grepl("scopus", in_corpus$ArticleURL), ]
wosID <- wosID$DOI
scopusAbstracts <- get_allMetadata(scopusID, fun = get_scopusAbstract)
wosAbstracts <- get_allMetadata(wosID, fun = get_wosAbstract)
in_corpus <- add_abstractsToCorpus(in_corpus, scopusAbstracts, wosAbstracts)
We now extract the author-keywords from the full results.
relevant_countries <- get_relevantCountries()
wosAuthKeywords <- get_allAuthKeywords(wosFullResult)
ind_nullAuthKeywords <- QA.AuthKeywords(wosAuthKeywords, relevant_countries)
At this point, it is possible to iterate and retry some of the failed entries
wosFullResult <- get_allMetadata(in_corpus$DOI, fun = get_wosFullResult, newpass = TRUE, metadata = wosFullResult, ind_null = ind_nullAuthKeywords)