# Load necessary packages
::p_load(
pacman
here, qs,
magrittr, janitor,
easystats, sjmisc,
ggpubr,
openalexR, bibliometrix,
tidyverse
)
# Load custom functions
# See https://github.com/christopherBelter/scopusAPI) for details
source(here("R/scopusAPI.R"))
API mining: Scopus
Projektseminar
Preparation
Mining Scopus API
See scopusAPI for information about the custom functions.
# Load API credentials
<- Sys.getenv("Elsevier_API") scopus_api_key
Create search query
<- 'TITLE-ABS-KEY ( ( "artificial intelligence" OR ai OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai" ) AND ( anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR ux OR usability OR trust* OR "conversational experience" OR cux OR "conversation design" OR safety OR privacy ) ) AND (SUBJAREA(SOCI) OR SUBJAREA(PSYC)) AND LANGUAGE("English")' scopus_query
Search and fetch data
<- searchByString(
scopus_xml string = scopus_query,
outfile = here("local_data/scopus_API_export.xml"))
Convert .xml
to data frame
<- list(
scopus xml = scopus_xml,
api = extractXML(scopus_xml)
)
Quality control
$api %>%
scopus::skim() skimr
Name | Piped data |
Number of rows | 7721 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
character | 16 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
scopusID | 0 | 1.00 | 17 | 18 | 0 | 7721 | 0 |
doi | 701 | 0.91 | 13 | 69 | 0 | 7015 | 0 |
pmid | 7256 | 0.06 | 7 | 10 | 0 | 465 | 0 |
authors | 0 | 1.00 | 5 | 812 | 0 | 7436 | 0 |
affiliations | 0 | 1.00 | 2 | 2364 | 0 | 5882 | 0 |
countries | 0 | 1.00 | 0 | 180 | 5 | 1256 | 0 |
year | 0 | 1.00 | 4 | 4 | 0 | 40 | 0 |
articletitle | 0 | 1.00 | 9 | 614 | 0 | 7700 | 0 |
journal | 0 | 1.00 | 4 | 432 | 0 | 2497 | 0 |
volume | 2624 | 0.66 | 1 | 15 | 0 | 381 | 0 |
issue | 4268 | 0.45 | 1 | 20 | 0 | 130 | 0 |
pages | 0 | 1.00 | 0 | 11 | 2967 | 4176 | 0 |
keywords | 1449 | 0.81 | 11 | 939 | 0 | 6259 | 0 |
abstract | 86 | 0.99 | 161 | 13294 | 0 | 7616 | 0 |
ptype | 0 | 1.00 | 4 | 17 | 0 | 13 | 0 |
timescited | 0 | 1.00 | 1 | 4 | 0 | 247 | 0 |
Quick overview
- Nearly all references have an abstract (± 98 percent) and DOI (± 90 percent), which are the critical information for the analysis.
Check duplicates
# Check for duplicates based on DOI
$api %>%
scopusfilter(!is.na(doi)) %>% # exclude cases without DOI
group_by(doi) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=7015 valid N=7015 mean=1.00 sd=0.03
Value | N | Raw % | Valid % | Cum. %
---------------------------------------
1 | 7010 | 99.93 | 99.93 | 99.93
2 | 5 | 0.07 | 0.07 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
<- list()
duplicates
# Extract duplicated IDs
$api$doi$string <- scopus$api %>%
duplicatesfilter(!is.na(doi)) %>%
group_by(doi) %>%
summarise(n = n()) %>%
filter(n > 1) %>%
pull(doi)
# Extract cases with duplicated IDs
$api$doi$data <- scopus$api %>%
duplicatesfilter(doi %in% duplicates$api$doi$string)
Summary
- The duplicates based on the DOI do not seem to follow a systematic pattern. Therefore, DOI duplicates are eliminated with
distinct()
.
Transformation
$raw <- scopus$api %>%
scopusdistinct(doi, .keep_all = TRUE) %>%
filter(ptype %in% c("Article", "Conference Paper"))
Check transformation
$raw %>%
scopusfilter(!is.na(doi)) %>% # exclude cases without DOI
group_by(doi) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=5667 valid N=5667 mean=1.00 sd=0.00
Value | N | Raw % | Valid % | Cum. %
---------------------------------------
1 | 5667 | 100 | 100 | 100
<NA> | 0 | 0 | <NA> | <NA>