# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  easystats, sjmisc,
  ggpubr, 
  openalexR, bibliometrix, 
  tidyverse
)
# Load custom functions
# See https://github.com/christopherBelter/scopusAPI) for details
source(here("R/scopusAPI.R"))API mining: Scopus
Projektseminar
Preparation
Mining Scopus API
See scopusAPI for information about the custom functions.
# Load API credentials
scopus_api_key <- Sys.getenv("Elsevier_API")Create search query
scopus_query <- 'TITLE-ABS-KEY ( ( "artificial intelligence" OR ai OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai" ) AND ( anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR ux OR usability OR trust* OR "conversational experience" OR cux OR "conversation design" OR safety OR privacy ) ) AND (SUBJAREA(SOCI) OR SUBJAREA(PSYC)) AND LANGUAGE("English")'Search and fetch data
scopus_xml <- searchByString(
  string = scopus_query,
  outfile = here("local_data/scopus_API_export.xml"))Convert .xml to data frame
scopus <- list(
  xml = scopus_xml, 
  api = extractXML(scopus_xml)
  )Quality control
scopus$api %>% 
  skimr::skim()| Name | Piped data | 
| Number of rows | 7721 | 
| Number of columns | 16 | 
| _______________________ | |
| Column type frequency: | |
| character | 16 | 
| ________________________ | |
| Group variables | None | 
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace | 
|---|---|---|---|---|---|---|---|
| scopusID | 0 | 1.00 | 17 | 18 | 0 | 7721 | 0 | 
| doi | 701 | 0.91 | 13 | 69 | 0 | 7015 | 0 | 
| pmid | 7256 | 0.06 | 7 | 10 | 0 | 465 | 0 | 
| authors | 0 | 1.00 | 5 | 812 | 0 | 7436 | 0 | 
| affiliations | 0 | 1.00 | 2 | 2364 | 0 | 5882 | 0 | 
| countries | 0 | 1.00 | 0 | 180 | 5 | 1256 | 0 | 
| year | 0 | 1.00 | 4 | 4 | 0 | 40 | 0 | 
| articletitle | 0 | 1.00 | 9 | 614 | 0 | 7700 | 0 | 
| journal | 0 | 1.00 | 4 | 432 | 0 | 2497 | 0 | 
| volume | 2624 | 0.66 | 1 | 15 | 0 | 381 | 0 | 
| issue | 4268 | 0.45 | 1 | 20 | 0 | 130 | 0 | 
| pages | 0 | 1.00 | 0 | 11 | 2967 | 4176 | 0 | 
| keywords | 1449 | 0.81 | 11 | 939 | 0 | 6259 | 0 | 
| abstract | 86 | 0.99 | 161 | 13294 | 0 | 7616 | 0 | 
| ptype | 0 | 1.00 | 4 | 17 | 0 | 13 | 0 | 
| timescited | 0 | 1.00 | 1 | 4 | 0 | 247 | 0 | 
Quick overview
- Nearly all references have an abstract (± 98 percent) and DOI (± 90 percent), which are the critical information for the analysis.
 
Check duplicates
# Check for duplicates based on DOI
scopus$api %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=7015 valid N=7015 mean=1.00 sd=0.03
Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 7010 | 99.93 |   99.93 |  99.93
    2 |    5 |  0.07 |    0.07 | 100.00
 <NA> |    0 |  0.00 |    <NA> |   <NA>
duplicates <- list()
# Extract duplicated IDs
duplicates$api$doi$string <- scopus$api %>%  
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)
# Extract cases with duplicated IDs
duplicates$api$doi$data <- scopus$api %>% 
  filter(doi %in% duplicates$api$doi$string)
Summary
- The duplicates based on the DOI do not seem to follow a systematic pattern. Therefore, DOI duplicates are eliminated with 
distinct(). 
Transformation
scopus$raw <- scopus$api %>% 
  distinct(doi, .keep_all = TRUE) %>% 
  filter(ptype %in% c("Article", "Conference Paper"))Check transformation
scopus$raw %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=5667 valid N=5667 mean=1.00 sd=0.00
Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 5667 |   100 |     100 |    100
 <NA> |    0 |     0 |    <NA> |   <NA>