API mining: Scopus

Projektseminar

Published

26.11.2024

Preparation

# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  easystats, sjmisc,
  ggpubr, 
  openalexR, bibliometrix, 
  tidyverse
)

# Load custom functions
# See https://github.com/christopherBelter/scopusAPI) for details
source(here("R/scopusAPI.R"))

Mining Scopus API

See scopusAPI for information about the custom functions.

# Load API credentials
scopus_api_key <- Sys.getenv("Elsevier_API")

Create search query

scopus_query <- 'TITLE-ABS-KEY ( ( "artificial intelligence" OR ai OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai" ) AND ( anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR ux OR usability OR trust* OR "conversational experience" OR cux OR "conversation design" OR safety OR privacy ) ) AND (SUBJAREA(SOCI) OR SUBJAREA(PSYC)) AND LANGUAGE("English")'

Search and fetch data

scopus_xml <- searchByString(
  string = scopus_query,
  outfile = here("local_data/scopus_API_export.xml"))

Convert .xml to data frame

scopus <- list(
  xml = scopus_xml, 
  api = extractXML(scopus_xml)
  )

Quality control

scopus$api %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 7721
Number of columns 16
_______________________
Column type frequency:
character 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
scopusID 0 1.00 17 18 0 7721 0
doi 701 0.91 13 69 0 7015 0
pmid 7256 0.06 7 10 0 465 0
authors 0 1.00 5 812 0 7436 0
affiliations 0 1.00 2 2364 0 5882 0
countries 0 1.00 0 180 5 1256 0
year 0 1.00 4 4 0 40 0
articletitle 0 1.00 9 614 0 7700 0
journal 0 1.00 4 432 0 2497 0
volume 2624 0.66 1 15 0 381 0
issue 4268 0.45 1 20 0 130 0
pages 0 1.00 0 11 2967 4176 0
keywords 1449 0.81 11 939 0 6259 0
abstract 86 0.99 161 13294 0 7616 0
ptype 0 1.00 4 17 0 13 0
timescited 0 1.00 1 4 0 247 0
Quick overview
  • Nearly all references have an abstract (± 98 percent) and DOI (± 90 percent), which are the critical information for the analysis.

Check duplicates

# Check for duplicates based on DOI
scopus$api %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=7015 valid N=7015 mean=1.00 sd=0.03

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 7010 | 99.93 |   99.93 |  99.93
    2 |    5 |  0.07 |    0.07 | 100.00
 <NA> |    0 |  0.00 |    <NA> |   <NA>
duplicates <- list()

# Extract duplicated IDs
duplicates$api$doi$string <- scopus$api %>%  
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)

# Extract cases with duplicated IDs
duplicates$api$doi$data <- scopus$api %>% 
  filter(doi %in% duplicates$api$doi$string)
Summary
  • The duplicates based on the DOI do not seem to follow a systematic pattern. Therefore, DOI duplicates are eliminated with distinct().

Transformation

scopus$raw <- scopus$api %>% 
  distinct(doi, .keep_all = TRUE) %>% 
  filter(ptype %in% c("Article", "Conference Paper"))

Check transformation

scopus$raw %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=5667 valid N=5667 mean=1.00 sd=0.00

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 5667 |   100 |     100 |    100
 <NA> |    0 |     0 |    <NA> |   <NA>