# Load necessary packages
::p_load(
pacman
here, qs,
magrittr, janitor,
naniar, visdat,
easystats, sjmisc,
ggpubr,
gt, gtExtras, gtsummary,
openalexR, bibliometrix,
tidyverse )
API mining: OpenAlex
Projektseminar
Preparation
Mining OpenAlex API
Set credentials
# Set openalexR.mailto option so that your requests go to the polite pool for faster response times
options(openalexR.mailto = "christoph.adrian@fau.de")
Initial OpenAlex API query
<- list()
references
# Download data via API
$openalex$api <- openalexR::oa_fetch(
referencesentity = "works",
title_and_abstract.search = '("artificial intelligence" OR AI OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai") AND (anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR UX OR usability OR trust* OR "conversational experience" OR CUX OR "conversation design" OR safety OR privacy)',
publication_year = "2016-2025",
primary_topic.field.id = c(
"fields/33", # Social Science
"fields/32" # Psychology
),language = "en",
type = c("article", "conference-paper", "preprint"),
verbose = TRUE
)
Quality control
$openalex$api %>%
references::skim() skimr
Name | Piped data |
Number of rows | 30646 |
Number of columns | 39 |
_______________________ | |
Column type frequency: | |
character | 23 |
list | 8 |
logical | 5 |
numeric | 3 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
id | 0 | 1.00 | 30 | 32 | 0 | 28528 | 0 |
title | 1 | 1.00 | 4 | 500 | 0 | 27743 | 0 |
display_name | 1 | 1.00 | 4 | 500 | 0 | 27743 | 0 |
ab | 1188 | 0.96 | 0 | 51771 | 10 | 26719 | 0 |
publication_date | 0 | 1.00 | 10 | 10 | 0 | 2672 | 0 |
so | 5735 | 0.81 | 1 | 244 | 0 | 7969 | 0 |
so_id | 5735 | 0.81 | 27 | 32 | 0 | 7994 | 0 |
host_organization | 13032 | 0.57 | 3 | 155 | 0 | 1274 | 0 |
issn_l | 10525 | 0.66 | 9 | 9 | 0 | 7295 | 0 |
url | 137 | 1.00 | 21 | 273 | 0 | 28390 | 0 |
pdf_url | 18826 | 0.39 | 29 | 359 | 0 | 10860 | 0 |
license | 20216 | 0.34 | 3 | 21 | 0 | 11 | 0 |
version | 12190 | 0.60 | 15 | 16 | 0 | 3 | 0 |
first_page | 13186 | 0.57 | 1 | 15 | 0 | 4030 | 0 |
last_page | 13299 | 0.57 | 1 | 15 | 0 | 4038 | 0 |
volume | 13362 | 0.56 | 1 | 25 | 0 | 604 | 0 |
issue | 15657 | 0.49 | 1 | 30 | 0 | 629 | 0 |
oa_status | 0 | 1.00 | 4 | 7 | 0 | 6 | 0 |
oa_url | 11799 | 0.61 | 20 | 359 | 0 | 17197 | 0 |
language | 0 | 1.00 | 2 | 2 | 0 | 1 | 0 |
cited_by_api_url | 0 | 1.00 | 53 | 55 | 0 | 28528 | 0 |
doi | 2133 | 0.93 | 26 | 96 | 0 | 26496 | 0 |
type | 0 | 1.00 | 7 | 8 | 0 | 2 | 0 |
Variable type: list
skim_variable | n_missing | complete_rate | n_unique | min_length | max_length |
---|---|---|---|---|---|
author | 249 | 0.99 | 27647 | 1 | 12 |
grants | 28143 | 0.08 | 2046 | 1 | 33 |
counts_by_year | 15039 | 0.51 | 4415 | 1 | 2 |
ids | 0 | 1.00 | 28528 | 1 | 5 |
referenced_works | 10836 | 0.65 | 18263 | 1 | 442 |
related_works | 423 | 0.99 | 20233 | 1 | 20 |
concepts | 0 | 1.00 | 28523 | 5 | 5 |
topics | 0 | 1.00 | 27999 | 5 | 5 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
is_oa | 134 | 1 | 0.56 | TRU: 17130, FAL: 13382 |
is_oa_anywhere | 0 | 1 | 0.61 | TRU: 18797, FAL: 11849 |
any_repository_has_fulltext | 0 | 1 | 0.30 | FAL: 21399, TRU: 9247 |
is_paratext | 0 | 1 | 0.00 | FAL: 30646 |
is_retracted | 0 | 1 | 0.00 | FAL: 30595, TRU: 51 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
relevance_score | 0 | 1 | 9.48 | 26.05 | 0.04 | 0.84 | 3.09 | 7.01 | 986.22 | ▇▁▁▁▁ |
cited_by_count | 0 | 1 | 7.47 | 32.62 | 0.00 | 0.00 | 1.00 | 4.00 | 1878.00 | ▇▁▁▁▁ |
publication_year | 0 | 1 | 2021.95 | 1.99 | 2016.00 | 2021.00 | 2023.00 | 2024.00 | 2025.00 | ▁▂▅▇▆ |
Quick overview
- Nearly all references have an abstract (± 96 percent) and DOI (± 93 percent), which are the critical information for the analysis.
- The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.
Check duplicates
based on OpenAlex ID
# Check for duplicates based on OpenAlex ID
$openalex$api %>%
referencesgroup_by(id) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=28528 valid N=28528 mean=1.07 sd=0.26
Value | N | Raw % | Valid % | Cum. %
----------------------------------------
1 | 26410 | 92.58 | 92.58 | 92.58
2 | 2118 | 7.42 | 7.42 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
<- list()
duplicates
# Extract duplicated IDs
$openalex$api$id$string <- references$openalex$api %>%
duplicatesgroup_by(id) %>%
summarise(n = n()) %>%
filter(n > 1) %>%
pull(id)
# Extract cases with duplicated IDs
$openalex$api$id$data <- references$openalex$api %>%
duplicatesfilter(id %in% duplicates$openalex$api$id$string) %>%
arrange(id)
# Extract uneven (odd) rows
<- duplicates$openalex$api$id$data[seq(1, nrow(duplicates$openalex$api$id$data), by = 2), ]
df1 <- duplicates$openalex$api$id$data[seq(2, nrow(duplicates$openalex$api$id$data), by = 2), ]
df2
# Compare the two data frames
summary(arsenal::comparedf(df1, df2))
Table: Summary of data.frames
version arg ncol nrow
-------- ---- ----- -----
x df1 39 2118
y df2 39 2118
Table: Summary of overall comparison
statistic value
------------------------------------------------------------ ------
Number of by-variables 0
Number of non-by variables in common 39
Number of variables compared 39
Number of variables in x but not y 0
Number of variables in y but not x 0
Number of variables compared with some values unequal 1
Number of variables compared with all values equal 38
Number of observations in common 2118
Number of observations in x but not y 0
Number of observations in y but not x 0
Number of observations with some compared variables unequal 2118
Number of observations with all compared variables equal 0
Number of values unequal 2118
Table: Variables not shared
------------------------
No variables not shared
------------------------
Table: Other variables not compared
--------------------------------
No other variables not compared
--------------------------------
Table: Observations not shared
---------------------------
No observations not shared
---------------------------
Table: Differences detected by variable
var.x var.y n NAs
---------------------------- ---------------------------- ----- ----
id id 0 0
title title 0 0
display_name display_name 0 0
author author 0 0
ab ab 0 0
publication_date publication_date 0 0
relevance_score relevance_score 2118 0
so so 0 0
so_id so_id 0 0
host_organization host_organization 0 0
issn_l issn_l 0 0
url url 0 0
pdf_url pdf_url 0 0
license license 0 0
version version 0 0
first_page first_page 0 0
last_page last_page 0 0
volume volume 0 0
issue issue 0 0
is_oa is_oa 0 0
is_oa_anywhere is_oa_anywhere 0 0
oa_status oa_status 0 0
oa_url oa_url 0 0
any_repository_has_fulltext any_repository_has_fulltext 0 0
language language 0 0
grants grants 0 0
cited_by_count cited_by_count 0 0
counts_by_year counts_by_year 0 0
publication_year publication_year 0 0
cited_by_api_url cited_by_api_url 0 0
ids ids 0 0
doi doi 0 0
type type 0 0
referenced_works referenced_works 0 0
related_works related_works 0 0
is_paratext is_paratext 0 0
is_retracted is_retracted 0 0
concepts concepts 0 0
topics topics 0 0
Table: Differences detected (2108 not shown)
var.x var.y ..row.names.. values.x values.y row.x row.y
---------------- ---------------- -------------- ---------- ---------- ------ ------
relevance_score relevance_score 1 5.394589 5.377582 1 1
relevance_score relevance_score 2 2.179161 2.139825 2 2
relevance_score relevance_score 3 0.8233411 0.8151723 3 3
relevance_score relevance_score 4 1.192189 1.161013 4 4
relevance_score relevance_score 5 2.006378 1.954127 5 5
relevance_score relevance_score 6 1.195497 1.183922 6 6
relevance_score relevance_score 7 0.6909628 0.6844353 7 7
relevance_score relevance_score 8 0.8900583 0.879919 8 8
relevance_score relevance_score 9 8.128157 8.087495 9 9
relevance_score relevance_score 10 0.7075946 0.7026421 10 10
Table: Non-identical attributes
----------------------------
No non-identical attributes
----------------------------
based on DOI
# Check for duplicates based on DOI
$openalex$api %>%
referencesdistinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
filter(!is.na(doi)) %>% # exclude cases without DOI
group_by(doi) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=26496 valid N=26496 mean=1.00 sd=0.02
Value | N | Raw % | Valid % | Cum. %
----------------------------------------
1 | 26488 | 99.97 | 99.97 | 99.97
2 | 8 | 0.03 | 0.03 | 100.00
<NA> | 0 | 0.00 | <NA> | <NA>
# Extract duplicated IDs
$openalex$api$doi$string <- references$openalex$api %>%
duplicatesdistinct(id, .keep_all = TRUE) %>%
filter(!is.na(doi)) %>%
group_by(doi) %>%
summarise(n = n()) %>%
filter(n > 1) %>%
pull(doi)
# Extract cases with duplicated IDs
$openalex$api$doi$data <- references$openalex$api %>%
duplicatesfilter(doi %in% duplicates$openalex$api$doi$string)
# Extract cases to be deleted
$openalex$api$doi$delete <- duplicates$openalex$api$doi$data %>%
duplicatesmutate(id_number = as.numeric(sub(".*W", "", id))) %>%
group_by(doi) %>% # Group by `doi`
slice_min(id_number, n = 1, with_ties = FALSE) %>%
select(-id_number)
Summary
- The duplicates based on the OpenAlex ID seem to only have differences in columns, that are less relevant for the analysis. The duplicates are therefore eliminated with the
distinct()
function. - The duplicates based on the DOI are a result of pre-prints being published. Therefore, only the most recent entry for each duplicated DOI will be kept.
Transformation
$openalex$raw <- references$openalex$api %>%
referencesdistinct(id, .keep_all = TRUE) %>% # delete duplicates based on ID
anti_join(duplicates$openalex$api$doi$delete, by = "id") # delete one case of each DOI duplicated()
Check transformation
$openalex$raw %>%
referencesgroup_by(id) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=28520 valid N=28520 mean=1.00 sd=0.00
Value | N | Raw % | Valid % | Cum. %
----------------------------------------
1 | 28520 | 100 | 100 | 100
<NA> | 0 | 0 | <NA> | <NA>
$openalex$raw %>%
referencesfilter(!is.na(doi)) %>% # exclude cases without DOI
group_by(doi) %>%
summarise(n = n()) %>%
frq(n, sort.frq = "desc")
n <integer>
# total N=26496 valid N=26496 mean=1.00 sd=0.00
Value | N | Raw % | Valid % | Cum. %
----------------------------------------
1 | 26496 | 100 | 100 | 100
<NA> | 0 | 0 | <NA> | <NA>