API mining: OpenAlex

Projektseminar

Published

17.12.2024

Preparation

# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  naniar, visdat,
  easystats, sjmisc,
  ggpubr, 
  gt, gtExtras, gtsummary,
  openalexR, bibliometrix, 
  tidyverse
)

Mining OpenAlex API

Set credentials

# Set openalexR.mailto option so that your requests go to the polite pool for faster response times
options(openalexR.mailto = "christoph.adrian@fau.de")

Initial OpenAlex API query

references <- list()

# Download data via API
references$openalex$api <- openalexR::oa_fetch(
  entity = "works",
  title_and_abstract.search = '("artificial intelligence" OR AI OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai") AND (anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR UX OR usability OR trust* OR "conversational experience" OR CUX OR "conversation design" OR safety OR privacy)',
  publication_year = "2016-2025",
  primary_topic.field.id = c(
    "fields/33", # Social Science
    "fields/32" # Psychology
  ),
  language = "en",
  type = c("article", "conference-paper", "preprint"),
  verbose = TRUE
)

Quality control

references$openalex$api %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	30646
Number of columns	39
_______________________
Column type frequency:
character	23
list	8
logical	5
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique
id	0	1.00	30	32	0	28528
title	1	1.00	4	500	0	27743
display_name	1	1.00	4	500	0	27743
ab	1188	0.96	0	51771	10	26719
publication_date	0	1.00	10	10	0	2672
so	5735	0.81	1	244	0	7969
so_id	5735	0.81	27	32	0	7994
host_organization	13032	0.57	3	155	0	1274
issn_l	10525	0.66	9	9	0	7295
url	137	1.00	21	273	0	28390
pdf_url	18826	0.39	29	359	0	10860
license	20216	0.34	3	21	0	11
version	12190	0.60	15	16	0	3
first_page	13186	0.57	1	15	0	4030
last_page	13299	0.57	1	15	0	4038
volume	13362	0.56	1	25	0	604
issue	15657	0.49	1	30	0	629
oa_status	0	1.00	4	7	0	6
oa_url	11799	0.61	20	359	0	17197
language	0	1.00	2	2	0	1
cited_by_api_url	0	1.00	53	55	0	28528
doi	2133	0.93	26	96	0	26496
type	0	1.00	7	8	0	2

Variable type: list

skim_variable	n_missing	complete_rate	n_unique	min_length	max_length
author	249	0.99	27647	1	12
grants	28143	0.08	2046	1	33
counts_by_year	15039	0.51	4415	1	2
ids	0	1.00	28528	1	5
referenced_works	10836	0.65	18263	1	442
related_works	423	0.99	20233	1	20
concepts	0	1.00	28523	5	5
topics	0	1.00	27999	5	5

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
is_oa	134	1	0.56	TRU: 17130, FAL: 13382
is_oa_anywhere	0	1	0.61	TRU: 18797, FAL: 11849
any_repository_has_fulltext	0	1	0.30	FAL: 21399, TRU: 9247
is_paratext	0	1	0.00	FAL: 30646
is_retracted	0	1	0.00	FAL: 30595, TRU: 51

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
relevance_score	1	9.48	26.05	0.04	0.84	3.09	7.01	986.22	▇▁▁▁▁
cited_by_count	1	7.47	32.62	0.00	0.00	1.00	4.00	1878.00	▇▁▁▁▁
publication_year	1	2021.95	1.99	2016.00	2021.00	2023.00	2024.00	2025.00	▁▂▅▇▆

Quick overview

Nearly all references have an abstract (± 96 percent) and DOI (± 93 percent), which are the critical information for the analysis.
The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.

Check duplicates

based on OpenAlex ID

# Check for duplicates based on OpenAlex ID
references$openalex$api %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=28528 valid N=28528 mean=1.07 sd=0.26

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26410 | 92.58 |   92.58 |  92.58
    2 |  2118 |  7.42 |    7.42 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>

duplicates <- list()

# Extract duplicated IDs
duplicates$openalex$api$id$string <- references$openalex$api %>% 
  group_by(id) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(id)

# Extract cases with duplicated IDs
duplicates$openalex$api$id$data <- references$openalex$api %>% 
  filter(id %in% duplicates$openalex$api$id$string) %>% 
  arrange(id)

# Extract uneven (odd) rows
df1 <- duplicates$openalex$api$id$data[seq(1, nrow(duplicates$openalex$api$id$data), by = 2), ]
df2 <- duplicates$openalex$api$id$data[seq(2, nrow(duplicates$openalex$api$id$data), by = 2), ]

# Compare the two data frames
summary(arsenal::comparedf(df1, df2))



Table: Summary of data.frames

version   arg    ncol   nrow
--------  ----  -----  -----
x         df1      39   2118
y         df2      39   2118



Table: Summary of overall comparison

statistic                                                      value
------------------------------------------------------------  ------
Number of by-variables                                             0
Number of non-by variables in common                              39
Number of variables compared                                      39
Number of variables in x but not y                                 0
Number of variables in y but not x                                 0
Number of variables compared with some values unequal              1
Number of variables compared with all values equal                38
Number of observations in common                                2118
Number of observations in x but not y                              0
Number of observations in y but not x                              0
Number of observations with some compared variables unequal     2118
Number of observations with all compared variables equal           0
Number of values unequal                                        2118



Table: Variables not shared

                         
 ------------------------
 No variables not shared 
 ------------------------



Table: Other variables not compared

                                 
 --------------------------------
 No other variables not compared 
 --------------------------------



Table: Observations not shared

                            
 ---------------------------
 No observations not shared 
 ---------------------------



Table: Differences detected by variable

var.x                         var.y                             n   NAs
----------------------------  ----------------------------  -----  ----
id                            id                                0     0
title                         title                             0     0
display_name                  display_name                      0     0
author                        author                            0     0
ab                            ab                                0     0
publication_date              publication_date                  0     0
relevance_score               relevance_score                2118     0
so                            so                                0     0
so_id                         so_id                             0     0
host_organization             host_organization                 0     0
issn_l                        issn_l                            0     0
url                           url                               0     0
pdf_url                       pdf_url                           0     0
license                       license                           0     0
version                       version                           0     0
first_page                    first_page                        0     0
last_page                     last_page                         0     0
volume                        volume                            0     0
issue                         issue                             0     0
is_oa                         is_oa                             0     0
is_oa_anywhere                is_oa_anywhere                    0     0
oa_status                     oa_status                         0     0
oa_url                        oa_url                            0     0
any_repository_has_fulltext   any_repository_has_fulltext       0     0
language                      language                          0     0
grants                        grants                            0     0
cited_by_count                cited_by_count                    0     0
counts_by_year                counts_by_year                    0     0
publication_year              publication_year                  0     0
cited_by_api_url              cited_by_api_url                  0     0
ids                           ids                               0     0
doi                           doi                               0     0
type                          type                              0     0
referenced_works              referenced_works                  0     0
related_works                 related_works                     0     0
is_paratext                   is_paratext                       0     0
is_retracted                  is_retracted                      0     0
concepts                      concepts                          0     0
topics                        topics                            0     0



Table: Differences detected (2108 not shown)

var.x             var.y              ..row.names..  values.x    values.y     row.x   row.y
----------------  ----------------  --------------  ----------  ----------  ------  ------
relevance_score   relevance_score                1  5.394589    5.377582         1       1
relevance_score   relevance_score                2  2.179161    2.139825         2       2
relevance_score   relevance_score                3  0.8233411   0.8151723        3       3
relevance_score   relevance_score                4  1.192189    1.161013         4       4
relevance_score   relevance_score                5  2.006378    1.954127         5       5
relevance_score   relevance_score                6  1.195497    1.183922         6       6
relevance_score   relevance_score                7  0.6909628   0.6844353        7       7
relevance_score   relevance_score                8  0.8900583   0.879919         8       8
relevance_score   relevance_score                9  8.128157    8.087495         9       9
relevance_score   relevance_score               10  0.7075946   0.7026421       10      10



Table: Non-identical attributes

                             
 ----------------------------
 No non-identical attributes 
 ----------------------------

based on DOI

# Check for duplicates based on DOI
references$openalex$api %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=26496 valid N=26496 mean=1.00 sd=0.02

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26488 | 99.97 |   99.97 |  99.97
    2 |     8 |  0.03 |    0.03 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>

# Extract duplicated IDs
duplicates$openalex$api$doi$string <- references$openalex$api %>%  
  distinct(id, .keep_all = TRUE) %>% 
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)

# Extract cases with duplicated IDs
duplicates$openalex$api$doi$data <- references$openalex$api %>% 
  filter(doi %in% duplicates$openalex$api$doi$string)

# Extract cases to be deleted
duplicates$openalex$api$doi$delete <- duplicates$openalex$api$doi$data %>%
  mutate(id_number = as.numeric(sub(".*W", "", id))) %>% 
  group_by(doi) %>% # Group by `doi`
  slice_min(id_number, n = 1, with_ties = FALSE) %>% 
  select(-id_number)

Summary

The duplicates based on the OpenAlex ID seem to only have differences in columns, that are less relevant for the analysis. The duplicates are therefore eliminated with the distinct() function.
The duplicates based on the DOI are a result of pre-prints being published. Therefore, only the most recent entry for each duplicated DOI will be kept.

Transformation

references$openalex$raw <- references$openalex$api %>% 
  distinct(id, .keep_all = TRUE) %>% # delete duplicates based on ID
  anti_join(duplicates$openalex$api$doi$delete, by = "id") # delete one case of each DOI duplicated()

Check transformation

references$openalex$raw %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=28520 valid N=28520 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 28520 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>

references$openalex$raw %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=26496 valid N=26496 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26496 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>