API mining: OpenAlex

Projektseminar

Published

17.12.2024

Preparation

# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  naniar, visdat,
  easystats, sjmisc,
  ggpubr, 
  gt, gtExtras, gtsummary,
  openalexR, bibliometrix, 
  tidyverse
)

Mining OpenAlex API

Set credentials

# Set openalexR.mailto option so that your requests go to the polite pool for faster response times
options(openalexR.mailto = "christoph.adrian@fau.de")

Initial OpenAlex API query

references <- list()

# Download data via API
references$openalex$api <- openalexR::oa_fetch(
  entity = "works",
  title_and_abstract.search = '("artificial intelligence" OR AI OR "chatbot" OR "AI-based chatbot" OR "artificial intelligence-based chatbot" OR "chat agent" OR "voice bot" OR "voice assistant" OR "voice-based assistant" OR "conversational agent" OR "conversational assistant" OR "conversational AI" OR "AI-based assistant" OR "artificial intelligence-based assistant" OR "virtual assistant" OR "intelligent assistant" OR "digital assistant" OR "smart speaker" OR chatgpt OR "google gemini" OR "google bard" OR "bing chat" OR "microsoft copilot" OR "claude ai" OR "perplexity ai") AND (anthropomorphism OR humanlike OR humanness OR humanized OR "user experience" OR UX OR usability OR trust* OR "conversational experience" OR CUX OR "conversation design" OR safety OR privacy)',
  publication_year = "2016-2025",
  primary_topic.field.id = c(
    "fields/33", # Social Science
    "fields/32" # Psychology
  ),
  language = "en",
  type = c("article", "conference-paper", "preprint"),
  verbose = TRUE
)

Quality control

references$openalex$api %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 30646
Number of columns 39
_______________________
Column type frequency:
character 23
list 8
logical 5
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1.00 30 32 0 28528 0
title 1 1.00 4 500 0 27743 0
display_name 1 1.00 4 500 0 27743 0
ab 1188 0.96 0 51771 10 26719 0
publication_date 0 1.00 10 10 0 2672 0
so 5735 0.81 1 244 0 7969 0
so_id 5735 0.81 27 32 0 7994 0
host_organization 13032 0.57 3 155 0 1274 0
issn_l 10525 0.66 9 9 0 7295 0
url 137 1.00 21 273 0 28390 0
pdf_url 18826 0.39 29 359 0 10860 0
license 20216 0.34 3 21 0 11 0
version 12190 0.60 15 16 0 3 0
first_page 13186 0.57 1 15 0 4030 0
last_page 13299 0.57 1 15 0 4038 0
volume 13362 0.56 1 25 0 604 0
issue 15657 0.49 1 30 0 629 0
oa_status 0 1.00 4 7 0 6 0
oa_url 11799 0.61 20 359 0 17197 0
language 0 1.00 2 2 0 1 0
cited_by_api_url 0 1.00 53 55 0 28528 0
doi 2133 0.93 26 96 0 26496 0
type 0 1.00 7 8 0 2 0

Variable type: list

skim_variable n_missing complete_rate n_unique min_length max_length
author 249 0.99 27647 1 12
grants 28143 0.08 2046 1 33
counts_by_year 15039 0.51 4415 1 2
ids 0 1.00 28528 1 5
referenced_works 10836 0.65 18263 1 442
related_works 423 0.99 20233 1 20
concepts 0 1.00 28523 5 5
topics 0 1.00 27999 5 5

Variable type: logical

skim_variable n_missing complete_rate mean count
is_oa 134 1 0.56 TRU: 17130, FAL: 13382
is_oa_anywhere 0 1 0.61 TRU: 18797, FAL: 11849
any_repository_has_fulltext 0 1 0.30 FAL: 21399, TRU: 9247
is_paratext 0 1 0.00 FAL: 30646
is_retracted 0 1 0.00 FAL: 30595, TRU: 51

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
relevance_score 0 1 9.48 26.05 0.04 0.84 3.09 7.01 986.22 ▇▁▁▁▁
cited_by_count 0 1 7.47 32.62 0.00 0.00 1.00 4.00 1878.00 ▇▁▁▁▁
publication_year 0 1 2021.95 1.99 2016.00 2021.00 2023.00 2024.00 2025.00 ▁▂▅▇▆
Quick overview
  • Nearly all references have an abstract (± 96 percent) and DOI (± 93 percent), which are the critical information for the analysis.
  • The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.

Check duplicates

based on OpenAlex ID

# Check for duplicates based on OpenAlex ID
references$openalex$api %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=28528 valid N=28528 mean=1.07 sd=0.26

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26410 | 92.58 |   92.58 |  92.58
    2 |  2118 |  7.42 |    7.42 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>
duplicates <- list()

# Extract duplicated IDs
duplicates$openalex$api$id$string <- references$openalex$api %>% 
  group_by(id) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(id)

# Extract cases with duplicated IDs
duplicates$openalex$api$id$data <- references$openalex$api %>% 
  filter(id %in% duplicates$openalex$api$id$string) %>% 
  arrange(id)
# Extract uneven (odd) rows
df1 <- duplicates$openalex$api$id$data[seq(1, nrow(duplicates$openalex$api$id$data), by = 2), ]
df2 <- duplicates$openalex$api$id$data[seq(2, nrow(duplicates$openalex$api$id$data), by = 2), ]

# Compare the two data frames
summary(arsenal::comparedf(df1, df2))


Table: Summary of data.frames

version   arg    ncol   nrow
--------  ----  -----  -----
x         df1      39   2118
y         df2      39   2118



Table: Summary of overall comparison

statistic                                                      value
------------------------------------------------------------  ------
Number of by-variables                                             0
Number of non-by variables in common                              39
Number of variables compared                                      39
Number of variables in x but not y                                 0
Number of variables in y but not x                                 0
Number of variables compared with some values unequal              1
Number of variables compared with all values equal                38
Number of observations in common                                2118
Number of observations in x but not y                              0
Number of observations in y but not x                              0
Number of observations with some compared variables unequal     2118
Number of observations with all compared variables equal           0
Number of values unequal                                        2118



Table: Variables not shared

                         
 ------------------------
 No variables not shared 
 ------------------------



Table: Other variables not compared

                                 
 --------------------------------
 No other variables not compared 
 --------------------------------



Table: Observations not shared

                            
 ---------------------------
 No observations not shared 
 ---------------------------



Table: Differences detected by variable

var.x                         var.y                             n   NAs
----------------------------  ----------------------------  -----  ----
id                            id                                0     0
title                         title                             0     0
display_name                  display_name                      0     0
author                        author                            0     0
ab                            ab                                0     0
publication_date              publication_date                  0     0
relevance_score               relevance_score                2118     0
so                            so                                0     0
so_id                         so_id                             0     0
host_organization             host_organization                 0     0
issn_l                        issn_l                            0     0
url                           url                               0     0
pdf_url                       pdf_url                           0     0
license                       license                           0     0
version                       version                           0     0
first_page                    first_page                        0     0
last_page                     last_page                         0     0
volume                        volume                            0     0
issue                         issue                             0     0
is_oa                         is_oa                             0     0
is_oa_anywhere                is_oa_anywhere                    0     0
oa_status                     oa_status                         0     0
oa_url                        oa_url                            0     0
any_repository_has_fulltext   any_repository_has_fulltext       0     0
language                      language                          0     0
grants                        grants                            0     0
cited_by_count                cited_by_count                    0     0
counts_by_year                counts_by_year                    0     0
publication_year              publication_year                  0     0
cited_by_api_url              cited_by_api_url                  0     0
ids                           ids                               0     0
doi                           doi                               0     0
type                          type                              0     0
referenced_works              referenced_works                  0     0
related_works                 related_works                     0     0
is_paratext                   is_paratext                       0     0
is_retracted                  is_retracted                      0     0
concepts                      concepts                          0     0
topics                        topics                            0     0



Table: Differences detected (2108 not shown)

var.x             var.y              ..row.names..  values.x    values.y     row.x   row.y
----------------  ----------------  --------------  ----------  ----------  ------  ------
relevance_score   relevance_score                1  5.394589    5.377582         1       1
relevance_score   relevance_score                2  2.179161    2.139825         2       2
relevance_score   relevance_score                3  0.8233411   0.8151723        3       3
relevance_score   relevance_score                4  1.192189    1.161013         4       4
relevance_score   relevance_score                5  2.006378    1.954127         5       5
relevance_score   relevance_score                6  1.195497    1.183922         6       6
relevance_score   relevance_score                7  0.6909628   0.6844353        7       7
relevance_score   relevance_score                8  0.8900583   0.879919         8       8
relevance_score   relevance_score                9  8.128157    8.087495         9       9
relevance_score   relevance_score               10  0.7075946   0.7026421       10      10



Table: Non-identical attributes

                             
 ----------------------------
 No non-identical attributes 
 ----------------------------

based on DOI

# Check for duplicates based on DOI
references$openalex$api %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=26496 valid N=26496 mean=1.00 sd=0.02

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26488 | 99.97 |   99.97 |  99.97
    2 |     8 |  0.03 |    0.03 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>
# Extract duplicated IDs
duplicates$openalex$api$doi$string <- references$openalex$api %>%  
  distinct(id, .keep_all = TRUE) %>% 
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)

# Extract cases with duplicated IDs
duplicates$openalex$api$doi$data <- references$openalex$api %>% 
  filter(doi %in% duplicates$openalex$api$doi$string)

# Extract cases to be deleted
duplicates$openalex$api$doi$delete <- duplicates$openalex$api$doi$data %>%
  mutate(id_number = as.numeric(sub(".*W", "", id))) %>% 
  group_by(doi) %>% # Group by `doi`
  slice_min(id_number, n = 1, with_ties = FALSE) %>% 
  select(-id_number) 
Summary
  • The duplicates based on the OpenAlex ID seem to only have differences in columns, that are less relevant for the analysis. The duplicates are therefore eliminated with the distinct() function.
  • The duplicates based on the DOI are a result of pre-prints being published. Therefore, only the most recent entry for each duplicated DOI will be kept.

Transformation

references$openalex$raw <- references$openalex$api %>% 
  distinct(id, .keep_all = TRUE) %>% # delete duplicates based on ID
  anti_join(duplicates$openalex$api$doi$delete, by = "id") # delete one case of each DOI duplicated()

Check transformation

references$openalex$raw %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=28520 valid N=28520 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 28520 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>
references$openalex$raw %>% 
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=26496 valid N=26496 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 26496 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>