Combine OpenAlex & Scopus

Projektseminar

Published

26.11.2024

Preparation

# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  naniar, visdat,
  easystats, sjmisc,
  ggpubr, 
  gt, gtExtras, gtsummary,
  openalexR, bibliometrix, 
  tidyverse
)

Comparison with Scopus

references <- qs::qread(here("local_data/references_openalex.qs"))
references$scopus$raw <- qs::qread(here("local_data/references_scopus.qs"))$raw

Identification of “missing” references

# Identify the number of scopus references missing in the openalex data
references$scopus$raw %>% 
  filter(!is.na(doi)) %>% 
  mutate(doi_full = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi_full %in% references$openalex$raw$doi)) %>% 
  glimpse()
Rows: 4,819
Columns: 17
$ scopusID     <chr> "2-s2.0-85208654899", "2-s2.0-85201379794", "2-s2.0-85209…
$ doi          <chr> "10.1016/j.csi.2024.103940", "10.1016/j.csi.2024.103903",…
$ pmid         <chr> NA, NA, NA, NA, NA, NA, NA, NA, "39389117", NA, NA, NA, N…
$ authors      <chr> "Alier M.|Pereira J.|García-Peñalvo F.J.|Casañ M.J.|Cabré…
$ affiliations <chr> "Universidad de Salamanca|Universitat Politécnica de Cata…
$ countries    <chr> "Spain", "Russian Federation|Uzbekistan", "South Korea|Ho…
$ year         <chr> "2025", "2025", "2025", "2025", "2025", "2025", "2025", "…
$ articletitle <chr> "LAMB: An open-source software framework to create artifi…
$ journal      <chr> "Computer Standards and Interfaces", "Computer Standards …
$ volume       <chr> "92", "92", "210", "194", "163", "149", "95", "106", "369…
$ issue        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "1", NA, "1", "1", NA…
$ pages        <chr> "", "", "", "", "", "", "", "", "625-632", "", "", "", "2…
$ keywords     <chr> "Education domain|Generative artificial intelligence|IMS …
$ abstract     <chr> "This paper presents LAMB (Learning Assistant Manager and…
$ ptype        <chr> "Article", "Article", "Article", "Article", "Article", "A…
$ timescited   <chr> "0", "0", "0", "0", "0", "0", "2", "1", "0", "0", "0", "0…
$ doi_full     <chr> "https://doi.org/10.1016/j.csi.2024.103940", "https://doi…

Extract DOIs for missing references

missing_references <- list()

# Format DOIs 
missing_references$scopus_dois <- references$scopus$raw %>%
  filter(!is.na(doi)) %>% 
  mutate(doi = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi %in% references$openalex$raw$doi)) %>%
  pull(doi)

# Split DOIs into chunks of 25
chunk_size <- 10
missing_references$scopus_dois_chunks <- split(missing_references$scopus_dois, ceiling(seq_along(missing_references$scopus_dois) / chunk_size))

Completion of OpenAlex data

Mining missing references via OpenAlex API

# Download missing references via API
missing_references$data$chunks <- map(
  missing_references$scopus_dois_chunks, function(chunk) {
  Sys.sleep(2)  # Pause for 1 second
  
  tryCatch(
    {
      # Attempt the API call
      openalexR::oa_fetch(
        entity = "works",
        doi = chunk,
        verbose = TRUE
      )
    },
    error = function(e) {
      # Handle the error
      message("Error with chunk: ", paste(chunk, collapse = ", "))
      message("Error message: ", e$message)
      NULL  # Return NULL for failed chunks
    }
  )
})
# Combine rows
missing_references$data$combined <- bind_rows(missing_references$data$chunks) %>% 
  mutate(mining_source = "openalex_rerun_doi")
qs::qsave(missing_references, file = here("local_data/missing_references.qs"))

Quality control

# Check for duplicates based on OpenAlex ID
missing_references$data$combined %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=4736 valid N=4736 mean=1.00 sd=0.00

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4736 |   100 |     100 |    100
 <NA> |    0 |     0 |    <NA> |   <NA>
# Check for duplicates based on DOI
missing_references$data$combined %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=4734 valid N=4734 mean=1.00 sd=0.02

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4732 | 99.96 |   99.96 |  99.96
    2 |    2 |  0.04 |    0.04 | 100.00
 <NA> |    0 |  0.00 |    <NA> |   <NA>
duplicates <- list()

# Extract duplicated IDs
duplicates$missing_references$combined$doi$string  <- missing_references$data$combined %>%  
  distinct(id, .keep_all = TRUE) %>% 
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)

# Extract cases with duplicated IDs
duplicates$missing_references$combined$doi$data <- missing_references$data$combined %>% 
  filter(doi %in% duplicates$missing_references$combined$doi$string)

# Extract cases to be deleted
duplicates$missing_references$combined$doi$delete <- duplicates$missing_references$combined$doi$data %>%
  mutate(id_number = as.numeric(sub(".*W", "", id))) %>% 
  group_by(doi) %>% # Group by `doi`
  slice_min(id_number, n = 1, with_ties = FALSE) %>% 
  select(-id_number) 
missing_references$data$raw <- missing_references$data$combined %>%
  distinct(id, .keep_all = TRUE) %>%  # delete duplicates based on ID 
  anti_join(duplicates$missing_references$combined$doi$delete, by = "id") 

Merging OpenAlex data

# Combine the missing references with the existing data
references$openalex$combined$api <- references$openalex$raw %>%
  mutate(mining_source = "openalex_initial") %>% 
  bind_rows(., missing_references$data$raw)

Quality control

references$openalex$combined$api %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 33254
Number of columns 40
_______________________
Column type frequency:
character 24
list 8
logical 5
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1.00 30 32 0 33079 0
title 6 1.00 3 500 0 32287 0
display_name 6 1.00 3 500 0 32287 0
ab 2325 0.93 0 51771 11 30110 0
publication_date 0 1.00 10 10 0 3052 0
so 6257 0.81 1 244 0 8410 0
so_id 6257 0.81 27 32 0 8436 0
host_organization 13463 0.60 3 155 0 1308 0
issn_l 11008 0.67 9 9 0 7690 0
url 127 1.00 21 273 0 32942 0
pdf_url 21198 0.36 29 359 0 11940 0
license 22241 0.33 3 21 0 11 0
version 14375 0.57 15 16 0 3 0
first_page 13455 0.60 1 15 0 5079 0
last_page 13563 0.59 1 15 0 5092 0
volume 13865 0.58 1 25 0 684 0
issue 17210 0.48 1 30 0 646 0
oa_status 0 1.00 4 7 0 6 0
oa_url 13664 0.59 20 359 0 19241 0
language 2 1.00 2 2 0 8 0
cited_by_api_url 0 1.00 53 55 0 33079 0
doi 2024 0.94 26 96 0 31055 0
type 0 1.00 6 12 0 6 0
mining_source 0 1.00 16 18 0 2 0

Variable type: list

skim_variable n_missing complete_rate n_unique min_length max_length
author 253 0.99 32161 1 12
grants 28882 0.13 2780 0 33
counts_by_year 15199 0.54 5709 0 2
ids 0 1.00 33079 1 5
referenced_works 10700 0.68 22293 0 447
related_works 403 0.99 23309 1 20
concepts 0 1.00 33074 5 5
topics 0 1.00 32528 0 5

Variable type: logical

skim_variable n_missing complete_rate mean count
is_oa 124 1 0.53 TRU: 17652, FAL: 15478
is_oa_anywhere 0 1 0.59 TRU: 19524, FAL: 13730
any_repository_has_fulltext 0 1 0.29 FAL: 23747, TRU: 9507
is_paratext 0 1 0.00 FAL: 33254
is_retracted 0 1 0.00 FAL: 33209, TRU: 45

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
relevance_score 4734 0.86 9.91 26.91 0.04 0.84 3.13 7.26 986.22 ▇▁▁▁▁
cited_by_count 0 1.00 9.42 38.52 0.00 0.00 1.00 5.00 1878.00 ▇▁▁▁▁
publication_year 0 1.00 2021.81 2.57 1983.00 2021.00 2023.00 2024.00 2025.00 ▁▁▁▁▇
Quick overview
  • The number of missing abstracts has risen. Therefore, the Scopus data will be checked for the possibility of filling in the missing abstracts.
  • The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.
# Check for duplicates based on OpenAlex ID
references$openalex$combined$api %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=33079 valid N=33079 mean=1.01 sd=0.07

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 32904 | 99.47 |   99.47 |  99.47
    2 |   175 |  0.53 |    0.53 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>
# Extract duplicated IDs
duplicates$openalex$combined$id$string <- references$openalex$combined$api %>% 
  group_by(id) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(id)

# Extract cases with duplicated IDs
duplicates$openalex$combined$id$data <- references$openalex$combined$api %>% 
  filter(id %in% duplicates$openalex$combined$id$string ) %>% 
  arrange(id)
# Extract uneven (odd) rows
df1 <- duplicates$openalex$combined$id$data[seq(1, nrow(duplicates$openalex$combined$id$data), by = 2), ]
df2 <- duplicates$openalex$combined$id$data[seq(2, nrow(duplicates$openalex$combined$id$data), by = 2), ]

# Compare the two data frames
summary(arsenal::comparedf(df1, df2))


Table: Summary of data.frames

version   arg    ncol   nrow
--------  ----  -----  -----
x         df1      40    175
y         df2      40    175



Table: Summary of overall comparison

statistic                                                      value
------------------------------------------------------------  ------
Number of by-variables                                             0
Number of non-by variables in common                              40
Number of variables compared                                      40
Number of variables in x but not y                                 0
Number of variables in y but not x                                 0
Number of variables compared with some values unequal              4
Number of variables compared with all values equal                36
Number of observations in common                                 175
Number of observations in x but not y                              0
Number of observations in y but not x                              0
Number of observations with some compared variables unequal      175
Number of observations with all compared variables equal           0
Number of values unequal                                         404



Table: Variables not shared

                         
 ------------------------
 No variables not shared 
 ------------------------



Table: Other variables not compared

                                 
 --------------------------------
 No other variables not compared 
 --------------------------------



Table: Observations not shared

                            
 ---------------------------
 No observations not shared 
 ---------------------------



Table: Differences detected by variable

var.x                         var.y                            n   NAs
----------------------------  ----------------------------  ----  ----
id                            id                               0     0
title                         title                            0     0
display_name                  display_name                     0     0
author                        author                           0     0
ab                            ab                               0     0
publication_date              publication_date                 0     0
relevance_score               relevance_score                175   175
so                            so                               0     0
so_id                         so_id                            0     0
host_organization             host_organization                0     0
issn_l                        issn_l                           0     0
url                           url                              0     0
pdf_url                       pdf_url                          0     0
license                       license                          0     0
version                       version                          0     0
first_page                    first_page                       0     0
last_page                     last_page                        0     0
volume                        volume                           0     0
issue                         issue                            0     0
is_oa                         is_oa                            0     0
is_oa_anywhere                is_oa_anywhere                   0     0
oa_status                     oa_status                        0     0
oa_url                        oa_url                           0     0
any_repository_has_fulltext   any_repository_has_fulltext      0     0
language                      language                         0     0
grants                        grants                          50    50
cited_by_count                cited_by_count                   0     0
counts_by_year                counts_by_year                   4     4
publication_year              publication_year                 0     0
cited_by_api_url              cited_by_api_url                 0     0
ids                           ids                              0     0
doi                           doi                              0     0
type                          type                             0     0
referenced_works              referenced_works                 0     0
related_works                 related_works                    0     0
is_paratext                   is_paratext                      0     0
is_retracted                  is_retracted                     0     0
concepts                      concepts                         0     0
topics                        topics                           0     0
mining_source                 mining_source                  175     0



Table: Differences detected (370 not shown)

var.x             var.y              ..row.names..  values.x           values.y              row.x   row.y
----------------  ----------------  --------------  -----------------  -------------------  ------  ------
relevance_score   relevance_score                1  8.167233           NA                        1       1
relevance_score   relevance_score                2  8.486296           NA                        2       2
relevance_score   relevance_score                3  280.2531           NA                        3       3
relevance_score   relevance_score                4  7.800102           NA                        4       4
relevance_score   relevance_score                5  7.819922           NA                        5       5
relevance_score   relevance_score                6  17.53957           NA                        6       6
relevance_score   relevance_score                7  1.024666           NA                        7       7
relevance_score   relevance_score                8  28.30165           NA                        8       8
relevance_score   relevance_score                9  30.97463           NA                        9       9
relevance_score   relevance_score               10  15.04404           NA                       10      10
grants            grants                         7  NA                 NULL                      7       7
grants            grants                         8  NA                 NULL                      8       8
grants            grants                        10  NA                 NULL                     10      10
grants            grants                        11  NA                 NULL                     11      11
grants            grants                        14  NA                 NULL                     14      14
grants            grants                        15  NA                 NULL                     15      15
grants            grants                        18  NA                 NULL                     18      18
grants            grants                        20  NA                 NULL                     20      20
grants            grants                        21  NA                 NULL                     21      21
grants            grants                        24  NA                 NULL                     24      24
counts_by_year    counts_by_year               166  NA                 NULL                    166     166
counts_by_year    counts_by_year               171  NA                 NULL                    171     171
counts_by_year    counts_by_year               172  NA                 NULL                    172     172
counts_by_year    counts_by_year               174  NA                 NULL                    174     174
mining_source     mining_source                  1  openalex_initial   openalex_rerun_doi        1       1
mining_source     mining_source                  2  openalex_initial   openalex_rerun_doi        2       2
mining_source     mining_source                  3  openalex_initial   openalex_rerun_doi        3       3
mining_source     mining_source                  4  openalex_initial   openalex_rerun_doi        4       4
mining_source     mining_source                  5  openalex_initial   openalex_rerun_doi        5       5
mining_source     mining_source                  6  openalex_initial   openalex_rerun_doi        6       6
mining_source     mining_source                  7  openalex_initial   openalex_rerun_doi        7       7
mining_source     mining_source                  8  openalex_initial   openalex_rerun_doi        8       8
mining_source     mining_source                  9  openalex_initial   openalex_rerun_doi        9       9
mining_source     mining_source                 10  openalex_initial   openalex_rerun_doi       10      10



Table: Non-identical attributes

                             
 ----------------------------
 No non-identical attributes 
 ----------------------------
# Check for duplicates based on DOI
references$openalex$combined$api %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")
n <integer> 
# total N=31055 valid N=31055 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 31055 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>
references$openalex$combined$raw <- references$openalex$combined$api %>%
  distinct(id, .keep_all = TRUE) 

Missing abstracts

# Identify cases with NA values in the variable ab
na_abstracts <- references$openalex$combined$raw %>%
  filter(is.na(ab)) 


# Check if Scopus data provides an abstract for those references
na_abstracts_with_scopus <- na_abstracts %>%
  mutate(doi_short = str_remove(doi, "https://doi.org/")) %>% 
  left_join(scopus$raw %>% 
              select(doi, abstract),
              by = join_by(doi_short == doi)) %>%
  mutate(ab = ifelse(is.na(ab), abstract, ab)) %>%
  select(-abstract)

# Update the combined references with the new abstracts from Scopus
references$openalex$combined$raw_updated <- references$openalex$combined$raw %>%
  left_join(na_abstracts_with_scopus %>% select(id, ab), by = "id", suffix = c("", "_updated")) %>%
  mutate(ab = ifelse(is.na(ab), ab_updated, ab)) %>%
  select(-ab_updated)

Create correct data

# Overview
references$openalex$combined$raw_updated %>% 
  skimr::skim()
Data summary
Name Piped data
Number of rows 33079
Number of columns 40
_______________________
Column type frequency:
character 24
list 8
logical 5
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
id 0 1.00 30 32 0 33079 0
title 6 1.00 3 500 0 32287 0
display_name 6 1.00 3 500 0 32287 0
ab 1429 0.96 0 51771 11 30999 0
publication_date 0 1.00 10 10 0 3052 0
so 6193 0.81 1 244 0 8410 0
so_id 6193 0.81 27 32 0 8436 0
host_organization 13382 0.60 3 155 0 1308 0
issn_l 10936 0.67 9 9 0 7690 0
url 127 1.00 21 273 0 32942 0
pdf_url 21052 0.36 29 359 0 11940 0
license 22097 0.33 3 21 0 11 0
version 14238 0.57 15 16 0 3 0
first_page 13387 0.60 1 15 0 5079 0
last_page 13495 0.59 1 15 0 5092 0
volume 13788 0.58 1 25 0 684 0
issue 17112 0.48 1 30 0 646 0
oa_status 0 1.00 4 7 0 6 0
oa_url 13547 0.59 20 359 0 19241 0
language 2 1.00 2 2 0 8 0
cited_by_api_url 0 1.00 53 55 0 33079 0
doi 2024 0.94 26 96 0 31055 0
type 0 1.00 6 12 0 6 0
mining_source 0 1.00 16 18 0 2 0

Variable type: list

skim_variable n_missing complete_rate n_unique min_length max_length
author 253 0.99 32161 1 12
grants 28778 0.13 2780 0 33
counts_by_year 15151 0.54 5709 0 2
ids 0 1.00 33079 1 5
referenced_works 10682 0.68 22293 0 447
related_works 403 0.99 23309 1 20
concepts 0 1.00 33074 5 5
topics 0 1.00 32528 0 5

Variable type: logical

skim_variable n_missing complete_rate mean count
is_oa 124 1 0.53 TRU: 17614, FAL: 15341
is_oa_anywhere 0 1 0.59 TRU: 19465, FAL: 13614
any_repository_has_fulltext 0 1 0.29 FAL: 23606, TRU: 9473
is_paratext 0 1 0.00 FAL: 33079
is_retracted 0 1 0.00 FAL: 33034, TRU: 45

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
relevance_score 4559 0.86 9.91 26.91 0.04 0.84 3.13 7.26 986.22 ▇▁▁▁▁
cited_by_count 0 1.00 9.41 38.59 0.00 0.00 1.00 5.00 1878.00 ▇▁▁▁▁
publication_year 0 1.00 2021.81 2.58 1983.00 2021.00 2023.00 2024.00 2025.00 ▁▁▁▁▇
references$openalex$combined$raw_updated %>% 
  frq(type, language)
type <character> 
# total N=33079 valid N=33079 mean=1.55 sd=1.38

Value        |     N | Raw % | Valid % | Cum. %
-----------------------------------------------
article      | 28539 | 86.28 |   86.28 |  86.28
book-chapter |    40 |  0.12 |    0.12 |  86.40
editorial    |     2 |  0.01 |    0.01 |  86.40
letter       |     1 |  0.00 |    0.00 |  86.41
preprint     |  4369 | 13.21 |   13.21 |  99.61
review       |   128 |  0.39 |    0.39 | 100.00
<NA>         |     0 |  0.00 |    <NA> |   <NA>

language <character> 
# total N=33079 valid N=33077 mean=2.00 sd=0.06

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
de    |     1 |  0.00 |    0.00 |   0.00
en    | 33052 | 99.92 |   99.92 |  99.93
es    |    14 |  0.04 |    0.04 |  99.97
fr    |     4 |  0.01 |    0.01 |  99.98
it    |     3 |  0.01 |    0.01 |  99.99
nl    |     1 |  0.00 |    0.00 |  99.99
pt    |     1 |  0.00 |    0.00 | 100.00
sv    |     1 |  0.00 |    0.00 | 100.00
<NA>  |     2 |  0.01 |    <NA> |   <NA>
references$openalex$correct <- references$openalex$combined$raw_updated %>% 
  filter(type %in% c("article", "conference-paper", "preprint")) %>% 
  filter(language == "en") %>% 
  filter(publication_year >= 2016) %>%
  mutate(
  # Create additional factor variables
    publication_year_fct = as.factor(publication_year), 
    type_fct = as.factor(type), 
  # Clean abstracts
    ab = ab %>%
      str_replace_all("\ufffe", "") %>%    # Remove invalid U+FFFE characters
      str_replace_all("[^\x20-\x7E\n]", "") %>% # Optional: Remove other non-ASCII chars
      iconv(from = "UTF-8", to = "UTF-8", sub = ""), # Ensure UTF-8 encoding
  ) 

Export data

qs::qsave(references$openalex$correct, file = here("local_data/references.qs"))
qs::qsave(references, file = here("local_data/references_full.qs"))
references_bibliometrix <- oa2bibliometrix(references$openalex$correct)
saveRDS(references_bibliometrix, file = here("local_data/references_import_bibliometrix.RDS"))