Combine OpenAlex & Scopus

Projektseminar

Published

26.11.2024

Preparation

# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  naniar, visdat,
  easystats, sjmisc,
  ggpubr, 
  gt, gtExtras, gtsummary,
  openalexR, bibliometrix, 
  tidyverse
)

Comparison with Scopus

references <- qs::qread(here("local_data/references_openalex.qs"))
references$scopus$raw <- qs::qread(here("local_data/references_scopus.qs"))$raw

Identification of “missing” references

# Identify the number of scopus references missing in the openalex data
references$scopus$raw %>% 
  filter(!is.na(doi)) %>% 
  mutate(doi_full = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi_full %in% references$openalex$raw$doi)) %>% 
  glimpse()

Rows: 4,819
Columns: 17
$ scopusID     <chr> "2-s2.0-85208654899", "2-s2.0-85201379794", "2-s2.0-85209…
$ doi          <chr> "10.1016/j.csi.2024.103940", "10.1016/j.csi.2024.103903",…
$ pmid         <chr> NA, NA, NA, NA, NA, NA, NA, NA, "39389117", NA, NA, NA, N…
$ authors      <chr> "Alier M.|Pereira J.|García-Peñalvo F.J.|Casañ M.J.|Cabré…
$ affiliations <chr> "Universidad de Salamanca|Universitat Politécnica de Cata…
$ countries    <chr> "Spain", "Russian Federation|Uzbekistan", "South Korea|Ho…
$ year         <chr> "2025", "2025", "2025", "2025", "2025", "2025", "2025", "…
$ articletitle <chr> "LAMB: An open-source software framework to create artifi…
$ journal      <chr> "Computer Standards and Interfaces", "Computer Standards …
$ volume       <chr> "92", "92", "210", "194", "163", "149", "95", "106", "369…
$ issue        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "1", NA, "1", "1", NA…
$ pages        <chr> "", "", "", "", "", "", "", "", "625-632", "", "", "", "2…
$ keywords     <chr> "Education domain|Generative artificial intelligence|IMS …
$ abstract     <chr> "This paper presents LAMB (Learning Assistant Manager and…
$ ptype        <chr> "Article", "Article", "Article", "Article", "Article", "A…
$ timescited   <chr> "0", "0", "0", "0", "0", "0", "2", "1", "0", "0", "0", "0…
$ doi_full     <chr> "https://doi.org/10.1016/j.csi.2024.103940", "https://doi…

Extract DOIs for missing references

missing_references <- list()

# Format DOIs 
missing_references$scopus_dois <- references$scopus$raw %>%
  filter(!is.na(doi)) %>% 
  mutate(doi = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi %in% references$openalex$raw$doi)) %>%
  pull(doi)

# Split DOIs into chunks of 25
chunk_size <- 10
missing_references$scopus_dois_chunks <- split(missing_references$scopus_dois, ceiling(seq_along(missing_references$scopus_dois) / chunk_size))

Completion of OpenAlex data

Mining missing references via OpenAlex API

# Download missing references via API
missing_references$data$chunks <- map(
  missing_references$scopus_dois_chunks, function(chunk) {
  Sys.sleep(2)  # Pause for 1 second
  
  tryCatch(
    {
      # Attempt the API call
      openalexR::oa_fetch(
        entity = "works",
        doi = chunk,
        verbose = TRUE
      )
    },
    error = function(e) {
      # Handle the error
      message("Error with chunk: ", paste(chunk, collapse = ", "))
      message("Error message: ", e$message)
      NULL  # Return NULL for failed chunks
    }
  )
})

# Combine rows
missing_references$data$combined <- bind_rows(missing_references$data$chunks) %>% 
  mutate(mining_source = "openalex_rerun_doi")

qs::qsave(missing_references, file = here("local_data/missing_references.qs"))

Quality control

# Check for duplicates based on OpenAlex ID
missing_references$data$combined %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=4736 valid N=4736 mean=1.00 sd=0.00

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4736 |   100 |     100 |    100
 <NA> |    0 |     0 |    <NA> |   <NA>

# Check for duplicates based on DOI
missing_references$data$combined %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=4734 valid N=4734 mean=1.00 sd=0.02

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4732 | 99.96 |   99.96 |  99.96
    2 |    2 |  0.04 |    0.04 | 100.00
 <NA> |    0 |  0.00 |    <NA> |   <NA>

duplicates <- list()

# Extract duplicated IDs
duplicates$missing_references$combined$doi$string  <- missing_references$data$combined %>%  
  distinct(id, .keep_all = TRUE) %>% 
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)

# Extract cases with duplicated IDs
duplicates$missing_references$combined$doi$data <- missing_references$data$combined %>% 
  filter(doi %in% duplicates$missing_references$combined$doi$string)

# Extract cases to be deleted
duplicates$missing_references$combined$doi$delete <- duplicates$missing_references$combined$doi$data %>%
  mutate(id_number = as.numeric(sub(".*W", "", id))) %>% 
  group_by(doi) %>% # Group by `doi`
  slice_min(id_number, n = 1, with_ties = FALSE) %>% 
  select(-id_number)

missing_references$data$raw <- missing_references$data$combined %>%
  distinct(id, .keep_all = TRUE) %>%  # delete duplicates based on ID 
  anti_join(duplicates$missing_references$combined$doi$delete, by = "id")

Merging OpenAlex data

# Combine the missing references with the existing data
references$openalex$combined$api <- references$openalex$raw %>%
  mutate(mining_source = "openalex_initial") %>% 
  bind_rows(., missing_references$data$raw)

Quality control

references$openalex$combined$api %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	33254
Number of columns	40
_______________________
Column type frequency:
character	24
list	8
logical	5
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique
id	0	1.00	30	32	0	33079
title	6	1.00	3	500	0	32287
display_name	6	1.00	3	500	0	32287
ab	2325	0.93	0	51771	11	30110
publication_date	0	1.00	10	10	0	3052
so	6257	0.81	1	244	0	8410
so_id	6257	0.81	27	32	0	8436
host_organization	13463	0.60	3	155	0	1308
issn_l	11008	0.67	9	9	0	7690
url	127	1.00	21	273	0	32942
pdf_url	21198	0.36	29	359	0	11940
license	22241	0.33	3	21	0	11
version	14375	0.57	15	16	0	3
first_page	13455	0.60	1	15	0	5079
last_page	13563	0.59	1	15	0	5092
volume	13865	0.58	1	25	0	684
issue	17210	0.48	1	30	0	646
oa_status	0	1.00	4	7	0	6
oa_url	13664	0.59	20	359	0	19241
language	2	1.00	2	2	0	8
cited_by_api_url	0	1.00	53	55	0	33079
doi	2024	0.94	26	96	0	31055
type	0	1.00	6	12	0	6
mining_source	0	1.00	16	18	0	2

Variable type: list

skim_variable	n_missing	complete_rate	n_unique	min_length	max_length
author	253	0.99	32161	1	12
grants	28882	0.13	2780	0	33
counts_by_year	15199	0.54	5709	0	2
ids	0	1.00	33079	1	5
referenced_works	10700	0.68	22293	0	447
related_works	403	0.99	23309	1	20
concepts	0	1.00	33074	5	5
topics	0	1.00	32528	0	5

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
is_oa	124	1	0.53	TRU: 17652, FAL: 15478
is_oa_anywhere	0	1	0.59	TRU: 19524, FAL: 13730
any_repository_has_fulltext	0	1	0.29	FAL: 23747, TRU: 9507
is_paratext	0	1	0.00	FAL: 33254
is_retracted	0	1	0.00	FAL: 33209, TRU: 45

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
relevance_score	4734	0.86	9.91	26.91	0.04	0.84	3.13	7.26	986.22	▇▁▁▁▁
cited_by_count	0	1.00	9.42	38.52	0.00	0.00	1.00	5.00	1878.00	▇▁▁▁▁
publication_year	0	1.00	2021.81	2.57	1983.00	2021.00	2023.00	2024.00	2025.00	▁▁▁▁▇

Quick overview

The number of missing abstracts has risen. Therefore, the Scopus data will be checked for the possibility of filling in the missing abstracts.
The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.

# Check for duplicates based on OpenAlex ID
references$openalex$combined$api %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=33079 valid N=33079 mean=1.01 sd=0.07

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 32904 | 99.47 |   99.47 |  99.47
    2 |   175 |  0.53 |    0.53 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>

# Extract duplicated IDs
duplicates$openalex$combined$id$string <- references$openalex$combined$api %>% 
  group_by(id) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(id)

# Extract cases with duplicated IDs
duplicates$openalex$combined$id$data <- references$openalex$combined$api %>% 
  filter(id %in% duplicates$openalex$combined$id$string ) %>% 
  arrange(id)

# Extract uneven (odd) rows
df1 <- duplicates$openalex$combined$id$data[seq(1, nrow(duplicates$openalex$combined$id$data), by = 2), ]
df2 <- duplicates$openalex$combined$id$data[seq(2, nrow(duplicates$openalex$combined$id$data), by = 2), ]

# Compare the two data frames
summary(arsenal::comparedf(df1, df2))



Table: Summary of data.frames

version   arg    ncol   nrow
--------  ----  -----  -----
x         df1      40    175
y         df2      40    175



Table: Summary of overall comparison

statistic                                                      value
------------------------------------------------------------  ------
Number of by-variables                                             0
Number of non-by variables in common                              40
Number of variables compared                                      40
Number of variables in x but not y                                 0
Number of variables in y but not x                                 0
Number of variables compared with some values unequal              4
Number of variables compared with all values equal                36
Number of observations in common                                 175
Number of observations in x but not y                              0
Number of observations in y but not x                              0
Number of observations with some compared variables unequal      175
Number of observations with all compared variables equal           0
Number of values unequal                                         404



Table: Variables not shared

                         
 ------------------------
 No variables not shared 
 ------------------------



Table: Other variables not compared

                                 
 --------------------------------
 No other variables not compared 
 --------------------------------



Table: Observations not shared

                            
 ---------------------------
 No observations not shared 
 ---------------------------



Table: Differences detected by variable

var.x                         var.y                            n   NAs
----------------------------  ----------------------------  ----  ----
id                            id                               0     0
title                         title                            0     0
display_name                  display_name                     0     0
author                        author                           0     0
ab                            ab                               0     0
publication_date              publication_date                 0     0
relevance_score               relevance_score                175   175
so                            so                               0     0
so_id                         so_id                            0     0
host_organization             host_organization                0     0
issn_l                        issn_l                           0     0
url                           url                              0     0
pdf_url                       pdf_url                          0     0
license                       license                          0     0
version                       version                          0     0
first_page                    first_page                       0     0
last_page                     last_page                        0     0
volume                        volume                           0     0
issue                         issue                            0     0
is_oa                         is_oa                            0     0
is_oa_anywhere                is_oa_anywhere                   0     0
oa_status                     oa_status                        0     0
oa_url                        oa_url                           0     0
any_repository_has_fulltext   any_repository_has_fulltext      0     0
language                      language                         0     0
grants                        grants                          50    50
cited_by_count                cited_by_count                   0     0
counts_by_year                counts_by_year                   4     4
publication_year              publication_year                 0     0
cited_by_api_url              cited_by_api_url                 0     0
ids                           ids                              0     0
doi                           doi                              0     0
type                          type                             0     0
referenced_works              referenced_works                 0     0
related_works                 related_works                    0     0
is_paratext                   is_paratext                      0     0
is_retracted                  is_retracted                     0     0
concepts                      concepts                         0     0
topics                        topics                           0     0
mining_source                 mining_source                  175     0



Table: Differences detected (370 not shown)

var.x             var.y              ..row.names..  values.x           values.y              row.x   row.y
----------------  ----------------  --------------  -----------------  -------------------  ------  ------
relevance_score   relevance_score                1  8.167233           NA                        1       1
relevance_score   relevance_score                2  8.486296           NA                        2       2
relevance_score   relevance_score                3  280.2531           NA                        3       3
relevance_score   relevance_score                4  7.800102           NA                        4       4
relevance_score   relevance_score                5  7.819922           NA                        5       5
relevance_score   relevance_score                6  17.53957           NA                        6       6
relevance_score   relevance_score                7  1.024666           NA                        7       7
relevance_score   relevance_score                8  28.30165           NA                        8       8
relevance_score   relevance_score                9  30.97463           NA                        9       9
relevance_score   relevance_score               10  15.04404           NA                       10      10
grants            grants                         7  NA                 NULL                      7       7
grants            grants                         8  NA                 NULL                      8       8
grants            grants                        10  NA                 NULL                     10      10
grants            grants                        11  NA                 NULL                     11      11
grants            grants                        14  NA                 NULL                     14      14
grants            grants                        15  NA                 NULL                     15      15
grants            grants                        18  NA                 NULL                     18      18
grants            grants                        20  NA                 NULL                     20      20
grants            grants                        21  NA                 NULL                     21      21
grants            grants                        24  NA                 NULL                     24      24
counts_by_year    counts_by_year               166  NA                 NULL                    166     166
counts_by_year    counts_by_year               171  NA                 NULL                    171     171
counts_by_year    counts_by_year               172  NA                 NULL                    172     172
counts_by_year    counts_by_year               174  NA                 NULL                    174     174
mining_source     mining_source                  1  openalex_initial   openalex_rerun_doi        1       1
mining_source     mining_source                  2  openalex_initial   openalex_rerun_doi        2       2
mining_source     mining_source                  3  openalex_initial   openalex_rerun_doi        3       3
mining_source     mining_source                  4  openalex_initial   openalex_rerun_doi        4       4
mining_source     mining_source                  5  openalex_initial   openalex_rerun_doi        5       5
mining_source     mining_source                  6  openalex_initial   openalex_rerun_doi        6       6
mining_source     mining_source                  7  openalex_initial   openalex_rerun_doi        7       7
mining_source     mining_source                  8  openalex_initial   openalex_rerun_doi        8       8
mining_source     mining_source                  9  openalex_initial   openalex_rerun_doi        9       9
mining_source     mining_source                 10  openalex_initial   openalex_rerun_doi       10      10



Table: Non-identical attributes

                             
 ----------------------------
 No non-identical attributes 
 ----------------------------

# Check for duplicates based on DOI
references$openalex$combined$api %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")

n <integer> 
# total N=31055 valid N=31055 mean=1.00 sd=0.00

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 31055 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>

references$openalex$combined$raw <- references$openalex$combined$api %>%
  distinct(id, .keep_all = TRUE)

Missing abstracts

# Identify cases with NA values in the variable ab
na_abstracts <- references$openalex$combined$raw %>%
  filter(is.na(ab)) 


# Check if Scopus data provides an abstract for those references
na_abstracts_with_scopus <- na_abstracts %>%
  mutate(doi_short = str_remove(doi, "https://doi.org/")) %>% 
  left_join(scopus$raw %>% 
              select(doi, abstract),
              by = join_by(doi_short == doi)) %>%
  mutate(ab = ifelse(is.na(ab), abstract, ab)) %>%
  select(-abstract)

# Update the combined references with the new abstracts from Scopus
references$openalex$combined$raw_updated <- references$openalex$combined$raw %>%
  left_join(na_abstracts_with_scopus %>% select(id, ab), by = "id", suffix = c("", "_updated")) %>%
  mutate(ab = ifelse(is.na(ab), ab_updated, ab)) %>%
  select(-ab_updated)

Create correct data

# Overview
references$openalex$combined$raw_updated %>% 
  skimr::skim()

Data summary
Name	Piped data
Number of rows	33079
Number of columns	40
_______________________
Column type frequency:
character	24
list	8
logical	5
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique
id	0	1.00	30	32	0	33079
title	6	1.00	3	500	0	32287
display_name	6	1.00	3	500	0	32287
ab	1429	0.96	0	51771	11	30999
publication_date	0	1.00	10	10	0	3052
so	6193	0.81	1	244	0	8410
so_id	6193	0.81	27	32	0	8436
host_organization	13382	0.60	3	155	0	1308
issn_l	10936	0.67	9	9	0	7690
url	127	1.00	21	273	0	32942
pdf_url	21052	0.36	29	359	0	11940
license	22097	0.33	3	21	0	11
version	14238	0.57	15	16	0	3
first_page	13387	0.60	1	15	0	5079
last_page	13495	0.59	1	15	0	5092
volume	13788	0.58	1	25	0	684
issue	17112	0.48	1	30	0	646
oa_status	0	1.00	4	7	0	6
oa_url	13547	0.59	20	359	0	19241
language	2	1.00	2	2	0	8
cited_by_api_url	0	1.00	53	55	0	33079
doi	2024	0.94	26	96	0	31055
type	0	1.00	6	12	0	6
mining_source	0	1.00	16	18	0	2

Variable type: list

skim_variable	n_missing	complete_rate	n_unique	min_length	max_length
author	253	0.99	32161	1	12
grants	28778	0.13	2780	0	33
counts_by_year	15151	0.54	5709	0	2
ids	0	1.00	33079	1	5
referenced_works	10682	0.68	22293	0	447
related_works	403	0.99	23309	1	20
concepts	0	1.00	33074	5	5
topics	0	1.00	32528	0	5

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
is_oa	124	1	0.53	TRU: 17614, FAL: 15341
is_oa_anywhere	0	1	0.59	TRU: 19465, FAL: 13614
any_repository_has_fulltext	0	1	0.29	FAL: 23606, TRU: 9473
is_paratext	0	1	0.00	FAL: 33079
is_retracted	0	1	0.00	FAL: 33034, TRU: 45

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
relevance_score	4559	0.86	9.91	26.91	0.04	0.84	3.13	7.26	986.22	▇▁▁▁▁
cited_by_count	0	1.00	9.41	38.59	0.00	0.00	1.00	5.00	1878.00	▇▁▁▁▁
publication_year	0	1.00	2021.81	2.58	1983.00	2021.00	2023.00	2024.00	2025.00	▁▁▁▁▇

references$openalex$combined$raw_updated %>% 
  frq(type, language)

type <character> 
# total N=33079 valid N=33079 mean=1.55 sd=1.38

Value        |     N | Raw % | Valid % | Cum. %
-----------------------------------------------
article      | 28539 | 86.28 |   86.28 |  86.28
book-chapter |    40 |  0.12 |    0.12 |  86.40
editorial    |     2 |  0.01 |    0.01 |  86.40
letter       |     1 |  0.00 |    0.00 |  86.41
preprint     |  4369 | 13.21 |   13.21 |  99.61
review       |   128 |  0.39 |    0.39 | 100.00
<NA>         |     0 |  0.00 |    <NA> |   <NA>

language <character> 
# total N=33079 valid N=33077 mean=2.00 sd=0.06

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
de    |     1 |  0.00 |    0.00 |   0.00
en    | 33052 | 99.92 |   99.92 |  99.93
es    |    14 |  0.04 |    0.04 |  99.97
fr    |     4 |  0.01 |    0.01 |  99.98
it    |     3 |  0.01 |    0.01 |  99.99
nl    |     1 |  0.00 |    0.00 |  99.99
pt    |     1 |  0.00 |    0.00 | 100.00
sv    |     1 |  0.00 |    0.00 | 100.00
<NA>  |     2 |  0.01 |    <NA> |   <NA>

references$openalex$correct <- references$openalex$combined$raw_updated %>% 
  filter(type %in% c("article", "conference-paper", "preprint")) %>% 
  filter(language == "en") %>% 
  filter(publication_year >= 2016) %>%
  mutate(
  # Create additional factor variables
    publication_year_fct = as.factor(publication_year), 
    type_fct = as.factor(type), 
  # Clean abstracts
    ab = ab %>%
      str_replace_all("\ufffe", "") %>%    # Remove invalid U+FFFE characters
      str_replace_all("[^\x20-\x7E\n]", "") %>% # Optional: Remove other non-ASCII chars
      iconv(from = "UTF-8", to = "UTF-8", sub = ""), # Ensure UTF-8 encoding
  )

Export data

qs::qsave(references$openalex$correct, file = here("local_data/references.qs"))
qs::qsave(references, file = here("local_data/references_full.qs"))

references_bibliometrix <- oa2bibliometrix(references$openalex$correct)
saveRDS(references_bibliometrix, file = here("local_data/references_import_bibliometrix.RDS"))