# Load necessary packages
pacman::p_load(
  here, qs, 
  magrittr, janitor,
  naniar, visdat,
  easystats, sjmisc,
  ggpubr, 
  gt, gtExtras, gtsummary,
  openalexR, bibliometrix, 
  tidyverse
)Combine OpenAlex & Scopus
Projektseminar
Preparation
Comparison with Scopus
references <- qs::qread(here("local_data/references_openalex.qs"))
references$scopus$raw <- qs::qread(here("local_data/references_scopus.qs"))$rawIdentification of “missing” references
# Identify the number of scopus references missing in the openalex data
references$scopus$raw %>% 
  filter(!is.na(doi)) %>% 
  mutate(doi_full = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi_full %in% references$openalex$raw$doi)) %>% 
  glimpse()Rows: 4,819
Columns: 17
$ scopusID     <chr> "2-s2.0-85208654899", "2-s2.0-85201379794", "2-s2.0-85209…
$ doi          <chr> "10.1016/j.csi.2024.103940", "10.1016/j.csi.2024.103903",…
$ pmid         <chr> NA, NA, NA, NA, NA, NA, NA, NA, "39389117", NA, NA, NA, N…
$ authors      <chr> "Alier M.|Pereira J.|García-Peñalvo F.J.|Casañ M.J.|Cabré…
$ affiliations <chr> "Universidad de Salamanca|Universitat Politécnica de Cata…
$ countries    <chr> "Spain", "Russian Federation|Uzbekistan", "South Korea|Ho…
$ year         <chr> "2025", "2025", "2025", "2025", "2025", "2025", "2025", "…
$ articletitle <chr> "LAMB: An open-source software framework to create artifi…
$ journal      <chr> "Computer Standards and Interfaces", "Computer Standards …
$ volume       <chr> "92", "92", "210", "194", "163", "149", "95", "106", "369…
$ issue        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "1", NA, "1", "1", NA…
$ pages        <chr> "", "", "", "", "", "", "", "", "625-632", "", "", "", "2…
$ keywords     <chr> "Education domain|Generative artificial intelligence|IMS …
$ abstract     <chr> "This paper presents LAMB (Learning Assistant Manager and…
$ ptype        <chr> "Article", "Article", "Article", "Article", "Article", "A…
$ timescited   <chr> "0", "0", "0", "0", "0", "0", "2", "1", "0", "0", "0", "0…
$ doi_full     <chr> "https://doi.org/10.1016/j.csi.2024.103940", "https://doi…
Extract DOIs for missing references
missing_references <- list()
# Format DOIs 
missing_references$scopus_dois <- references$scopus$raw %>%
  filter(!is.na(doi)) %>% 
  mutate(doi = paste0("https://doi.org/", doi)) %>% 
  filter(!(doi %in% references$openalex$raw$doi)) %>%
  pull(doi)
# Split DOIs into chunks of 25
chunk_size <- 10
missing_references$scopus_dois_chunks <- split(missing_references$scopus_dois, ceiling(seq_along(missing_references$scopus_dois) / chunk_size))Completion of OpenAlex data
Mining missing references via OpenAlex API
# Download missing references via API
missing_references$data$chunks <- map(
  missing_references$scopus_dois_chunks, function(chunk) {
  Sys.sleep(2)  # Pause for 1 second
  
  tryCatch(
    {
      # Attempt the API call
      openalexR::oa_fetch(
        entity = "works",
        doi = chunk,
        verbose = TRUE
      )
    },
    error = function(e) {
      # Handle the error
      message("Error with chunk: ", paste(chunk, collapse = ", "))
      message("Error message: ", e$message)
      NULL  # Return NULL for failed chunks
    }
  )
})# Combine rows
missing_references$data$combined <- bind_rows(missing_references$data$chunks) %>% 
  mutate(mining_source = "openalex_rerun_doi")qs::qsave(missing_references, file = here("local_data/missing_references.qs"))Quality control
# Check for duplicates based on OpenAlex ID
missing_references$data$combined %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=4736 valid N=4736 mean=1.00 sd=0.00
Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4736 |   100 |     100 |    100
 <NA> |    0 |     0 |    <NA> |   <NA>
# Check for duplicates based on DOI
missing_references$data$combined %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=4734 valid N=4734 mean=1.00 sd=0.02
Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 4732 | 99.96 |   99.96 |  99.96
    2 |    2 |  0.04 |    0.04 | 100.00
 <NA> |    0 |  0.00 |    <NA> |   <NA>
duplicates <- list()
# Extract duplicated IDs
duplicates$missing_references$combined$doi$string  <- missing_references$data$combined %>%  
  distinct(id, .keep_all = TRUE) %>% 
  filter(!is.na(doi)) %>%
  group_by(doi) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(doi)
# Extract cases with duplicated IDs
duplicates$missing_references$combined$doi$data <- missing_references$data$combined %>% 
  filter(doi %in% duplicates$missing_references$combined$doi$string)
# Extract cases to be deleted
duplicates$missing_references$combined$doi$delete <- duplicates$missing_references$combined$doi$data %>%
  mutate(id_number = as.numeric(sub(".*W", "", id))) %>% 
  group_by(doi) %>% # Group by `doi`
  slice_min(id_number, n = 1, with_ties = FALSE) %>% 
  select(-id_number) missing_references$data$raw <- missing_references$data$combined %>%
  distinct(id, .keep_all = TRUE) %>%  # delete duplicates based on ID 
  anti_join(duplicates$missing_references$combined$doi$delete, by = "id") Merging OpenAlex data
# Combine the missing references with the existing data
references$openalex$combined$api <- references$openalex$raw %>%
  mutate(mining_source = "openalex_initial") %>% 
  bind_rows(., missing_references$data$raw)Quality control
references$openalex$combined$api %>% 
  skimr::skim()| Name | Piped data | 
| Number of rows | 33254 | 
| Number of columns | 40 | 
| _______________________ | |
| Column type frequency: | |
| character | 24 | 
| list | 8 | 
| logical | 5 | 
| numeric | 3 | 
| ________________________ | |
| Group variables | None | 
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace | 
|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 30 | 32 | 0 | 33079 | 0 | 
| title | 6 | 1.00 | 3 | 500 | 0 | 32287 | 0 | 
| display_name | 6 | 1.00 | 3 | 500 | 0 | 32287 | 0 | 
| ab | 2325 | 0.93 | 0 | 51771 | 11 | 30110 | 0 | 
| publication_date | 0 | 1.00 | 10 | 10 | 0 | 3052 | 0 | 
| so | 6257 | 0.81 | 1 | 244 | 0 | 8410 | 0 | 
| so_id | 6257 | 0.81 | 27 | 32 | 0 | 8436 | 0 | 
| host_organization | 13463 | 0.60 | 3 | 155 | 0 | 1308 | 0 | 
| issn_l | 11008 | 0.67 | 9 | 9 | 0 | 7690 | 0 | 
| url | 127 | 1.00 | 21 | 273 | 0 | 32942 | 0 | 
| pdf_url | 21198 | 0.36 | 29 | 359 | 0 | 11940 | 0 | 
| license | 22241 | 0.33 | 3 | 21 | 0 | 11 | 0 | 
| version | 14375 | 0.57 | 15 | 16 | 0 | 3 | 0 | 
| first_page | 13455 | 0.60 | 1 | 15 | 0 | 5079 | 0 | 
| last_page | 13563 | 0.59 | 1 | 15 | 0 | 5092 | 0 | 
| volume | 13865 | 0.58 | 1 | 25 | 0 | 684 | 0 | 
| issue | 17210 | 0.48 | 1 | 30 | 0 | 646 | 0 | 
| oa_status | 0 | 1.00 | 4 | 7 | 0 | 6 | 0 | 
| oa_url | 13664 | 0.59 | 20 | 359 | 0 | 19241 | 0 | 
| language | 2 | 1.00 | 2 | 2 | 0 | 8 | 0 | 
| cited_by_api_url | 0 | 1.00 | 53 | 55 | 0 | 33079 | 0 | 
| doi | 2024 | 0.94 | 26 | 96 | 0 | 31055 | 0 | 
| type | 0 | 1.00 | 6 | 12 | 0 | 6 | 0 | 
| mining_source | 0 | 1.00 | 16 | 18 | 0 | 2 | 0 | 
Variable type: list
| skim_variable | n_missing | complete_rate | n_unique | min_length | max_length | 
|---|---|---|---|---|---|
| author | 253 | 0.99 | 32161 | 1 | 12 | 
| grants | 28882 | 0.13 | 2780 | 0 | 33 | 
| counts_by_year | 15199 | 0.54 | 5709 | 0 | 2 | 
| ids | 0 | 1.00 | 33079 | 1 | 5 | 
| referenced_works | 10700 | 0.68 | 22293 | 0 | 447 | 
| related_works | 403 | 0.99 | 23309 | 1 | 20 | 
| concepts | 0 | 1.00 | 33074 | 5 | 5 | 
| topics | 0 | 1.00 | 32528 | 0 | 5 | 
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count | 
|---|---|---|---|---|
| is_oa | 124 | 1 | 0.53 | TRU: 17652, FAL: 15478 | 
| is_oa_anywhere | 0 | 1 | 0.59 | TRU: 19524, FAL: 13730 | 
| any_repository_has_fulltext | 0 | 1 | 0.29 | FAL: 23747, TRU: 9507 | 
| is_paratext | 0 | 1 | 0.00 | FAL: 33254 | 
| is_retracted | 0 | 1 | 0.00 | FAL: 33209, TRU: 45 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| relevance_score | 4734 | 0.86 | 9.91 | 26.91 | 0.04 | 0.84 | 3.13 | 7.26 | 986.22 | ▇▁▁▁▁ | 
| cited_by_count | 0 | 1.00 | 9.42 | 38.52 | 0.00 | 0.00 | 1.00 | 5.00 | 1878.00 | ▇▁▁▁▁ | 
| publication_year | 0 | 1.00 | 2021.81 | 2.57 | 1983.00 | 2021.00 | 2023.00 | 2024.00 | 2025.00 | ▁▁▁▁▇ | 
Quick overview
- The number of missing abstracts has risen. Therefore, the Scopus data will be checked for the possibility of filling in the missing abstracts.
 - The difference in the number of cases and the number of unique IDs indicates that there are duplicates in the data.
 
# Check for duplicates based on OpenAlex ID
references$openalex$combined$api %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=33079 valid N=33079 mean=1.01 sd=0.07
Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 32904 | 99.47 |   99.47 |  99.47
    2 |   175 |  0.53 |    0.53 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>
# Extract duplicated IDs
duplicates$openalex$combined$id$string <- references$openalex$combined$api %>% 
  group_by(id) %>%
  summarise(n = n()) %>%
  filter(n > 1) %>% 
  pull(id)
# Extract cases with duplicated IDs
duplicates$openalex$combined$id$data <- references$openalex$combined$api %>% 
  filter(id %in% duplicates$openalex$combined$id$string ) %>% 
  arrange(id)# Extract uneven (odd) rows
df1 <- duplicates$openalex$combined$id$data[seq(1, nrow(duplicates$openalex$combined$id$data), by = 2), ]
df2 <- duplicates$openalex$combined$id$data[seq(2, nrow(duplicates$openalex$combined$id$data), by = 2), ]
# Compare the two data frames
summary(arsenal::comparedf(df1, df2))
Table: Summary of data.frames
version   arg    ncol   nrow
--------  ----  -----  -----
x         df1      40    175
y         df2      40    175
Table: Summary of overall comparison
statistic                                                      value
------------------------------------------------------------  ------
Number of by-variables                                             0
Number of non-by variables in common                              40
Number of variables compared                                      40
Number of variables in x but not y                                 0
Number of variables in y but not x                                 0
Number of variables compared with some values unequal              4
Number of variables compared with all values equal                36
Number of observations in common                                 175
Number of observations in x but not y                              0
Number of observations in y but not x                              0
Number of observations with some compared variables unequal      175
Number of observations with all compared variables equal           0
Number of values unequal                                         404
Table: Variables not shared
                         
 ------------------------
 No variables not shared 
 ------------------------
Table: Other variables not compared
                                 
 --------------------------------
 No other variables not compared 
 --------------------------------
Table: Observations not shared
                            
 ---------------------------
 No observations not shared 
 ---------------------------
Table: Differences detected by variable
var.x                         var.y                            n   NAs
----------------------------  ----------------------------  ----  ----
id                            id                               0     0
title                         title                            0     0
display_name                  display_name                     0     0
author                        author                           0     0
ab                            ab                               0     0
publication_date              publication_date                 0     0
relevance_score               relevance_score                175   175
so                            so                               0     0
so_id                         so_id                            0     0
host_organization             host_organization                0     0
issn_l                        issn_l                           0     0
url                           url                              0     0
pdf_url                       pdf_url                          0     0
license                       license                          0     0
version                       version                          0     0
first_page                    first_page                       0     0
last_page                     last_page                        0     0
volume                        volume                           0     0
issue                         issue                            0     0
is_oa                         is_oa                            0     0
is_oa_anywhere                is_oa_anywhere                   0     0
oa_status                     oa_status                        0     0
oa_url                        oa_url                           0     0
any_repository_has_fulltext   any_repository_has_fulltext      0     0
language                      language                         0     0
grants                        grants                          50    50
cited_by_count                cited_by_count                   0     0
counts_by_year                counts_by_year                   4     4
publication_year              publication_year                 0     0
cited_by_api_url              cited_by_api_url                 0     0
ids                           ids                              0     0
doi                           doi                              0     0
type                          type                             0     0
referenced_works              referenced_works                 0     0
related_works                 related_works                    0     0
is_paratext                   is_paratext                      0     0
is_retracted                  is_retracted                     0     0
concepts                      concepts                         0     0
topics                        topics                           0     0
mining_source                 mining_source                  175     0
Table: Differences detected (370 not shown)
var.x             var.y              ..row.names..  values.x           values.y              row.x   row.y
----------------  ----------------  --------------  -----------------  -------------------  ------  ------
relevance_score   relevance_score                1  8.167233           NA                        1       1
relevance_score   relevance_score                2  8.486296           NA                        2       2
relevance_score   relevance_score                3  280.2531           NA                        3       3
relevance_score   relevance_score                4  7.800102           NA                        4       4
relevance_score   relevance_score                5  7.819922           NA                        5       5
relevance_score   relevance_score                6  17.53957           NA                        6       6
relevance_score   relevance_score                7  1.024666           NA                        7       7
relevance_score   relevance_score                8  28.30165           NA                        8       8
relevance_score   relevance_score                9  30.97463           NA                        9       9
relevance_score   relevance_score               10  15.04404           NA                       10      10
grants            grants                         7  NA                 NULL                      7       7
grants            grants                         8  NA                 NULL                      8       8
grants            grants                        10  NA                 NULL                     10      10
grants            grants                        11  NA                 NULL                     11      11
grants            grants                        14  NA                 NULL                     14      14
grants            grants                        15  NA                 NULL                     15      15
grants            grants                        18  NA                 NULL                     18      18
grants            grants                        20  NA                 NULL                     20      20
grants            grants                        21  NA                 NULL                     21      21
grants            grants                        24  NA                 NULL                     24      24
counts_by_year    counts_by_year               166  NA                 NULL                    166     166
counts_by_year    counts_by_year               171  NA                 NULL                    171     171
counts_by_year    counts_by_year               172  NA                 NULL                    172     172
counts_by_year    counts_by_year               174  NA                 NULL                    174     174
mining_source     mining_source                  1  openalex_initial   openalex_rerun_doi        1       1
mining_source     mining_source                  2  openalex_initial   openalex_rerun_doi        2       2
mining_source     mining_source                  3  openalex_initial   openalex_rerun_doi        3       3
mining_source     mining_source                  4  openalex_initial   openalex_rerun_doi        4       4
mining_source     mining_source                  5  openalex_initial   openalex_rerun_doi        5       5
mining_source     mining_source                  6  openalex_initial   openalex_rerun_doi        6       6
mining_source     mining_source                  7  openalex_initial   openalex_rerun_doi        7       7
mining_source     mining_source                  8  openalex_initial   openalex_rerun_doi        8       8
mining_source     mining_source                  9  openalex_initial   openalex_rerun_doi        9       9
mining_source     mining_source                 10  openalex_initial   openalex_rerun_doi       10      10
Table: Non-identical attributes
                             
 ----------------------------
 No non-identical attributes 
 ----------------------------
# Check for duplicates based on DOI
references$openalex$combined$api %>% 
  distinct(id, .keep_all = TRUE) %>% # exclude ID duplicates
  filter(!is.na(doi)) %>% # exclude cases without DOI
  group_by(doi) %>% 
  summarise(n = n()) %>% 
  frq(n, sort.frq = "desc")n <integer> 
# total N=31055 valid N=31055 mean=1.00 sd=0.00
Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    1 | 31055 |   100 |     100 |    100
 <NA> |     0 |     0 |    <NA> |   <NA>
references$openalex$combined$raw <- references$openalex$combined$api %>%
  distinct(id, .keep_all = TRUE) Missing abstracts
# Identify cases with NA values in the variable ab
na_abstracts <- references$openalex$combined$raw %>%
  filter(is.na(ab)) 
# Check if Scopus data provides an abstract for those references
na_abstracts_with_scopus <- na_abstracts %>%
  mutate(doi_short = str_remove(doi, "https://doi.org/")) %>% 
  left_join(scopus$raw %>% 
              select(doi, abstract),
              by = join_by(doi_short == doi)) %>%
  mutate(ab = ifelse(is.na(ab), abstract, ab)) %>%
  select(-abstract)
# Update the combined references with the new abstracts from Scopus
references$openalex$combined$raw_updated <- references$openalex$combined$raw %>%
  left_join(na_abstracts_with_scopus %>% select(id, ab), by = "id", suffix = c("", "_updated")) %>%
  mutate(ab = ifelse(is.na(ab), ab_updated, ab)) %>%
  select(-ab_updated)Create correct data
# Overview
references$openalex$combined$raw_updated %>% 
  skimr::skim()| Name | Piped data | 
| Number of rows | 33079 | 
| Number of columns | 40 | 
| _______________________ | |
| Column type frequency: | |
| character | 24 | 
| list | 8 | 
| logical | 5 | 
| numeric | 3 | 
| ________________________ | |
| Group variables | None | 
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace | 
|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 30 | 32 | 0 | 33079 | 0 | 
| title | 6 | 1.00 | 3 | 500 | 0 | 32287 | 0 | 
| display_name | 6 | 1.00 | 3 | 500 | 0 | 32287 | 0 | 
| ab | 1429 | 0.96 | 0 | 51771 | 11 | 30999 | 0 | 
| publication_date | 0 | 1.00 | 10 | 10 | 0 | 3052 | 0 | 
| so | 6193 | 0.81 | 1 | 244 | 0 | 8410 | 0 | 
| so_id | 6193 | 0.81 | 27 | 32 | 0 | 8436 | 0 | 
| host_organization | 13382 | 0.60 | 3 | 155 | 0 | 1308 | 0 | 
| issn_l | 10936 | 0.67 | 9 | 9 | 0 | 7690 | 0 | 
| url | 127 | 1.00 | 21 | 273 | 0 | 32942 | 0 | 
| pdf_url | 21052 | 0.36 | 29 | 359 | 0 | 11940 | 0 | 
| license | 22097 | 0.33 | 3 | 21 | 0 | 11 | 0 | 
| version | 14238 | 0.57 | 15 | 16 | 0 | 3 | 0 | 
| first_page | 13387 | 0.60 | 1 | 15 | 0 | 5079 | 0 | 
| last_page | 13495 | 0.59 | 1 | 15 | 0 | 5092 | 0 | 
| volume | 13788 | 0.58 | 1 | 25 | 0 | 684 | 0 | 
| issue | 17112 | 0.48 | 1 | 30 | 0 | 646 | 0 | 
| oa_status | 0 | 1.00 | 4 | 7 | 0 | 6 | 0 | 
| oa_url | 13547 | 0.59 | 20 | 359 | 0 | 19241 | 0 | 
| language | 2 | 1.00 | 2 | 2 | 0 | 8 | 0 | 
| cited_by_api_url | 0 | 1.00 | 53 | 55 | 0 | 33079 | 0 | 
| doi | 2024 | 0.94 | 26 | 96 | 0 | 31055 | 0 | 
| type | 0 | 1.00 | 6 | 12 | 0 | 6 | 0 | 
| mining_source | 0 | 1.00 | 16 | 18 | 0 | 2 | 0 | 
Variable type: list
| skim_variable | n_missing | complete_rate | n_unique | min_length | max_length | 
|---|---|---|---|---|---|
| author | 253 | 0.99 | 32161 | 1 | 12 | 
| grants | 28778 | 0.13 | 2780 | 0 | 33 | 
| counts_by_year | 15151 | 0.54 | 5709 | 0 | 2 | 
| ids | 0 | 1.00 | 33079 | 1 | 5 | 
| referenced_works | 10682 | 0.68 | 22293 | 0 | 447 | 
| related_works | 403 | 0.99 | 23309 | 1 | 20 | 
| concepts | 0 | 1.00 | 33074 | 5 | 5 | 
| topics | 0 | 1.00 | 32528 | 0 | 5 | 
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count | 
|---|---|---|---|---|
| is_oa | 124 | 1 | 0.53 | TRU: 17614, FAL: 15341 | 
| is_oa_anywhere | 0 | 1 | 0.59 | TRU: 19465, FAL: 13614 | 
| any_repository_has_fulltext | 0 | 1 | 0.29 | FAL: 23606, TRU: 9473 | 
| is_paratext | 0 | 1 | 0.00 | FAL: 33079 | 
| is_retracted | 0 | 1 | 0.00 | FAL: 33034, TRU: 45 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| relevance_score | 4559 | 0.86 | 9.91 | 26.91 | 0.04 | 0.84 | 3.13 | 7.26 | 986.22 | ▇▁▁▁▁ | 
| cited_by_count | 0 | 1.00 | 9.41 | 38.59 | 0.00 | 0.00 | 1.00 | 5.00 | 1878.00 | ▇▁▁▁▁ | 
| publication_year | 0 | 1.00 | 2021.81 | 2.58 | 1983.00 | 2021.00 | 2023.00 | 2024.00 | 2025.00 | ▁▁▁▁▇ | 
references$openalex$combined$raw_updated %>% 
  frq(type, language)type <character> 
# total N=33079 valid N=33079 mean=1.55 sd=1.38
Value        |     N | Raw % | Valid % | Cum. %
-----------------------------------------------
article      | 28539 | 86.28 |   86.28 |  86.28
book-chapter |    40 |  0.12 |    0.12 |  86.40
editorial    |     2 |  0.01 |    0.01 |  86.40
letter       |     1 |  0.00 |    0.00 |  86.41
preprint     |  4369 | 13.21 |   13.21 |  99.61
review       |   128 |  0.39 |    0.39 | 100.00
<NA>         |     0 |  0.00 |    <NA> |   <NA>
language <character> 
# total N=33079 valid N=33077 mean=2.00 sd=0.06
Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
de    |     1 |  0.00 |    0.00 |   0.00
en    | 33052 | 99.92 |   99.92 |  99.93
es    |    14 |  0.04 |    0.04 |  99.97
fr    |     4 |  0.01 |    0.01 |  99.98
it    |     3 |  0.01 |    0.01 |  99.99
nl    |     1 |  0.00 |    0.00 |  99.99
pt    |     1 |  0.00 |    0.00 | 100.00
sv    |     1 |  0.00 |    0.00 | 100.00
<NA>  |     2 |  0.01 |    <NA> |   <NA>
references$openalex$correct <- references$openalex$combined$raw_updated %>% 
  filter(type %in% c("article", "conference-paper", "preprint")) %>% 
  filter(language == "en") %>% 
  filter(publication_year >= 2016) %>%
  mutate(
  # Create additional factor variables
    publication_year_fct = as.factor(publication_year), 
    type_fct = as.factor(type), 
  # Clean abstracts
    ab = ab %>%
      str_replace_all("\ufffe", "") %>%    # Remove invalid U+FFFE characters
      str_replace_all("[^\x20-\x7E\n]", "") %>% # Optional: Remove other non-ASCII chars
      iconv(from = "UTF-8", to = "UTF-8", sub = ""), # Ensure UTF-8 encoding
  ) Export data
qs::qsave(references$openalex$correct, file = here("local_data/references.qs"))
qs::qsave(references, file = here("local_data/references_full.qs"))references_bibliometrix <- oa2bibliometrix(references$openalex$correct)
saveRDS(references_bibliometrix, file = here("local_data/references_import_bibliometrix.RDS"))