Unsupervised Machine Learning I

Session 09 - Showcase

Published

24.06.2024

Quarto Slide Link to slides

Preparation

if (!require("pacman")) install.packages("pacman")
pacman::p_load(
    here, qs, # file management
    magrittr, janitor, # data wrangling
    easystats, sjmisc, # data analysis
    gt, gtExtras, # table visualization
    ggpubr, ggwordcloud, # visualization
    # text analysis    
    tidytext, widyr, # based on tidytext
    quanteda, # based on quanteda
    quanteda.textmodels, quanteda.textplots, quanteda.textstats, 
    stm, # structural topic modeling
    openalexR, 
    tidyverse # load last to avoid masking issues
  )

Codechunks aus der Sitzung

Erstellung und Bearbeitung der Subsample

review_subsample <- review_works_correct %>% 
  # Eingrenzung: Sprache und Typ
  filter(language == "en") %>% 
  filter(type == "article") %>%
  # Datentranformation
  unnest(topics, names_sep = "_") %>%
  filter(topics_name == "field") %>% 
  filter(topics_i == "1") %>% 
  # Eingrenzung: Forschungsfeldes
  filter(
   topics_display_name == "Social Sciences"|
   topics_display_name == "Psychology"
    )

# Überblick 
review_subsample %>% glimpse
Rows: 45,221
Columns: 45
$ id                          <chr> "https://openalex.org/W4293003987", "https…
$ title                       <chr> "The WHO-5 Well-Being Index: A Systematic …
$ display_name                <chr> "The WHO-5 Well-Being Index: A Systematic …
$ author                      <list> [<data.frame[4 x 12]>], [<data.frame[2 x …
$ ab                          <chr> "The 5-item World Health Organization Well…
$ publication_date            <chr> "2015-01-01", "2017-08-28", "2014-01-01", …
$ relevance_score             <dbl> 938.7603, 752.3500, 591.2553, 576.1210, 56…
$ so                          <chr> "Psychotherapy and psychosomatics", "Journ…
$ so_id                       <chr> "https://openalex.org/S184803288", "https:…
$ host_organization           <chr> "Karger Publishers", "SAGE Publishing", NA…
$ issn_l                      <chr> "0033-3190", "0739-456X", NA, "2214-7829",…
$ url                         <chr> "https://doi.org/10.1159/000376585", "http…
$ pdf_url                     <chr> "https://www.karger.com/Article/Pdf/376585…
$ license                     <chr> "cc-by-nc", NA, NA, "cc-by", NA, NA, "cc-b…
$ version                     <chr> "publishedVersion", NA, "publishedVersion"…
$ first_page                  <chr> "167", "93", NA, "89", "55", "2150", "e356…
$ last_page                   <chr> "176", "112", NA, "106", "64", "2159", "e3…
$ volume                      <chr> "84", "39", NA, "6", "277", "32", "2", "24…
$ issue                       <chr> "3", "1", NA, NA, NA, "19", "8", NA, "9", …
$ is_oa                       <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE…
$ is_oa_anywhere              <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE,…
$ oa_status                   <chr> "hybrid", "green", "bronze", "gold", "bron…
$ oa_url                      <chr> "https://www.karger.com/Article/Pdf/376585…
$ any_repository_has_fulltext <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE…
$ language                    <chr> "en", "en", "en", "en", "en", "en", "en", …
$ grants                      <list> NA, NA, NA, <"https://openalex.org/F43203…
$ cited_by_count              <int> 2657, 1375, 2568, 803, 3664, 1553, 2895, 9…
$ counts_by_year              <list> [<data.frame[11 x 2]>], [<data.frame[7 x …
$ publication_year            <int> 2015, 2017, 2014, 2016, 2020, 2014, 2017, …
$ cited_by_api_url            <chr> "https://api.openalex.org/works?filter=cit…
$ ids                         <list> <"https://openalex.org/W4293003987", "htt…
$ doi                         <chr> "https://doi.org/10.1159/000376585", "http…
$ type                        <chr> "article", "article", "article", "article"…
$ referenced_works            <list> <"https://openalex.org/W1492518593", "htt…
$ related_works               <list> <"https://openalex.org/W3020194755", "htt…
$ is_paratext                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ is_retracted                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ concepts                    <list> [<data.frame[7 x 5]>], [<data.frame[18 x …
$ topics_i                    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ topics_score                <dbl> 0.9926, 0.9050, 0.9995, 0.9987, 0.9999, 1.…
$ topics_name                 <chr> "field", "field", "field", "field", "field…
$ topics_id                   <chr> "https://openalex.org/fields/32", "https:/…
$ topics_display_name         <chr> "Psychology", "Social Sciences", "Psycholo…
$ publication_year_fct        <fct> 2015, 2017, 2014, 2016, 2020, 2014, 2017, …
$ type_fct                    <fct> article, article, article, article, articl…

Exkurs: Identfikation von fehlenden Werten

visdat::vis_miss(review_subsample, warn_large_data = FALSE)
Figure 1

Anpassung der Subsample

review_subsample <- review_works_correct %>%
  # Eingrenzung: Sprache und Typ
  filter(language == "en") %>% 
  filter(type == "article") %>%
  # Datentranformation
  unnest(topics, names_sep = "_") %>%
  filter(topics_name == "field") %>% 
  filter(topics_i == "1") %>% 
  # Eingrenzung: Forschungsfeldes
  filter(
   topics_display_name == "Social Sciences"|
   topics_display_name == "Psychology"
   ) %>% 
  # Eingrenzung: Keine Einträge ohne Abstract
  filter(!is.na(ab))

# Überblick 
review_subsample %>% glimpse
Rows: 36,680
Columns: 45
$ id                          <chr> "https://openalex.org/W4293003987", "https…
$ title                       <chr> "The WHO-5 Well-Being Index: A Systematic …
$ display_name                <chr> "The WHO-5 Well-Being Index: A Systematic …
$ author                      <list> [<data.frame[4 x 12]>], [<data.frame[2 x …
$ ab                          <chr> "The 5-item World Health Organization Well…
$ publication_date            <chr> "2015-01-01", "2017-08-28", "2014-01-01", …
$ relevance_score             <dbl> 938.7603, 752.3500, 591.2553, 576.1210, 56…
$ so                          <chr> "Psychotherapy and psychosomatics", "Journ…
$ so_id                       <chr> "https://openalex.org/S184803288", "https:…
$ host_organization           <chr> "Karger Publishers", "SAGE Publishing", NA…
$ issn_l                      <chr> "0033-3190", "0739-456X", NA, "2214-7829",…
$ url                         <chr> "https://doi.org/10.1159/000376585", "http…
$ pdf_url                     <chr> "https://www.karger.com/Article/Pdf/376585…
$ license                     <chr> "cc-by-nc", NA, NA, "cc-by", NA, NA, "cc-b…
$ version                     <chr> "publishedVersion", NA, "publishedVersion"…
$ first_page                  <chr> "167", "93", NA, "89", "55", "2150", "e356…
$ last_page                   <chr> "176", "112", NA, "106", "64", "2159", "e3…
$ volume                      <chr> "84", "39", NA, "6", "277", "32", "2", "24…
$ issue                       <chr> "3", "1", NA, NA, NA, "19", "8", NA, "9", …
$ is_oa                       <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE…
$ is_oa_anywhere              <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE,…
$ oa_status                   <chr> "hybrid", "green", "bronze", "gold", "bron…
$ oa_url                      <chr> "https://www.karger.com/Article/Pdf/376585…
$ any_repository_has_fulltext <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE…
$ language                    <chr> "en", "en", "en", "en", "en", "en", "en", …
$ grants                      <list> NA, NA, NA, <"https://openalex.org/F43203…
$ cited_by_count              <int> 2657, 1375, 2568, 803, 3664, 1553, 2895, 9…
$ counts_by_year              <list> [<data.frame[11 x 2]>], [<data.frame[7 x …
$ publication_year            <int> 2015, 2017, 2014, 2016, 2020, 2014, 2017, …
$ cited_by_api_url            <chr> "https://api.openalex.org/works?filter=cit…
$ ids                         <list> <"https://openalex.org/W4293003987", "htt…
$ doi                         <chr> "https://doi.org/10.1159/000376585", "http…
$ type                        <chr> "article", "article", "article", "article"…
$ referenced_works            <list> <"https://openalex.org/W1492518593", "htt…
$ related_works               <list> <"https://openalex.org/W3020194755", "htt…
$ is_paratext                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ is_retracted                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ concepts                    <list> [<data.frame[7 x 5]>], [<data.frame[18 x …
$ topics_i                    <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ topics_score                <dbl> 0.9926, 0.9050, 0.9995, 0.9987, 0.9999, 1.…
$ topics_name                 <chr> "field", "field", "field", "field", "field…
$ topics_id                   <chr> "https://openalex.org/fields/32", "https:/…
$ topics_display_name         <chr> "Psychology", "Social Sciences", "Psycholo…
$ publication_year_fct        <fct> 2015, 2017, 2014, 2016, 2020, 2014, 2017, …
$ type_fct                    <fct> article, article, article, article, articl…

DTM/DFM Erstellung

`tidytext``

# Create tidy data
subsample_tidy <- review_subsample %>% 
    tidytext::unnest_tokens("text", ab) %>% 
    filter(!text %in% tidytext::stop_words$word)

# Create summarized data
subsample_summarized <- subsample_tidy %>% 
  count(id, text) 

# Create DTM
subsample_dtm <- subsample_summarized %>% 
  cast_dtm(id, text, n)

# Preview
subsample_dtm
<<DocumentTermMatrix (documents: 36654, terms: 122147)>>
Non-/sparse entries: 3280664/4473895474
Sparsity           : 100%
Maximal term length: 188
Weighting          : term frequency (tf)

quanteda

# Create corpus
quanteda_corpus <- review_subsample %>% 
  quanteda::corpus(
    docid_field = "id", 
    text_field = "ab"
  )

# Tokenize
quanteda_token <- quanteda_corpus %>% 
  quanteda::tokens(
    remove_punct = TRUE,
    remove_symbols = TRUE, 
    remove_numbers = TRUE, 
    remove_url = TRUE, 
    split_tags = FALSE # keep hashtags and mentions
  ) %>% 
  quanteda::tokens_tolower() %>% 
  quanteda::tokens_remove(
    pattern = stopwords("en")
    )

# Convert to Document-Feature-Matrix (DFM)
quanteda_dfm <- quanteda_token %>% 
  quanteda::dfm()

# Preview
quanteda_dfm

Netzwerk der Top-Begriffe

tidytext

# Extract most common hashtags
top50_features_tidy <- subsample_tidy %>% 
  count(text, sort = TRUE) %>%
  slice_head(n = 50) %>% 
  pull(text)

# Visualize
subsample_tidy %>% 
  count(id, text, sort = TRUE) %>% 
  filter(!is.na(text)) %>% 
  cast_dfm(id, text, n) %>% 
  quanteda::fcm() %>% 
  quanteda::fcm_select(
    pattern = top50_features_tidy,
    case_insensitive = FALSE
  ) %>%  
  quanteda.textplots::textplot_network(
    edge_color = "#04316A"
  )

quanteda

# Extract most common features 
top50_features_quanteda <- quanteda_dfm %>% 
  topfeatures(50) %>% 
  names()

# Construct feature-occurrence matrix of features
quanteda_dfm %>% 
  fcm() %>% 
  fcm_select(pattern = top50_features_quanteda) %>% 
  textplot_network(
    edge_color = "#C50F3C"
  ) 

Preparation for STM

# Pruning
quanteda_dfm_trim <- quanteda_dfm %>% 
  dfm_trim( 
    min_docfreq = 10/nrow(review_subsample),
    max_docfreq = 0.99, 
    docfreq_type = "prop")

# Convert for stm topic modeling
quanteda_stm <- quanteda_dfm_trim %>% 
   convert(to = "stm")

Structural Topic Model

Schätzung

tictoc::tic()
stm_mdl <- stm::stm(
  documents = quanteda_stm$documents,
  vocab = quanteda_stm$vocab, 
  K = 20, 
  seed = 42,
  max.em.its = 10,
  init.type = "Spectral",
  verbose = TRUE)
tictoc::toc(log = TRUE)

Modelinformationen

# Überblick über STM
stm_mdl
A topic model with 20 topics, 36650 documents and a 14322 word dictionary.

Überblick über die Themen

# Simple
plot(stm_mdl, type = "summary")

# Komplex
top_gamma <- stm_mdl %>%
  tidy(matrix = "gamma") %>% 
  dplyr::group_by(topic) %>%
  dplyr::summarise(gamma = mean(gamma), .groups = "drop") %>%
  dplyr::arrange(desc(gamma))

top_beta <- stm_mdl %>%
  tidytext::tidy(.) %>% 
  dplyr::group_by(topic) %>%
  dplyr::arrange(-beta) %>%
  dplyr::top_n(10, wt = beta) %>% 
  dplyr::select(topic, term) %>%
  dplyr::summarise(terms_beta = toString(term), .groups = "drop")

top_topics_terms <- top_beta %>% 
  dplyr::left_join(top_gamma, by = "topic") %>%
  dplyr::mutate(
          topic = reorder(topic, gamma)
      )

# Preview
top_topics_terms %>%
  mutate(across(gamma, ~round(.,3))) %>% 
  dplyr::arrange(-gamma) %>% 
  gt() %>% 
  cols_label(
    topic = "Topic", 
    terms_beta = "Top Terms (based on beta)",
    gamma = "Gamma"
  ) %>% 
  gtExtras::gt_theme_538()
Topic Top Terms (based on beta) Gamma
16 research, literature, review, paper, systematic, future, study, analysis, findings, knowledge 0.142
19 studies, interventions, review, systematic, evidence, outcomes, included, quality, intervention, health 0.096
13 health, mental, care, review, support, family, children, social, factors, studies 0.079
9 learning, students, education, school, review, skills, educational, teachers, teaching, study 0.072
11 study, literature, research, can, work, development, review, also, human, economic 0.068
15 studies, ci, effect, meta-analysis, depression, p, trials, anxiety, effects, interventions 0.061
14 social, media, information, use, digital, data, technology, communication, review, research 0.060
17 studies, children, relationship, review, language, variables, research, factors, results, effects 0.052
6 prevalence, studies, covid-19, suicide, among, risk, pandemic, countries, ci, vaccine 0.050
1 sleep, studies, eating, cognitive, review, associated, weight, body, may, association 0.044
2 health, studies, women, gender, review, care, social, services, cultural, access 0.043
18 studies, health, used, measures, review, tools, instruments, assessment, training, n 0.039
3 treatment, disorder, disorders, patients, ptsd, symptoms, clinical, studies, therapy, anxiety 0.036
7 articles, review, adolescents, studies, literature, search, use, results, systematic, databases 0.036
4 patients, articles, review, music, therapy, cancer, can, study, pain, life 0.032
12 violence, studies, use, sexual, risk, h3, ipv, substance, alcohol, review 0.031
5 et, al, university, review, gt, literature, lt, author, p, search 0.030
8 physical, training, studies, disability, exercise, disabilities, employment, ed, review, strength 0.015
20 attachment, studies, scholar, google, science, review, social, styles, welfare, research 0.009
10 de, la, y, en, los, e, 的, el, se, que 0.004

Themenkorrelation

stm_corr <- stm::topicCorr(stm_mdl)
plot(stm_corr)

Fokus auf einzele Themen

Prominente Wörter
# Fokus auf Themas 16
stm::labelTopics(stm_mdl, topic=16)
Topic 16 Top Words:
     Highest Prob: research, literature, review, paper, systematic, future, study 
     FREX: tourism, sustainable, sustainability, originality, innovation, conceptual, agenda 
     Lift: paradigmatic, hrd, edlm, positivist, internationalisation, tccm, wom 
     Score: tourism, research, literature, paper, leadership, themes, sustainable 
# Fokus auf Thema 10
stm::labelTopics(stm_mdl, topic=10)
Topic 10 Top Words:
     Highest Prob: de, la, y, en, los, e, 的 
     FREX: resultados, foram, que, sobre, intervenciones, riesgo, uma 
     Lift: criterios, efecto, así, comparación, cumplieron, debido, depresión 
     Score: de, la, 的, en, los, y, que