if (!require("pacman")) install.packages("pacman")
::p_load(
pacman# file management
here, qs, # data wrangling
magrittr, janitor, # data analysis
easystats, sjmisc, # visualization
ggpubr, ggwordcloud, # fancy tables
gt, gtExtras, # tidy text processing
tidytext, textdata, widyr, # quanteda text processing
quanteda,
quanteda.textplots,
topicmodels, stm, # load last to avoid masking issues
tidyverse )
Showcase 10: 🔨 Automatic analysis of text in R
Digital disconnection on Twitter
Background
Increasing trend towards more conscious use of digital media (devices), including (deliberate) non-use with the aim to restore or improve psychological well-being (among other factors)
But how do “we” talk about digital detox/disconnection: 💊 drug, 👹 demon or 🍩 donut?
Todays’s data basis: Twitter dataset
- Collection of all tweets up to the beginning of 2023 that mention or discuss digital detox (and similar terms) on Twitter (not 𝕏)
- Initial query is searching for “digital detox”, “#digitaldetox”, “digital_detox”
- Access via official Academic-Twitter-API via academictwitteR (Barrie and Ho 2021) at the beginning of last year
Preparation
Import and process the data
# Import raw data from local
<- qs::qread(here("local_data/tweets-digital_detox.qs"))$raw %>%
tweets ::clean_names()
janitor
# Initial data processing
<- tweets %>%
tweets_correct mutate(
# reformat and create datetime variables
across(created_at, ~ymd_hms(.)), # convert to dttm format
year = year(created_at),
month = month(created_at),
day = day(created_at),
hour = hour(created_at),
minute = minute(created_at),
# create addtional variables
retweet_dy = str_detect(text, "^RT"), # identify retweets
detox_dy = str_detect(text, "#digitaldetox")
%>%
) distinct(tweet_id, .keep_all = TRUE)
# Filter relevant tweets
<- tweets_correct %>%
tweets_detox filter(
== TRUE, # only tweets with #digitaldetox
detox_dy == FALSE, # no retweets
retweet_dy == "en" # only english tweets
lang
)
# Topic models
<- qs::qread(here("local_data/stm_k0.qs"))
stm_k0 <- qs::qread(here("local_data/stm_exploration.qs"))
stm_exploration <- qs::qread(here("local_data/stm_results.qs")) stm_results
Text as data in R (Part II)
Step-by-step DTM creation
# Common HTML entities
<- "&|<|>"
remove_reg
# Create tidy data
<- tweets_detox %>%
tweets_tidy mutate(
text = str_remove_all(text, remove_reg)) %>%
::unnest_tokens("text", text) %>%
tidytextfilter(!text %in% tidytext::stop_words$word)
# Create summarized data
<- tweets_tidy %>%
tweets_summarized count(tweet_id, text)
# Create DTM
<- tweets_summarized %>%
tweets_dfm cast_dfm(tweet_id, text, n)
# Preview
tweets_dfm
Document-feature matrix of: 46,670 documents, 87,172 features (99.99% sparse) and 0 docvars.
features
docs bite detox digital digitaldetox enjoy fly happitizer
1000009901563838465 1 1 1 1 2 1 1
1000038819520008193 0 0 0 1 0 0 0
1000042717492187136 0 0 0 1 0 0 0
1000043574673715203 0 0 0 1 0 0 0
1000053895035531264 0 1 1 1 0 0 0
1000075155891281925 0 0 0 1 0 0 0
features
docs happitizers https inspiration
1000009901563838465 1 1 1
1000038819520008193 0 1 0
1000042717492187136 0 1 0
1000043574673715203 0 2 0
1000053895035531264 0 2 0
1000075155891281925 0 1 0
[ reached max_ndoc ... 46,664 more documents, reached max_nfeat ... 87,162 more features ]
# Create corpus
<- tweets_detox %>%
quanteda_corpus ::corpus(
quantedadocid_field = "tweet_id",
text_field = "text"
)
# Tokenize
<- quanteda_corpus %>%
quanteda_token ::tokens(
quantedaremove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
split_tags = FALSE # keep hashtags and mentions
%>%
) ::tokens_tolower() %>%
quanteda::tokens_remove(
quantedapattern = stopwords("en")
)
# Convert to Document-Feature-Matrix (DFM)
<- quanteda_token %>%
quanteda_dfm ::dfm()
quanteda
# Preview
quanteda_dfm
Document-feature matrix of: 46,670 documents, 49,123 features (99.98% sparse) and 35 docvars.
features
docs put blackberry iphone read pew report teens distracted driving
5777201122 1 1 1 1 1 1 1 1 1
4814687834 0 0 0 0 0 0 0 0 0
4813781509 0 0 0 0 0 0 0 0 0
3351604894 0 0 0 0 0 0 0 0 0
3350930292 0 0 0 0 0 0 0 0 0
3349372574 0 0 0 0 0 0 0 0 0
features
docs #digitaldetox
5777201122 1
4814687834 1
4813781509 1
3351604894 1
3350930292 1
3349372574 1
[ reached max_ndoc ... 46,664 more documents, reached max_nfeat ... 49,113 more features ]
Topic modeling
Preparation
Pruning
<- quanteda_dfm %>%
quanteda_dfm_trim dfm_trim(
min_docfreq = 0.0001,
max_docfreq = .099,
docfreq_type = "prop"
)
# Convert for stm topic modeling
<- quanteda_dfm_trim %>%
quanteda_stm convert(to = "stm")
Converting the trimmmed DFM to an stm object will result in an errormessage
Warning message:
In dfm2stm(x, docvars, omit_empty = TRUE) : Dropped 46,670 empty document(s)
This is due to the fact, that some tweets are “empty” or do not match with any feature after the pruning. To successfully match the stm results with the original data, the “empty” tweets need to be dropped. To identify the empty cases, run
# Check if tweet contains feature
<- Matrix::rowSums(as(quanteda_dfm_trim, "Matrix")) == 0
empty_docs
# Create vector for empty tweet identification
<- quanteda_dfm_trim@docvars$docname[empty_docs] empty_docs_ids
# Optional: Print indices of empty documents
if (any(empty_docs)) {
cat("Indices of empty documents:", which(empty_docs), "\n")
# Print corresponding docnames
cat("Docnames of empty documents:", quanteda_dfm_trim@docvars$docname[empty_docs], "\n")
}
Select model
based on k = 0
# Estimate model
<- stm(
stm_k0 $documents,
quanteda_stm$vocab,
quanteda_stmK = 0,
max.em.its = 50,
init.type = "Spectral",
seed = 42
)
# Preview
stm_k0
A topic model with 72 topics, 46574 documents and a 8268 word dictionary.
based on different model diagnostics
# Set up parallel processing using furrr
::plan(future::multisession()) # use multiple sessions
future
# Estimate multiple models
<- tibble(k = seq(from = 5, to = 85, by = 5)) %>%
stm_exploration mutate(mdl = furrr::future_map(k, ~stm::stm(
documents = quanteda_stm$documents,
vocab = quanteda_stm$vocab,
K = .,
seed = 42,
max.em.its = 1000,
init.type = "Spectral",
verbose = FALSE),
.options = furrr::furrr_options(seed = 42))
)
$mdl stm_exploration
[[1]]
A topic model with 5 topics, 46574 documents and a 8268 word dictionary.
[[2]]
A topic model with 10 topics, 46574 documents and a 8268 word dictionary.
[[3]]
A topic model with 15 topics, 46574 documents and a 8268 word dictionary.
[[4]]
A topic model with 20 topics, 46574 documents and a 8268 word dictionary.
[[5]]
A topic model with 25 topics, 46574 documents and a 8268 word dictionary.
[[6]]
A topic model with 30 topics, 46574 documents and a 8268 word dictionary.
[[7]]
A topic model with 35 topics, 46574 documents and a 8268 word dictionary.
[[8]]
A topic model with 40 topics, 46574 documents and a 8268 word dictionary.
[[9]]
A topic model with 45 topics, 46574 documents and a 8268 word dictionary.
[[10]]
A topic model with 50 topics, 46574 documents and a 8268 word dictionary.
[[11]]
A topic model with 55 topics, 46574 documents and a 8268 word dictionary.
[[12]]
A topic model with 60 topics, 46574 documents and a 8268 word dictionary.
[[13]]
A topic model with 65 topics, 46574 documents and a 8268 word dictionary.
[[14]]
A topic model with 70 topics, 46574 documents and a 8268 word dictionary.
[[15]]
A topic model with 75 topics, 46574 documents and a 8268 word dictionary.
[[16]]
A topic model with 80 topics, 46574 documents and a 8268 word dictionary.
[[17]]
A topic model with 85 topics, 46574 documents and a 8268 word dictionary.
Exploration
# Create heldout
<- make.heldout(
heldout $documents,
quanteda_stm$vocab,
quanteda_stmseed = 42)
# Create model diagnostics
<- stm_exploration %>%
stm_results mutate(exclusivity = map(mdl, exclusivity),
semantic_coherence = map(mdl, semanticCoherence, quanteda_stm$documents),
eval_heldout = map(mdl, eval.heldout, heldout$missing),
residual = map(mdl, checkResiduals, quanteda_stm$documents),
bound = map_dbl(mdl, function(x) max(x$convergence$bound)),
lfact = map_dbl(mdl, function(x) lfactorial(x$settings$dim$K)),
lbound = bound + lfact,
iterations = map_dbl(mdl, function(x) length(x$convergence$bound)))
# Preview
stm_results
# A tibble: 17 × 10
k mdl exclusivity semantic_coherence eval_heldout residual bound
<dbl> <list> <list> <list> <list> <list> <dbl>
1 5 <STM> <dbl [5]> <dbl [5]> <named list> <named list> -3.25e6
2 10 <STM> <dbl [10]> <dbl [10]> <named list> <named list> -3.21e6
3 15 <STM> <dbl [15]> <dbl [15]> <named list> <named list> -3.17e6
4 20 <STM> <dbl [20]> <dbl [20]> <named list> <named list> -3.12e6
5 25 <STM> <dbl [25]> <dbl [25]> <named list> <named list> -3.09e6
6 30 <STM> <dbl [30]> <dbl [30]> <named list> <named list> -3.09e6
7 35 <STM> <dbl [35]> <dbl [35]> <named list> <named list> -3.08e6
8 40 <STM> <dbl [40]> <dbl [40]> <named list> <named list> -3.07e6
9 45 <STM> <dbl [45]> <dbl [45]> <named list> <named list> -3.07e6
10 50 <STM> <dbl [50]> <dbl [50]> <named list> <named list> -3.06e6
11 55 <STM> <dbl [55]> <dbl [55]> <named list> <named list> -3.05e6
12 60 <STM> <dbl [60]> <dbl [60]> <named list> <named list> -3.05e6
13 65 <STM> <dbl [65]> <dbl [65]> <named list> <named list> -3.05e6
14 70 <STM> <dbl [70]> <dbl [70]> <named list> <named list> -3.07e6
15 75 <STM> <dbl [75]> <dbl [75]> <named list> <named list> -3.06e6
16 80 <STM> <dbl [80]> <dbl [80]> <named list> <named list> -3.03e6
17 85 <STM> <dbl [85]> <dbl [85]> <named list> <named list> -3.06e6
# ℹ 3 more variables: lfact <dbl>, lbound <dbl>, iterations <dbl>
# Visualize
%>%
stm_results transmute(
k,`Lower bound` = lbound,
Residuals = map_dbl(residual, "dispersion"),
`Semantic coherence` = map_dbl(semantic_coherence, mean),
`Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
gather(Metric, Value, -k) %>%
ggplot(aes(k, Value, color = Metric)) +
geom_line(size = 1.5, alpha = 0.7, show.legend = FALSE) +
geom_point(size = 3) +
scale_x_continuous(breaks = seq(from = 5, to = 85, by = 5)) +
facet_wrap(~Metric, scales = "free_y") +
labs(x = "K (number of topics)",
y = NULL,
title = "Model diagnostics by number of topics"
+
) theme_pubr() +
# add highlights
geom_vline(aes(xintercept = 5), color = "#00BFC4", alpha = .5) +
geom_vline(aes(xintercept = 10), color = "#C77CFF", alpha = .5) +
geom_vline(aes(xintercept = 40), color = "#C77CFF", alpha = .5)
# Models for comparison
= c(5, 10, 40)
models_for_comparison
# Create figures
<- stm_results %>%
fig_excl # Edit data
select(k, exclusivity, semantic_coherence) %>%
filter(k %in% models_for_comparison) %>%
unnest(cols = c(exclusivity, semantic_coherence)) %>%
mutate(k = as.factor(k)) %>%
# Build graph
ggplot(aes(semantic_coherence, exclusivity, color = k)) +
geom_point(size = 2, alpha = 0.7) +
labs(
x = "Semantic coherence",
y = "Exclusivity"
# title = "Comparing exclusivity and semantic coherence",
# subtitle = "Models with fewer topics have higher semantic coherence for more topics, but lower exclusivity"
+
) theme_pubr()
# Create plotly
%>% plotly::ggplotly() fig_excl
Understanding
Select model for further analysis
<- 10
n_topics
<- stm_results |>
tpm filter(k == n_topics) |>
pull(mdl) %>% .[[1]]
Overview
%>% plot(type = "summary") tpm
%>% plot(type = "hist") tpm
Document/Word-topic relations
# Create data
<- tpm %>%
top_gamma tidy(matrix = "gamma") %>%
::group_by(topic) %>%
dplyr::summarise(gamma = mean(gamma), .groups = "drop") %>%
dplyr::arrange(desc(gamma))
dplyr
<- tpm %>%
top_beta ::tidy(.) %>%
tidytext::group_by(topic) %>%
dplyr::arrange(-beta) %>%
dplyr::top_n(10, wt = beta) %>%
dplyr::select(topic, term) %>%
dplyr::summarise(terms_beta = toString(term), .groups = "drop")
dplyr
<- top_beta %>%
top_topics_terms ::left_join(top_gamma, by = "topic") %>%
dplyr::mutate(
dplyrtopic = paste0("Topic ", topic),
topic = reorder(topic, gamma)
)
# Preview
%>%
top_topics_terms mutate(across(gamma, ~round(.,3))) %>%
::arrange(-gamma) %>%
dplyrgt()
topic | terms_beta | gamma |
---|---|---|
Topic 10 | phone, via, much, #screentime, know, check, people, phones, #parenting, online | 0.122 |
Topic 9 | help, read, devices, use, world, tech, family, taking, kids, sleep | 0.114 |
Topic 3 | need, great, week, going, listen, #unplugging, discuss, @icphenomenallyu, away, well | 0.111 |
Topic 2 | #unplug, just, weekend, #travel, unplug, enjoy, #nature, really, join, nature | 0.103 |
Topic 8 | new, technology, smartphone, retreat, work, addiction, year, health, internet, without | 0.099 |
Topic 7 | can, #mindfulness, feel, #switchoff, #digitalwellbeing, #wellness, things, #phonefree, #disconnecttoreconnect, #digitalminimalism | 0.098 |
Topic 4 | amp, day, take, #mentalhealth, go, try, #wellbeing, now, give, every | 0.092 |
Topic 1 | social, media, get, life, back, like, good, #socialmedia, find, see | 0.091 |
Topic 5 | us, today, days, next, put, happy, may, facebook, share, hour | 0.089 |
Topic 6 | one, break, tips, make, screen, love, #technology, free, looking, getting | 0.080 |
Document-topic-relations
# Prepare for merging
<- tpm %>%
topic_gammas tidy(matrix = "gamma") %>%
::group_by(document) %>%
dplyr::pivot_wider(
tidyrid_cols = document,
names_from = "topic",
names_prefix = "gamma_topic_",
values_from = "gamma")
<- tpm %>%
gammas ::tidy(matrix = "gamma") %>%
tidytext::group_by(document) %>%
dplyr::slice_max(gamma) %>%
dplyr::mutate(main_topic = ifelse(gamma > 0.5, topic, NA)) %>%
dplyrrename(
top_topic = topic,
top_gamma = gamma) %>%
ungroup() %>%
left_join(., topic_gammas, by = join_by(document))
# Merge with original data
<- tweets_detox %>%
tweets_detox_topics filter(!(tweet_id %in% empty_docs_ids)) %>%
bind_cols(gammas) %>%
select(-document)
# Preview
tweets_detox_topics
# A tibble: 46,574 × 50
tweet_id user_username text created_at in_reply_to_user_id
<chr> <chr> <chr> <dttm> <chr>
1 5777201122 pblackshaw Put down yo… 2009-11-16 22:03:12 <NA>
2 4814687834 andrewgerrard @Dawn_Wylie… 2009-10-12 18:48:30 18003723
3 4813781509 pblackshaw Is Social M… 2009-10-12 17:55:57 <NA>
4 3351604894 KurtyD Google gear… 2009-08-16 23:20:53 <NA>
5 3350930292 KurtyD Wifi on the… 2009-08-16 22:31:22 <NA>
6 3349372574 KurtyD Adios amigo… 2009-08-16 20:37:03 <NA>
7 2722418665 evelynanne Ok, I'm hea… 2009-07-19 14:31:16 <NA>
8 1959962952 halriley I'm thinkin… 2009-05-29 14:11:36 <NA>
9 1638612063 deconst Interesting… 2009-04-28 13:03:14 <NA>
10 1626172818 deconst Back from #… 2009-04-27 03:55:25 <NA>
# ℹ 46,564 more rows
# ℹ 45 more variables: author_id <chr>, lang <chr>, possibly_sensitive <lgl>,
# conversation_id <chr>, user_created_at <chr>, user_protected <lgl>,
# user_name <chr>, user_verified <lgl>, user_description <chr>,
# user_location <chr>, user_url <chr>, user_profile_image_url <chr>,
# user_pinned_tweet_id <chr>, retweet_count <int>, like_count <int>,
# quote_count <int>, user_tweet_count <int>, user_list_count <int>, …
Topic distribution
%>%
tweets_detox_topics mutate(across(top_topic, as.factor)) %>%
ggplot(aes(top_topic)) +
geom_bar() +
theme_pubr()
Topic distribution over time
%>%
tweets_detox_topics mutate(across(top_topic, as.factor)) %>%
ggplot(aes(year, fill = top_topic)) +
geom_bar() +
theme_pubr()
%>%
tweets_detox_topics mutate(across(top_topic, as.factor)) %>%
ggplot(aes(year, fill = top_topic)) +
geom_bar(position = "fill") +
theme_pubr()
Focus on specific topics
%>%
tweets_detox_topics filter(top_topic == 10) %>%
arrange(-top_gamma) %>%
slice_head(n = 10) %>%
select(tweet_id, user_username, created_at, text, top_gamma) %>%
gt()
tweet_id | user_username | created_at | text | top_gamma |
---|---|---|---|---|
1496707135794827266 | beckygrantstr | 2022-02-24 04:42:43 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/EcV7yKn4WX | 0.8192142 |
1496343978672898048 | beckygrantstr | 2022-02-23 04:39:39 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/o9e6WQIpm5 | 0.8192142 |
1495981434665947137 | beckygrantstr | 2022-02-22 04:39:02 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/rQgcPltrMt | 0.8192142 |
1499612427503247360 | beckygrantstr | 2022-03-04 05:07:18 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/DQRQlN1iyq | 0.8192142 |
1499249315381977089 | beckygrantstr | 2022-03-03 05:04:26 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/L7ly66J2fq | 0.8192142 |
1498886117394980865 | beckygrantstr | 2022-03-02 05:01:13 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/uJPIMYY1Id | 0.8192142 |
1498522796611321864 | beckygrantstr | 2022-03-01 04:57:30 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/z4Fv2bmeag | 0.8192142 |
1498159674008510465 | beckygrantstr | 2022-02-28 04:54:35 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/SdZFAJy6kZ | 0.8192142 |
1497796525929472003 | beckygrantstr | 2022-02-27 04:51:34 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/OKLMzwfQUA | 0.8192142 |
1497433419512524802 | beckygrantstr | 2022-02-26 04:48:42 | Do you have a blind spot when it comes to what your kids are doing online? #screentime #parenting #digitaldetox Parents Have a Blind Spot When it Comes to Kids and Screens | by Becky Grant | A Parent Is Born | Feb, 2022 | Medium - via @pensignal https://t.co/yTnqhTPV8h | 0.8192142 |
Top Users
%>%
tweets_detox_topics filter(top_topic == 10) %>%
count(user_username, sort = TRUE) %>%
mutate(prop = round(n/sum(n)*100, 2)) %>%
slice_head(n = 15)
# A tibble: 15 × 3
user_username n prop
<chr> <int> <dbl>
1 TimeToLogOff 2384 28.8
2 punkt 390 4.72
3 petitstvincent 175 2.12
4 tanyagoodin 135 1.63
5 ditox_unplug 111 1.34
6 OurHourOff 99 1.2
7 beckygrantstr 63 0.76
8 CreeEscape 56 0.68
9 ConsciDigital 51 0.62
10 phubboo 50 0.6
11 digitaldetoxing 47 0.57
12 smudgedlippy 40 0.48
13 detox_india 32 0.39
14 winiepuh 27 0.33
15 iamscentered 23 0.28
References
Barrie, Christopher, and Justin Ho. 2021. “academictwitteR: An r Package to Access the Twitter Academic Research Product Track V2 API Endpoint.” Journal of Open Source Software 6 (62): 3272. https://doi.org/10.21105/joss.03272.
Nassen, Lise-Marie, Heidi Vandebosch, Karolien Poels, and Kathrin Karsay. 2023. “Opt-Out, Abstain, Unplug. A Systematic Review of the Voluntary Digital Disconnection Literature.” Telematics and Informatics 81 (June): 101980. https://doi.org/10.1016/j.tele.2023.101980.