if (!require("pacman")) install.packages("pacman")
pacman::p_load(
here, qs, # file management
magrittr, janitor, # data wrangling
easystats, sjmisc, # data analysis
ggpubr, ggwordcloud, # visualization
gt, gtExtras, # fancy tables
tidytext, textdata, widyr, # tidy text processing
quanteda, # quanteda text processing
quanteda.textplots,
topicmodels, stm,
tidyverse # load last to avoid masking issues
)Exercise 10: 🔨 Automatic analysis of text in R
Digital disconnection on Twitter
Background
Increasing trend towards more conscious use of digital media (devices), including (deliberate) non-use with the aim to restore or improve psychological well-being (among other factors)
But how do “we” talk about digital detox/disconnection: 💊 drug, 👹 demon or 🍩 donut?
Todays’s data basis: Twitter dataset
- Collection of all tweets up to the beginning of 2023 that mention or discuss digital detox (and similar terms) on Twitter (not 𝕏)
- Initial query is searching for “digital detox”, “#digitaldetox”, “digital_detox”
- Access via official Academic-Twitter-API via academictwitteR (Barrie and Ho 2021) at the beginning of last year
Preparation
Import and process the data
Get twitter data
# Import raw data from local
tweets <- qs::qread(here("local_data/tweets-digital_detox.qs"))$raw %>%
janitor::clean_names()
# Initial data processing
tweets_correct <- tweets %>%
mutate(
# reformat and create datetime variables
across(created_at, ~ymd_hms(.)), # convert to dttm format
year = year(created_at),
month = month(created_at),
day = day(created_at),
hour = hour(created_at),
minute = minute(created_at),
# create addtional variables
retweet_dy = str_detect(text, "^RT"), # identify retweets
detox_dy = str_detect(text, "#digitaldetox")
) %>%
distinct(tweet_id, .keep_all = TRUE)
# Filter relevant tweets
tweets_detox <- tweets_correct %>%
filter(
detox_dy == TRUE, # only tweets with #digitaldetox
retweet_dy == FALSE, # no retweets
lang == "en" # only english tweets
)DTM/DFM creation
# Common HTML entities
remove_reg <- "&|<|>"
# Create tidy data
tweets_tidy <- tweets_detox %>%
mutate(
text = str_remove_all(text, remove_reg)) %>%
tidytext::unnest_tokens("text", text) %>%
filter(!text %in% tidytext::stop_words$word)
# Create summarized data
tweets_summarized <- tweets_tidy %>%
count(tweet_id, text)
# Create DTM
tweets_dfm <- tweets_summarized %>%
cast_dfm(tweet_id, text, n)
# Preview
tweets_dfm# Create corpus
quanteda_corpus <- tweets_detox %>%
mutate(across(text, ~str_replace_all(., "#digitaldetox", ""))) %>%
select(-c(detox_dy, retweet_dy)) %>%
quanteda::corpus(
docid_field = "tweet_id",
text_field = "text"
)
# Tokenize
quanteda_token <- quanteda_corpus %>%
quanteda::tokens(
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
split_tags = FALSE # keep hashtags and mentions
) %>%
quanteda::tokens_tolower() %>%
quanteda::tokens_remove(
pattern = stopwords("en")
)
# Convert to Document-Feature-Matrix (DFM)
quanteda_dfm <- quanteda_token %>%
quanteda::dfm()
# Preview
quanteda_dfmGet topic model (data)
# TPM data
stm_results <- qs::qread(here("local_data/stm_results.qs"))
# Base data with topics
tweets_detox_topics <- qs::qread(here("local_data/tweets-digital-detox-topics.qs"))📋 Exercises
Objective of this exercise
- Brief review of the contents of the last session
- Teaching the basic steps for creating and analyzing document feature matrices and stm topic models
Attention
- Before you start working on the exercise, please make sure to render all the chunks of the section Preparation. You can do this by using the “Run all chunks above”-button
of the next chunk. - You can choose to solve some exercises using either the
tidytextorquantedafunctions. As the work steps differ, please select the tab with your preferred package. - When in doubt, use the
showcase(.qmd or .html) to look at the code chunks used to produce the output of the slides.
📋 Exercise 1: Hashtag co-occurence
Objective(s)
Recreate the network plot from the session without the token #digitaldetox
- Create new dataset
tweets_dfm_cleaned- Based on the dataset
tweets_detox,- Use
mutate()and create the variablestextthat removes the hashtag"#digitaldetox"usingstr_remove_all()andhashtagthat extracts hashtags from the variabletextwith the help ofstr_extract_all()(and thepattern "#\\S+").
- Tokenize the data by using only
unnest()on the variablehashtag. - Summarize occurrences using
count(tweet_id, hashtags). - Convert to DFM using
cast_dfm(tweet_id, hashtags, n).
- Use
- Save this transformation by creating a new dataset with the name
tweets_dfm_cleaned.
- Based on the dataset
- Create new dataset
top_hashtags_tidy- Based on the dataset
tweets_detox,- Repeat steps 1. & 2. from before.
- Summarize occurrences using
count(hashtags, sort = TRUE). - Extract top 50 hashtags by using
slice_head(n = 50). - Convert to string vector by using
pull().
- Save this transformation by creating a new dataset with the name
top_hashtags_tidy.
- Based on the dataset
- Visualize co-occurence
- Based on the dataset
tweets_dfm_cleaned,- Transform data to feature co-occurrence matrix [FCM] using
quanteda::fcm() - Select relevant hashtags using
quanteda::fcm_select(pattern = top_hashtags_tidy, case_insensitive = FALSE). - Visualize using
quanteda.textplots::textplot_network()
- Transform data to feature co-occurrence matrix [FCM] using
- Based on the dataset
- Interpret and compare results
- Analyze patterns or connections among top hashtags.
- Discuss insights from the visualization, especially in comparison to the visualization of the slides/showcase.
- Create new dataset
quanteda_dfm_cleaned:- Based on the dataset
quanteda_dfm,- Use
quanteda::dfm_select(pattern = "#*")to create a DFM containing only hashtags. - Use
quanteda::dfm_remove(pattern = "#digitaldetox")to removes the hashtag"#digitaldetox".
- Use
- Save this transformation by creating a new dataset with the name
quanteda_dfm_cleaned.
- Based on the dataset
- Create new dataset
top_hashtags_quanteda- Based on the dataset
quanteda_dfm_cleaned,- Use
topfeatures(50)o extract the top 50 most common hashtags. - Use
names()to only store the names (not the values).
- Use
- Save this transformation by creating a new dataset with the name
top_hashtags_quanteda.
- Based on the dataset
- Visualize co-occurence
- Based on the dataset
quanteda_dfm_cleaned,- Transform data to feature co-occurrence matrix [FCM] using
quanteda::fcm() - Select relevant hashtags using
quanteda::fcm_select(pattern = top_hashtags_tidy, case_insensitive = FALSE). - Visualize using
quanteda.textplots::textplot_network().
- Transform data to feature co-occurrence matrix [FCM] using
- Based on the dataset
- Interpret and compare results
- Analyze patterns or connections among top hashtags.
- Discuss insights from the visualization, especially in comparison to the visualization of the slides/showcase.
# Create new dataset tweets_dfm_cleaned | quanteda_dfm_cleaned
# Create new dataset top_hashtags_quanteda |top_hashtags_quanteda
# Visualize co-occurence📋 Exercise 2: Understanding Topic 9
Objective(s)
Take a closer look at another topic from the topic model used in the session by examining the most representative tweets and the users who post the most tweets on that topic.
2.1: Explore top tweets
- Based on the dataset
tweets_detox_topics- Use
filter()to only analyze tweets that belong to topic 9. Use the variabletop_topicfor filtering. - Arrange the selected tweets in descending order based on
top_gammavalues usingarrange(-top_gamma). - Extract the top 10 tweets using
slice_head(n = 10). - Select only relevant columns (
tweet_id, user_username, created_at, text, top_gamma) usingselect(). - Create a tabular presentation of the selected tweets using
gt()from the gt package.
- Use
- Interpret and compare results (with a partner)
2.1 Explore top users
- Based on the dataset
tweets_detox_topics- Use
filter()to only analyze tweets that belong to topic 9. Use the variabletop_topicfor filtering. - Count the number of tweets per user using
count(user_username, sort = TRUE). - Calculate the proportion of tweets for each user by creating a new column
propusingmutate(prop = round(n/sum(n)*100, 2)). - Extract the top 15 users with the highest engagement using
slice_head(n = 15). - Create a tabular presentation of the selected tweets using
gt()from the gt package.
- Use
- Interpret and compare results (with a partner)
📋 Exercise 3: Expolore different topic model
Objective(s)
In the session, three models were considered for closer examination. Choose one of the other topics and recreate all the steps for the initial exploration of the topic model (as in the session).
3.1 Initial exploration
- Explore a different topic model (estimation)
- Based on the dataset
stm_results,- Use the
filter(k == X)to select a different topic model (estimation) of your choice (by definingX).
Tip: Usestm_results$kto see available options forX - Use
pull(mdl) %>% .[[1]]to select the topic model (estimation) with the specified number of topics.
- Use the
- Save this transformation by creating a new dataset with the name
tpm_new.
- Based on the dataset
- Visual exploration of
tpm_new- Visualize the summary of the selected topic model using
plot(type = "summary").
- Visualize the summary of the selected topic model using
# Explore a different topic model (estimation)
tpm_new <- stm_results |>
filter(k == ) |>
pull(mdl) %>% .[[1]]
# Visual exploration of tpm_new3.2 Document/Word-topic relations
- Calculate mean gamma values
- based on the dataset
tpm_new- Use
tidy(matrix = "gamma")ontpm_newto get the gamma values. - Calculate the mean gamma values for each topic using
group_by()andsummarise(). - Use
arrange()to sort topics (descending) bygamma
- Use
- Save this transformation by creating a new dataset with the name
top_gamma_new.
- based on the dataset
- Identify top terms of each topic
- based on the dataset
tpm_new- Use
tidy(matrix = "beta")ontpm_newto get the beta values. - Arrange terms within each topic in descending order based on beta values using
group_by(),arrange(-beta), andtop_n(10, wt = beta). - Select relevant columns (
topic, term) usingselect(). - Summarize the top 10 terms for each topic using
summarise(terms_beta = toString(term), .groups = "drop").
- Use
- Save this transformation by creating a new dataset with the name
top_beta_new.
- based on the dataset
- Combine top topics and top terms
- Join the dataframes
top_gamma_newandtop_beta_newbased on the “topic” column usingleft_join(). - Within
mutate(), - Adjusted topic names bytopic = paste0("Topic ", topic)and - Reorder the dataset withtopic = reorder(topic, gamma) - Save this transformation by creating a new dataset with the name
top_topics_terms_new.
- Join the dataframes
- Preview the results
- Display a table preview of the top topics and terms with rounded gamma values using gt()
# Calculate mean gamma values
# Identify top terms of each topic
# Combine top topics and top terms
# Preview the esultsReferences
Barrie, Christopher, and Justin Ho. 2021. “academictwitteR: An r Package to Access the Twitter Academic Research Product Track V2 API Endpoint.” Journal of Open Source Software 6 (62): 3272. https://doi.org/10.21105/joss.03272.
Nassen, Lise-Marie, Heidi Vandebosch, Karolien Poels, and Kathrin Karsay. 2023. “Opt-Out, Abstain, Unplug. A Systematic Review of the Voluntary Digital Disconnection Literature.” Telematics and Informatics 81 (June): 101980. https://doi.org/10.1016/j.tele.2023.101980.