--- title: "Psychological Methods: the Text Tutorial" description: " " author: "" opengraph: image: src: "http://r-text.org/articles/text_files/figure-html/unnamed-chunk-5-1.png" twitter: card: summary_large_image creator: "@oscarkjell" output: github_document #rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{psychological_methods} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) evaluate = FALSE ``` ```{r, eval = evaluate, warning=FALSE, message=FALSE, dpi=300} # For text-version => 0.9.99 # Install text from CRAN install.packages("text") library(text) # Set-up en environment with text-required python packages textrpp_install() # Initialize the environment – and save the settings for next time textrpp_initialize(save_profile = TRUE) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Example text texts <- c("I am feeling relatedness with others", "That's great!") # Defaults embeddings <- textEmbed(texts) # Output embeddings$tokens # Output embeddings$texts # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Look at example data included in the text- package comprising both text and numerical variables (note that there are only 40 participants in this example). Language_based_assessment_data_8 # Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings). word_embeddings <- textEmbed( Language_based_assessment_data_8, model = "bert-base-uncased", aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", keep_token_embeddings = FALSE ) # See how the word embeddings are structured word_embeddings # Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data. saveRDS(word_embeddings, "word_embeddings.rds") # Get the saved word embeddings (again) word_embeddings <- readRDS("word_embeddings.rds") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Get hidden states for "I am fine" imf_embeddings_11_12 <- textEmbedRawLayers( "I am fine", layers = 11:12 ) imf_embeddings_11_12 #OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 1. Concatenate layers(results in 1,536 dimensions). textEmbedLayerAggregation( imf_embeddings_11_12$context_tokens, layers = 11:12, aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean" ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 2. Aggregate layers using mean (results in 768). textEmbedLayerAggregation( imf_embeddings_11_12$context_tokens, layers = 11, aggregation_from_tokens_to_texts = "mean" ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Examine the relationship between satisfactiontext and the corresponding rating scale model_satisfactiontext_swls <- textTrain( x = word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings) y = Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score. model_description = "author(s): Kjell, Giorgi, & Schwartz; data: N=40, population = Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale" ) # Examine the correlation between predicted and observed Harmony in life scale scores model_satisfactiontext_swls$results # OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Save the mode saveRDS( model_satisfactiontext_swls, "model_satisfactiontext_swls.rds" ) # Read the model model_satisfactiontext_swls <- readRDS( "model_satisfactiontext_swls.rds" ) # Examine the names in the object returned from training names(model_satisfactiontext_swls) #OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Predicting several outcomes from several word embeddings models_words_ratings <- textTrainLists( word_embeddings$texts[1:2], Language_based_assessment_data_8[5:6] ) # See results models_words_ratings$results # OUTPUT # Save model saveRDS(models_words_ratings, "models_words_ratings.rds") # Read model models_words_ratings <- readRDS( "models_words_ratings.rds" ) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read a valence trained prediction model (download it from https://osf.io/dgczt/) valence_Warriner_L11 <- readRDS( "valence_Warriner_L11.rds" ) # Examine the model valence_Warriner_L11 # PART OF THE OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Apply the model to the satisfaction text satisfaction_text_valence <- textPredict( valence_Warriner_L11, word_embeddings$texts$satisfactiontexts, dim_names = FALSE ) # Examine the correlation between the predicted valence and the Satisfaction with life scale score psych::corr.test( satisfaction_text_valence$word_embeddings__ypred, Language_based_assessment_data_8$swlstotal ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity scores between two text columns, using the previously created word_embeddings. semantic_similarity_scores <- textSimilarity( word_embeddings$texts$harmonytexts, word_embeddings$texts$satisfactiontexts ) # Look at the first scores head(semantic_similarity_scores) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read word norms text (later we will use these for the semantic centrality plot) word_norms <- read.csv( "Word_Norms_Mental_Health_Kjell2018_text.csv" ) # Read the word embeddings for the word norms word_norms_embeddings <- readRDS( "Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds" ) # Examine which word norms there are. names(word_norms_embeddings$texts) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity score between the harmony answers and the harmony norm # Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created. norm_similarity_scores_harmony <- textSimilarityNorm( word_embeddings$texts$harmonywords, word_norms_embeddings$texts$harmonynorm ) # Correlating the semantic measure with the corresponding rating scale psych::corr.test( norm_similarity_scores_harmony, Language_based_assessment_data_8$hilstotal ) # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Extract word type embeddings and text embeddings for harmony words harmony_words_embeddings <- textEmbed( texts = Language_based_assessment_data_8["harmonywords"], aggregation_from_layers_to_tokens = "concatenate", aggregation_from_tokens_to_texts = "mean", aggregation_from_tokens_to_word_types = "mean", keep_token_embeddings = FALSE ) # Pre-processing data for plotting projection_results <- textProjection( words = Language_based_assessment_data_8$harmonywords, word_embeddings = harmony_words_embeddings$texts, word_types_embeddings = harmony_words_embeddings$word_types, x = Language_based_assessment_data_8$hilstotal, y = Language_based_assessment_data_8$age ) projection_results$word_data # To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session: options(ggrepel.max.overlaps = 1000) # Plot plot_projection <- textPlot( projection_results, min_freq_words_plot = 1, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, y_axes = FALSE, p_alpha = 0.05, p_adjust_method = "fdr", title_top = "Harmony Words Responses (Supervised Dimension Projection)", x_axes_label = "Low to High Harmony in Life Scale Score", y_axes_label = "", bivariate_color_codes = c("#FFFFFF", "#FFFFFF", "#FFFFFF", "#E07f6a", "#EAEAEA", "#85DB8E", "#FFFFFF", "#FFFFFF", "#FFFFFF" ) ) # View plot plot_projection$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Plot plot_projection_2D <- textPlot( projection_results, min_freq_words_plot = 1, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, y_axes = TRUE, # Change to TRUE/FALSE p_alpha = 0.05, p_adjust_method = "fdr", title_top = "Harmony Words Responses (Supervised Dimension Projection)", x_axes_label = "Low vs. High Harmony in Life Scale Score", y_axes_label = "Low vs.High Age", bivariate_color_codes = c("#E07f6b", "#60A1F7", "#85DB8D", "#FF0000", "#EAEAEA", "#5dc688", "#E07f6a", "#60A1F7", "#85DB8E" ) ) # View plot plot_projection_2D$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Computing words' centrality (semantic similarity) score to the aggregated embedding of all words centrality_results <- textCentrality( words = word_norms$satisfactionnorm, word_embeddings = word_norms_embeddings$texts$satisfactionnorm, word_types_embeddings = word_norms_embeddings$word_types ) options(ggrepel.max.overlaps = 1000) centrality_plot <- textCentralityPlot( word_data = centrality_results, min_freq_words_test = 2, plot_n_word_extreme = 10, plot_n_word_frequency = 5, plot_n_words_middle = 5, title_top = "Satisfaction with life word norm: Semantic Centrality Plot", x_axes_label = "Satisfaction with Life Semantic Centrality" ) centrality_plot$final_plot # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Supplementary # PCA results to be plotted help(textPCA) textPCA_results <- textPCA( words = Language_based_assessment_data_8$satisfactionwords, word_types_embeddings = harmony_words_embeddings$word_types ) # Plotting the PCA results plot_PCA <- textPCAPlot( word_data = textPCA_results, min_freq_words_test = 2, plot_n_word_extreme = 5, plot_n_word_frequency = 5, plot_n_words_middle = 5 ) plot_PCA$final_plot ```