## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) evaluate = FALSE ## ----eval = evaluate, warning=FALSE, message=FALSE, dpi=300------------------- # # For text-version => 0.9.99 # # Install text from CRAN # install.packages("text") # library(text) # # # Set-up en environment with text-required python packages # textrpp_install() # # # Initialize the environment – and save the settings for next time # textrpp_initialize(save_profile = TRUE) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Example text # texts <- c("I am feeling relatedness with others", "That's great!") # # # Defaults # embeddings <- textEmbed(texts) # # # Output # embeddings$tokens # # # Output # embeddings$texts # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Look at example data included in the text- package comprising both text and numerical variables (note that there are only 40 participants in this example). # Language_based_assessment_data_8 # # # Transform the text/word data to word embeddings (see help(textEmbed) to see the default settings). # word_embeddings <- textEmbed( # Language_based_assessment_data_8, # model = "bert-base-uncased", # aggregation_from_layers_to_tokens = "concatenate", # aggregation_from_tokens_to_texts = "mean", # keep_token_embeddings = FALSE # ) # # # See how the word embeddings are structured # word_embeddings # # # Save the word embeddings to avoid having to embed the text again. It is good practice to save output from analyses that take a lot of time to compute, which is often the case when analyzing text data. # saveRDS(word_embeddings, "word_embeddings.rds") # # # Get the saved word embeddings (again) # word_embeddings <- readRDS("word_embeddings.rds") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Get hidden states for "I am fine" # imf_embeddings_11_12 <- textEmbedRawLayers( # "I am fine", # layers = 11:12 # ) # imf_embeddings_11_12 # # #OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 1. Concatenate layers(results in 1,536 dimensions). # textEmbedLayerAggregation( # imf_embeddings_11_12$context_tokens, # layers = 11:12, # aggregation_from_layers_to_tokens = "concatenate", # aggregation_from_tokens_to_texts = "mean" # ) # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 2. Aggregate layers using mean (results in 768). # textEmbedLayerAggregation( # imf_embeddings_11_12$context_tokens, # layers = 11, # aggregation_from_tokens_to_texts = "mean" # ) # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Examine the relationship between satisfactiontext and the corresponding rating scale # model_satisfactiontext_swls <- textTrain( # x = word_embeddings$texts$satisfactiontexts, # the predictor variables (i.e., the word embeddings) # y = Language_based_assessment_data_8$swlstotal, # the criterion variable (i.e.,the rating scale score. # model_description = "author(s): Kjell, Giorgi, & Schwartz; data: N=40, population = Online, Mechanical Turk; publication: title = Example for demo; description: swls = the satisfaction with life scale" # ) # # # Examine the correlation between predicted and observed Harmony in life scale scores # model_satisfactiontext_swls$results # # # OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Save the mode # saveRDS( # model_satisfactiontext_swls, # "model_satisfactiontext_swls.rds" # ) # # Read the model # model_satisfactiontext_swls <- readRDS( # "model_satisfactiontext_swls.rds" # ) # # # Examine the names in the object returned from training # names(model_satisfactiontext_swls) # # #OUTPUT: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Predicting several outcomes from several word embeddings # models_words_ratings <- textTrainLists( # word_embeddings$texts[1:2], # Language_based_assessment_data_8[5:6] # ) # # # See results # models_words_ratings$results # # # OUTPUT # # # # Save model # saveRDS(models_words_ratings, "models_words_ratings.rds") # # Read model # models_words_ratings <- readRDS( # "models_words_ratings.rds" # ) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read a valence trained prediction model (download it from https://osf.io/dgczt/) # valence_Warriner_L11 <- readRDS( # "valence_Warriner_L11.rds" # ) # # # Examine the model # valence_Warriner_L11 # # # PART OF THE OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Apply the model to the satisfaction text # satisfaction_text_valence <- textPredict( # valence_Warriner_L11, # word_embeddings$texts$satisfactiontexts, # dim_names = FALSE # ) # # # Examine the correlation between the predicted valence and the Satisfaction with life scale score # psych::corr.test( # satisfaction_text_valence$word_embeddings__ypred, # Language_based_assessment_data_8$swlstotal # ) # # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity scores between two text columns, using the previously created word_embeddings. # semantic_similarity_scores <- textSimilarity( # word_embeddings$texts$harmonytexts, # word_embeddings$texts$satisfactiontexts # ) # # Look at the first scores # head(semantic_similarity_scores) # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Read word norms text (later we will use these for the semantic centrality plot) # word_norms <- read.csv( # "Word_Norms_Mental_Health_Kjell2018_text.csv" # ) # # # Read the word embeddings for the word norms # word_norms_embeddings <- readRDS( # "Word_Norms_Mental_Health_Kjell2018_text_embedding_L11.rds" # ) # # # Examine which word norms there are. # names(word_norms_embeddings$texts) # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Compute semantic similarity score between the harmony answers and the harmony norm # # Note that the descriptive word answers are used instead of text answers to correspond with how the word norm was created. # norm_similarity_scores_harmony <- textSimilarityNorm( # word_embeddings$texts$harmonywords, # word_norms_embeddings$texts$harmonynorm # ) # # # Correlating the semantic measure with the corresponding rating scale # psych::corr.test( # norm_similarity_scores_harmony, # Language_based_assessment_data_8$hilstotal # ) # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Extract word type embeddings and text embeddings for harmony words # harmony_words_embeddings <- textEmbed( # texts = Language_based_assessment_data_8["harmonywords"], # aggregation_from_layers_to_tokens = "concatenate", # aggregation_from_tokens_to_texts = "mean", # aggregation_from_tokens_to_word_types = "mean", # keep_token_embeddings = FALSE # ) # # # Pre-processing data for plotting # projection_results <- textProjection( # words = Language_based_assessment_data_8$harmonywords, # word_embeddings = harmony_words_embeddings$texts, # word_types_embeddings = harmony_words_embeddings$word_types, # x = Language_based_assessment_data_8$hilstotal, # y = Language_based_assessment_data_8$age # ) # # projection_results$word_data # # # To avoid warnings -- and that words do not get plotted, first increase the max.overlaps for the entire session: # options(ggrepel.max.overlaps = 1000) # # # Plot # plot_projection <- textPlot( # projection_results, # min_freq_words_plot = 1, # plot_n_word_extreme = 10, # plot_n_word_frequency = 5, # plot_n_words_middle = 5, # y_axes = FALSE, # p_alpha = 0.05, # p_adjust_method = "fdr", # title_top = "Harmony Words Responses (Supervised Dimension Projection)", # x_axes_label = "Low to High Harmony in Life Scale Score", # y_axes_label = "", # bivariate_color_codes = c("#FFFFFF", "#FFFFFF", "#FFFFFF", # "#E07f6a", "#EAEAEA", "#85DB8E", # "#FFFFFF", "#FFFFFF", "#FFFFFF" # ) # ) # # View plot # # plot_projection$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Plot # plot_projection_2D <- textPlot( # projection_results, # min_freq_words_plot = 1, # plot_n_word_extreme = 10, # plot_n_word_frequency = 5, # plot_n_words_middle = 5, # y_axes = TRUE, # Change to TRUE/FALSE # p_alpha = 0.05, # p_adjust_method = "fdr", # title_top = "Harmony Words Responses (Supervised Dimension Projection)", # x_axes_label = "Low vs. High Harmony in Life Scale Score", # y_axes_label = "Low vs.High Age", # bivariate_color_codes = c("#E07f6b", "#60A1F7", "#85DB8D", # "#FF0000", "#EAEAEA", "#5dc688", # "#E07f6a", "#60A1F7", "#85DB8E" # ) # ) # # View plot # plot_projection_2D$final_plot # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Computing words' centrality (semantic similarity) score to the aggregated embedding of all words # centrality_results <- textCentrality( # words = word_norms$satisfactionnorm, # word_embeddings = word_norms_embeddings$texts$satisfactionnorm, # word_types_embeddings = word_norms_embeddings$word_types # ) # # options(ggrepel.max.overlaps = 1000) # centrality_plot <- textCentralityPlot( # word_data = centrality_results, # min_freq_words_test = 2, # plot_n_word_extreme = 10, # plot_n_word_frequency = 5, # plot_n_words_middle = 5, # title_top = "Satisfaction with life word norm: Semantic Centrality Plot", # x_axes_label = "Satisfaction with Life Semantic Centrality" # ) # # centrality_plot$final_plot # # # OUTPUT # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Supplementary # # # PCA results to be plotted help(textPCA) # textPCA_results <- textPCA( # words = Language_based_assessment_data_8$satisfactionwords, # word_types_embeddings = harmony_words_embeddings$word_types # ) # # # # Plotting the PCA results # plot_PCA <- textPCAPlot( # word_data = textPCA_results, # min_freq_words_test = 2, # plot_n_word_extreme = 5, # plot_n_word_frequency = 5, # plot_n_words_middle = 5 # ) # plot_PCA$final_plot #