--- title: "Utility Functions for Further Exploration and Visualization" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Utility Functions for Further Exploration and Visualization} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(conversim) ``` ## Introduction This vignette demonstrates the usage of utility functions for speech similarity analysis. These functions are designed to complement the main similarity calculation functions and provide additional tools for visualization and analysis. ```{r echo=FALSE} library(ggplot2) combine_sims <- function(similarities, weights = NULL) { if (is.null(weights)) { weights <- rep(1, length(similarities)) } else { weights <- unlist(weights) } weighted_sum <- sum(unlist(similarities) * weights) total_weight <- sum(weights) return(weighted_sum / total_weight) } plot_sims <- function(similarities) { df <- data.frame( measure = names(similarities), score = unlist(similarities) ) ggplot2::ggplot(df, ggplot2::aes(x = .data$measure, y = .data$score)) + ggplot2::geom_bar(stat = "identity", fill = "steelblue") + ggplot2::coord_flip() + ggplot2::labs(title = "Similarity Scores", x = "Measure", y = "Score") + ggplot2::theme_minimal() + ggplot2::scale_y_continuous(limits = c(0, 1)) } compare_style <- function(stylistic_result) { features <- names(stylistic_result$text1_features) speech1_values <- unlist(stylistic_result$text1_features) speech2_values <- unlist(stylistic_result$text2_features) df <- data.frame( feature = rep(features, 2), value = c(speech1_values, speech2_values), speech = rep(c("speech1", "speech2"), each = length(features)) ) ggplot2::ggplot(df, ggplot2::aes(x = .data$feature, y = .data$value, fill = .data$speech)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = "Comparison of Stylistic Features", x = "Feature", y = "Value") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) } gen_sim_report <- function(speech1, speech2, topic_method = "lda", semantic_method = "tfidf", glove_path = NULL) { topic_sim <- topic_similarity(speech1, speech2, method = topic_method) lexical_sim <- lexical_similarity(speech1, speech2) semantic_sim <- semantic_similarity(speech1, speech2, method = semantic_method, model_path = glove_path) structural_sim <- structural_similarity(speech1, speech2) stylistic_result <- stylistic_similarity(speech1, speech2) sentiment_sim <- sentiment_similarity(speech1, speech2) similarities <- list( topic = topic_sim, lexical = lexical_sim, semantic = semantic_sim, structural = structural_sim, stylistic = stylistic_result$overall_similarity, sentiment = sentiment_sim ) combined_sim <- combine_sims(similarities) report <- list( similarities = similarities, combined_similarity = combined_sim, similarity_plot = plot_sims(similarities), stylistic_plot = compare_style(stylistic_result) ) return(report) } print_sim_report <- function(report) { cat("Similarity Report\n") cat("=================\n\n") cat("Individual Similarity Scores:\n") for (measure in names(report$similarities)) { cat(sprintf(" %s: %.4f\n", measure, report$similarities[[measure]])) } cat("\nCombined Similarity Score: %.4f\n", report$combined_similarity) cat("\nPlots have been generated for overall similarities and stylistic features.\n") cat("Use 'report$similarity_plot' and 'report$stylistic_plot' to view them.\n") } agg_seq <- function(sequence, num_segments) { segment_size <- ceiling(length(sequence) / num_segments) aggregated <- numeric(num_segments) for (i in 1:num_segments) { start_idx <- (i - 1) * segment_size + 1 end_idx <- min(i * segment_size, length(sequence)) aggregated[i] <- mean(sequence[start_idx:end_idx], na.rm = TRUE) } return(aggregated) } combine_sim_seq <- function(similarities, weights = NULL) { if (length(similarities) == 0) { stop("At least one similarity measure is required") } if (is.null(weights)) { weights <- rep(1 / length(similarities), length(similarities)) } sequence_lengths <- sapply(similarities, function(x) length(x$sequence)) if (length(unique(sequence_lengths)) > 1) { warning("Sequence lengths do not match. Results may be unexpected.") } max_length <- max(sequence_lengths) combined_sequence <- numeric(max_length) for (i in seq_along(similarities)) { seq <- c(similarities[[i]]$sequence, rep(NA, max_length - length(similarities[[i]]$sequence))) combined_sequence <- combined_sequence + seq * weights[i] } combined_average <- sum(sapply(seq_along(similarities), function(i) { similarities[[i]]$average * weights[i] })) return(list(sequence = combined_sequence, average = combined_average)) } norm_sim <- function(similarities) { min_sim <- min(similarities, na.rm = TRUE) max_sim <- max(similarities, na.rm = TRUE) if (min_sim == max_sim) { return(rep(0, length(similarities))) } return((similarities - min_sim) / (max_sim - min_sim)) } cor_sim_seq <- function(similarities, method = "pearson") { sequences <- lapply(similarities, function(x) x$sequence) cor_matrix <- cor(do.call(cbind, sequences), use = "pairwise.complete.obs", method = method) return(cor_matrix) } plot_cor_heatmap <- function(cor_matrix, titles) { # Create a data frame from the correlation matrix df <- expand.grid(Measure1 = titles, Measure2 = titles) df$Correlation <- as.vector(cor_matrix) ggplot2::ggplot(df, ggplot2::aes(x = .data$Measure1, y = .data$Measure2, fill = .data$Correlation)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) + ggplot2::labs(x = "", y = "", fill = "Correlation") + ggplot2::theme_minimal() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) } plot_sim_time <- function(similarities, title, y_label) { df <- data.frame( dyad = rep(names(similarities), sapply(similarities, length)), time = unlist(lapply(similarities, seq_along)), similarity = unlist(similarities) ) ggplot2::ggplot(df, ggplot2::aes(x = .data$time, y = .data$similarity, color = .data$dyad)) + ggplot2::geom_line() + ggplot2::geom_point() + ggplot2::labs(title = title, x = "Time", y = y_label) + ggplot2::theme_minimal() } calc_sum_stats <- function(similarities) { if (length(similarities) == 0) { stop("No data to calculate summary statistics") } na_present <- any(sapply(similarities, function(x) any(is.na(x)))) if (na_present) { warning("NAs present in the data") } summary_stats <- lapply(similarities, function(x) { c(mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE), min = min(x, na.rm = TRUE), max = max(x, na.rm = TRUE)) }) do.call(rbind, summary_stats) } plot_sum_stats <- function(summary_stats, title) { df <- as.data.frame(summary_stats) df$dyad <- rownames(df) df_long <- data.frame( dyad = rep(df$dyad, each = 4), statistic = rep(c("mean", "sd", "min", "max"), nrow(df)), value = c(t(as.matrix(df[, c("mean", "sd", "min", "max")]))) ) ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$value, fill = .data$statistic)) + ggplot2::geom_bar(stat = "identity", position = "dodge") + ggplot2::labs(title = title, x = "Dyad", y = "Value") + ggplot2::theme_minimal() + ggplot2::scale_fill_brewer(palette = "Set2") } compare_sim_meas <- function(similarity_list, measure_names) { if (length(similarity_list) != length(measure_names)) { stop("The number of similarity lists must match the number of measure names.") } result <- data.frame(dyad = rep(names(similarity_list[[1]]), sapply(similarity_list[[1]], length))) for (i in seq_along(similarity_list)) { measure <- measure_names[i] similarities <- unlist(similarity_list[[i]]) result[[measure]] <- similarities } result } plot_sim_comp <- function(comparison_df, title) { measures <- setdiff(names(comparison_df), "dyad") df_long <- data.frame( dyad = rep(comparison_df$dyad, length(measures)), measure = rep(measures, each = nrow(comparison_df)), similarity = unlist(comparison_df[, measures]) ) ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$similarity, fill = .data$measure)) + ggplot2::geom_boxplot() + ggplot2::labs(title = title, x = "Dyad", y = "Similarity") + ggplot2::theme_minimal() + ggplot2::scale_fill_brewer(palette = "Set3") } calc_sim_cor <- function(comparison_df) { cor(comparison_df[, -1], use = "pairwise.complete.obs") } plot_sim_cor_heatmap <- function(cor_matrix, title) { cor_df <- as.data.frame(as.table(cor_matrix)) names(cor_df) <- c("Var1", "Var2", "Correlation") ggplot2::ggplot(cor_df, ggplot2::aes(x = .data$Var1, y = .data$Var2, fill = .data$Correlation)) + ggplot2::geom_tile() + ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1)) + ggplot2::theme_minimal() + ggplot2::labs(title = title) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) } ``` ## Combining Similarity Measures The `combine_sims` function allows you to combine multiple similarity measures into a single score: ```{r} sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9) combined_score <- combine_sims(sims) print(combined_score) # With custom weights weighted_score <- combine_sims(sims, weights = list(topic = 2, lexical = 1, semantic = 1.5, structural = 1)) print(weighted_score) ``` ## Visualizing Similarity Scores The `plot_sims` function creates a bar plot of similarity scores: ```{r} sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9) plot_sims(sims) ``` ## Comparing Stylistic Features The `compare_style` function visualizes the comparison of stylistic features between two speeches: ```{r} # Simulating the result of stylistic_similarity function stylistic_result <- list( text1_features = list(sentence_length = 15, word_length = 5, unique_words = 100), text2_features = list(sentence_length = 12, word_length = 4, unique_words = 80), overall_similarity = 0.85 ) compare_style(stylistic_result) ``` ## Generating a Comprehensive Similarity Report The `gen_sim_report` function generates a comprehensive report of all similarity measures: ```{r} speech1 <- "This is the first speech. It talks about important topics." speech2 <- "This is the second speech. It covers similar subjects." # Note: This function call might not work as-is because it depends on other functions # that are not defined in the utility files. For demonstration purposes, we'll create # a mock report. mock_report <- list( similarities = list( topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9, stylistic = 0.85, sentiment = 0.75 ), combined_similarity = 0.75, similarity_plot = plot_sims(list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)), stylistic_plot = compare_style(stylistic_result) ) # Print the mock report print_sim_report(mock_report) ``` ## Working with Conversation Sequences The `conversation_sequence_utilities.R` file provides functions for analyzing similarity sequences in conversations: ```{r} # Combine similarity measures for a single dyad sim1 <- list(sequence = c(0.8, 0.7, 0.9), average = 0.8) sim2 <- list(sequence = c(0.6, 0.8, 0.7), average = 0.7) combined <- combine_sim_seq(list(sim1, sim2)) print(combined) # Normalize similarity scores scores <- c(0.2, 0.5, 0.8, 1.0, 0.3) normalized <- norm_sim(scores) print(normalized) # Aggregate similarity sequence seq <- c(0.5, 0.6, 0.7, 0.6, 0.8, 0.7, 0.9, 0.8, 0.7, 0.8) aggregated <- agg_seq(seq, 3) print(aggregated) # Calculate correlation between similarity measures cor_matrix <- cor_sim_seq(list(sim1, sim2)) print(cor_matrix) ``` ## Analyzing Multiple Dyads The conversation_multidyads_utilities.R file provides functions for analyzing similarities across multiple dyads: ```{r} # Create mock data for multiple dyads similarities <- list( "1" = c(0.5, 0.6, 0.7), "2" = c(0.4, 0.5, 0.6) ) # Plot similarity over time for multiple dyads plot_sim_time(similarities, "Topic Similarity", "Similarity Score") # Calculate summary statistics stats <- calc_sum_stats(similarities) print(stats) # Plot summary statistics plot_sum_stats(stats, "Summary Statistics of Similarities") # Compare multiple similarity measures topic_similarities <- list("1" = c(0.5, 0.6, 0.7), "2" = c(0.4, 0.5, 0.6)) lexical_similarities <- list("1" = c(0.6, 0.7, 0.8), "2" = c(0.5, 0.6, 0.7)) comparison_df <- compare_sim_meas( list(topic_similarities, lexical_similarities), c("Topic", "Lexical") ) print(head(comparison_df)) # Plot comparison of multiple similarity measures plot_sim_comp(comparison_df, "Comparison of Similarity Measures") ``` ## Conclusion This vignette has demonstrated the usage of various utility functions for speech similarity analysis. These functions provide powerful tools for combining, visualizing, and analyzing similarity measures across different aspects of speech and conversation dynamics.