## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) library(tm) library(topicmodels) library(lsa) library(word2vec) library(sentimentr) ## ----load-data---------------------------------------------------------------- data_path <- system.file("extdata", "speeches_data.Rdata", package = "conversim") load(data_path) # Print a summary of the speeches data print(summary(speeches_data)) ## ----include=FALSE------------------------------------------------------------ preprocess_text <- function(text) { text <- tolower(text) text <- gsub("[[:punct:]]", "", text) text <- gsub("[[:digit:]]", "", text) text <- gsub("\\s+", " ", trimws(text)) return(text) } ## ----------------------------------------------------------------------------- # Example usage with our data original_text <- substr(speeches_data$text[1], 1, 200) # First 200 characters of speech A preprocessed_text <- preprocess_text(original_text) print(paste("Original:", original_text)) print(paste("Preprocessed:", preprocessed_text)) ## ----include=FALSE------------------------------------------------------------ topic_similarity <- function(conv1, conv2, method = "lda", num_topics = 2) { corpus <- c(conv1, conv2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) dtm_matrix <- as.matrix(dtm) if (method == "lda") { lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234)) topic_dist <- posterior(lda_model)$topics js_divergence <- function(p, q) { m <- 0.5 * (p + q) 0.5 * sum(p * log(p / m)) + 0.5 * sum(q * log(q / m)) } similarity <- 1 - sqrt(js_divergence(topic_dist[1,], topic_dist[2,])) } else if (method == "lsa") { if (nrow(dtm_matrix) < num_topics) { num_topics <- nrow(dtm_matrix) } lsa_space <- lsa(dtm_matrix, dims = num_topics) doc_lsa <- lsa_space$dk similarity <- cosine(doc_lsa[1,], doc_lsa[2,]) similarity <- (similarity + 1) / 2 } else { stop("Invalid method. Choose 'lda' or 'lsa'.") } return(similarity) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data lda_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lda", num_topics = 5) lsa_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lsa", num_topics = 5) print(paste("LDA Similarity:", lda_similarity)) print(paste("LSA Similarity:", lsa_similarity)) ## ----include=FALSE------------------------------------------------------------ lexical_similarity <- function(conv1, conv2) { words1 <- unique(unlist(strsplit(conv1, " "))) words2 <- unique(unlist(strsplit(conv2, " "))) intersection <- length(intersect(words1, words2)) union <- length(union(words1, words2)) return(intersection / union) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data lex_similarity <- lexical_similarity(speeches_data$text[1], speeches_data$text[2]) print(paste("Lexical Similarity:", lex_similarity)) ## ----include=FALSE------------------------------------------------------------ semantic_similarity <- function(conversation1, conversation2, method = "tfidf", model_path = NULL, dim = 100, window = 5, iter = 5) { # Internal function to calculate cosine similarity cosine_similarity <- function(a, b) { if (length(a) == 0 || length(b) == 0) return(0) sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))) # Ensure the result is between 0 and 1 return((sim + 1) / 2) } # Internal function to load pre-trained GloVe embeddings load_glove <- function(file_path) { tryCatch({ conn <- file(file_path, "r") lines <- readLines(conn) close(conn) split_lines <- strsplit(lines, " ") words <- sapply(split_lines, `[`, 1) vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1]))) rownames(vectors) <- words return(vectors) }, error = function(e) { stop(paste("Error loading GloVe file:", e$message)) }) } # Internal function to calculate sentence embedding sentence_embedding <- function(sentence, word_vectors) { tokens <- unlist(strsplit(sentence, "\\s+")) valid_tokens <- tokens[tokens %in% rownames(word_vectors)] if (length(valid_tokens) == 0) { return(rep(0, ncol(word_vectors))) } embeddings <- word_vectors[valid_tokens, , drop = FALSE] if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors))) return(colMeans(embeddings)) } if (method == "tfidf") { # TF-IDF approach corpus <- c(conversation1, conversation2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) tfidf <- weightTfIdf(dtm) m <- as.matrix(tfidf) # Issue a warning for short conversations or little vocabulary overlap if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) { warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.") } # If the conversations are identical, return 1 if (identical(conversation1, conversation2)) { return(1) } # Ensure we have at least one term in common if (ncol(m) == 0) { return(0) } # Calculate cosine similarity similarity <- cosine_similarity(m[1,], m[2,]) } else if (method == "word2vec" || method == "glove") { # Word2Vec or GloVe approach if (method == "word2vec") { # Train Word2Vec model all_text <- c(conversation1, conversation2) model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1) word_vectors <- as.matrix(model) } else { # method == "glove" if (is.null(model_path)) { stop("Please provide a path to the pre-trained GloVe file.") } # Load pre-trained GloVe vectors word_vectors <- load_glove(model_path) } # Calculate embeddings for each conversation embedding1 <- sentence_embedding(conversation1, word_vectors) embedding2 <- sentence_embedding(conversation2, word_vectors) # Calculate cosine similarity similarity <- cosine_similarity(embedding1, embedding2) } else { stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.") } return(similarity) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data tfidf_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "tfidf") word2vec_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "word2vec") print(paste("TF-IDF Similarity:", tfidf_similarity)) print(paste("Word2Vec Similarity:", word2vec_similarity)) # Note: For GloVe method, you need to provide a path to pre-trained GloVe vectors # glove_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "glove", model_path = "path/to/glove/vectors.txt") ## ----include=FALSE------------------------------------------------------------ structural_similarity <- function(conv1, conv2) { length_sim <- 1 - abs(length(conv1) - length(conv2)) / max(length(conv1), length(conv2)) avg_turn_length1 <- mean(nchar(conv1)) avg_turn_length2 <- mean(nchar(conv2)) turn_length_sim <- 1 - abs(avg_turn_length1 - avg_turn_length2) / max(avg_turn_length1, avg_turn_length2) return(mean(c(length_sim, turn_length_sim))) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data struct_similarity <- structural_similarity(strsplit(speeches_data$text[1], "\n")[[1]], strsplit(speeches_data$text[2], "\n")[[1]]) print(paste("Structural Similarity:", struct_similarity)) ## ----include=FALSE------------------------------------------------------------ stylistic_similarity <- function(text1, text2) { # Helper function to calculate features for a single text calculate_features <- function(text) { words <- strsplit(text, " ")[[1]] sentences <- strsplit(text, "\\. ")[[1]] ttr <- length(unique(words)) / length(words) avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]]))) syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w))))) fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59 c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade) } features1 <- calculate_features(text1) features2 <- calculate_features(text2) feature_diff <- abs(features1 - features2) overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2)) normalized1 <- (features1 - mean(features1)) / sd(features1) normalized2 <- (features2 - mean(features2)) / sd(features2) cosine_similarity <- sum(normalized1 * normalized2) / (sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2))) list( text1_features = features1, text2_features = features2, feature_differences = feature_diff, overall_similarity = overall_similarity, cosine_similarity = cosine_similarity ) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data style_similarity <- stylistic_similarity(speeches_data$text[1], speeches_data$text[2]) print("Stylistic Similarity Results:") print(style_similarity) ## ----include=FALSE------------------------------------------------------------ sentiment_similarity <- function(conv1, conv2) { sent1 <- sentiment_by(conv1)$ave_sentiment sent2 <- sentiment_by(conv2)$ave_sentiment return(1 - abs(sent1 - sent2) / 2) } ## ----------------------------------------------------------------------------- # Example usage with our speeches data sent_similarity <- sentiment_similarity(speeches_data$text[1], speeches_data$text[2]) print(paste("Sentiment Similarity:", sent_similarity))