--- title: "Introduction to quickSentiment" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to quickSentiment} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- # --- 1. SETUP: LOAD LIBRARIES --- # ------------------------------------------------------------------- ```{r, include = FALSE} library(quickSentiment) ``` ```{r setup} library(doParallel) # CRAN limits the number of cores used during package checks cores <- min(2, parallel::detectCores()) registerDoParallel(cores = cores) ``` # --- 2. LOAD AND PREPARE TRAINING DATA --- ```{r} # Look for the file in the installed package first csv_path <- system.file("extdata", "tweets.csv", package = "quickSentiment") # Fallback for when you are building the package locally if (csv_path == "") { csv_path <- "../inst/extdata/tweets.csv" } tweets <- read.csv(csv_path) set.seed(123) ``` # --- 3. PREPROCESS THE TEXT --- # ------------------------------------------------------------------- # Use the pre_process() function from our package to clean the raw text. # This step is done externally to the main pipeline, allowing you to reuse # the same cleaned text for multiple different models or analyses in the future. ```{r} tweets$cleaned_text <- pre_process(tweets$Tweet) tweets$sentiment = ifelse(tweets$Avg>0,'P','N') ``` # --- 4. RUN THE MAIN TRAINING PIPELINE --- # ------------------------------------------------------------------- # This is the core of the package. We call the main pipeline() function # to handle the train/test split, vectorization, model training, and evaluation. ```{r} result <- pipeline( # --- Define the vectorization method --- # Options: "bow" (raw counts), "tf" (term frequency), "tfidf" vect_method = "tf", # --- Define the model to train --- # Options: "logit", "rf", "xgb" model_name = "rf", # --- Specify the data and column names --- df = tweets, text_column_name = "cleaned_text", # The column with our preprocessed text sentiment_column_name = "sentiment", # The column with the target variable # --- Set vectorization options --- # Use n_gram = 2 for unigrams + bigrams, or 1 for just unigrams n_gram = 1 ) ``` # =================================================================== # --- 5. PREDICTION ON NEW, UNSEEN DATA --- # =================================================================== # The training is complete. The 'result' object now contains our trained # model and all the necessary "artifacts" for prediction. ```{r} tweets$sentimentPredict <- prediction( pipeline_object = result, df = tweets, text_column = "cleaned_text" ) ```