% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_flow.R
\name{dtm_chisq}
\alias{dtm_chisq}
\title{Compare term usage across 2 document groups using the Chi-square Test for Count Data}
\usage{
dtm_chisq(dtm, groups, correct = TRUE, ...)
}
\arguments{
\item{dtm}{a document term matrix: an object returned by \code{\link{document_term_matrix}}}

\item{groups}{a logical vector with 2 groups (TRUE / FALSE) where the size of the \code{groups} vector 
is the same as the number of rows of \code{dtm} and where element i corresponds row i of \code{dtm}}

\item{correct}{passed on to \code{\link{chisq.test}}}

\item{...}{further arguments passed on to \code{\link{chisq.test}}}
}
\value{
a data.frame with columns term, chisq, p.value, freq, freq_true, freq_false indicating for each term in the \code{dtm},
how frequently it occurs in each group, the Chi-Square value and it's corresponding p-value.
}
\description{
Perform a \code{\link{chisq.test}} to compare if groups of documents have more prevalence of specific terms.\cr
The function looks to each term in the document term matrix and applies a \code{\link{chisq.test}} comparing the frequency 
of occurrence of each term compared to the other terms in the document group.
}
\examples{
data(brussels_reviews_anno)
##
## Which nouns occur in text containing the term 'centre'
##
x <- subset(brussels_reviews_anno, xpos == "NN" & language == "fr")
x <- x[, c("doc_id", "lemma")]
x <- document_term_frequencies(x)
dtm <- document_term_matrix(x)
relevant <- dtm_chisq(dtm, groups = dtm[, "centre"] > 0)
head(relevant, 10)

##
## Which adjectives occur in text containing the term 'hote'
##
x <- subset(brussels_reviews_anno, xpos == "JJ" & language == "fr")
x <- x[, c("doc_id", "lemma")]
x <- document_term_frequencies(x)
dtm <- document_term_matrix(x)

group <- subset(brussels_reviews_anno, lemma \%in\% "hote")
group <- rownames(dtm) \%in\% group$doc_id
relevant <- dtm_chisq(dtm, groups = group)
head(relevant, 10)


\dontrun{
# do not show scientific notation of the p-values
options(scipen = 100)
head(relevant, 10)
}
}
