# library(naivebayes)
### Simulate the data:
set.seed(1)
cols <- 3 # words
rows <- 10000 # all documents
rows_spam <- 100 # spam documents
prob_word_non_spam <- prop.table(runif(cols))
prob_word_spam <- prop.table(runif(cols))
M1 <- t(rmultinom(rows_spam, size = cols, prob = prob_word_spam))
M2 <- t(rmultinom(rows - rows_spam, size = cols, prob = prob_word_non_spam))
M <- rbind(M1, M2)
colnames(M) <- paste0("word", 1:cols) ; rownames(M) <- paste0("doc", 1:rows)
head(M)
y <- c(rep("spam", rows_spam), rep("non-spam", rows - rows_spam))
### Train the Multinomial Naive Bayes
laplace <- 1
mnb <- multinomial_naive_bayes(x = M, y = y, laplace = laplace)
summary(mnb)
# Classification
head(predict(mnb, newdata = M, type = "class")) # head(mnb %class% M)
# Posterior probabilities
head(predict(mnb, newdata = M, type = "prob")) # head(mnb %prob% M)
# Parameter estimates
coef(mnb)
# Compare
round(cbind(non_spam = prob_word_non_spam, spam = prob_word_spam), 3)
### Sparse data: train the Multinomial Naive Bayes
library(Matrix)
M_sparse <- Matrix(M, sparse = TRUE)
class(M_sparse) # dgCMatrix
# Fit the model with sparse data
mnb_sparse <- multinomial_naive_bayes(M_sparse, y, laplace = laplace)
# Classification
head(predict(mnb_sparse, newdata = M_sparse, type = "class"))
# Posterior probabilities
head(predict(mnb_sparse, newdata = M_sparse, type = "prob"))
# Parameter estimates
coef(mnb_sparse)
Run the code above in your browser using DataLab