Learn R Programming

udpipe (version 0.8.11)

dtm_bind: Combine 2 document term matrices either by rows or by columns

Description

These 2 methods provide cbind and rbind functionality for sparse matrix objects which are returned by document_term_matrix.

In case of dtm_cbind, if the rows are not ordered in the same way in x and y, it will order them based on the rownames. If there are missing rows these will be filled with NA values.
In case of dtm_rbind, if the columns are not ordered in the same way in x and y, it will order them based on the colnames. If there are missing columns these will be filled with NA values.

Usage

dtm_cbind(x, y, ...)

dtm_rbind(x, y, ...)

Value

a sparse matrix where either rows are put below each other in case of dtm_rbind

or columns are put next to each other in case of dtm_cbind

Arguments

x

a sparse matrix such as a "dgCMatrix" object which is returned by document_term_matrix

y

a sparse matrix such as a "dgCMatrix" object which is returned by document_term_matrix

...

more sparse matrices

See Also

document_term_matrix

Examples

Run this code
# \dontshow{
data.table::setDTthreads(1)
# }
data(brussels_reviews_anno)
x <- brussels_reviews_anno

## rbind
dtm1 <- document_term_frequencies(x = subset(x, doc_id %in% c("10049756", "10284782")),
                                  document = "doc_id", term = "token")
dtm1 <- document_term_matrix(dtm1)
dtm2 <- document_term_frequencies(x = subset(x, doc_id %in% c("10789408", "12285061", "35509091")),
                                  document = "doc_id", term = "token")
dtm2 <- document_term_matrix(dtm2)
dtm3 <- document_term_frequencies(x = subset(x, doc_id %in% c("31133394", "36224131")),
                                  document = "doc_id", term = "token")
dtm3 <- document_term_matrix(dtm3)
m <- dtm_rbind(dtm1, dtm2)
dim(m)
m <- dtm_rbind(dtm1, dtm2, dtm3)
dim(m)

## cbind
library(data.table)
x <- subset(brussels_reviews_anno, language %in% c("nl", "fr"))
x <- as.data.table(x)
x <- x[, token_bigram  := txt_nextgram(token, n = 2), by = list(doc_id, sentence_id)]
x <- x[, lemma_upos    := sprintf("%s//%s", lemma, upos)]
dtm1 <- document_term_frequencies(x = x, document = "doc_id", term = c("token"))
dtm1 <- document_term_matrix(dtm1)
dtm2 <- document_term_frequencies(x = x, document = "doc_id", term = c("token_bigram"))
dtm2 <- document_term_matrix(dtm2)
dtm3 <- document_term_frequencies(x = x, document = "doc_id", term = c("upos"))
dtm3 <- document_term_matrix(dtm3)
dtm4 <- document_term_frequencies(x = x, document = "doc_id", term = c("lemma_upos"))
dtm4 <- document_term_matrix(dtm4)
m <- dtm_cbind(dtm1, dtm2)
dim(m)
m <- dtm_cbind(dtm1, dtm2, dtm3, dtm4)
dim(m)
m <- dtm_cbind(dtm1[-c(100, 999), ], dtm2[-1000,])
dim(m)

Run the code above in your browser using DataLab