## this helper function displays the cooccurrence matrix together with marginals
with.marginals <- function (x) {
y <- x$M
rownames(y) <- with(x$rows, sprintf("%-8s | %6d", term, f))
colnames(y) <- with(x$cols, sprintf(" %s | %d", term, f))
y
}
## we will read this term-context DSM from a triplet file included in the package
with.marginals(DSM_TermContext)
## the triplet file with term-document frequencies
triplet.file <- system.file("extdata", "term_context_triplets.gz", package="wordspace")
cat(readLines(triplet.file), sep="\n") # file format
## marginals incorrect because matrix covers only subset of targets & features
TC1 <- read.dsm.triplet(triplet.file, freq=TRUE)
with.marginals(TC1) # marginal frequencies far too small
## TAB-delimited file with marginal frequencies and other information
marg.file <- system.file("extdata", "term_context_marginals.txt.gz", package="wordspace")
cat(readLines(marg.file), sep="\n") # notice the header row with "term" and "f"
## single table with marginals for rows and columns, but has to be specified twice
TC2 <- read.dsm.triplet(triplet.file, freq=TRUE,
rowinfo=marg.file, colinfo=marg.file, N=108771103)
with.marginals(TC2) # correct marginal frequencies
## marginals table without header: specify column lables separately
no.hdr <- system.file("extdata", "term_context_marginals_noheader.txt",
package="wordspace")
hdr.names <- c("term", "f", "df", "type")
TC3 <- read.dsm.triplet(triplet.file, freq=TRUE,
rowinfo=no.hdr, rowinfo.header=hdr.names,
colinfo=no.hdr, colinfo.header=hdr.names, N=108771103)
all.equal(TC2, TC3, check.attributes=FALSE) # same result
Run the code above in your browser using DataLab