# NOT RUN {
# ----- reasonably fast with large very sparse matrices -----
X <- rSparseMatrix(1e6, 1e6, 1e6, NULL)
system.time(M <- assocSparse(X, method = poi))
length(M@x) / prod(dim(M)) # only one in 1e6 cells non-zero
# }
# NOT RUN {
# ----- reaching limits of sparsity -----
# watch out:
# with slightly less sparse matrices the result will not be very sparse,
# so this will easily fill up your RAM during computation!
X <- rSparseMatrix(1e4, 1e4, 1e6, NULL)
system.time(M <- assocSparse(X, method = poi))
print(object.size(M), units = "auto") # about 350 Mb
length(M@x) / prod(dim(M)) # 30% filled
# most values are low, so it often makes sense
# to remove low values to keep results sparse
M <- drop0(M, tol = 2)
print(object.size(M), units = "auto") # reduces to 10 Mb
length(M@x) / prod(dim(M)) # down to less than 1% filled
# }
# NOT RUN {
# ----- defining new methods -----
# Using the following simple 'div' method is the same as
# using a cosine similarity with a 1-norm, up to a factor nrow(X)
div <- function(o,e) {o/e}
X <- rSparseMatrix(10, 10, 30, NULL)
all.equal(
assocSparse(X, method = div),
cosSparse(X, norm = norm1) * nrow(X)
)
# ----- comparing methods -----
# Compare various methods on random data
# ignore values on diagonal, because different methods differ strongly here
# Note the different behaviour of pointwise mutual information (and division)
X <- rSparseMatrix(1e2, 1e2, 1e3, NULL)
p <- assocSparse(X, method = poi); diag(p) <- 0
r <- assocSparse(X, method = res); diag(r) <- 0
m <- assocSparse(X, method = pmi); diag(m) <- 0
w <- assocSparse(X, method = wpmi); diag(w) <- 0
d <- assocSparse(X, method = div); diag(d) <- 0
pairs(~w@x+p@x+r@x+d@x+m@x,
labels=c("weighted pointwise\nmutual information","poisson","residuals","division",
"pointwise\nmutual\ninformation"), cex = 0.7)
# }
# NOT RUN {
<!-- % The following is only true for non-zero/one matrices! -->
# }
# NOT RUN {
<!-- % -->
# }
# NOT RUN {
<!-- % # pmi has radically different with very small observed values: -->
# }
# NOT RUN {
<!-- % # because log(0) = -Inf we get asymptotic behaviour around Observed=zero -->
# }
# NOT RUN {
<!-- % -->
# }
# NOT RUN {
<!-- % par(mfrow = c(1,3)) -->
# }
# NOT RUN {
<!-- % O <- crossprod(X)@x -->
# }
# NOT RUN {
<!-- % plot(O, p, xlab = "observed", ylab = "poisson") -->
# }
# NOT RUN {
<!-- % plot(O, r, xlab = "observed", ylab = "residuals") -->
# }
# NOT RUN {
<!-- % plot(O, m, xlab = "observed", ylab = "pointwise mutual information") -->
# }
# NOT RUN {
<!-- % par(mfrow = c(1,1)) -->
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab