# reasonable fast on modern hardware
# \donttest{
# try different sizes to find limits on local machine
system.time(X <- rSparseMatrix(1e8, 1e8, 1e6))
system.time(M <- cosSparse(X))
# }
# consider removing small values of result to improve sparsity
X <- rSparseMatrix(1e5, 1e5, 1e6)
print(object.size(X), units = "auto") # 12 Mb
system.time(M <- cosSparse(X))
print(object.size(M), units = "auto") # 59 Mb
M <- drop0(M, tol = 0.1) # remove small values
print(object.size(M), units = "auto") # 14 Mb
# Compare various weightings
# with random data from a normal distribution there is almost no difference
#
# data from a normal distribution
X <- rSparseMatrix(1e2, 1e2, 1e3)
w0 <- cosSparse(X, norm = norm2, weight = NULL)@x
wi <- cosSparse(X, norm = norm2, weight = idf)@x
ws <- cosSparse(X, norm = norm2, weight = isqrt)@x
pairs(~ w0 + wi + ws,
labels=c("no weighting","inverse\ndocument\nfrequency","inverse\nsquare root"))
# with heavily skewed data there is a strong difference!
X <- rSparseMatrix(1e2, 1e2, 1e3,
rand.x = function(n){round(rpois(1e3, 10), 2)})
w0 <- cosSparse(X, norm = norm2, weight = NULL)@x
wi <- cosSparse(X, norm = norm2, weight = idf)@x
ws <- cosSparse(X, norm = norm2, weight = isqrt)@x
pairs(~ w0 + wi + ws,
labels=c("no weighting","inverse\ndocument\nfrequency","inverse\nsquare root"))
Run the code above in your browser using DataLab