# NOT RUN {
## get the help for a method
## help("crossprod,db.obj-method")
<!-- %% @test .port Database port number -->
<!-- %% @test .dbname Database name -->
## set up the database connection
## Assume that .port is port number and .dbname is the database name
cid <- db.connect(port = .port, dbname = .dbname, verbose = FALSE)
## create a table from the example data.frame "abalone"
delete("abalone", conn.id = cid)
x <- as.db.data.frame(abalone, "abalone", conn.id = cid, verbose = FALSE)
lookat(crossprod(x[,-c(1,2)]))
x$arr <- db.array(1, x$length, x$diameter)
lookat(crossprod(x$arr))
## -----------------------------------------------------
## Create a function that does Principal Component Analysis in parallel.
## As long as the number of features of the data table is fewer than
## ~ 5000, the matrix t(x) <!-- %*% x can be loaded into memory to compute -->
## the eigenvalues and eigenvectors. However, the step t(x) <!-- %*% x must -->
## be done in-database in parallel, because x can be very big.
pca <- function (x, center = TRUE, scale = FALSE)
{
y <- scale(x, center = center, scale = scale) # centering and scaling
z <- as.db.data.frame(y, verbose = FALSE) # create an intermediate table to save computation
m <- lookat(crossprod(z)) # one scan of the table to compute Z^T * Z
d <- delete(z) # delete the intermediate table
res <- eigen(m) # only this computation is in R
n <- attr(y, "row.number") # save the computation to count rows
## return the result
list(val = sqrt(res$values/(n-1)), # eigenvalues
vec = res$vectors, # columns of this matrix are eigenvectors
center = attr(y, "scaled:center"),
scale = attr(y, "scaled:scale"))
}
## create a data table with a random name
dat <- db.data.frame("abalone", conn.id = cid, verbose = FALSE)
## exclude id and sex columns
p <- pca(dat[,-c(1,2)])
p$val # eigenvalues
db.disconnect(cid, verbose = FALSE)
# }
Run the code above in your browser using DataLab