if (FALSE) {
library(sparklyr)
sc <- spark_connect(master = "local")
r1 <- 1
n1 <- 80L
r2 <- 4
n2 <- 80L
gen_circle <- function(radius, num_pts) {
# generate evenly distributed points on a circle centered at the origin
seq(0, num_pts - 1) %>%
lapply(
function(pt) {
theta <- 2 * pi * pt / num_pts
radius * c(cos(theta), sin(theta))
}
)
}
guassian_similarity <- function(pt1, pt2) {
dist2 <- sum((pt2 - pt1)^2)
exp(-dist2 / 2)
}
gen_pic_data <- function() {
# generate points on 2 concentric circle centered at the origin and then
# compute pairwise Gaussian similarity values of all unordered pair of
# points
n <- n1 + n2
pts <- append(gen_circle(r1, n1), gen_circle(r2, n2))
num_unordered_pairs <- n * (n - 1) / 2
src <- rep(0L, num_unordered_pairs)
dst <- rep(0L, num_unordered_pairs)
sim <- rep(0, num_unordered_pairs)
idx <- 1
for (i in seq(2, n)) {
for (j in seq(i - 1)) {
src[[idx]] <- i - 1L
dst[[idx]] <- j - 1L
sim[[idx]] <- guassian_similarity(pts[[i]], pts[[j]])
idx <- idx + 1
}
}
tibble::tibble(src = src, dst = dst, sim = sim)
}
pic_data <- copy_to(sc, gen_pic_data())
clusters <- ml_power_iteration(
pic_data,
src_col = "src", dst_col = "dst", weight_col = "sim", k = 2, max_iter = 40
)
print(clusters)
}
Run the code above in your browser using DataLab