Learn R Programming

bit64 (version 4.6.0-1)

hashmap: Hashing for 64bit integers

Description

This is an explicit implementation of hash functionality that underlies matching and other functions in R. Explicit means that you can create, store and use hash functionality directly. One advantage is that you can re-use hashmaps, which avoid re-building hashmaps again and again.

Usage

hashfun(x, ...)

# S3 method for integer64 hashfun(x, minfac = 1.41, hashbits = NULL, ...)

hashmap(x, ...)

# S3 method for integer64 hashmap(x, nunique = NULL, minfac = 1.41, hashbits = NULL, cache = NULL, ...)

hashpos(cache, ...)

# S3 method for cache_integer64 hashpos(cache, x, nomatch = NA_integer_, ...)

hashrev(cache, ...)

# S3 method for cache_integer64 hashrev(cache, x, nomatch = NA_integer_, ...)

hashfin(cache, ...)

# S3 method for cache_integer64 hashfin(cache, x, ...)

hashrin(cache, ...)

# S3 method for cache_integer64 hashrin(cache, x, ...)

hashdup(cache, ...)

# S3 method for cache_integer64 hashdup(cache, ...)

hashuni(cache, ...)

# S3 method for cache_integer64 hashuni(cache, keep.order = FALSE, ...)

hashupo(cache, ...)

# S3 method for cache_integer64 hashupo(cache, keep.order = FALSE, ...)

hashtab(cache, ...)

# S3 method for cache_integer64 hashtab(cache, ...)

hashmaptab(x, ...)

# S3 method for integer64 hashmaptab(x, nunique = NULL, minfac = 1.5, hashbits = NULL, ...)

hashmapuni(x, ...)

# S3 method for integer64 hashmapuni(x, nunique = NULL, minfac = 1.5, hashbits = NULL, ...)

hashmapupo(x, ...)

# S3 method for integer64 hashmapupo(x, nunique = NULL, minfac = 1.5, hashbits = NULL, ...)

Value

See Details

Arguments

x

an integer64 vector

...

further arguments, passed from generics, ignored in methods

minfac

minimum factor by which the hasmap has more elements compared to the data x, ignored if hashbits is given directly

hashbits

length of hashmap is 2^hashbits

nunique

giving correct number of unique elements can help reducing the size of the hashmap

cache

an optional cache() object into which to put the hashmap (by default a new cache is created

nomatch

the value to be returned if an element is not found in the hashmap

keep.order

determines order of results and speed: FALSE (the default) is faster and returns in the (pseudo)random order of the hash function, TRUE returns in the order of first appearance in the original data, but this requires extra work

Details

functionsee alsodescription
hashfundigestexport of the hash function used in hashmap
hashmapmatch()return hashmap
hashposmatch()return positions of x in hashmap
hashrevmatch()return positions of hashmap in x
hashfin%in%.integer64return logical whether x is in hashmap
hashrin%in%.integer64return logical whether hashmap is in x
hashdupduplicated()return logical whether hashdat is duplicated using hashmap
hashuniunique()return unique values of hashmap
hashmapuniunique()return unique values of x
hashupounique()return positions of unique values in hashdat
hashmapupounique()return positions of unique values in x
hashtabtable()tabulate values of hashdat using hashmap in keep.order=FALSE
hashmaptabtable()tabulate values of x building hasmap on the fly in keep.order=FALSE

See Also

match(), runif64()

Examples

Run this code
x <- as.integer64(sample(c(NA, 0:9)))
y <- as.integer64(sample(c(NA, 1:9), 10, TRUE))
hashfun(y)
hx <- hashmap(x)
hy <- hashmap(y)
ls(hy)
hashpos(hy, x)
hashrev(hx, y)
hashfin(hy, x)
hashrin(hx, y)
hashdup(hy)
hashuni(hy)
hashuni(hy, keep.order=TRUE)
hashmapuni(y)
hashupo(hy)
hashupo(hy, keep.order=TRUE)
hashmapupo(y)
hashtab(hy)
hashmaptab(y)

stopifnot(identical(match(as.integer(x),as.integer(y)),hashpos(hy, x)))
stopifnot(identical(match(as.integer(x),as.integer(y)),hashrev(hx, y)))
stopifnot(identical(as.integer(x) %in% as.integer(y), hashfin(hy, x)))
stopifnot(identical(as.integer(x) %in% as.integer(y), hashrin(hx, y)))
stopifnot(identical(duplicated(as.integer(y)), hashdup(hy)))
stopifnot(identical(as.integer64(unique(as.integer(y))), hashuni(hy, keep.order=TRUE)))
stopifnot(identical(sort(hashuni(hy, keep.order=FALSE)), sort(hashuni(hy, keep.order=TRUE))))
stopifnot(identical(y[hashupo(hy, keep.order=FALSE)], hashuni(hy, keep.order=FALSE)))
stopifnot(identical(y[hashupo(hy, keep.order=TRUE)], hashuni(hy, keep.order=TRUE)))
stopifnot(identical(hashpos(hy, hashuni(hy, keep.order=TRUE)), hashupo(hy, keep.order=TRUE)))
stopifnot(identical(hashpos(hy, hashuni(hy, keep.order=FALSE)), hashupo(hy, keep.order=FALSE)))
stopifnot(identical(hashuni(hy, keep.order=FALSE), hashtab(hy)$values))
stopifnot(identical(as.vector(table(as.integer(y), useNA="ifany"))
, hashtab(hy)$counts[order.integer64(hashtab(hy)$values)]))
stopifnot(identical(hashuni(hy, keep.order=TRUE), hashmapuni(y)))
stopifnot(identical(hashupo(hy, keep.order=TRUE), hashmapupo(y)))
stopifnot(identical(hashtab(hy), hashmaptab(y)))

    if (FALSE) {
    message("explore speed given size of the hasmap in 2^hashbits and size of the data")
    message("more hashbits means more random access and less collisions")
    message("i.e. more data means less random access and more collisions")
    bits <- 24
    b <- seq(-1, 0, 0.1)
    tim <- matrix(NA, length(b), 2, dimnames=list(b, c("bits","bits+1")))
    for (i in 1:length(b)){
      n <- as.integer(2^(bits+b[i]))
      x <- as.integer64(sample(n))
      tim[i,1] <- repeat.time(hashmap(x, hashbits=bits))[3]
      tim[i,2] <- repeat.time(hashmap(x, hashbits=bits+1))[3]
      print(tim)
      matplot(b, tim)
    }
    message("we conclude that n*sqrt(2) is enough to avoid collisions")
    }

Run the code above in your browser using DataLab