library(dplyr)
# Connect to a temporary in-memory SQLite database
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
# Add some data
copy_to(con, mtcars)
DBI::dbListTables(con)
# To retrieve a single table from a source, use `tbl()`
con %>% tbl("mtcars")
# Use `in_schema()` for fully qualified table names
con %>% tbl(in_schema("temp", "mtcars")) %>% head(1)
# You can also use pass raw SQL if you want a more sophisticated query
con %>% tbl(sql("SELECT * FROM mtcars WHERE cyl = 8"))
# If you just want a temporary in-memory database, use src_memdb()
src2 <- src_memdb()
# To show off the full features of dplyr's database integration,
# we'll use the Lahman database. lahman_sqlite() takes care of
# creating the database.
if (requireNamespace("Lahman", quietly = TRUE)) {
batting <- copy_to(con, Lahman::Batting)
batting
# Basic data manipulation verbs work in the same way as with a tibble
batting %>% filter(yearID > 2005, G > 130)
batting %>% select(playerID:lgID)
batting %>% arrange(playerID, desc(yearID))
batting %>% summarise(G = mean(G), n = n())
# There are a few exceptions. For example, databases give integer results
# when dividing one integer by another. Multiply by 1 to fix the problem
batting %>%
select(playerID:lgID, AB, R, G) %>%
mutate(
R_per_game1 = R / G,
R_per_game2 = R * 1.0 / G
)
# All operations are lazy: they don't do anything until you request the
# data, either by `print()`ing it (which shows the first ten rows),
# or by `collect()`ing the results locally.
system.time(recent <- filter(batting, yearID > 2010))
system.time(collect(recent))
# You can see the query that dplyr creates with show_query()
batting %>%
filter(G > 0) %>%
group_by(playerID) %>%
summarise(n = n()) %>%
show_query()
}
Run the code above in your browser using DataLab