forked from tidyverse/dplyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrc_sqlite.Rd
164 lines (129 loc) · 5.49 KB
/
src_sqlite.Rd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/src-sqlite.r
\name{src_sqlite}
\alias{src_sqlite}
\alias{tbl.src_sqlite}
\title{Connect to a sqlite database.}
\usage{
src_sqlite(path, create = FALSE)
\method{tbl}{src_sqlite}(src, from, ...)
}
\arguments{
\item{path}{Path to SQLite database}
\item{create}{if \code{FALSE}, \code{path} must already exist. If
\code{TRUE}, will create a new SQlite3 database at \code{path}.}
\item{src}{a sqlite src created with \code{src_sqlite}.}
\item{from}{Either a string giving the name of table in database, or
\code{\link{sql}} described a derived table or compound join.}
\item{...}{Included for compatibility with the generic, but otherwise
ignored.}
}
\description{
Use \code{src_sqlite} to connect to an existing sqlite database,
and \code{tbl} to connect to tables within that database.
If you are running a local sqliteql database, leave all parameters set as
their defaults to connect. If you're connecting to a remote database,
ask your database administrator for the values of these variables.
}
\section{Debugging}{
To see exactly what SQL is being sent to the database, you see
\code{\link{show_query}} and \code{\link{explain}}.
}
\section{Grouping}{
Typically you will create a grouped data table is to call the \code{group_by}
method on a mysql tbl: this will take care of capturing
the unevalated expressions for you.
For best performance, the database should have an index on the variables
that you are grouping by. Use \code{\link{explain}} to check that
the database is using the indexes that you expect.
}
\section{Output}{
All data manipulation on SQL tbls are lazy: they will not actually
run the query or retrieve the data unless you ask for it: they all return
a new \code{\link{tbl_sql}} object. Use \code{\link{compute}} to run the
query and save the results in a temporary in the database, or use
\code{\link{collect}} to retrieve the results to R.
Note that \code{do} is not lazy since it must pull the data into R.
It returns a \code{\link{tbl_df}} or \code{\link{grouped_df}}, with one
column for each grouping variable, and one list column that contains the
results of the operation. \code{do} never simplifies its output.
}
\section{Query principles}{
This section attempts to lay out the principles governing the generation
of SQL queries from the manipulation verbs. The basic principle is that
a sequence of operations should return the same value (modulo class)
regardless of where the data is stored.
\itemize{
\item \code{arrange(arrange(df, x), y)} should be equivalent to
\code{arrange(df, y, x)}
\item \code{select(select(df, a:x), n:o)} should be equivalent to
\code{select(df, n:o)}
\item \code{mutate(mutate(df, x2 = x * 2), y2 = y * 2)} should be
equivalent to \code{mutate(df, x2 = x * 2, y2 = y * 2)}
\item \code{filter(filter(df, x == 1), y == 2)} should be
equivalent to \code{filter(df, x == 1, y == 2)}
\item \code{summarise} should return the summarised output with
one level of grouping peeled off.
}
}
\examples{
\dontrun{
# Connection basics ---------------------------------------------------------
# To connect to a database first create a src:
my_db <- src_sqlite(path = tempfile(), create = TRUE)
# Then reference a tbl within that src
my_tbl <- tbl(my_db, "my_table")
}
# Here we'll use the Lahman database: to create your own local copy,
# run lahman_sqlite()
\dontrun{
if (requireNamespace("RSQLite") && has_lahman("sqlite")) {
lahman_s <- lahman_sqlite()
# Methods -------------------------------------------------------------------
batting <- tbl(lahman_s, "Batting")
dim(batting)
colnames(batting)
head(batting)
# Data manipulation verbs ---------------------------------------------------
filter(batting, yearID > 2005, G > 130)
select(batting, playerID:lgID)
arrange(batting, playerID, desc(yearID))
summarise(batting, G = mean(G), n = n())
mutate(batting, rbi2 = 1.0 * R / AB)
# note that all operations are lazy: they don't do anything until you
# request the data, either by `print()`ing it (which shows the first ten
# rows), by looking at the `head()`, or `collect()` the results locally.
system.time(recent <- filter(batting, yearID > 2010))
system.time(collect(recent))
# Group by operations -------------------------------------------------------
# To perform operations by group, create a grouped object with group_by
players <- group_by(batting, playerID)
group_size(players)
# sqlite doesn't support windowed functions, which means that only
# grouped summaries are really useful:
summarise(players, mean_g = mean(G), best_ab = max(AB))
# When you group by multiple level, each summarise peels off one level
per_year <- group_by(batting, playerID, yearID)
stints <- summarise(per_year, stints = max(stint))
filter(ungroup(stints), stints > 3)
summarise(stints, max(stints))
# Joins ---------------------------------------------------------------------
player_info <- select(tbl(lahman_s, "Master"), playerID, birthYear)
hof <- select(filter(tbl(lahman_s, "HallOfFame"), inducted == "Y"),
playerID, votedBy, category)
# Match players and their hall of fame data
inner_join(player_info, hof)
# Keep all players, match hof data where available
left_join(player_info, hof)
# Find only players in hof
semi_join(player_info, hof)
# Find players not in hof
anti_join(player_info, hof)
# Arbitrary SQL -------------------------------------------------------------
# You can also provide sql as is, using the sql function:
batting2008 <- tbl(lahman_s,
sql("SELECT * FROM Batting WHERE YearID = 2008"))
batting2008
}
}
}