-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathstep-first.R
139 lines (128 loc) · 4.19 KB
/
step-first.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#' Create a "lazy" data.table for use with dplyr verbs
#'
#' @description
#' A lazy data.table lazy captures the intent of dplyr verbs, only actually
#' performing computation when requested (with [collect()], [pull()],
#' [as.data.frame()], [data.table::as.data.table()], or [tibble::as_tibble()]).
#' This allows dtplyr to convert dplyr verbs into as few data.table expressions
#' as possible, which leads to a high performance translation.
#'
#' See `vignette("translation")` for the details of the translation.
#'
#' @param x A data table (or something can can be coerced to a data table).
#' @param immutable If `TRUE`, `x` is treated as immutable and will never
#' be modified by any code generated by dtplyr. Alternatively, you can set
#' `immutable = FALSE` to allow dtplyr to modify the input object.
#' @param name Optionally, supply a name to be used in generated expressions.
#' For expert use only.
#' @param key_by Set keys for data frame, using [select()] semantics (e.g.
#' `key_by = c(key1, key2)`.
#'
#' This uses [data.table::setkey()] to sort the table and build an index.
#' This will considerably improve performance for subsets, summaries, and
#' joins that use the keys.
#'
#' See `vignette("datatable-keys-fast-subset")` for more details.
#' @export
#' @aliases tbl_dt grouped_dt
#' @examples
#' library(dplyr, warn.conflicts = FALSE)
#'
#' # If you have a data.table, using it with any dplyr generic will
#' # automatically convert it to a lazy_dt object
#' dt <- data.table::data.table(x = 1:10, y = 10:1)
#' dt %>% filter(x == y)
#' dt %>% mutate(z = x + y)
#'
#' # Note that dtplyr will avoid mutating the input data.table, so the
#' # previous translation includes an automatic copy(). You can avoid this
#' # with a manual call to lazy_dt()
#' dt %>%
#' lazy_dt(immutable = FALSE) %>%
#' mutate(z = x + y)
#'
#' # If you have a data frame, you can use lazy_dt() to convert it to
#' # a data.table:
#' mtcars2 <- lazy_dt(mtcars)
#' mtcars2
#' mtcars2 %>% select(mpg:cyl)
#' mtcars2 %>% select(x = mpg, y = cyl)
#' mtcars2 %>% filter(cyl == 4) %>% select(mpg)
#' mtcars2 %>% select(mpg, cyl) %>% filter(cyl == 4)
#' mtcars2 %>% mutate(cyl2 = cyl * 2, cyl4 = cyl2 * 2)
#' mtcars2 %>% transmute(cyl2 = cyl * 2, vs2 = vs * 2)
#' mtcars2 %>% filter(cyl == 8) %>% mutate(cyl2 = cyl * 2)
#'
#' # Learn more about translation in vignette("translation")
#' by_cyl <- mtcars2 %>% group_by(cyl)
#' by_cyl %>% summarise(mpg = mean(mpg))
#' by_cyl %>% mutate(mpg = mean(mpg))
#' by_cyl %>%
#' filter(mpg < mean(mpg)) %>%
#' summarise(hp = mean(hp))
lazy_dt <- function(x, name = NULL, immutable = TRUE, key_by = NULL) {
# in case `x` has an `as.data.table()` method but not a `group_vars()` method
groups <- tryCatch(group_vars(x), error = function(e) character())
if (!is.data.table(x)) {
if (!immutable) {
abort("`immutable` must be `TRUE` when `x` is not already a data table.")
}
x <- as.data.table(x)
copied <- TRUE
} else {
copied <- FALSE
}
key_by <- enquo(key_by)
key_vars <- unname(tidyselect::vars_select(names(x), !!key_by))
if (length(key_vars)) {
if (immutable && !copied) {
x <- data.table::copy(x)
}
data.table::setkeyv(x, key_vars)
}
step_first(x, name = name, groups = groups, immutable = immutable, env = caller_env())
}
#' @export
dim.dtplyr_step_first <- function(x) {
dim(x$parent)
}
step_first <- function(parent, name = NULL, groups = character(),
immutable = TRUE, env = caller_env()) {
stopifnot(is.data.table(parent))
if (is.null(name)) {
name <- unique_name()
}
new_step(parent,
vars = names(parent),
groups = groups,
locals = list(),
implicit_copy = !immutable,
needs_copy = FALSE,
name = sym(name),
env = env,
class = "dtplyr_step_first"
)
}
#' @export
dt_call.dtplyr_step_first <- function(x, needs_copy = FALSE) {
if (needs_copy) {
expr(copy(!!x$name))
} else {
x$name
}
}
#' @export
dt_sources.dtplyr_step_first <- function(x) {
stats::setNames(list(x$parent), as.character(x$name))
}
#' @export
dt_has_computation.dtplyr_step_first <- function(x) {
FALSE
}
unique_name <- local({
i <- 0
function() {
i <<- i + 1
paste0("_DT", i)
}
})