Skip to content

Commit 329009b

Browse files
committed
Data transformation (first draft)
1 parent 42261e4 commit 329009b

File tree

7 files changed

+225874
-0
lines changed

7 files changed

+225874
-0
lines changed

Przetwarzanie_danych/MANIFEST

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Transformowanie_danych_(dplyr)
2+
Grupowanie_danych_(dplyr)
3+
Czyszczenie_danych_(tidyr)
4+
Obsluga_dat_(lubridate)

Przetwarzanie_danych/Transformowanie_danych_(dplyr)/2014-07-08.csv

+225,469
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Turn off double evaluation to make things faster
2+
AUTO_DETECT_NEWVAR <- FALSE
3+
4+
match_call <- function(correct_call = NULL) {
5+
e <- get("e", parent.frame())
6+
# Trivial case
7+
if(is.null(correct_call)) return(TRUE)
8+
# Get full correct call
9+
full_correct_call <- expand_call(correct_call)
10+
# Expand user's expression
11+
expr <- deparse(e$expr)
12+
full_user_expr <- expand_call(expr)
13+
# Compare function calls with full arg names
14+
identical(full_correct_call, full_user_expr)
15+
}
16+
17+
# Utility function for match_call answer test
18+
# Fills out a function call with full argument names
19+
expand_call <- function(call_string) {
20+
# Quote expression
21+
qcall <- parse(text=call_string)[[1]]
22+
# If expression is not greater than length 1...
23+
if(length(qcall) <= 1) return(qcall)
24+
# See if it's an assignment
25+
is_assign <- is(qcall, "<-")
26+
# If assignment, process righthandside
27+
if(is_assign) {
28+
# Get righthand side
29+
rhs <- qcall[[3]]
30+
# If righthand side is not a call, can't use match.fun()
31+
if(!is.call(rhs)) return(qcall)
32+
# Get function from function name
33+
fun <- match.fun(rhs[[1]])
34+
# match.call() does not support primitive functions
35+
if(is.primitive(fun)) return(qcall)
36+
# Get expanded call
37+
full_rhs <- match.call(fun, rhs)
38+
# Full call
39+
qcall[[3]] <- full_rhs
40+
} else { # If not assignment, process whole thing
41+
# Get function from function name
42+
fun <- match.fun(qcall[[1]])
43+
# match.call() does not support primitive functions
44+
if(is.primitive(fun)) return(qcall)
45+
# Full call
46+
qcall <- match.call(fun, qcall)
47+
}
48+
# Return expanded function call
49+
qcall
50+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
dplyr
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
### Code used to download and save data
2+
# logurl <- 'http://cran-logs.rstudio.com/2014/2014-07-08.csv.gz'
3+
# filepath <- paste0('~/', basename(logurl))
4+
# download.file(logurl, filepath)
5+
# cranlog <- read.csv(filepath)
6+
# write.csv(cranlog, 'Getting_and_Cleaning_Data/Manipulating_Data_with_dplyr/2014-07-08.csv')
7+
# unlink(filepath)
8+
9+
# Make path to csv available to user
10+
path2csv <- file.path(path.package('swirl.pl'), 'Courses',
11+
'Przetwarzanie_danych',
12+
'Transformowanie_danych_(dplyr)',
13+
'2014-07-08.csv')
14+
15+
# Create datasets for user. We don't advertise that we're
16+
# doing this, but it will be necessary for students who
17+
# quit and later resume. We are not saving the variable
18+
# to the progress file to save on performance.
19+
cran <- tbl_df(read.csv(path2csv, stringsAsFactors = FALSE))
20+
cran2 <- select(cran, size:ip_id)
21+
cran3 <- select(cran, ip_id, package, size)

Przetwarzanie_danych/Transformowanie_danych_(dplyr)/lesson.yaml

+277
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
### Manipulating data with dplyr - Getting and Cleaning Data ###
2+
3+
# Code to download data
4+
# logurl <- 'http://cran-logs.rstudio.com/2014/2014-07-08.csv.gz'
5+
# filepath <- paste0('~/', basename(logurl))
6+
# download.file(logurl, filepath)
7+
# cranlog <- read.csv(filepath)
8+
# write.csv(cranlog, '~/Desktop/2014-07-08.csv')
9+
# unlink(filepath)
10+
11+
path2csv <- file.path('~/Desktop/2014-07-08.csv')
12+
13+
14+
### Intro
15+
df <- read.csv(path2csv, as.is = TRUE)
16+
dim(df)
17+
head(df)
18+
library(dplyr)
19+
cran <- tbl_df(df)
20+
cran
21+
?manip
22+
23+
# "The dplyr philosophy is to have small functions that each do
24+
# one thing well."
25+
26+
27+
### select()
28+
select(cran, ip_id, package, country) # reorders
29+
select(cran, r_arch:country)
30+
select(cran, -(X:time))
31+
select(cran, ip_id:size)
32+
33+
### filter()
34+
filter(cran, package == "swirl")
35+
filter(cran, r_version == "3.1.1", country == "US")
36+
filter(cran, country == "US" | country == "CA")
37+
filter(cran, r_version >= "3.0.0", r_os == "linux-gnu")
38+
filter(cran, !is.na(package))
39+
40+
### arrange()
41+
arrange(cran, ip_id)
42+
arrange(cran, package, ip_id)
43+
arrange(cran, country, desc(r_version), ip_id)
44+
arrange(cran, package, desc(version), ip_id)
45+
46+
### mutate()
47+
cran1 <- select(cran, ip_id, package, size)
48+
mutate(cran1, size_mb = size / 2^20)
49+
mutate(cran1, size_mb = size / 2^20, size_gb = size_mb / 2^10)
50+
51+
### summarize()
52+
summarize(cran, avg_bytes = mean(size))

0 commit comments

Comments
 (0)