Notes of R for data science_07Jul2020

缘起

整理文件时发现之前的笔记,放到这里。

目的

坐车等无聊情景下方便回顾这些技能。

# this is a note about the book [R For Data Science] 
# and  a little bitter of [R In Action]
# and [Advanced R]

# Rujiang Liang
# Created in 2018


# R FOR DATA SCIENCE ------------------------------------------------------
# DATA TRANSFORMATION -----------------------------------------------------

library(tidyverse)
library(nycflights13)

nycflights13::flights %>%
  print(n = 10, width = Inf) # width = Inf to show all variables

iris %>%
  .$Species %>% 
  table() %>%
  (function(x) x / 3) %>%
  (function(x) x + 2) %>%
  `*`(2) %>%
  round() %>%
  as.character()

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

iris %>%
  .[["Species"]]

jan1 <- flights %>%
  filter(month == 1, day == 1)

test <- flights %>%
  filter(month == 11 | month == 12)

table(test$month)

filter(flights, month %in% c(11, 12)) %>%
  count(month)

# ***FALSE****
 filter(flights, month == 11 | 12) %>% count(month)

flights %>%
  arrange(year, month, day) 

flights %>%
  arrange(desc(dep_delay))

# SELECT helper
# starts_with("abc"): matches names that begin with “abc”.
# ends_with("xyz"): matches names that end with “xyz”.
# contains("ijk"): matches names that contain “ijk”.
# matches("(.)\\1"): selects variables that match a regular expression. 
# num_range("x", 1:3): matches x1, x2 and x3

flights %>%
  rename(YEAR = year, MONTH = month)

select(flights, time_hour, air_time, everything())

# generate variables by existing variables
flights %>%
  select(
    year:day,
    ends_with("delay"),
    distance,
    air_time
  ) %>%
  mutate(
    gain = dep_delay - arr_delay,
    hour = air_time / 60,
    gaim_per_hour = gain / hour
  )

# keep ONLY new variables
transmute(flights, 
          gain = dep_delay - arr_delay,
          hour = air_time / 60,
          gain_per_hour = gain / hour
)

transmute(flights, 
          dep_time, # so that we can keep oringal variables
          hour = dep_time %/% 100,
          minute = dep_time %% 100
)


# group_by summary
table1 <- tibble(
  a = c(1, 2),
  b = c(100, 200)
)

table1 %>%
  count(a)

table1 %>%
  count(a, wt = b)

flights %>%
  group_by(year, month, day) %>%
  summarise(count = n(), # MEAN or SD calculated by how many cases?
            delay = mean(dep_delay, na.rm = T)) # na.rm = T should not be omited 
                                                # unless the data have no NA.

flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = T),
    delay = mean(arr_delay, na.rm = T)
  ) %>%
  filter(count > 20, dest != "HNL")

not_cancelled <- flights %>%
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay),
            n = n())

delays <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(delay = mean(arr_delay)) %>%
  ggplot(aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

not_cancelled %>% 
group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )

#first(x), nth(x, 2), last(x) work similarly to x[1], x[2], x[length(x)]
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time),
    quantile = quantile(dep_time, .75)
  )

iris %>%
  group_by(Species) %>%
  summarise(
    mean = mean(Sepal.Width),
    quantile = quantile(Sepal.Length, .75))

sapply(Filter(is.numeric, iris), function(i) list(mean = mean(i), sd = sd(i)))
funs <- list(min = min, median = median, mean = mean, max = max, sd = sd)
sapply(funs, function(f) f(1:10, na.rm = T))
sapply(Filter(is.numeric, iris), 
       function(x) sapply(funs, function(f) f(x, na.rm = T)))

iris %>%
  split(.$Species) %>%
  sapply(., function(df) 
    list(mean = mean(df$Sepal.Width), 
         quantile = quantile(df$Sepal.Length, .75)))
iris %>% 
  summarise(n())
iris %>%
 group_by(Species) %>%
  summarise(n(),
            n_distinct(Sepal.Length),
            sum(!is.na(Sepal.Length)))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>%
  filter(r %in% range(r))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay),
    avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
  )

#n(), sum(!is.na(x)), n_distinct(x)
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))

not_cancelled %>% 
  group_by(year, month, day) %>%
  filter(rank((arr_delay)) < 10)
iris %>%
  as.tibble %>%
  filter(rank(Sepal.Width) < 10)

iris %>%
  as.tibble %>%
  filter(min_rank(Sepal.Width) < 10)

iris %>%
  as.tibble %>%
  filter(rank(desc(Sepal.Width)) < 10)

popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365)
popular_dests

popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)

# related database (merge)
library(nycflights13)
# left_join:  by = c("dest" = "faa")
top_dest <- flights %>%
  count(dest, sort = T) %>%
  head(10)

flights %>%
  semi_join(top_dest)

## the below part of related database is not important
library(nycflights13)
airlines
airports
planes
planes %>%
  count(tailnum) %>%
  filter(n > 1)

weather %>% 
  count(year, month, day, hour, origin) %>% 
  filter(n > 1)

flights %>%
  count(year, month, day, hour, flight) %>%
  filter(n > 1)

flights <- flights %>%
  mutate(id = row_number())
flights

flights %>%
  count(id) %>%
  filter(n > 1)

flights2 <- nycflights13::flights %>%
  select(year:day, hour, origin, dest, tailnum, carrier) 
flights2 

flights2 %>%
  select(-origin, -dest) %>%
  left_join(airlines, by = "carrier")

flights2 %>%
  select(-origin, -dest) %>%
  mutate(name = airlines$name[match(carrier, airlines$carrier)])


x <- tribble(
  ~key, ~val_x,
  1, "x1",
  2, "x2",
  2, "x3",
  1, "x4"
)
y <- tribble(
  ~key, ~val_y,
  1, "y1",
  2, "y2"
)
left_join(x, y, by = "key")
left_join(y, x, by = "key")

flights2 %>%
  left_join(weather)

flights2 %>%
  left_join(planes, by = "tailnum")

flights2 %>%
  left_join(airports, by = c("dest" = "faa"))

flights2 %>%
  left_join(airports, by = c("origin" = "faa"))

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
  borders("state") +
  geom_point(color = "blue") +
  coord_quickmap() + 
  theme_void()

top_dest <- flights %>%
  count(dest, sort = T) %>%
  head(10)

flights %>%
  filter(dest %in% top_dest$dest)

flights %>%
  semi_join(top_dest, by = "dest")

# spreading and gathering 
library(tidyverse)
table4a
tidy4a <- table4a %>%
  gather(`1999`, `2000`, key = "year", value = "cases")
tidy4a

table4b
tidy4b <- table4b %>%
  gather(`1999`, `2000`, key = "year", value = "population")

tidy4a %>%
  left_join(tidy4b, by = c("country", "year"))

table2
table2 %>%
  spread(key = type, value = count)

table3
table3 %>%
  separate(rate, into = c("cases", "population"))

table3 %>%
  separate(rate, into = c("cases", "population"), sep = "/", convert = T)

table3 %>%
  separate(year, into = c("century", "year"), sep = 2)

table5
table5 %>%
  unite(new, century, year)

table5 %>%
  unite(new, century, year, sep = "")
stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)
stocks
stocks %>%
  spread(year, return)

stocks %>% 
  spread(year, return) %>% 
  gather(`2015`:`2016`, key = "year", value = "return")
stocks %>% 
  spread(year, return) %>% 
  gather(`2015`:`2016`, key = "year", value = "return", na.rm = T)

women %>% 
  mutate(id = row_number()) %>% 
  gather(-id, key = "key", value = "value")

stocks %>%
  complete(year, qtr)

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)
treatment
treatment %>%
  fill(person)

# an example of data cleaning
library(tidyverse)
who
who1 <- who %>%
  gather(new_sp_m014:newrel_f65, key = "key", value = "case", na.rm = T)
who1
who1 %>%
  count(key)
who2 <- who1 %>%
  mutate(key = str_replace(key, "newrel", "new_rel"))
who2
who3 <- who2 %>%
  separate(key, c("new", "type", "sexage"), sep = "_")
who3 %>%
  count(new)
who4 <- who3 %>%
  select(-new, -iso2, -iso3)
who4

who5 <- who4 %>%
  separate(sexage, c("sex", "age"), sep = 1)
who5


tidyr::who %>%
  gather(key, value, new_sp_m014:newrel_f65, na.rm = T) %>%
  mutate(key = str_replace(key, "newrel", "new_rel")) %>%
  separate(key, c("new", "var", "sexage")) %>%
  select(-new, -iso2, -iso3) %>%
  separate(sexage, c("sex", "age"), sep = 1)

# INPUT / OUTPUT ----------------------------------------------------------

library(tidyverse)
ggplot(diamonds, aes(carat, price)) + 
  geom_hex()
ggsave("diamonds.pdf")

plot1 <- ggplot(diamonds, aes(clarity, price)) + 
  geom_boxplot()
ggsave("diamonds.png", plot = plot1)

write_csv(diamonds, "diamonds.csv")

read_csv("diamonds.csv")
read_csv(
  "a, b, c
  1, 2, 3
  4, 5, 6"
)

read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")

read_csv("1,2,3\n4,5,6", col_names = FALSE)
read_csv("1,2,3\n4,5,6")

read_csv("a,b,c\n1,2,.", na = ".")

write_rds(iris, "iris.rds")
read_rds("iris.rds")

# save(file, file = "name.RData)
# load("name.Rdata")

# STRINGR & REGULAR EXPRESSION --------------------------------------------

#strings
library(tidyverse)

string1 <- "this is a string"
string2 <- 'if i want to include a "quote" inside a string, i use single quotes'
x <- c("\"", "\\")
x
writeLines(x)
str_length(c("a", "r for data science", NA))

str_c("x", "y", sep = ", ")

x <- c("abc", NA)
x
str_c("|-", x, "-|")
str_c("|-", str_replace_na(x), "-|")
str_c("prefix-", c("a", "b", "c"), "-suffix")

name <- "Hadley"
time_of_day <- "morning"
birthday <- F

str_c("good ", time_of_day, " ", name, if (birthday) "   and happy birthday",
      ".")
str_c(c("x", "y", "z"), collapse = ", ")

#subsetting strings
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
str_sub(x, -3, -1)
str_sub("a", 1, 5)
str_sub(x, -3, -1) <- str_to_upper(str_sub(x, -3, -1))
x

#regular expression
x <- c("apple", "banana", "pear")
str_view(x, "an")

str_view(x, ".a.")
str_view(c("abc", "a.c", "bef"), "a\\.c")
x <- "a\\b"
writeLines(x)
str_view(x, "\\\\")

x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")

#For example, I’ll search for \bsum\b to 
#avoid matching summarise, summary, rowsum and so on.

# \\d: matches any digit.
# \\s: matches any whitespace (e.g. space, tab, newline).
# [abc]: matches a, b, or c.
# [^abc]: matches anything except a, b, or c.

str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]c")
str_view(c("abc", "a.c", "a*c", "a c"), "a[b ]c")
# This works for most (but not all) 
# regex metacharacters: $ . | ? * + ( ) [ {. 
# Unfortunately, a few characters have special meaning 
# even inside a character class and 
# must be handled with backslash escapes: ] \ ^ and -

str_view(c("grey", "gray"), "gr(e|a)y")
str_view(c("grey", "gray"), "gr[ea]y")
str_view(c("grey", "gray", "ray", "r"), ".*r.*")

# ?: 0 or 1
# +: 1 or more
# *: 0 or more
# precedence of these operators is high

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, "C[LX]+")

str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, "C{2,3}?")
str_view(x, "C[LX]+?")

str_view(fruit, "(..)\\1", match = T)


x <- c("apple", "banana", "pear")
str_detect(x, "e")

sum(str_detect(words, "^t"))
mean(str_detect(words, "[aeiou]$"))


no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)

words[str_detect(words, "x$")]
str_subset(words, "x$")

df <- tibble(
  word = words,
  i = seq_along(word)
)
df %>%
  filter(str_detect(word, "x$"))

x <- c("apple", "banana", "pear")
str_count(x, "a")
mean(str_count(x, "a"))

df %>%
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )

str_count("abababa", "aba")
str_view_all("abababa", "aba")

length(sentences)
head(sentences)
colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match
has_color <- str_subset(sentences, color_match)
matchs <- str_extract(has_color, color_match)

more <- sentences[str_count(sentences, color_match) > 1] 
more
str_extract(more, color_match)
str_extract_all(more, color_match, simplify = T)
str_extract_all(more, color_match, simplify = F)
x <- c("a", "a b", "abc")
str_extract_all(x, "[a-z]", simplify = T)
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>%
  str_extract(noun)
has_noun %>%
  str_match(noun)

# FACTORS -----------------------------------------------------------------

library(forcats)
x1 <- c("Dec", "Apr", "Jan", "Mar")
sort(x1)
month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
sort(y1)
y1
x2 <- c("Dec", "Apr", "Jam", "Mar")
y2 <- factor(x2, levels = month_levels) # elements not in levels will be replaced with NA
y2
sort(y2)
levels(y2)
factor(x1)
factor(x1, levels = unique(x1)) # waht is this mean?

gss_cat %>%
  count(race)

gss_cat %>%
  ggplot(aes(race)) + 
  geom_bar()

gss_cat %>%
  ggplot(aes(race)) + 
  geom_bar() +
  scale_x_discrete(drop = F)

relig_summary <- gss_cat %>%
  group_by(relig) %>%
    summarise(
      age = mean(age, na.rm = T),
      tvhours = mean(tvhours, na.rm = T),
      n = n()
    ) 

ggplot(relig_summary, aes(tvhours, relig)) +
  geom_point()

ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) + 
  geom_point()

relig_summary %>%
  mutate(relig = fct_reorder(relig, tvhours)) # what is this mean?

rincome_summary <- gss_cat %>%
  group_by(rincome) %>%
  summarise(
    age = mean(age, na.rm = T), 
    tvhours = mean(tvhours, ra.rm = T),
    n = n()
  )
ggplot(rincome_summary, aes(age, rincome)) + geom_point()
ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +  # what is this mean?
  geom_point()



# FUNCTION ----------------------------------------------------------------
library(tidyverse)
library(magrittr)

df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10), 
  d = rnorm(10)
)

x <- df$a
(x - min(x, no.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))
rng <- range(x, na.rm = T)
(x - rng[1]) / (rng[2] - rng[1])

rescale01 <- function(x) {   # rescale a vector, use for-loop if you want rescale a dataframe
  rng <- range(x, na.rm = T)
  (x - rng[1]) / (rng[2] - rng[1])
}

rescale01(c(0, 5, 10))
rescale01(c(-10, 0, 10))
rescale01(c(1, 2, 3, NA, 5))

has_name <- function(x) {
  nms <- names(x)
  if (is.null(nms)) {
    rep(FALSE, length(x))
  } else {
    !is.na(nms) & nms != ""
  }
}

# if(length(x) == 0 || lenght(y) == 0), note that "OR AND" in *condition*
# shou be written as "|| &&"

f <- function(x, y, op) {
  switch(op, 
         plus = x + y,
         minus = x - y,
         times = x * y,
         dvide = x / y,
         stop("Unknown op")
  )
}

f(1, 2, "plus")
f(2, 2, "minus")
y <- 10
x <- if (y < 20) "Too low" else "Too high"
x

mean_ci <- function(x, conf = .95) {
  se <- sd(x) / sqrt(length(x))
  alpha = 1 - conf
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

x <- runif(100)
mean_ci(x)
mean_ci(x, .99)

wt_mean <- function(x, w) {
  if (length(x) != length(w)) {
    stop("'x' and 'w' must be the same length", call. = F) # call. = F what mean?
  }
  sum(w * x) / sum(x)
}

wt_mean <- function(x, w, na.rm = F) { # how robust the function you want?
  if (!is.logical(na.rm)) {
    stop("'na.rm' must be logical")
  }
  if (length(na.rm) != 1) {
    stop("'na.rm' must be length 1")
  }
  if(length(x) != length(w)) {
    stop("'x' and 'w' must be the same length", call. = F)
  }
  
  if(na.rm) {         # impressive   if (condition) {}    nothing happened if (condition = F)
    miss <- is.na(x) | is.na(w)
    w <- w[!miss]
    x <- x[!miss]
  }
  sum(x * w) / sum(w)
}

# a shorthand of preview one, but no message to show if there are errors
wt_mean <- function(x, w, na.rm = F) { 
  stopifnot(is.logical(na.rm), length(na.rm) == 1)
  stopifnot(length(x) == length(w))  # stop if not (TURE) 
  
  if(na.rm) {
    miss <- is.na(x) | is.na(w)
    w <- w[!miss]
    x <- x[!miss]
  }
  sum(x * w) / sum(w)
}

commas <- function(...) stringr::str_c(..., collapse = ", ")
commas(letters[1:10])

rule <- function(..., pad = "-") {
  title <- paste0(...)
  width = getOption("width") - nchar(title) - 5
  cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
}

x <- c(1, 2)
sum(x, na.mr = T) 
sum(x, na.mr = F) # be careful, some function treat T as 1 and F as 0

library(tidyverse)
typeof(letters)
typeof(1:10)
typeof(1)
typeof(1L)

x <- sqrt(2) ^ 2
x
x - 2
x == 2

c(-1, 0, 1) / 0

x <- sample(20, 100, replace = T)
sum(x > 10)
mean(x > 10)

x <- c(10, 3, NA, 5, 8, 1, NA)
x[!is.na(x)]
x[x %% 2 == 0]
x[x %% 2 == 0 & !is.na(x)]

y <- c(x = 1, y = 2, z = 3)
purrr::set_names(1:3, c("a", "b", "c"))
y["x"]
y[c("x", "y")]

x <- list(1, 2, 3)
x
str(x)
x_named <- list(a = 1, b = 2, c = 3)
str(x_named)
y <- list("a", 1L, 1.5, T)
str(y)
z <- list(list(1, 2), list(3, 4))
str(z)

a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
str(a)
str(a[1:2])
str(a[[1]])
str(a[[4]])
a[[4]][1]
a[[4]][[1]]

df <- tibble(
  a = rnorm(10),
  b = rnorm(10), 
  c = rnorm(10),
  d = rnorm(10)
)

output <- vector("double", ncol(df))
for (i in seq_along(df)) {
  output[[i]] <- median(df[[i]])
}
output


# 1.The output: output <- vector("double", length(x)). Before you start the loop, you must always allocate sufficient space for the output. This is very important for efficiency: if you grow the for loop at each iteration using c() (for example), your for loop will be very slow.
# 
# A general way of creating an empty vector of given length is the vector() function. It has two arguments: the type of the vector (“logical”, “integer”, “double”, “character”, etc) and the length of the vector.
# 
# 2.The sequence: i in seq_along(df). This determines what to loop over: each run of the for loop will assign i to a different value from seq_along(df). It’s useful to think of i as a pronoun, like “it”.
# 
# You might not have seen seq_along() before. It’s a safe version of the familiar 1:length(l), with an important difference: if you have a zero-length vector, seq_along() does the right thing:
#   
#   y <- vector("double", 0)
# seq_along(y)
# #> integer(0)
# 1:length(y)
# #> [1] 1 0
# 
# You probably won’t create a zero-length vector deliberately, but it’s easy to create them accidentally. If you use 1:length(x) instead of seq_along(x), you’re likely to get a confusing error message.
# 
# 3.The body: output[[i]] <- median(df[[i]]). This is the code that does the work. It’s run repeatedly, each time with a different value for i. The first iteration will run output[[1]] <- median(df[[1]]), the second will run output[[2]] <- median(df[[2]]), and so on.

rescale01 <- function(x) {
  rng <- range(x, na.rm = T)
  (x - rng[1]) / (rng[2] - rng[1])
}

df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)

for (i in seq_along(df)) {
  df[[i]] <- rescale01(df[[i]])
}

# for (x in xs)
#   
#   for (nm in names(xs))
#     x[[nm]]

results <- vector("list", length(iris))
names(results) <- names(iris)

# “logical”, “integer”, “double”, “character”

for (i in seq_along(iris)) {  # what is mean?
  names <- names(iris)[[i]]
  value <- iris[[i]]
}

means <- c(0, 1, 2)
output <- double()
for (i in seq_along(means)) {
  n <- sample(100, 1)
  output <- c(output, rnorm(n, means[[i]]))
}
str(output)

#save time and space
out <- vector("list", length(means))
for (i in seq_along(means)) {
  n <- sample(100, 1)
  out[[i]] <- rnorm(n, means[[i]])
}
str(out)
str(unlist(out))

#You might be generating a long string. Instead of paste()ing together each iteration with the previous, save the output in a character vector and then combine that vector into a single string with paste(output, collapse = "").
#You might be generating a big data frame. Instead of sequentially rbind()ing in each iteration, save the output in a list, then use dplyr::bind_rows(output) to combine the output into a single data frame.

# for (i in seq_along(x)) {
#   # body
# }
# 
# # eauivalent to 
# i <- 1
# while (i <= length(x)) {
# # body  
# i <- i + 1
# }

flip <- function() sample(c("T", "H"), 1)
flips <- 0
nheads <- 0

while (nheads < 3) {
  if (flip() == "H") {
    nheads = nheads + 1
  } else {
    nheads <- 0
  }
  flips <- flips + 1
}

# files <- dir("data/", pattern = "\\.csv$", full.names = T)

# created by Rujiang Liang, it is not perfect
# show_mean <- function(df) {
#   vector <- logical()
#   for (i in 1:ncol(df)) {
#     vector[i] <- is.numeric(df[[i]])  
#   }
#   df_reduced <- df[, vector]
#   out_name <- names(df_reduced)
#   out_value <- double()
#   for (j in 1:ncol(df_reduced)) {
#     out_value[j] <- mean(df_reduced[[j]])
#   }
#   as.matrix(c(out_name, out_value), ncol = 2)
# }

df <- tibble(
  a = rnorm(10),
  b = rnorm(10), 
  c = rnorm(10), 
  d = rnorm(10)
)

output <- vector("double", length(df))
for (i in seq_along(df)) {
  output[i] <- mean(df[[i]])
}
output

col_mean <- function(df) {
  output <- vector("double", length(df))
  for (i in seq_along(df)){
    output[i] <- mean(df[[i]])
  }
  output
}

col_summary <- function(df, fun) {
  out <- vector("double", length(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
  }
  out
}

col_summary(df, mean)
col_summary(df, sd)

map_dbl(df, mean)
map_dbl(df, median)
map_dbl(df, sd)
map_dbl(iris[-5], function(x) length(unique(x)))
map_dbl(iris[-5], ~length(unique(.)))

# map() makes a list.
# map_lgl() makes a logical vector.
# map_int() makes an integer vector.
# map_dbl() makes a double vector.
# map_chr() makes a character vector.

df %>% map_dbl(mean)

map_dbl(df, mean, trim = .5)

z <- list(x = 1:3, y = 4:5)
map_int(z, length)

models <- mtcars %>%
  split(.$cyl) %>%
  map(function(df) lm(mpg ~ wt, data = df)) 

models <- mtcars %>%
  split(.$cyl) %>%
  map(~lm(mpg ~ wt, data = .))

models %>%
  map(summary) %>%
  map_dbl(~.$r.squared)

models %>%
  map(summary) %>%
  map(~.$coefficients)

models %>%
  map(summary) %>%
  map(~.$coefficients[2, 1])


models %>%
  map(summary) %>%
  map(~.$coefficients["wt", ]) 

models %>%
  map(summary) %>%
  map_dbl("r.squared")

models %>%
  map(summary) %>%
  map("coefficients")

mtcars %>%
  split(.$cyl) %>%
  map(~lm(mpg ~ wt, .)) %>%
  map(summary) %>%
  map("coefficients") %>%
  map(~.["wt", c("Estimate", "t value")]) %>%
  reduce(rbind)

x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)

threshold <- function(x, cutoff = .8) x[x > cutoff]

safe_log <- safely(log)
str(safe_log(10))
str(safe_log("a"))
x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)
y <- y %>% transpose()
str(y)
is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]
y$result[is_ok] %>% flatten_dbl()

x %>% map_dbl(possibly(log, NA))
x <- list(1, -1)
x %>% map(quietly(log)) %>% str()

mu <- tibble(`5` = 5, `10` = 10, `-3` = -3)
mu %>%
  map_df(rnorm, n = 5)

mu <- list(`5` = 5, `10` = 10, `-3` = -3)
mu %>% 
  map(rnorm, n = 5)  

mu %>%
  map_df(rnorm, n = 5)

sigma <- list(1, 5, 10)
seq_along(mu) %>%
  map(~rnorm(5, mu[[.]], sigma[[.]])) %>%
  str()

map2(mu, sigma, rnorm, n = 5) %>% str()

n <- list(1, 3, 5)
arg1 <- list(n, mu, sigma)
arg1 %>%
  pmap(rnorm) %>%
  str()
# better to name the arguments
arg2 <- list(mean = mu, sd = sigma, n = n)
arg2 %>%
  pmap(rnorm) %>%
  str()

# a data.frame ensures name and some lenght of each column
params <- tibble( 
  mean = c(5, 10, -3),
  sd = c(1, 5, 10),
  n = c(1, 3, 5)
)
params %>%
  pmap(rnorm)

sim <- tribble(
  ~f,      ~params,
  "runif", list(min = -1, max = 1),
  "rnorm", list(sd = 5),
  "rpois", list(lambda = 10)
)
sim %>%
  mutate(sim = invoke_map(f, params, n = 10))

plots <- mtcars %>%
  # group_by(cyl) %>% do not work
  split(.$cyl) %>%
  map(~ggplot(., aes(mpg, wt)) + geom_point())
paths <- str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave)

iris %>%
  keep(is.factor) %>%
  str()
iris %>%
  keep(is.numeric) %>%
  str()
iris %>%
  discard(is.factor) %>%
  str()

x <- list(1:5, letters, list(10))
x %>%
  some(is_character)
x %>%
  every(is_vector)

#detect() finds the first element where the predicate is true; 
#detect_index() returns its position.
x <- sample(10)
x %>%
  detect(~ . > 5)
x %>%
  detect_index(~ . > 5)

#head_while() and tail_while() 
#take elements from the start or end of a vector 
#while a predicate is true
x %>%
  head_while(~ . > 5)
x %>%
  tail_while(~ . > 5)

dfs <- list(
  age = tibble(name = "John", age = 30),
  sex = tibble(name = c("John", "Marry"), sex = c("m", "f")),
  trt = tibble(name = "Marry", treatment = "a")
)
dfs %>% reduce(full_join)

dfs <- list(
  c(1, 3, 5, 6, 10),
  c(1, 2, 3, 7, 8, 10),
  c(1, 2, 3, 4, 8, 9, 10)
)
dfs %>%
  reduce(intersect)
#Accumulate keeps all the interim results.
x <- sample(10)
x %>%
  accumulate(`+`)

# MODEL -------------------------------------------------------------------

library(tidyverse)
library(modelr)
options(na.action = na.warn)

ggplot(sim1, aes(x, y)) + 
  geom_point()

models <- tibble(
  a1 = runif(250, -20, 40),
  a2 = runif(250, -5, 5)
)

ggplot(sim1, aes(x, y)) + 
  geom_abline(aes(intercept = a1, slope = a2), 
              data = models, alpha = 1/4) + 
  geom_point()

model1 <- function(a, data) {
  a[1] + data$x * a[2]
}

model1(c(7, 1.5), sim1)

measure_distance <- function(mod, data) {
  diff <- data$y - model1(mod, data)
  sqrt(mean(diff ^ 2))
}

measure_distance(c(7, 1.5), sim1)

sim1_dist <- function(a1, a2) {
  measure_distance(c(a1, a2), sim1)
}

models <- models %>%
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))

models

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, color = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, color = -dist), 
    data = filter(models, rank(dist) <= 10)
  )

ggplot(models, aes(a1, a2)) + 
  geom_point(data = filter(models, rank(dist) <= 10), 
             size = 4, color = "red") + 
  geom_point(aes(color = -dist))

grid <- expand.grid(
  a1 = seq(-5, 20, length = 25),
  a2 = seq(1, 3, length = 25)
) %>%
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))

grid %>%
  ggplot(aes(a1, a2))  + 
  geom_point(data = filter(grid, rank(dist) <= 10), 
             size = 4, color = "red") + 
  geom_point(aes(color = -dist))

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, color = "grey30") + 
  geom_abline(
    aes(intercept = a1, slope = a2, color = -dist),
    data = filter(grid, rank(dist) <= 10)
  )

best <- optim(c(0, 0), measure_distance, data = sim1)
best$par

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, color = "grey30") + 
  geom_abline(intercept = best$par[1], slope = best$par[2])

sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)

ggplot(sim1, aes(x, y)) + 
  geom_point(size = 2, color = "grey30") + 
  geom_abline(intercept = best$par[1], slope = best$par[2],
              size = 3, alpha = 1 / 4) +
  geom_abline(intercept = coef(sim1_mod)[1], 
              slope = coef(sim1_mod)[2],
              color = "red")

# bookmark ----------------------------------------------------------------

grid <- sim1 %>%
  data_grid(x)

grid

grid <- grid %>%
  add_predictions(sim1_mod)

grid

ggplot(sim1, aes(x)) + 
  geom_point(aes(y = y)) + 
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

sim1 <- sim1 %>%
  add_residuals(sim1_mod)
sim1

ggplot(sim1, aes(resid)) + 
  geom_freqpoly(binwidth = .5)

ggplot(sim1, aes(x, resid)) + 
  geom_ref_line(h = 0) + 
  geom_point()

df <- tibble(
  y = c(4, 5), 
  x1 = c(2, 1), 
  x2 = c(5, 6)
)

model_matrix(df, y ~ x1)

ggplot(sim2) + 
  geom_point(aes(x, y))

mod2 <- lm(y ~ x, data = sim2)
grid <- sim2 %>%
  data_grid(x) %>%
  add_predictions(mod2)
grid  

ggplot(sim2, aes(x)) + 
  geom_point(aes(y = y)) +
  geom_point(data = grid, aes(y = pred), color = "red", size = 4)

ggplot(sim3, aes(x1, y)) +
  geom_point(aes(color = x2))

mod1 <- lm(y ~ x1 + x2, data = sim3)
mod2 <- lm(y ~ x1 * x2, data = sim3)

grid <- sim3 %>%
  data_grid(x1, x2) %>%
  gather_predictions(mod1, mod2)
grid

ggplot(sim3, aes(x1, y, color = x2)) + 
  geom_point() + 
  geom_line(data = grid, aes(y = pred)) +
  facet_wrap(~ model)

sim3 <- sim3 %>%  
  gather_residuals(mod1, mod2) 

ggplot(sim3, aes(x1, resid, color = x2)) +
  geom_point() + 
  facet_grid(model ~ x2)

mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)

grid <- sim4 %>%
  data_grid(
    x1 = seq_range(x1, 5),
    x2 = seq_range(x2, 5)
  ) %>%
  gather_predictions(mod1, mod2)
grid

ggplot(grid, aes(x1, x2)) + 
  geom_tile(aes(fill = pred)) + 
  facet_wrap(~ model)

ggplot(grid, aes(x1, pred, color = x2, group = x2)) + 
  geom_line() + 
  facet_wrap(~ model)
ggplot(grid, aes(x2, pred, color = x1, group = x1)) + 
  geom_line() + 
  facet_wrap(~ model)

library(splines)

sim5 <- tibble(
  x = seq(0, 3.5 * pi, length = 50), 
  y = 4 * sin(x) + rnorm(length(x))
)

ggplot(sim5, aes(x, y)) + 
  geom_point()

mod1 <- lm(y ~ ns(x, 1), data = sim5)
mod2 <- lm(y ~ ns(x, 2), data = sim5)
mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)
mod6 <- lm(y ~ ns(x, 6), data = sim5)
mod7 <- lm(y ~ ns(x, 7), data = sim5)

grid <- sim5 %>%
  data_grid(x = seq_range(x, n = 50, expand = .1)) %>%
  gather_predictions(mod1, mod2, mod3, mod4, mod5, mod6, mod7, .pred = "y")

ggplot(sim5, aes(x, y)) + 
  geom_point() + 
  geom_line(data = grid, color = "red") + 
  facet_wrap(~ model)

nobs(mod1)


library(tidyverse)
library(modelr)
options(na.action = na.warn)
library(nycflights13)
library(lubridate)

ggplot(diamonds, aes(cut, price)) + geom_boxplot()
ggplot(diamonds, aes(color, price)) + geom_boxplot()
ggplot(diamonds, aes(clarity, price)) + geom_boxplot()

ggplot(diamonds, aes(carat, price)) + geom_hex(bins = 100)

diamonds2 <- diamonds %>%
  filter(carat <= 2.5) %>%
  mutate(lprice = log2(price), lcarat = log2(carat))

ggplot(diamonds2, aes(lcarat, lprice)) + 
  geom_hex(bins = 50)

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)

grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%
  mutate(lcarat = log2(carat)) %>%
  add_predictions(mod_diamond, "lprice") %>%
  mutate(price = 2 ^ lprice)
grid

ggplot(diamonds2, aes(carat, price)) +
  geom_point(alpha = .1) + 
  geom_line(data = grid, color = "red", size = 1)

diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond, "lresid")

ggplot(diamonds2, aes(lcarat, lresid)) + 
  geom_hex(bins = 50)

ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()
ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()
ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()

mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)

grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions(mod_diamond2)
grid

ggplot(grid, aes(cut, pred)) + 
  geom_point()
diamonds2 <- diamonds2 %>%
  add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
  geom_hex(bins = 50)

diamonds2 %>%
  filter(abs(lresid2) > 1) %>%
  add_predictions(mod_diamond2) %>%
  mutate(pred = round(2 ^ pred)) %>%
  select(price, pred, carat:table, x:z) %>%
  arrange(price)
daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarise(n = n())
daily

ggplot(daily, aes(date, n)) + 
  geom_line()

daily <- daily %>%
  mutate(wday = wday(date, label = T))
ggplot(daily, aes(wday, n)) + 
  geom_boxplot()

mod <- lm(n ~ wday, data = daily)  
grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) + 
  geom_boxplot() + 
  geom_point(data = grid, color = "red", size = 4)

daily <- daily %>%
  add_residuals(mod)
daily %>%
  ggplot(aes(date, resid)) + 
  geom_ref_line(h = 0) + 
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) + 
  geom_ref_line(h = 0) + 
  geom_line()

daily %>%
  filter(resid < -100)

daily %>%
  ggplot(aes(date, resid)) + 
  geom_ref_line(h = 0) + 
  geom_line(color = "grey50") + 
  geom_smooth(se = F, span = .3)

daily %>%
  filter(wday == "周六") %>% 
  ggplot(aes(date, n)) + 
  geom_point() + 
  geom_line() + 
  scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b") + 
  theme_bw()
  
term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall"))
} 

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "周六") %>%
  ggplot(aes(date, n, color = term)) + 
  geom_point(alpha = 1/3) + 
  geom_line() + 
  scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")

daily %>%
  ggplot(aes(wday, n, color = term)) + 
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) + 
  geom_line(alpha = .75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) + 
  geom_boxplot() + 
  geom_point(data = grid, color = "red") + 
  facet_wrap(~ term)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) + 
  geom_hline(yintercept = 0, color = "grey75") + 
  geom_line()

#chapter 25
#chapter 27

# R BASE ------------------------------------------------------------------

test <- 1:3
test[1] #ok
test[[1]] #ok
# object[index]  "index" can be a vector of numeric 
# or a vector of character if nameed
df <- data.frame(a = 1:5, b = 1:5, c = 1:5)
df
df$c(a, b) # not appliable
df$c("a", "b") # not appliable
df[c(a,b)] # not applyable

df[c("a","b")] #  ok

my_index <- "a"
df[my_index]

df[!names(df) %in% c("a", "b")] # ok

x <- c("one", "two", "three")
x[c(1, -1)] # it does not work, do not mix negative and positive sub_number
x[rep(1, 10)]
x[0]

x <- c(10, 3, NA, 5, 8, 1, NA)
x[x %% 2 == 0]
x[x %% 2 == 1]

attach(mtcars)
summary(mpg)
plot(mpg, disp)
plot(mpg, wt)
detach(mtcars)

data$gender <- factor(data$gender,
                      levels = c(1, 2),
                      labels = c("male", "female"))

data$age[data$age == 99] <- NA
data <- within(data, {
  agecat <- NA
  agecat[age > 75] <- "Elder" 
  agecat[age <= 75] <- "Young"
})

data <- transform(data,
                  sumx = x1 + x2,
                  meanx = (x1 + x2) / 2)

library(car)
x <- 10:100
df <- data.frame(x = x)
df
df$y <- recode(df$x, "lo:60 = 'C'; 61:80 = 'B'; 81:100 = 'A'")
df$z <- recode(df$x, "lo:60 = 'C'; 60:80 = 'B'; 81:hi='A'; else='NULL'")
df

help(recode) 

leadership <- iris
names(leadership)[2] <- "something_you_like"
names(leadership)
names(leadership)[2:4] <- c("something", "you", "like")
names(leadership)
names(leadership)[[2]] <- "something_you_like"
names(leadership)

order(4:6)
(3:1)[order(3:1)]
leadership[order(leadership$Sepal.Length),]
leadership[order(
  c(leadership$Sepal.Length, leadership$something_you_like)
  ),]
# is.na() can be applied to a dataframe, this is awesome
# is.infinite()  in.nan()
# na.omit() drops all the obs(rows) that contain NA
# something == NA do not work
# any(c(T, T, F))   all(c(T, T, F))

sum(c(1, 2, NA))

# cbind(a, b) is a relatively safe function,
# to make it work, dataframe a and b must have same variable,
# but do not have to be arranged to same order

# subset
df[6:10, ]

myvars <- c("q1", "q2", "q3", "q4", "q5")
df[myvars]

myvars <- paste("q", 1:5, sep = "")
myvars
df[myvars]

myvars <- names(df) %in% c("q3", "q4")
df[!myvars]

head(df)
df$Sepal.Length <- NULL
df$Sepal.Width <- df$Petal.Length <- NULL # interesting

leadership[leadership$male == 1, ]

df[1, "Species"] 
df[1, 2]

subset(leadership, age >= 35 | age < 24,
       select = c(q1, q2, q3, q4, q5)) # select = q1:q5

sample(1:10, 3, replace = F)

mean(x, trim = .05, na.rm = T) # drop 10% obs and na

##strings
x <- c("ab", "cde", "fghij")
nchar(x)
length(x)
substr(x, 1, 1)
substr(x, 1, 1) <- "something"
x
substr(x, 1, 2) <- "something"
x
toupper(c("aaa", "bb"))
tolower(c("someTHING", "hErE"))
seq(1, 5, 2)

x <- 1:10
cut(x, 3)
cut(x, 3, ordered_result = T)
cut(x, c(0, 3, 6, 9, 10), 
    labels = c("not_so_good", "good", "great", "awesome"),
    ordered_result = T)

pretty(x, 3)

c <- matrix(runif(12), nrow = 3)
c
mean(c)
apply(c, 1, mean)
apply(c, 2, mean)
apply(c, 2, mean, trim = .2)

apply(iris, 2, mean) # why does not work?
apply(iris[-5], 2, mean) # it works
apply(iris[-5], 1, mean) # it works

## function
mystats <- function(x, parametric = T, print = F) {
  if (parametric) {
    center = mean(x)
    spread = sd(x)
  } else {
    center = median(x)
    spread = mad(x)
  } 
  
  if (print & parametric) {
    cat("Mean =", center, "\n", "SD =", spread, "\n")
  } else if (print & !parametric){
      cat("Median =", center, "\n", "SD =", spread, "\n")
  }
  
  results <- list(center = center, spread = spread)
  return(results)
}

set.seed(1234)
x <- rnorm(500)
mystats(x)
y <- mystats(x, print = T)
x <- 1:10
x
names(x) <- c(letters[1:10])
x
attributes(x)
attr(x, "dim") <- c(2, 5)
x
attributes(x)
row.names(x) <- c("a", "b")
attributes(x)
x
attr(x, "dimnames")[[2]] <- c(LETTERS[1:5])
x
attr(x, "names") <- NULL
x
attributes(x)
attr(x, "test") <- "this is a test"
x
attr(x, "dim") <- NULL
x

attributes(iris)
iris <- unclass(iris)
str(iris)
attributes(iris)

data(iris)
set.seed(1234)
fit <- kmeans(iris[1:4], 3)
fit
str(fit)
names(fit)
summary(fit)
unclass(fit)
fit[c(2, 7)]
fit[2]
fit[[2]]
fit[["centers"]]
fit$"centers"
fit$centers
fit[[2]][1, 1:3]
fit[2][1][1][[1]][1, ]

interactive()
args(mean)

#output to file
sink("test.txt", append = T)
sink()
cat("hello")
cat("hello", file = "mytxt.txt", append = T)
#work with dirs
getwd()
list.dirs()
list.files()
dir()
dir(path = "./check")
file.info("./check.readme.txt")
file.info("communication.html")
file.exists("./check/readme.txt")
file.exists("test.txt")
file_test("-f", "check")
file_test("-d", "check")
dir.create("./something")
file.exists("something")
file.create("test.txt")
file.info("test.txt")
file.rename("test.txt", "something.txt")
file.info("something.txt")
file.remove("something.txt")
file.exists(c("something.txt", "test.txt"))
system("tree")
file.rename("something", "something2")
file.copy("diamonds.csv", "./something2/renamed.html")
unlink("something2")

readLines("./check/readme.txt", encoding = "UTF-8")
readLines("temp.R")

# SOME FUNCTION BY ME -----------------------------------------------------

my_sample <- function(x, size = 30, repeats = 100) {
  out <- vector("double", length = repeats)
  for (i in seq_len(repeats)) {
    out[[i]] <- mean(sample(x, size))
  }
  out
}

sizes <- c(`20` = 20, `30` = 30, `50` = 50, `100` = 100)

sizes %>%
  lapply(my_sample, x = 1:999, repeats = 100) %>%
  as.tibble() %>%
  gather() %>%
  qplot(x = value, data = ., 
        facets = factor(key, levels = sizes) ~ .)

library(tidyverse)
out %>%
  as.tibble() %>%
  gather() %>%
  ggplot(aes(value)) + 
  geom_histogram(aes(y = ..density..), fill = "blue", alpha = .5) +
  geom_density(color = "red", fill = "pink", alpha = .3) + 
  facet_grid(factor(key, levels = c("size_20", "size_30", "size_50", "size_100"))~.) + 
  geom_vline(aes(xintercept = 500), color = "orange", size = 1.2) + 
  theme_bw()

map_dbl(out, mean)

col_num <- function(df) {
  index <- vector("logical", ncol(df))
  for (i in seq_along(df)) {
    index[[i]] <- is.numeric(df[[i]])
  } 
  df[index]
}

col_mean <- function(df) {
  results <- vector("double", ncol(df))
  names(results) <- names(df)
  for (i in seq_along(df)) {
    results[[i]] <- mean(df[[i]])  
  }
  results
}

show_vec_mean <- function(x) {
  for (i in seq_along(x)) {
    cat(names(x)[[i]], ":", "\t", round(x[[i]], 3), "\n")
  }
}

show_mean <- function(df) {
  show_vec_mean(col_mean(col_num(df)))
}

data(iris)
show_mean(iris)
data(diamonds)
show_mean(diamonds)


chk_outlier <- function(df, min = T, number = 10) {
  out <- vector("list", length = length(df))
  names(out) <- names(df)
  if (min) {
    for (i in seq_along(df)) {
      out[[i]] <- head(sort(df[[i]]), number)
    }
  } else {
    for (i in seq_along(df)) {
      out[[i]] <- rev(tail(sort(df[[i]]), number))
    }
  }
  out
}

num <- col_num(iris)
chk_outlier(num, min = F, number = 4)


library(tidyverse)

split_chr <- function(df, sep = "#") {
  vec <- df[[1]]
  strlist <- str_split(vec, sep)
  unique_chr <- unique(flatten_chr(strlist))
  col_name <- unique_chr[order(as.numeric(unique_chr))]
  list <- vector("list", length = length(col_name))
  names(list) <- col_name
  for (i in seq_along(col_name)) {
    list[[i]] <- map_lgl(strlist, ~col_name[[i]] %in% .)
  }
  list
}

list2df <- function(list) {
  as.data.frame(do.call(cbind, list))
}

lgl2chr <- function(df, chr = c("1", "")) {
  map_df(df, ~ifelse(., chr[1], chr[2]))
}

split_col <- function(df) {
  lgl2chr(list2df(split_chr(df)), c("bingo",""))
}

test <- tibble(a = c("1#3#2", "1#4#something", "2#3#@"))
split_col(test)

keep_na <- function(df, start_num, end_num) {
  #start_num, end_num for cols that must have their value keeped,
  #like ID, name....
  right_df <- df[-(start_num:end_num)]
  na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
  na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
  left_df <- df[start_num:end_num][na_rows, ]
  right_df_na_keeped <- right_df[na_rows, na_cols]
  logical <- is.na(right_df_na_keeped)
  chr_na_keeped <- ifelse(logical, "missing", "")
  cbind(left_df, chr_na_keeped)
}
  
data(iris)
names(iris) <- c("a", "b", "c", "d", "e")
head(iris)

variable <- c("a", "b", "d")
conds <- list(c(4.5, 7.5),
              c(2.3, 4),
              c(.2, 2.4))

chk_rng <- function(vec, cond) {
  between(vec, cond[1], cond[2])
}

chk_rng(1:10, c(3,7))

map2(iris[variable], conds, chk_rng)

chk_range <- function(df, var, cond) {
  temp_list <- map2(df[var], cond, chk_rng) 
  df_lgl <- list2df(temp_list)
}

head(chk_range(iris, variable, conds))

keep_FALSE <- function(df, id) {
  right_df <- df[-(id[1]:id[2])]
  na_rows <- apply(right_df, 1, function(x) {any(!x)})
  na_cols <- apply(right_df, 2, function(x) {any(!x)})
  left_df <- df[id[1]:id[2]][na_rows, ]
  right_df_na_keeped <- right_df[na_rows, na_cols]
  cbind(left_df, right_df_na_keeped)
}

test_data <- data.frame(a = 1:5,
                        b = c(T, F, T, T, T),
                        c = c(T, T, F, T, T))
keep_FALSE(test_data, id = c(1, 1))

chk_range_id <- function(df, var, cond, id) {
  right_df <- chk_range(df, var, cond)
  full_df_interested <- cbind(df[id[1]:id[2]], right_df)
  newid <- c(1, (id[2] - id[1]) + 1)
  keep_FALSE(full_df_interested, newid)
}

results <- chk_range_id(iris, variable, conds, c(5,5))
results

lgl2chr(results[-1])
cbind(results[1], lgl2chr(results[-1]))

chk_lgl_vecout <- function(df, vec, cond, skip) {
  temp_df <- df[skip]
  temp_df <- is.na(temp_df)
  
  logical_value <- df[[vec]] == cond
  
  out <- vector("logical", length(logical_value))
  for (i in seq_along(logical_value)) {
    if (logical_value[[i]]) {
      out[[i]] <- all(unlist(temp_df[i, ]))
    } else {
      out[[i]] <- all(!unlist(temp_df[i, ]))
    }
  }
  out
}

test_data <- tibble(id = 1:10, 
                    name = c(rep(letters[1:3], 3), "a"),
                    a1 = c(rep(1:3, each = 3), 1),
                    a2 = c("A", NA, NA, "A", NA, NA, "A", NA, NA, "A"),
                    a3 = c(1, 1, NA, 1, 1, NA, 1, 1, NA, 1))
test_data 

(sub <- chk_lgl_vecout(test_data, "a1", 3, c("a2", "a3")))
test_data[!sub, ]

test_data1 <- test_data %>%
  mutate(b1 = as.character(a1),
         b2 = a2,
         b3 = rep(1, 10))

test_data1

vec <- list("a1", "b1")
cond <- list(3, "3")
skip <- list(c("a2", "a3"), c("b2", "b3"))

vec; cond; skip
chk_lgl_vecout_with_df <- function(a, b, c) {
  chk_lgl_vecout(test_data1, a, b, c)
}

chk_lgl_vecout_with_df("b1", "3", c("b2", "b3"))
pars <- list(vec, cond, skip)
pmap(pars, chk_lgl_vecout_with_df)

chk_lgl_id <- function(df, id, vec, cond, skip) {
  pars <- list(vec, cond, skip)
  right_df <- pmap(pars, function(a, b, c) chk_lgl_vecout(df, a, b, c))
  names(right_df) <- unlist(vec)
  left_df <- df[id]
  full_df <- cbind(left_df, right_df)
  keep_FALSE(full_df, id)
}

test_data1[9, "b3"] <- NA
test_data1[2, "a2"] <- "B"
test_data1[3, "b2"] <- "c"
test_data1
chk_lgl_id(test_data1, 1:2, vec, cond, skip)
chk_lgl_id(test_data1, 1:2, vec, cond, skip) %>%
  select(-(1:2)) %>%
  lgl2chr(c("", "1"))

test_data <- tibble(a = 1:5,
                    b = c("1", "2", "3", "4", "5"),
                    c = c("one", "2", "3", "4#5", "5"))

test_data

map_df(test_data, as.numeric)

chk_illegal_type <- function(df, id) {
  right_df <- df[-id]
  right_df <- map_df(right_df, as.numeric)
  na_rows <- apply(right_df, 1, function(x) {any(is.na(x))})
  na_cols <- apply(right_df, 2, function(x) {any(is.na(x))})
  left_df <- df[id][na_rows, ]
  right_df_na_keeped <- right_df[na_rows, na_cols]
  logical <- is.na(right_df_na_keeped)
  chr_na_keeped <- ifelse(logical, "wrong_type_or_missing", "")
  cbind(left_df, chr_na_keeped)
}

test_data <- test_data %>%
  mutate(id = row_number(), 
         d = lag(c)) %>%
  select(id, everything())
test_data  
chk_illegal_type(test_data, 1)

# A LITTLE BITTER OF VISUALITION ------------------------------------------

data <- map_dbl(col_num(iris), mean)
#par(mfrow = c(2, 2))
par(mfrow = c(1, 1))
barplot(data)
pie(data)
boxplot(iris$Sepal.Length)
hist(iris$Sepal.Length, col = "gray", cex = 1.5)
plot(iris$Sepal.Length, iris$Sepal.Width, cex = 1.5)
plot(women$height, women$weight, type = "b")
plot(density(iris$Sepal.Length))
boxplot(mtcars$mpg ~ mtcars$cyl)
mtcars <- mtcars[order(mtcars$mpg),]
dotchart(mtcars$mpg, labels = row.names(mtcars), pch = 19, cex = 1.3)
library(rgl)
with(mtcars, {
  plot3d(wt, disp, mpg, col = "red", size = 5)
})
library("GGally")
ggpairs(iris[iris$Species %in% c("versicolor", "virginica"), ], aes(color = Species)) 
+ theme_bw()
ggpairs(mtcars[c("wt", "disp", "mpg")])


# ADVANCED R --------------------------------------------------------------
# SECTION_1 basic ---------------------------------------------------------

a <- matrix(1:4, ncol = 2)
is.vector(a) 
is.atomic(a)
is.list(a)
typeof(a)
class(a)
attributes(a)
x <- c(1, 2, 3)
typeof(x)
x <- c(1L, 2L, 3L)
typeof(x)
x

structure(1:20, my_attribute = "this is a vecgtor")

# name a vector
x <- c(a = 1, b = 2, c = 3)
x <- 1:3; names(x) <- c("a", "b", "c")
x <- setNames(1:3, c("a", "b", "c"))
# drop name
unname(x)
names(x) <- NULL

# factor
x <- factor(c("a", "b", "b", "a"))
class(x)
levels(x)
x[2] <- "c"
x
c(factor("a"), factor("b")) # do not work
sex_char <- c("m", "m", "m")
sex_factor <- factor(sex_char, levels = c("m", "f"))
table(sex_factor)

f1 <- factor(letters)
levels(f1) <- rev(levels(f1))
f2 <- rev(factor(letters))
f3 <- factor(letters, levels = rev(letters))
f1; f2; f3

l <- list(1:3, "a", T, 1.0)
dim(l) <- c(2, 2)
l
dim(1:3)
as.data.frame(l)
as.data.frame(list(a = 1:4, b = 2:5))

df <- data.frame(x = 1:3)
df$y <- list(1:2, 1:3, 1:4)
df
data.frame(x = 1:3, y = list(1:2, 1:3, 1:4))
data.frame(x = 1:3, y = I(list(1:2, 1:3, 1:4)))

dfm <- data.frame(x = 1:3, y = I(matrix(1:9, nrow = 3)))
dim(dfm)
names(dfm)
str(dfm)

x <- c(2.1, 4.2, 3.3, 5,4)
x[c(1, 1)]
x[c(2.1, 2.9)]
x[c(-1, 2)]

x[c(T, F)]
x[c(T, F, T, F)] #same as above one
x[c(T, F, NA, F)]
x[]
x[0]

y <- setNames(x, letters[1:5])
y
y[c("a", "a", "a")]
y[c("a", "aa")] #matched precisely

a <- matrix(1:9, nrow = 3)
colnames(a) <- c("A", "B", "C")
a[c(T, F, T), c("B", "B")]
a[0, -2]

vals <- outer(1:5, 1:5, FUN = "paste", sep = ",")
vals
vals[c(4, 15)]
select <- matrix(ncol = 2, byrow = T, c(
  1, 1,
  3, 1,
  2, 4
))
select
vals[select]
vals[1]
z <- as.data.frame(vals)
z <- setNames(z, letters[1:5]); z
z[, "a"]
z[, "a", drop = F]

a <- list(a = 1, b = 2)
a[[1]]
a[["a"]]

b <- list(a = list(b = list(c = list(d = 1))))
b[[c("a", "b", "c", "d")]] #recursive, very useful
b[["a"]][["b"]][["c"]][["d"]] #same as above

x <- 1:5
x[-1] <- 4:1; x #length(left_hand = right_hand)
x[c(1, 1)] <- 2:3; x # legal
x[c(1, NA)] <- c(1:2) # illegal
x[c(T, F, NA)] <- 1 #legal, NA is treated as FALSE

data(mtcars)
mtcars <- lapply(mtcars, as.integer); mtcars
data(mtcars)
mtcars[] <- lapply(mtcars, as.integer); mtcars

x <- list(a = 1, b = 2)
x[["a"]] <- NULL; str(x)

x <- list(a = 1)
x["b"] <- list(NULL); str(x)

x <- c("m", "f", "u", "f", "f", "m", "u")
lookup <- c(m = "male", f = "female", u = NA)
lookup[x]
unname(lookup[x])
c(m = "known", f = "known", u = "unkame")[x]

grades <- c(1, 2, 2, 3, 1) 
info <- data.frame( grade = 3:1, 
                    desc = c("Excellent", "Good", "Poor"), 
                    fail = c(F, F, T) )
id <- match(grades, info$grade)
info[id, ]

rownames(info) <- info$grade
info[as.character(grades), ]

df <- data.frame(x = rep(1:3, each = 2), y = 6:1, z = letters[1:6])
df[sample(nrow(df)), ]
df[sample(nrow(df), 3), ]
df[sample(nrow(df), 6, rep = T), ]

df2 <- df[sample(nrow(df)), 3:1]
df2[order(df2$x), ]
df2[, order(names(df2))]

df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1))
df
rep(1:nrow(df), df$n)
df[rep(1:nrow(df), df$n), ]

subset(mtcars, gear == 5)
subset(mtcars, gear == 5 & cyl == 4)

x <- sample(10) < 4
which(x)
unwhich <- function(x, n) {
  out <- rep_len(F, n)
  out[x] <- T
  out
}
unwhich(which(x), 10)

(x1 <- 1:10 %% 2 == 0)
(x2 <- which(x1))
(y1 <- 1:10 %% 5 == 0)
(y2 <- which(y1))
x1 & y1
intersect(x2, y2)
x1 | y1
union(x2, y2)
x1 & !y1
setdiff(x2, y2)
xor(x1, y1)
setdiff(union(x2, y2), intersect(x2, y2))


# FUNCTION ----------------------------------------------------------------

# install.packages("pryr")
library("pryr")
library("purrr")

f <- function(x) x^2 # three parts
f
formals(f)
body(f)
environment(f)

sum
formals(sum)
body(sum)
environment(sum)

objs <- mget(ls("package:base"), inherits = T)
funs <- Filter(is.function, objs)
map(map(funs, formals), length)

x <- 1
h <- function() {
  y <- 2
  i <- function() {
    z <- 3
    c(x, y, z)
  }
i()
}
h()
rm(x, h)

j <- function(x) {
  y <- 2
  function() {
    c(x, y)
  }
}

k <- j(1)
k()
k
rm(j, k)

l <- function(x) x + 1
m <- function() {
  l <- function(x) x * 2
       l(10)
}
m()
rm(l, m)

n <- function(x) x / 2
o <- function() {
  n <- 10
  n(n)
}
o()
rm(o, n)

j <- function() {
  if(!exists("a")) {
    a <- 1
  } else {
    a <- a + 1
  }
  print(a)
}
j()

f <- function() x
x <- 15
f()
x <- 30
f()
#above function f should be avoid because of outer dependency

f <- function() x + 1
codetools::findGlobals(f)
environment(f) <- emptyenv()
f()

c <- 10
c(c = c)
add <- function(x, y) x + y 
sapply(1:10, add, 3)
sapply(1:10, `+`, 3)
sapply(1:10, "+", 3)

x <- list(1:3, 4:9, 10:12)
sapply(x, "[", 2)
sapply(x, function(x) x[2])
library(purrr)
map(x, function(x) x[2])

f <- function(abcdef, bcde1, bcde2) {
  list(a = abcdef, b1 = bcde1, b2 = bcde2)
}

str(f(1, 2, 3))
str(f(2, 3, a = 1))
str(f(1, 3, b = 1))

args <- list(1:10, na.rm = T)
do.call(mean, args) #divery the formals to the functions

g <- function(a = 1, b = a * 2) {
  c(a, b)
}
g()
g(109)

f <- function(a = 1, b = d) {
  d <- (a + 1) ^ 2
  c(a, b)
}
f()
f(10)

i <- function(a, b) {
  c(missing(a), missing(b))
}
i()
i(a = 1)
i(b = 2)
i(a = 1, b = 2)

f <- function(x) {
  10
}
f(stop("this is an error!"))
f <- function(x) {
  force(x)
  10
}
f(stop("this is an error"))

add <- function(x) {
  function(y) x + y
}
adders <- lapply(1:10, add)
adders[[1]](10)
adders[[10]](10)

add <- function(x) {
  force(x)
  function(y) x + y
}
adders2 <- lapply(1:10, add)
adders2[[1]](10)
adders2[[10]](10)

f <- function(x = ls()) {
  a <- 1
  x
}
f()
f(ls())

`&&` <- function(x, y) {
  if (!x) return(FALSE)
  if (!y) return(FALSE)
  TRUE
}
a <- NULL
!is.null(a) && a > 0
a > 0 && !is.null(a)

if(is.null(a)) stop("a is null")
!is.null(a) || stop("a is null")

barplot(1:5, col = "red", pch = 20)
plot(1:5, bty = "u")
plot(1:5, labels = F)

f <- function(...) {
  names(list(...))
}
f(a = 1, b = 2)

`%+%` <- function(a, b) paste(a, b, sep = "")
"new" %+% "string"
`second<-` <- function(x, value) {
  x[2] <- value
  x
}
x <- 1:10
second(x) <- 5
x

library(pryr)
x <- 1:10
address(x)
second(x) <- 6
address(x)

`modify<-` <- function(x, position, value) {
  x[position] <- value
  x
}
modify(x, 1) <- 10
modify(get("x"), 1) <- 10 # do not work, 
# get("x") <- `modify<-`(get("x"), 1, 10)

replace_fun <- function(fun = F) {
  objs <- mget(ls("package:base"), inherits = T)
  fun_names <- names(Filter(is.function, objs))
  replace_fun_names <- fun_names[str_detect(fun_names, "<-")]
  if (fun) objs[replace_fun_names] else replace_fun_names
}
replace_fun(fun = T)

f1 <- function() 1
f1()
f2 <- function() invisible(1)
f2()
f1() == 1
f2() == 1
(f2())
a <- 2
(a <- 2) # <- is one of the most common used function that
         # return invisible value, so, it can be used below
a <- b <- c <- d <- 2

in_dir <- function(dir, code) {
  old <- setwd(dir)
  on.exit(setwd(old))
  force(code)
}
getwd()
in_dir("~", getwd())

# OO GUIDE ----------------------------------------------------------------

f <- function() {}
typeof(f)
is.function(f)

typeof(sum)
is.primitive(sum)

df <- data.frame(x = 1:10, y = letters[1:10])
otype(df)
otype(df$x)
otype(df$y)

mean
UseMethod("mean")
ftype(mean)

ftype(t.data.frame)
ftype(t.test)
methods("mean")

methods(class = "list")


foo <- structure(list(), class = "foo")
foo
foo <- list()
class(foo) <- "foo"
foo
class(foo)
inherits(foo, "foo")

foo <- function(x) {
  if(!is.numeric(x)) stop("X must be numeric")
  structure(list(x), class = "foo")
}

mod <- lm(log(mpg) ~ log(disp), data = mtcars)
class(mod)
print(mod)
class(mod) <- "data.frame"
print(mod)
mod$coefficients

f <- function(x) UseMethod("f")
f.a <- function(x) "class a"
a <- structure(list(), class = "a")
class(a)
f(a)

mean.a <- function(x) "a"
mean(a)

f <- function(x) UseMethod("f")
f.a <- function(x) "class a"
f.default <- function(x) "unknown class"
f(structure(list(), class = "a"))
f(structure(list(), class = "b"))
f(structure(list(), class = c("b", "a")))

library(methods)
library(stats4)
y <- c(26, 17, 13, 12,20, 5, 9, 8, 5, 4, 8)
nLL <- function(lambda) -sum(dpois(y, lambda, log = T))
fit <- mle(nLL, start = list(lambda = 5), nobs = length(y))
isS4(fit)
otype(fit)
isS4(nobs)
ftype(nobs)

# s4 object ---------------------------------------------------------------

mle_nobs <- method_from_call(nobs(fit))
isS4(mle_nobs)
ftype(mle_nobs)
is(fit)
class(fit)
is(fit, "mle")
getGenerics()
getclass()

setClass("Person", 
         slots = list(name = "character", age = "numeric")) 
setClass("Employee", 
         slots = list(boss = "Person"), 
         contains = "Person")
alice <- new("Person", name = "Alice", age = 40)
john <- new("Employee", name = "John", age = 20, boss = alice)
alice@age
slot(john, "boss")

setClass("RangedNumeric", 
         contains = "numeric",
         slots = list(min = "numeric", max = "numeric"))
rn <- new("RangedNumeric", 1:10, min = 1, max = 10)
rn@min
[email protected]

setGeneric("union")
setMethod("union",
          c(x = "data.frame", y = "data.frame"),
          function(x, y) {
            unique(rbind(x, y))
          })

setGeneric("myGeneric", function(x) {
  standardGeneric("myGeneric")
})


# reference class ---------------------------------------------------------

Account <- setRefClass("Account")
Account$new()

Account <- setRefClass("Account",
                       fields = list(balance = "numeric"))
a <- Account$new(balance = 100)
a$balance
a$balance <- 200
a$balance

b <- a
b$balance
a$balance <- 0
b$balance

c <- a$copy()
c$balance
a$balance <- 100
c$balance

Account <- setRefClass("Account",
                       fields = list(balance = "numeric"),
                       methods = list(
                         withdraw = function(x) {
                           balance <<- balance - x
                         },
                         deposit = function(x) {
                           balance <<- balance + x
                         }
                       ))
a <- Account$new(balance = 100)
a$deposit(100)
a$balance

NoOverdraft <- setRefClass("NoOverdraft",
                           contains = "Account",
                           methods = list(
                             withdraw = function(x) {
                               if (balance < x) stop("Not enough money")
                               balance <<- balance - x
                             }
                           ))
accountJohn <- NoOverdraft$new(balance = 100)
accountJohn$deposit(50)
accountJohn
accountJohn$withdraw(1000)
pryr::otype(accountJohn)

# env ---------------------------------------------------------------------

e <- new.env()
e$a <- FALSE
e$b <- "a"
e$c <- 2.3
e$d <- 1:3

e$a <- e$d
e$a <- 1:3

search()
as.environment("package:stats")
ls(e)
parent.env(e)

e$.a <- 2
ls(e)
ls(e, all.names = T)

str(e)
ls.str(e)

e$c <- 3
e$c
e[["c"]]
get("c", envir = e)

e <- new.env()
e$a <- 1
e$a <- NULL
ls(e)
e$a
rm("a", envir = e)
ls(e)

x <- 10
exists("x", envir = e)
exists("x", envir = e, inherits = F)
identical(globalenv(), environment())


library(pryr)
x <- 5
where("x")
where("mean")

where <- function(name, env = parent.env()) {
  if(identical(env, emptyenv())) {
    stop("can not find", name, call. = FALSE)
  } else if (exists(name, envir = env, inherits = FALSE))
    env
} else {
  where(name,parent.env(env))
}


y <- 1
f <- function(x) x + y
environment(f)

e <- new.env()
e$g <- function() 1
environment(e$g)

environment(sd)
where("sd")

g <- function(x) {
  if(!exists("a", inherits = F)) {
    message("define a")
    a <- 1
  } else {
    a <- a + 1
  }
  a
}
g(10)
g(10)

h <- function() {
  x <- 10
  function() {
    x
  }
}
i <- h()
x <- 20
i()

#assign
x <- 0
f <- function() {
  x <<- 1
}
f()
x

library(pryr)
system.time(b%", paste0(...), "")
  }
}
tags <- c("p", "b", "i")
html <- lapply(setNames(tags, tags), simple_tag)
html$p("this is ", html$b("bold"))
with(html, p("this is", b("bold"), "text."))

attach(html)
p("this is", b("bold"), "text.")
detach()

list2env(html, environment())
p("This is ", b("bold"), " text.")
rm(list = names(html), envir = environment())

midpoint <- function(f, a, b) {
  (b - a) * f((a + b) / 2)
}

trapezoid <- function(f, a, b) {
  (b - a) / 2 * (f(a) + f(b))
}

midpoint(sin, 0, pi)
trapezoid(sin, 0, pi)

midpoint_composite <- function(f, a, b, n = 10) {
  points <- seq(a, b, length = n + 1)
  h <- (b - a) / n
  area <- 0
  for (i in seq_len(n)) {
    area <- area + h * f((points[i] + points[i + 1]) / 2)
  }
  area
}

trapezoid_composite <- function(f, a, b, n = 10) {
  points <- seq(a, b, length = n + 1)
  h <- (b - a) / n
  area <- 0
  for (i in seq_len(n)) {
    area <- area + h / 2 * (f(points[i]) + f(points[i + 1]))
  }
  area
}

midpoint_composite(sin, 0, pi, n = 10)
midpoint_composite(sin, 0, pi, n = 100)
trapezoid_composite(sin, 0, pi, n = 10)
trapezoid_composite(sin, 0, pi, n = 100)

composite <- function(f, a, b, n = 10, rule) {
  points <- seq(a, b, length = n + 1)
  area <- 0
  for (i in seq_len(n)) {
    area <- area + rule(f, points[i], points[i + 1])
  }
  area
}

composite(sin, 0, pi, n = 10, rule = midpoint)
composite(sin, 0, pi, n = 10, rule = trapezoid)

simpson <- function(f, a, b) {
  (b - a) / 6 * (f(a) + 4 * f((a + b) / 2) + f(b))
}
boole <- function(f, a, b) {
  pos <- function(i) a + i * (b - a) / 4
  fi <- function(i) f(pos(i))
  (b - a) / 90 *
    (7 * fi(0) + 32 * fi(1) + 12 * fi(2) + 32 * fi(3) + 7 * fi(4))
}

composite(sin, 0, pi, n = 10, rule = simpson)
composite(sin, 0, pi, n = 10, rule = boole)

newton_cotes <- function(coef, open = FALSE) {
  n <- length(coef) + open
  function(f, a, b) {
    pos <- function(i) a + i * (b - a) / n
    points <- pos(seq.int(0, length(coef) - 1))
    (b - a) / sum(coef) * sum(f(points) * coef)
  }
}
boole <- newton_cotes(c(7, 32, 12, 32, 7))
milne <- newton_cotes(c(2, -1, 2), open = TRUE)
composite(sin, 0, pi, n = 10, rule = milne)

randomise <- function(f) f(runif(1e3))
randomise(mean)
randomise(mean)
randomise(sum)

# functional --------------------------------------------------------------

lapply2 <- function(x, f, ...) {
  out <- vector("list", length(x))
  for (i in seq_along(x)) {
    out[[i]] <- f(x[[i]], ...)
  }
  out
}

l <- replicate(20, runif(sample(1:10, 1)), simplify = F)

out <- vector("list", length(l))
for (i in seq_along(l)) {
  out[[i]] <- length(l[[i]])
}
unlist(out)
unlist(lapply(l, length))

unlist(lapply(mtcars, class))
mtcars[] <- lapply(mtcars, function(x) x / mean(x))

trims <- c(0, .1, .2, .5)
x <- rcauchy(1000)
unlist(lapply(trims, function(trim) mean(x, trim = trim)))  # beautiful
lapply(trims, mean, x = x)

# 3 ways to for-loop 1: for (x in xs) 2: for (i in seq_along(xs))
# 3:for(nm in names(xs))
xs <- runif(1e3)
res <- c()
for (x in xs) res <- c(res, sqrt(x)) #it is slowl
res

# 3 ways to using lapply
lapply(xs, function(x) {}) # most frequently used methods
lapply(seq_along(xs), function(i) {})
lapply(names(xs), function(nm) {})

formulas <- list(
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) + wt
)

lapply(formulas, function(formula) lm(formula = formula, data = mtcars))

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars), rep = T)
  mtcars[rows, ]
})

out <- vector("list", 10)
for (i in 1:10) {
  rows <- sample(1:nrow(mtcars), rep = T)
  out[[i]] <- mtcars[rows, ]
}
out

sapply(mtcars, is.numeric)
vapply(mtcars, is.numeric, logical(1))

sapply(list(), is.numeric) # better in interactive
vapply(list(), is.numeric, logical(1)) #better in program

df <- data.frame(x = 1:10, y = letters[1:10])
sapply(df, class)
vapply(df, class, character(1))

df2 <- data.frame(x = 1:10, y = Sys.time() + 1:10)
sapply(df2, class)
vapply(df2, class, character(1))

xs <- replicate(5, runif(10), simplify = F)
ws <- replicate(5, rpois(10, 5) + 1, simplify = F)
unlist(lapply(xs, mean))
unlist(lapply(seq_along(xs), function(i) {
  weighted.mean(xs[[i]], ws[[i]])
}))
unlist(Map(weighted.mean, xs, ws))

mtmeans <- lapply(mtcars, mean)
mtmeans[] <- Map(`/`, mtcars, mtmeans)
Map(function(x, w) weighted.mean(x, w, na.rm = T), xs, ws)

rollmean <- function(x, n) {
  out <- rep(NA, length(x))
  offset <- trunc(n / 2)
  for (i in (offset + 1):(length(x) - n + offset - 1)) {
    out[i] <- mean(x[(i - offset):(i + offset - 1)])
  }
  out
}

x <- seq(1, 3, length = 1e2) + runif(1e2)
plot(x)
lines(rollmean(x, 5), col = "blue", lwd = 2)
lines(rollmean(x, 10), col = "red", lwd = 2)

rollapply <- function(x, n, f, ...) {
  out <- rep(NA, length(x))
  offset <- trunc(n / 2)
  for (i in (offset = 1):(length(x) - n + offset + 1)) {
    out[i] <- f(x[(i - offset):(i + offset)], ...)
  }
  out
}
x <- seq(1, 3, length = 1e2) + rt(1e2, df = 2) / 3
plot(x)
lines(rollmean(x, 5), col = "red", lwd = 2)
lines(rollapply(x, 5, median), col = "blue", lwd = 2)

# which element be computed first is not important when you using apply
lapply3 <- function(x, f, ...) {
  out <- vector("list", length(x))
  for (i in sample(seq_along(x))) { 
    out[[i]] <- f(x[[i]], ...)
  }
  out
}

unlist(lapply(1:10, sqrt))
unlist(lapply3(1:10, sqrt))

library(parallel)
boot_df <- function(x) x[sample(nrow(x), rep = T), ]
rsquared <- function(mod) summary(mod)$r.square
boot_lm <- function(i) {
  rsquared(lm(mpg ~ wt + disp, data = boot_df(mtcars)))
}
system.time(lapply(1:500, boot_lm))
system.time(mclapply(1:500, boot_lm, mc.cores = 2)) # unfortunately, it does not work on windows,try parLapply() instead.

a <- matrix(1:20, nrow = 5)
a1 <- apply(a, 1, identity)
a1
identical(a, a1)
identical(a, t(a1))

x <- matrix(rnorm(20, 0, 1), nrow = 4)
x1 <- sweep(x, 1, apply(x, 1, min), `-`)
x2 <- sweep(x1, 1, apply(x1, 1, max), `/`)

outer(1:3, 1:10, `*`)

pulse <- round(rnorm(22, 70, 10 / 3)) + rep(c(0, 5), c(10, 12))
group <- rep(c("A", "B"), c(10, 12))
tapply(pulse, group, length)
tapply(pulse, group, mean)
split(pulse, group)
tapply2 <- function(x, group, f, ..., simplify = TRUE) {
  pieces <- split(x, group)
  sapply(pieces, f, simplify = simplify)
}
tapply2(pulse, group, mean)
tapply2(pulse, group, length)

Reduce2 <- function(f, x) {
  out <- x[[1]]
  for (i in seq(2, length(x))) {
    out <- f(out, x[[i]])
  }
  out
}

l <- replicate(5, sample(1:10, 15, replace = T), simplify = F)
Reduce(intersect, l)

where <- function(f, x) {
  vapply(x, f, logical(1))
}

df <- data.frame(x = 1:3, y = c("a", "b", "c"))
where(is.factor, df)
str(Filter(is.factor, df))
Find(is.numeric, df)
Position(is.factor, df)

# integrate() uniroot() potimise() are skiped

trans <- list(
  disp = function(x) x * 0.0163871,
  am = function(x) factor(x, levels = c("auto", "manual"))
)
for (var in names(trans)) {
  mtcars[[var]] <- trans[[var]](mtcars[[var]])
}

# recursively using for-loop is skiped

#11.7

add <- function(x, y) {
  stopifnot(length(x) == 1, length(y) == 1,
            is.numeric(x), is.numeric(y))
  x + y
}
rm_na <- function(x, y, identity) {
  if(is.na(x) && is.na(y)) {
    identity
  } else if (is.na(x)) {
    y
  } else {
    x
  }
}
rm_na(NA, 10, 0)
rm_na(10, NA, 0)
rm_na(NA, NA, NA)

add <- function(x, y, na.rm = F) {
  if (na.rm && (is.na(x) || is.na(y))) rm_na(x, y, 0) else x + y
}
add(10, NA)
add(10, NA, na.rm = T)
add(NA, NA)
add(NA, NA, na.rm = T)

r_add <- function(xs, na.rm = T) {
  Reduce(function(x, y) add(x, y, na.rm = na.rm), xs)
}
r_add(1:10)
r_add(NA, na.rm = T)
r_add(numeric())
r_add <- function(xs, na.rm = T) {
  Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, init = 0)
}
r_add(1:10)
r_add(NA, na.rm = T)
r_add(numeric())

v_add1 <- function(x, y, na.rm = FALSE) {
  stopifnot(length(x) == length(y), is.numeric(x), is.numeric(y))
  if (length(x) == 0) return(numeric())
  simplify2array(Map(function(x, y) add(x, y, na.rm = na.rm), x, y))
}
v_add1(1:10, 1:10)
v_add1(numeric(), numeric())
v_add1(c(1, NA), c(1, NA))
v_add1(c(1, NA), c(1, NA), na.rm = T)

c_add <- function(xs, na.rm = FALSE) {
  Reduce(function(x, y) add(x, y, na.rm = na.rm), xs, accumulate = TRUE)
}
c_add(1:10)

row_sum <- function(x, na.rm = FALSE) {
  apply(x, 1, add, na.rm = na.rm)
}
col_sum <- function(x, na.rm = FALSE) {
  apply(x, 2, add, na.rm = na.rm)
}

# 12 ----------------------------------------------------------------------

chatty <- function(f) {
  function(x, ...) {
    res <- f(x, ...)
    cat("processing", x, "\n", sep = "")
    res
  }
}
f <- function(x) x ^ 2
s <- 3:1
chatty(f)(1)
vapply(s, chatty(f), numeric(1))

delay_by <- function(delay, f) {
  function(...) {
    Sys.sleep(delay)
    f(...)
  }
}
system.time(runif(100))
system.time(delay_by(.1, runif)(100))

dot_every <- function(n, f) {
  i <- 1
  function(...) {
    if (i %% n == 0) cat(".")
    i <<- i + 1
    f(...)
  }
}
x <- lapply(1:100, runif)
x <- lapply(1:100, dot_every(10, runif)) 

library(memoise)
slow_func <- function(x) {
  Sys.sleep(1)
  10
}
system.time(slow_func())
system.time(slow_func())

fast_func <- memoise(slow_func)
system.time(fast_func())
system.time(fast_func())

fib <- function(n) {
  if(n < 2) return(1)
  fib(n -2) + fib(n - 1)
}
system.time(fib(25))
system.time(fib(26))

fib2 <- memoise(function(n) {
  if (n < 2) return(1)
  fib2(n - 2) + fib2(n - 1)
})
system.time(fib2(23))
system.time(fib2(24))

runif <- memoise(runif)
runif(5)
runif(5)

# a lot of contents skiped


# some interesting FUNCs by me --------------------------------------------
library(tidyverse)

get_coefs <- function(model) {
  summary(model)[["coefficients"]]
}

line_two <- function(matrix, n = 2) {
 matrix[n, , drop = F]
}

vstack <- function(list) {
  do.call(rbind, list)
}

do_model <- function(response, data, model = lm, ...) {
  formula <- as.formula(paste(response, "~", "."))
  model(formula, data = data, ...)
}

extend_scaler <- function(x) {
    c(x, unlist(lapply(0:99, function(i) str_c(x, i))))
}

extend_chr <- function(chr) {
  unlist(lapply(chr, extend_scaler))
}

keep_row <- function(matrix, vras, extend = T) {
  if (extend) vars = extend_chr(vars)
  vars = intersect(rownames(matrix), vars)
  matrix[vars, , drop = F]  
}

univar <- function(response, vars_interested, controls = NA, data, model = lm, ...) {
  entry_name <- c(response, vars_interested, controls)
  entry_name <- entry_name[!is.na(entry_name)]
  wrong_name <- setdiff(entry_name, names(data))
  if (length(wrong_name) != 0) stop("wrong variable name in you entry")
  
  out <- vector("list", length = length(vars_interested))
  for (i in seq_along(vars_interested)) {
    new_data <- data[c(response, vars_interested[[i]], controls[!is.na(controls)])]
    fit <- do_model(response, new_data, model, ...)
    out[[i]] <- get_coefs(fit)
  }
  out
}


data(mtcars); names(mtcars)
vars <- c("cyl", "disp", "hp")
univar("am", vars, c("qsec", "gear"), mtcars)
univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two)
univar("am", vars, c("qsec", "gear"), mtcars) %>% lapply(line_two) %>% vstack

mtcars$cyl <- as.factor(mtcars$cyl)
univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial())
univar("vs", vars, "gear", data = mtcars, model = glm, family = binomial()) %>%
  vstack %>% keep_row(vars)

library(reshape)
library(tidyverse)
library(tableone)
q_plot <- function(data, ...) qplot(data = data, ...)

to_draw <- function(data, group_var) {
  nms <- names(data)
  nms2 <- setdiff(nms, group_var)
  melt(data, id = group_var)
}

keep_num <- function(data, group) {
  out <- keep(data, is.numeric)
  if (is.numeric(data[[group]])) out else cbind(out, data[group])
}

data(iris); names(iris) <- letters[1:5]; iris$group <- c(1, 0); str(iris); head(iris)

iris %>% keep_num("group") %>% to_draw("group") %>% 
  q_plot(x = value, facets = variable ~ group, geom = "density")

CreateTableOne(c("a", "b", "c", "d", "e"), "group", data = iris) %>% 
  print(nonnormal = c("c", "d"))

你可能感兴趣的:(Notes of R for data science_07Jul2020)