转载来源
C-x C-e
, 爽到根本停不下来!C-x C-e
执行R的S表达式C-x C-e
执行R的S表达式el-get-install ESS
C-c C-k
打开R的Repl, C-c C-l
eval当前文件缓冲到Repl里面C-x C-e
fun r lisp!;; 将这里的配置放到启动脚本init.el或者是`.emacs`
(defun ess-eval-sexp (vis)
(interactive "P")
(save-excursion
(backward-sexp)
(let ((end (point)))
(forward-sexp)
(ess-eval-region (point) end vis "Eval sexp"))))
(add-hook 'ess-mode-hook (lambda () (define-key global-map (kbd "C-x C-e") 'ess-eval-sexp) ))
# define
(function (y) (function (x) ('+' (x, y))))
# call
((function (x) x) (1)) #=> [1] 1
## 用高阶函数 和 %>% 管道来 代替let, function(a=111,b=222,c=function(...){...} ) { ... }
## function的默认参数就是一个局部变量: function(a=1, b=2) <=> let[a 1 b 2]
((function (x, y=(function (i) ('*' (i, 2))) ) (y (x))) (2)) #=> [1] 4
## 用强大的函数管道
(library (magrittr))
((c (1, 2, 3)) %>% (function (x) (Map ((function (x) ('+' (x, 100))), x)))
%>% (function (x) (Reduce ('+', x)) ) ) #=> [1] 306
## let复用前面的变量定义
((function (x, y=('*' (x, 2))) y) (100)) #=> [1] 200
## 综合例子: function里面的默认参数,当let来用,可以用前面定义的变量(x,y=x),但是不能覆盖前面定义的变量(x,x=1)
((function (y, x, mx=(as.matrix (x)), cx=(cbind (Intercept=1, mx)))
('%*%' (('%*%' ((solve ('%*%' ((t (cx)), cx))), (t (cx)))), y)) ) -> reg)
(reg (y=(launch$distress_ct), x=(launch [3])))
## [,1]
## Intercept 4.30158730
## temperature -0.05746032
('if' (0, ('==' (1, 1)), ('==' (2, 1)))) #=> [1] FALSE
('plot' (('rnorm' (10)), ('rnorm' (10))))
# 加了额外的参数
('plot' (('rnorm' (10)), ('rnorm' (10)), type='b'))
(Reduce ('*', 1:10))
((function (x) ('if' (('%%' (x, 2)), x, 0))) (2)) #=> [1] 0
# call
(Filter ((function (x) ('if' (('%%' (x, 2)), x, 0))), 1:10)) #=> [1] 1 3 5 7 9
(Map ((function (x) ('+' (x, 100))), 1:3))
# =>
[[1]]
[1] 101
[[2]]
[1] 102
[[3]]
[1] 103
## 1d: 1维
# 如果本来是前缀的表达方式的函数,引号'c'可以省略,function除外必须加引号
(c (1, 1, 3)) #=> [1] 1 1 3
((c (1, 8, 3)) [2]) #=> [1] 8
((c ("A", "B", "C")) -> defvar) #=> [1] "A" "B" "C"
# levels是不能重复出现的
(factor ((c ("1", "1", "3", "11", "9", "8")), levels=(c ("A", "B", "C", "AA", "BB", "CC"))))
#=>
[1] <NA> <NA> <NA> <NA> <NA> <NA>
Levels: A B C AA BB CC
### 替换一下数据名称,把B替换为"良性肿块" ==>> 因子的意义: 赋予跟多的标签的意义
((factor (wbcd$diagnosis, levels=(c ("B", "M")), labels=(c ("良性肿块", "恶性肿块")))) -> wbcd$diagnosis)
#=>
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
1 恶性肿块 17.990 10.38 122.80 1001.0 0.11840
2 恶性肿块 20.570 17.77 132.90 1326.0 0.08474
21 良性肿块 13.080 15.71 85.63 520.0 0.10750
## 1d: 1维
(list (11, "aa", FALSE))
#=>
[[1]]
[1] 11
[[2]]
[1] "aa"
[[3]]
[1] FALSE
## nd: N维
(1:12) ##=> [1] 1 2 3 4 5 6 7 8 9 10 11 12
##class: [1] "integer"
(array (1:12)) #=>class [1] "array"
##=> [1] 1 2 3 4 5 6 7 8 9 10 11 12
(array (1:12, (c (2, 3, 2)))) #=>class [1] "array"
## [,1] [,2] [,3]
## [1,] 7 9 11
## [2,] 8 10 12
##
## 2d: 2维
((data.frame (
ID=(c (11,12,13)),
Name=(c ("Devin","Edward","Wenli")),
Gender=(c ("M","M","F")),
Birthdate=(c ("1984-12-29","1983-5-6","1986-8-8")))) -> pt_data)
#=>
ID Name Gender Birthdate
1 11 Devin M 1984-12-29
2 12 Edward M 1983-5-6
3 13 Wenli F 1986-8-8
## get:
(pt_data [1, 2]) #=> 第一行,第二列
[1] Devin
Levels: Devin Edward Wenli
(pt_data [,3]) #=> 只是第三列
[1] M M F
Levels: F M
((pt_data [-1]) [-2])
#=> 去除第一,然后再去除第二列
Name Birthdate
1 Devin 1984-12-29
2 Edward 1983-5-6
3 Wenli 1986-8-8
(pt_data$Birthdate)
#=> 取某一列
[1] 1984-12-29 1983-5-6 1986-8-8
Levels: 1983-5-6 1984-12-29 1986-8-8
(pt_data [2:3])
# 取范围
Name Gender
1 Devin M
2 Edward M
3 Wenli F
## 2d: 2维
(matrix ((c (1, 2, 1, 3, 5, 8)), nrow=2))
#=> 2行->3列
[,1] [,2] [,3]
[1,] 1 1 5
[2,] 2 3 8
(matrix ((c (1, 2, 1, 3, 5, 8)), ncol=2))
#=>
[,1] [,2]
[1,] 1 3
[2,] 2 5
[3,] 1 8
(matrix ((c (1, 2, 4, 3)), ncol=1))
#=> 单列矩阵
[,1]
[1,] 1
[2,] 2
[3,] 4
[4,] 3
(matrix ((c (1, 2, 4, 3)), nrow=1))
#=> 单行矩阵
[,1] [,2] [,3] [,4]
[1,] 1 2 4 3
(cbind ((c (1, 1, 1)), (c (1, 0, 1)), (c (0, 1, 0))))
#=> 拼接矩阵
## [,1] [,2] [,3]
## [1,] 1 1 0
## [2,] 1 0 1
## [3,] 1 1 0
## =========== 矩阵线性代数
## 矩阵转置: 如果参数里面只有一个参数时,并且是函数调用的时候,可以省略参数标记的一对括号,如下=>
(t (matrix ((c (1, 2, 1, 3, 5, 8)), ncol=2)))
## [,1] [,2]
## [1,] 1 3
## [2,] 2 5
## [3,] 1 8
## ==>>
## [,1] [,2] [,3]
## [1,] 1 2 1
## [2,] 3 5 8
## 矩阵的标量运算
('*' (10, (matrix ((c (1, 2, 1, 3, 5, 8)), ncol=2))))
## [,1] [,2]
## [1,] 10 30
## [2,] 20 50
## [3,] 10 80
##
## 矩阵求和: 必须结构相同才能相加
('+' ((matrix ((c (9, 2, 3, 8, 1, 4)), ncol=2)),
(matrix ((c (0, 3, 5, 3, 7, 2)), ncol=2))))
# A + B
## [,1] [,2]
## [1,] 9 8
## [2,] 2 1
## [3,] 3 4
## [,1] [,2]
## [1,] 0 3
## [2,] 3 7
## [3,] 5 2
## =======>>>>>
## [,1] [,2]
## [1,] 9 11
## [2,] 5 8
## [3,] 8 6
##
## 矩阵乘法: A的列数必须等于B的行数 <=> 列的加权求和
('%*%' ((matrix ((c (1, 4, 3, 0, 1, 2)), ncol=2)),
(matrix ((c (7, 8)), ncol=1))))
# A * B
## [,1]
## [1,] 7
## [2,] 36
## [3,] 37
##
## 矩阵求逆: 必需是正方形的
(solve (matrix ((c (1, 4, 3, 0, 1, 2, 1, 6, 8)), ncol=3)))
## [,1] [,2] [,3]
## [1,] 1 0 1
## [2,] 4 1 6
## [3,] 3 2 8
## ===>>>
## [,1] [,2] [,3]
## [1,] -4 2 -1
## [2,] -14 5 -2
## [3,] 5 -2 1
##
(write.csv (pt_data, file="my-data-frame.csv"))
# cat my-data-frame.csv #=>
"","ID","Name","Gender","Birthdate"
"1",11,"Devin","M","1984-12-29"
"2",12,"Edward","M","1983-5-6"
"3",13,"Wenli","F","1986-8-8"
(read.csv ("my-data-frame.csv"))
#=>
X ID Name Gender Birthdate
1 1 11 Devin M 1984-12-29
2 2 12 Edward M 1983-5-6
3 3 13 Wenli F 1986-8-8
# => read from web:
((read.csv ("http://127.0.0.1:8003/wisc_bc_data.csv", stringsAsFactors=FALSE)) -> wbcd)
(table (wbcd$diagnosis))
# B M # B是良性肿块, B是恶性肿块
# 357 212
(round (('*' ((prop.table (table (wbcd$diagnosis))) ,100)), digits=1))
## 2.良性肿块 恶性肿块
## 62.7 37.3 # 百分比计算
## 3.总结特征, 细胞核的3种特征: 最小, 最大, 平均值,中间值等
(summary ((wbcd [(c ("radius_mean", "area_mean", "smoothness_mean"))])))
#=>
radius_mean area_mean smoothness_mean
Min. : 6.981 Min. : 143.5 Min. :0.05263
1st Qu.:11.700 1st Qu.: 420.3 1st Qu.:0.08637
Median :13.370 Median : 551.1 Median :0.09587
Mean :14.127 Mean : 654.9 Mean :0.09636
3rd Qu.:15.780 3rd Qu.: 782.7 3rd Qu.:0.10530
Max. :28.110 Max. :2501.0 Max. :0.16340
((function (x)
('/' (('-' (x, (min (x)))),
('-' ((max (x)), (min (x))))))) -> normalize)
(normalize ((c (10, 20, 30, 40, 50)))) #=> [1] 0.00 0.25 0.50 0.75 1.00
(lapply ((wbcd [2:31]), normalize))
#=>
$radius_mean
[1] 0.52103744 0.64314449 0.60149557 0.21009040 0.62989256 0.25883856
...
$texture_mean
[1] 0.02265810 0.27257355 0.39026040 0.36083869 0.15657761 0.20257017
...
((as.data.frame ((lapply ((wbcd [2:31]), normalize)))) -> wbcd_n)
#=> list列表(可以不同类型): 重新变成data.frame
radius_mean texture_mean perimeter_mean area_mean smoothness_mean
1 0.52103744 0.02265810 0.54598853 0.36373277 0.59375282
2 0.64314449 0.27257355 0.61578329 0.50159067 0.28987993
(1:10 -> x)
#=> [1] 1 2 3 4 5 6 7 8 9 10
(('+' (x, (rnorm (10, 0, 1)))) -> y)
#=>
# [1] 0.4150231 1.9585418 1.7173466 3.2213521 4.0119051 4.8112887 5.7995432
# [8] 7.1943800 9.3619532 9.2997215
((lm (y ~ x)) -> fit)
#=>
# Call:
# lm(formula = y ~ x)
#
# Coefficients:
# (Intercept) x
# -0.8111 1.0164
(summary (fit))
# Call:
# lm(formula = y ~ x)
#
# Residuals:
# Min 1Q Median 3Q Max
# 0.52077 -0.42176 -0.08944 0.14898 1.02546
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) -0.81107 0.38014 -2.134 0.0654 .
# x 1.01640 0.06126 16.590 1.76e-07 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 0.5565 on 8 degrees of freedom
# Multiple R-squared: 0.9718, Adjusted R-squared: 0.9682
# F-statistic: 275.2 on 1 and 8 DF, p-value: 1.76e-07
#
(library (class))
((knn (train=wbcd_train, test=wbcd_test, cl=wbcd_train_labels, k=21)) -> wbcd_test_pred)
# knn返回wbcd_test_pred因子向量,为测试数据集中的每一个案例返回一个预测标签
# 评估模型的性能
(library (gmodels))
(CrossTable (x=wbcd_test_labels, y=wbcd_test_pred, prop.chisq=FALSE))
(str (credit))
#=>
## 'data.frame': 1000 obs. of 21 variables:
## $ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : Factor w/ 5 levels "critical","delayed",..: 1 5 1 5 2 5 5 5 5 1 ...
## $ purpose : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## ... ...
## $ job : Factor w/ 4 levels "mangement self-employed",..: 2 2 4 2 2 4 2 1 4 1 ...
(summary (credit$months_loan_duration))
#=>
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.0 12.0 18.0 20.9 24.0 72.0
(head (credit_rand$amount))
#=>
[1] 2346 2030 1082 2631 3069 1333
(library (gmodels))
(CrossTable (x=wbcd_test_labels, y=wbcd_test_pred, prop.chisq=FALSE))
## | wbcd_test_pred
## wbcd_test_labels | 良性肿块 | 恶性肿块 | Row Total |
## -----------------|-----------|-----------|-----------|
## 良性肿块 | 77 | 0 | 77 |
## | 1.000 | 0.000 | 0.770 |
## | 0.975 | 0.000 | |
## | 0.770 | 0.000 | |
## -----------------|-----------|-----------|-----------|
## 恶性肿块 | 2 | 21 | 23 |
## | 0.087 | 0.913 | 0.230 |
## | 0.025 | 1.000 | |
## | 0.020 | 0.210 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 79 | 21 | 100 |
## | 0.790 | 0.210 | |
## -----------------|-----------|-----------|-----------|
(library (C50))
((C5.0 ((credit_train [-17]), credit_train$default)) -> credit_model)
((predict (credit_model, credit_test)) -> credit_pred)
(CrossTable (credit_test$default, credit_pred, prop.chisq=FALSE, prop.c=FALSE, prop.r=FALSE, dnn=(c ('actual default', 'predicted default'))))
(library (neuralnet))
## neuralnet函数用于数值预测的神经网络: 多种原料=>强度预测, 用多层前馈神经网络
((neuralnet (strength ~ cement + slag + ash + water + superplastic + coarseagg + fineagg + age, data=concrete_train)) -> concrete_model)
## 预测强度
((model_results$net.result) -> predicted_strength)
## cor用来获取两个数值向量之间的相关性
(cor (predicted_strength, concrete_test$strength))
## [,1]
## [1,] 0.7195218932
(library (kernlab))
## 字母分类器: 超平面分割面=>两类数据空间化(填充,龚起来)=>分割完了再降维
((ksvm (letter ~ ., data=letters_train, kernel="vanilladot")) -> letter_classifier)
## 评估模型的性能: 字母的预测
((predict (letter_classifier, letters_test)) -> letter_predictions)
## 预测的值和真实的值进行比较=>
(round (('*' ((prop.table (table ('==' (letter_predictions, letters_test$letter)))) ,100)), digits=1))
## ==>> 正确率为83.9%
## FALSE TRUE
## 16.1 83.9
## 只是取36个特征:
((teens [5:40]) -> interests)
((as.data.frame (lapply (interests, scale))) -> interests_z)
## k均值聚类:
((kmeans (interests_z, 5)) -> teen_clusters)
## 看到分出来5类,各自的数量如下
(teen_clusters$size)
# [1] 868 5089 2528 986 20529
# 分量teen_clusters$centers查看聚类质心的坐标,所有的特征
(teen_clusters$centers)
(library (magrittr))
(1 %>% (function (x) ('+' (x, 100)))
%>% (function (x) (print (x))) ) #=> [1] 101
(library (tm))
(library (magrittr))
((function (text)
(text
%>% (function (st) (Corpus ((VectorSource (st)))))
%>% (function (cor) (tm_map (cor, (content_transformer (tolower)))))
%>% (function (cor) (tm_map (cor, removePunctuation)))
%>% (function (cor) (tm_map (cor, removeNumbers)))
%>% (function (cor) (tm_map (cor, removeWords, (c (stopwords("SMART"), "thy", "thou", "thee", "the", "and", "but")))))
%>% (function (cor) (TermDocumentMatrix (cor, control=(list (minWordLength=1)))))
%>% (function (mydtm) (as.matrix (mydtm)))
%>% (function (m) (sort ((rowSums (m)), decreasing=TRUE))) )) -> getTermMatrix)
(getTermMatrix ("The Clojure Programming Language. Clojure is a dynamic, general-purpose programming")) #=>
## clojure programming dynamic generalpurpose language
## 2 2 1 1 1
## 3.1 探索特征之间的关系---相关系数矩阵
(cor (insurance [(c ("age", "bmi", "children", "charges"))]))
## age bmi children charges
## age 1.0000000 0.1092719 0.04246900 0.29900819
## bmi 0.1092719 1.0000000 0.01275890 0.19834097
## children 0.0424690 0.0127589 1.00000000 0.06799823
## charges 0.2990082 0.1983410 0.06799823 1.00000000
## 3.2 可视化特征之间的关系------散点图矩阵
## (pairs (insurance [(c ("age", "bmi", "children", "charges"))])) #=> pairs_insurance.png
(library (psych)) ## pairs.panels可以显示拟合的线
## (pairs.panels (insurance [(c ("age", "bmi", "children", "charges"))])) #=> pairs_panels_insurance.png
## 3.3 基于数据训练模型 --------------
((lm (charges ~ age + children + bmi + sex + smoker + region, data=insurance)) -> ins_model)
## Call:
## lm(formula = charges ~ age + children + bmi + sex + smoker +
## region, data = insurance)
##
## Coefficients:
## (Intercept) age children bmi
## -11938.5 256.9 475.5 339.2
## sexmale smokeryes regionnorthwest regionsoutheast
## -131.3 23848.5 -353.0 -1035.0
## regionsouthwest
## -960.1
##
## 3.4 评估模型的性能
(summary (ins_model))
(library (Boruta))
((Boruta (Classes~., data=(train [,-348]))) -> Boruta.mod)
(png ("Boruta_selection.png", width=4000,height=1600))
(plot (Boruta.mod, las="2"))
(dev.off ())
## 将选出来的重要特征保存到一个rda里面
(library (magrittr))
(library (dplyr)) #select函数
(train %>%
(function (data) (select (data, zakończyć,zdjęcie,należeć,naprawdę,polski,kobieta,sierpień,zobaczyć,dotyczyć,szczęście,mężczyzna,europejski)))
-> train_Boruta)
(save (train_Boruta, file="train_Boruta.rda"))
importance绘图
(library (caret))
(library (rpart))
(library (e1071))
((trainControl (method="repeatedcv", number=10,repeats=3)) -> control)
((train (churn~., data=trainset, method="rpart",preProcess="scale", trControl=control)) -> model)
## 2315 samples
## 16 predictor
## 2 classes: 'yes', 'no'
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2084, 2084, 2083, 2083, 2082, 2084, ...
## Resampling results across tuning parameters:
## cp Accuracy Kappa
## 0.05555556 0.8995112 0.5174059
## 0.07456140 0.8593389 0.2124126
## 0.07602339 0.8567440 0.1898221
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.
##
((varImp (model, scale=FALSE)) -> importance)
## rpart variable importance
## Overall
## number_customer_service_calls 116.015
## total_day_minutes 106.988
## total_day_charge 100.648
## ...
(plot (importance)) ##=> fs_churn_importance_by_caret.png
(hist (insurance$charges)) #==>> charges_hist.png
(pairs (insurance [(c ("age", "bmi", "children", "charges"))])) #=> pairs_insurance.png