##################################################################
使用数据:Titanic
# look for data
str(Titanic)
# transform table into data frame
df <- as.data.frame(Titanic)
head(df)
> head(df)
Class Sex AgeSurvivedFreq
1 1st MaleChild No 0
2 2nd MaleChild No 0
3 3rd MaleChild No 35
4 Crew MaleChild No 0
titanic.raw <- NULL
# 如果频率字段大于0,将该行记录按列追加到变量中,Freq=0,当然就不追加
for(iin1:4) {
titanic.raw <- cbind(titanic.raw, rep(as.character(df[,i]), df$Freq))
}
# 前35行都是一样的
]]]]> titanic.raw[1:36,]
[,1] [,2] [,3] [,4]
[1,]"3rd""Male" "Child""No"
[2,]"3rd""Male" "Child""No"
[3,]"3rd""Male" "Child""No"
[4,]"3rd""Male" "Child""No"
...
[35,]"3rd""Male" "Child""No"
[36,]"3rd""Female""Child""No"
# transform to data frame
titanic.raw <- as.data.frame(titanic.raw)
> head(titanic.raw)
V1 V2 V3V4
1 3rd MaleChildNo
2 3rd MaleChildNo
3 3rd MaleChildNo
4 3rd MaleChildNo
5 3rd MaleChildNo
6 3rd MaleChildNo
# 生成数据框后添加属性名称
names(titanic.raw) <- names(df)[1:4];dim(titanic.raw);
summary(titanic.raw)
# 转换后:每一行代表了一个人,可以用于关联规则。转换前是什么类型的数据? (按照class、sex、年龄汇总的生存人数的数据)
With the function, the default settings are:1) supp=0.1, which is the minimum support of rules;2) conf=0.8, which is the minimum confidence of rules; and 3) maxlen=10, which is the maximum length of rules.
library(arules)
rules <- apriori(titanic.raw) # apriori可以直接传递非transactions类型的对象,内部自动转换
rules # 根据最小的 (supp=0.1,conf=0.8),返回的规则的最多个数 10个
summary(rules);
inspect(rules);
quality(rules) <- quality(rules)
rules.better <- apriori(titanic.raw,
parameter =list(minlen = 2, supp =0.005, conf =0.8),
appearance = list(rhs =c("Survived=No", "Survived=Yes"), default ="lhs"),
control = list(verbose=F)
)
# base on lift sorted
rules.sorted <- sort(rules.better, by="lift")
inspect(rules.sorted)
> inspect(rules.sorted)
lhs rhs supportconfidence lift
1 {Class=2nd,
Age=Child} => {Survived=Yes} 0.010904134 1.00000003.095640
2 {Class=2nd,
Sex=Female,
Age=Child} => {Survived=Yes} 0.005906406 1.00000003.095640
3 {Class=1st,
Sex=Female} => {Survived=Yes} 0.064061790 0.97241383.010243
4 {Class=1st,
Sex=Female,
Age=Adult} => {Survived=Yes} 0.063607451 0.97222223.009650
5 {Class=2nd,
Sex=Female} => {Survived=Yes} 0.042253521 0.87735852.715986
6 {Class=Crew,
Sex=Female} => {Survived=Yes} 0.009086779 0.86956522.691861
7 {Class=Crew,
Sex=Female,
Age=Adult} => {Survived=Yes} 0.009086779 0.86956522.691861
8 {Class=2nd,
Sex=Female,
Age=Adult} => {Survived=Yes} 0.036347115 0.86021512.662916
9 {Class=2nd,
Sex=Male,
Age=Adult} => {Survived=No} 0.069968196 0.91666671.354083
10 {Class=2nd,
Sex=Male} => {Survived=No} 0.069968196 0.86033521.270871
11 {Class=3rd,
Sex=Male,
Age=Adult} => {Survived=No} 0.175829169 0.83766231.237379
12 {Class=3rd,
Sex=Male} => {Survived=No} 0.191731031 0.82745101.222295
lhs rhs support confidence lift
1 {Class=2nd,
Age=Child} =>{Survived=Yes}0.010904134 1.0000000 3.095640
2 {Class=2nd,
Sex=Female,
Age=Child} =>{Survived=Yes}0.005906406 1.0000000 3.095640
# redundant
subset.matrix <- is.subset(rules.sorted, rules.sorted) #
subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA
# 计算列TRUE的数量
redundant <- colSums(subset.matrix, na.rm=T) >= 1; #
which(redundant) # 冗余规则的下标
# 删除冗余规则
rules.pruned <- rules.sorted[!redundant]
inspect(rules.pruned)
> inspect(rules.pruned)
lhs rhs support confidence lift
1 {Class=2nd,
Age=Child} => {Survived=Yes} 0.010904134 1.0000000 3.095640
2 {Class=1st,
Sex=Female} => {Survived=Yes} 0.064061790 0.9724138 3.010243
3 {Class=2nd,
Sex=Female} => {Survived=Yes} 0.042253521 0.8773585 2.715986
4 {Class=Crew,
Sex=Female} => {Survived=Yes} 0.009086779 0.8695652 2.691861
5 {Class=2nd,
Sex=Male,
Age=Adult} => {Survived=No} 0.069968196 0.9166667 1.354083
6 {Class=2nd,
Sex=Male} => {Survived=No} 0.069968196 0.8603352 1.270871
7 {Class=3rd,
Sex=Male,
Age=Adult} => {Survived=No} 0.175829169 0.8376623 1.237379
8 {Class=3rd,
# former rules set
rules.better <- apriori(titanic.raw,
parameter =list(minlen = 2, supp =0.005, conf =0.8),
appearance = list(rhs =c("Survived=No", "Survived=Yes"), default ="lhs"),
control = list(verbose=F)
)
# compare rules set
rules <- apriori(titanic.raw,
parameter =list(minlen=3,supp=0.002, conf=0.2),
appearance = list(rhs=c("Survived=Yes"),
lhs=c("Class=1st", "Class=2nd", "Class=3rd",
"Age=Child", "Age=Adult"),
default="none"),
control = list(verbose = F)
);
rules.sorted <- sort(rules, by = "confidence")
lhs rhs support confidence lift
1{Class=2nd,
Age=Child}=>{Survived=Yes}0.010904134 1.0000000 3.0956399
2{Class=1st,
Age=Child}=>{Survived=Yes}0.002726034 1.0000000 3.0956399
3{Class=1st,
Age=Adult}=>{Survived=Yes}0.089504771 0.6175549 1.9117275
4{Class=2nd,
Age=Adult}=>{Survived=Yes}0.042707860 0.3601533 1.1149048
5{Class=3rd,
Age=Child}=>{Survived=Yes}0.012267151 0.3417722 1.0580035
6{Class=3rd,
Age=Adult}=>{Survived=Yes}0.068605179 0.2408293 0.7455209
# 找指定包的路径
a <- find.package("rattle") # "/Library/Frameworks/R.framework/Versions/2.15/Resources/library/rattle"
# 设定文件所在的路径
file <- file.path(a, "csv", c("weather.csv","dvdtrans.csv"))
# file <- file.path(a, "csv")
# 判断指定目录下文件是否存在
logical.file <- file.exists(file)
# 只要存在文件
if(any(logical.file)) {
file[logical.file] # file[TRUE]
}
# 综上,用其它的包练习一下
packagePath <- find.package("caret");packagePath # find package path
file <- file.path(packagePath,"html","R.css");file # 设定文件路径及文件名
logic.file <- file.exists(file)# 返回逻辑值,判定是否存在指定的文件
if(any(logic.file)){
file[logic.file]
################ split() start ################
# split:split divides the data in the vector x into the groups defined by f
# 每个ID有购买了不同的商品,split功能就是对商品Item进行分组切分, 组即为ID,结果返回list
split(dvdtrans$Item, dvdtrans$ID) # 自行查看结果
# Error in as(split(dvdtrans$Item, dvdtrans$ID), "transactions") :
# no method or default for coercing “list” to “transactions”
> data <- as(split(dvdtrans$Item, dvdtrans$ID),"transactions")
# 看看生成的data是什么形式?10个ID, 即为10行交易数据,即由原来的纵表转换为横表,item商品共10种,生成10个属性字段
> data
transactionsinsparseformatwith
10 transactions (rows) and
10 items (columns)
# 用 apriori命令生成频繁项集,设其支持度为0.5,置信度为0.8
rules <- apriori(data, parameter=list(supp=0.5, conf=0.8))
# use inspect to extract rules
> inspect(rules)
lhs rhs supportconfidence lift
1 {Patriot} => {Gladiator} 0.6 1.00000001.428571
2 {Gladiator} => {Patriot} 0.6 0.85714291.428571
library(arules)
function(...,package="base",lib.loc=NULL,mustWork=FALSE)
if(nargs()==0L)
return(file.path(.Library,"base"))
stop("'package' must be of length 1")
ans<-if(length(packagePath)){
FILES<-file.path(packagePath,...)
present<-file.exists(FILES)
if(any(present))
FILES[present]
else""
}
else""
if(mustWork&&identical(ans,""))
stop("no file found")
ans