第四章 排序:智能收件箱

library(tm)
library(ggplot2)
library(dplyr)
library(reshape2)
library(tidyr)
library(lubridate)
data.path<-'D:\\ML_for_Hackers-master\\03-Classification\\data\\'
easyham.path<-paste0(data.path,'easy_ham\\')
easyham.path

‘D:\ML_for_Hackers-master\03-Classification\data\easy_ham\’

提取邮件有用信息(发送者、主题、日期、内容等)

parse.email<-function(path){
    full.msg<-msg.full(path)
    date<-get.date(full.msg)
    from<-get.from(full.msg)
    subj<-get.subject(full.msg)
    msg<-get.msg(full.msg)
    return(c(date,from,subj,msg,path))
}

读取一封邮件

msg.full<-function(path){
    con<-file(path,'r')
    msg<-readLines(con,encoding='latin1')
    close(con)
    return(msg)
}

提取邮件地址

get.from<-function(msg.vec){
     from<-msg.vec[grepl('From: ',msg.vec)]
     from<-strsplit(from,'[":<> ]')[[1]]
     from<-from[which(from != "" & from != " ")]
     return(from[grepl('@',from)][1])
}

提取邮件正文

get.msg<-function(msg.vec){
    msg <- msg.vec[seq(which(msg.vec == "")[1]+1,length(msg.vec))]
    return(paste(msg,collapse='\n'))
}

提取邮件主题

get.subject<-function(msg.vec){
    subj<-msg.vec[grepl('Subject: ',msg.vec)]
    if(length(subj)>0){
        return(strsplit(subj,'Subject: ')[[1]][2])
    }
    else{
        return('')
    }
}

邮件接收日期

get.date<-function(msg.vec){
    date.grep<-grepl('^Date: ',msg.vec)
    date.grep<-which(date.grep == TRUE)
    date<-msg.vec[date.grep[1]]
    date<-strsplit(date,'\\+|\\-|: ')[[1]][2]
    date<-gsub('^\\s+|\\s+$','',date)
    return(strtrim(date,25))
}
easyham.docs<-dir(easyham.path)
easyham.docs<-easyham.docs[which(easyham.docs!='cmds')]
easyham.parse<-lapply(easyham.docs,function(p) parse.email(paste0(easyham.path,p))) ## lapply函数分别对各个邮件进行处理
ehparse.matrix<-do.call(rbind,easyham.parse)
allparse.df<-data.frame(ehparse.matrix,stringsAsFactors=F)
names(allparse.df)<-c('Date','From.Email','Subject','Message','Path')
head(allparse.df[,1:3])
Date From.Email Subject
Thu, 22 Aug 2002 18:26:25 [email protected] Re: New Sequences Window
Thu, 22 Aug 2002 12:46:18 [email protected] [zzzzteana] RE: Alexander
Thu, 22 Aug 2002 13:52:38 [email protected] [zzzzteana] Moscow bomber
Thu, 22 Aug 2002 09:15:25 [email protected] [IRR] Klez: The Virus That Won’t Die
Thu, 22 Aug 2002 14:38:22 [email protected] Re: [zzzzteana] Nothing like mama used to make
Thu, 22 Aug 2002 14:50:31 [email protected] Re: [zzzzteana] Nothing like mama used to make

日期格式转化

date.converter<-function(dates,pattern1,pattern2){
    pattern1.convert<-strptime(dates,pattern1)
    pattern2.convert<-strptime(dates,pattern2)
    pattern1.convert[is.na(pattern1.convert)]<-pattern2.convert[is.na(pattern1.convert)]
    return(pattern1.convert)
}
Sys.setlocale("LC_TIME", "C")
pattern1 <- "%a,%d %b %Y %H:%M:%S"
pattern2 <- "%d %b %Y %H:%M:%S"

‘C’

head(priority.df[,c(1,2,3,5)])
Date From.Email Subject Path
1061 2002-01-31 22:44:14 [email protected] please help a newbie compile mplayer :-) D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01061.6610124afa2a5844d41951439d1c1068
1062 2002-02-01 00:53:41 [email protected] re: please help a newbie compile mplayer :-) D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01062.ef7955b391f9b161f3f2106c8cda5edb
1063 2002-02-01 02:01:44 [email protected] re: please help a newbie compile mplayer :-) D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01063.ad3449bd2890a29828ac3978ca8c02ab
1064 2002-02-01 10:29:23 [email protected] re: please help a newbie compile mplayer :-) D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01064.9f4fc60b4e27bba3561e322c82d5f7ff
1070 2002-02-01 12:42:02 [email protected] prob. w/ install/uninstall D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01070.6e34c1053a1840779780a315fb083057
1072 2002-02-01 13:39:31 [email protected] re: prob. w/ install/uninstall D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01072.81ed44b31e111f9c1e47e53f4dfbefe3
allparse.df$Date<-date.converter(allparse.df$Date,pattern1,pattern2)
allparse.df$Subject<-tolower(allparse.df$Subject)
allparse.df$From.Email<-tolower(allparse.df$From.Email)

选取前一半时间的数据(训练数据)进行测试

priority.df<-allparse.df[with(allparse.df,order(Date)),]
priority.train<-priority.df[1:(round(nrow(priority.df)/2)),]

计算各个发送者发送邮件的数量

from.weight<-select(priority.train,-Date) %>% group_by(From.Email) %>% summarise(Freq=length(Subject))
head(from.weight)
From.Email Freq
[email protected] 1
[email protected] 1
[email protected] 1
[email protected] 1
[email protected] 1
[email protected] 6
from.weight_map <-subset(from.weight,Freq>6)
from.weight_map<-from.weight_map[order(from.weight_map$Freq),]
tail(from.weight_map)
From.Email Freq
[email protected] 24
[email protected] 25
[email protected] 29
[email protected] 34
[email protected] 37
[email protected] 45

取次数大于6的数据画图

ggplot(from.weight_map,aes(x=reorder(From.Email,Freq),y=Freq))+geom_bar(stat="identity",color='green')+coord_flip()

第四章 排序:智能收件箱_第1张图片

取不同对数类型的差异

test1<-data.frame(y=from.weight[order(from.weight$Freq),]$Freq,x=1:nrow(from.weight),type=rep(1,nrow(from.weight)))
test2<-data.frame(y=log(from.weight[order(from.weight$Freq),]$Freq),x=1:nrow(from.weight),type=rep(2,nrow(from.weight)))
test3<-data.frame(y=log10(from.weight[order(from.weight$Freq),]$Freq),x=1:nrow(from.weight),type=rep(3,nrow(from.weight)))
test<-rbind(test1,test2,test3)
ggplot(test,aes(x,y,color=type,group = type))+geom_line()+ylab('接收邮件数')+xlab('')

第四章 排序:智能收件箱_第2张图片

from.weight<-transform(from.weight,Weight=log(Freq+1))

找出回复他人的邮件

find.threads<-function(email.df){
    response.threads<-strsplit(email.df$Subject,'re: ')
    is.thread<-sapply(response.threads,function(subj) ifelse(subj[1]=='',TRUE,FALSE))

        threads<-response.threads[is.thread]
        senders<-email.df$From.Email[is.thread]
        threads<-sapply(threads,function(t) paste(t[2:length(t)],collapse='re: '))
        return(cbind(senders,threads))
}
threads.matrix<-find.threads(priority.train)
head(threads.matrix)
senders threads
[email protected] please help a newbie compile mplayer :-)
[email protected] please help a newbie compile mplayer :-)
[email protected] please help a newbie compile mplayer :-)
[email protected] prob. w/ install/uninstall
[email protected] prob. w/ install/uninstall
[email protected] http://apt.nixia.no/

计算每个邮件回复他人的数量及构建权重

email.thread<-function(thread.matrix){
    sender<-threads.matrix[,1]
    senders.freq<-table(sender)
    senders.matrix<-cbind(names(senders.freq),senders.freq,log(senders.freq+1))
    senders.df<-data.frame(senders.matrix,stringsAsFactors=FALSE)
    row.names(senders.df)<-1:nrow(senders.df)
    names(senders.df)<-c('From.Email','Freq','Weight')
    senders.df$Freq<-as.numeric(senders.df$Freq)
    senders.df$Weight<-as.numeric(senders.df$Weight)
    return(senders.df)
}
senders.df<-email.thread(threads.matrix)
head(senders.df)
From.Email Freq Weight
[email protected] 1 0.6931472
[email protected] 5 1.7917595
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472

计算同一主题的线程时间差,并赋值

get.threads<-function(threads.matrix,email.df){
    threads<-unique(threads.matrix[,2])
    thread.counts<-lapply(threads,function(t) thread.counts(t,email.df))
        thread.matrix<-do.call(rbind,thread.counts)
        return(cbind(threads,thread.matrix))
}
thread.counts<-function(thread,email.df){
    thread.times<-email.df$Date[which(email.df$Subject==thread|email.df$Subject==paste('re:',thread))]
    freq<-length(thread.times)
    min.times<-min(thread.times)
    max.times<-max(thread.times)
    time.span<-as.numeric(difftime(max.times,min.times,units='secs'))
    if(freq<2){
        return(c(NA,NA,NA))
    }
    else{
        trans.weight<-freq/time.span
        log.trans.weight<-10+log(trans.weight,base=10)
        return(c(freq,time.span,log.trans.weight))
    }
}
thread.weights<-get.threads(threads.matrix,priority.train)
thread.weights<-data.frame(thread.weights,stringsAsFactors=FALSE)
names(thread.weights)<-c('Thread','Freq','Response','Weight')
thread.weights$Freq<-as.numeric(thread.weights$Freq)
thread.weights$Response<-as.numeric(thread.weights$Response)
thread.weights$Weight<-as.numeric(thread.weights$Weight)
thread.weights<-subset(thread.weights,is.na(thread.weights$Freq)==FALSE)
head(thread.weights)
Thread Freq Response Weight
please help a newbie compile mplayer :-) 4 42309 5.975627
prob. w/ install/uninstall 4 23745 6.226488
http://apt.nixia.no/ 10 265303 5.576258
problems with ‘apt-get -f install’ 3 55960 5.729244
problems with apt update 2 6347 6.498461
about apt, kernel updates and dist-upgrade 5 240238 5.318328

计算主题的词频

term.counts<-function(term.vec,control){
    vec.corpus<-Corpus(VectorSource(term.vec))
    vec.tdm<-TermDocumentMatrix(vec.corpus,control=control)
    return(rowSums(as.matrix(vec.tdm)))
}
thread.terms<-term.counts(thread.weights$Thread,control=list(strpwords=stopwords()))
head(thread.terms)
compile
2
help
2
mplayer
2
newbie
2
please
2
install
2
thread.terms<-names(thread.terms)
term.weights<-sapply(thread.terms,function(t) mean(thread.weights$Weight[grepl(t,thread.weights$Thread,fixed=TRUE)])) ##计算存在关键词的主题的权重的均值
term.weights<-data.frame(list(Term=names(term.weights),Weight=term.weights),stringsAsFactors=FALSE,row.names=1:length(term.weights))
head(term.weights)
Term Weight
compile 5.803255
help 5.427126
mplayer 6.724644
newbie 5.444172
please 6.309005
install 5.977866

计算邮件内容的词频,并据此赋权重

msg.terms<-term.counts(priority.train$Message,control=list(stopwords=stopwords(),removePunctuation=TRUE,removeNumbers=TRUE))
msg.weights<-data.frame(list(Term=names(msg.terms),Weight=log(msg.terms,base=10)),stringsAsFactors=FALSE,row.names=1:length(msg.terms))
msg.weights<-subset(msg.weights,Weight>0)
head(msg.weights)
Term Weight
anyway 1.875061
appreciated 1.176091
apt 2.255273
can 3.077004
directory 2.056905
document 1.278754

最终的训练数据

head(from.weight)                    ##社交特征
head(senders.df)                    ##发件人在线程内的活跃度(re:)
head(thread.weights)                ##线程(一个主题存在多个邮件)的活跃度
head(term.weights)                 ##活跃线程的词项
head(msg.weights)                  ##所有邮件共有词项
From.Email Freq Weight
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 6 1.9459101
From.Email Freq Weight
[email protected] 1 0.6931472
[email protected] 5 1.7917595
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
[email protected] 1 0.6931472
Thread Freq Response Weight
please help a newbie compile mplayer :-) 4 42309 5.975627
prob. w/ install/uninstall 4 23745 6.226488
http://apt.nixia.no/ 10 265303 5.576258
problems with ‘apt-get -f install’ 3 55960 5.729244
problems with apt update 2 6347 6.498461
about apt, kernel updates and dist-upgrade 5 240238 5.318328
Term Weight
compile 5.803255
help 5.427126
mplayer 6.724644
newbie 5.444172
please 6.309005
install 5.977866
Term Weight
anyway 1.875061
appreciated 1.176091
apt 2.255273
can 3.077004
directory 2.056905
document 1.278754

训练和测试排序算法

get.weights<- function(search.term,weight.df,term=TRUE){
    if(length(search.term)>0){
        if(term){
            term.match<-match(names(search.term),weight.df$Term)
        }
        else{
            term.match<-match(search.term,weight.df$Thread)
        }
        match.weights<-weight.df$Weight[which(!is.na(term.match))]
        if(length(match.weights)<1){
            return(1)
        }
        else{
            return(mean(match.weights))
        }
    }
    else{
        return(1)
    }
}

计算每封邮件的的权重(相乘)

rank.message<-function(path){
    msg<-parse.email(path)
    #Weighting based on message author  return(c(date,from,subj,msg,path))

    #First  is just on the total frequency
    from <- ifelse(length(which(from.weight$From.Email==msg[2]))>0,from.weight$Weight[which(from.weight$From.Email==msg[2])],1)

    #Second is based on senders in threads ,and threads themselves
    thread.from<-ifelse(length(which(senders.df$From.Email==msg[2]))>0,senders.df$Weight[which(senders.df$From.Email==msg[2])],1)

    subj<-strsplit(tolower(msg[3]),'re: ')
    is.thread<-ifelse(subj[[1]][1]=='',TRUE,FALSE)
    if(is.thread){
        activity<-get.weights(subj[[1]][2],thread.weights,term=FALSE)
    }
    else{
        activity=1
    }
    #Next,weight based on terms
    #Weight based on terms in threads
    thread.terms <- term.counts(msg[3],control=list(stopwords=stopwords()))
    thread.terms.weights<-get.weights(thread.terms,term.weights)

    #Weight baesd term in all messages
    msg.terms<-term.counts(msg[4],control=list(stopwords=stopwords(),removePunctuation=TRUE,removeNumbers=TRUE))
    msg.weights<-get.weights(msg.terms,msg.weights)

    #Calcuate rank by interacting all weights
    rank <- prod(from,thread.from,activity,thread.terms.weights,msg.weights)##连乘

    return(c(msg[1],msg[2],msg[3],rank))
}

训练集和测试集

##拆分训练集和测试集
train.paths<-priority.df$Path[1:(round(nrow(priority.df)/2))]
test.paths<-priority.df$Path[((round(nrow(priority.df)/2))+1):nrow(priority.df)]

训练数据集结果

train.ranks<-lapply(train.paths,rank.message)##计算评分
train.ranks.matrix<-do.call(rbind,train.ranks)
train.ranks.matrix<-cbind(train.paths,train.ranks.matrix,'TRAINING')
train.ranks.df<-data.frame(train.ranks.matrix,stringsAsFactors=FALSE)
names(train.ranks.df)<-c('Message','Date','From','Subj','Rank','Type')
train.ranks.df$Rank<-as.numeric(train.ranks.df$Rank)

priority.threshold<-median(train.ranks.df$Rank)##评分的中位数

train.ranks.df$Priority<-ifelse(train.ranks.df$Rank>=priority.threshold,1,0)
train.ranks.df$Date<-date.converter(train.ranks.df$Date,pattern1,pattern2)
nrow(priority.df)

2500

train.ranks.df[15,]
Message Date From Subj Rank Type Priority
15 D:\ML_for_Hackers-master\03-Classification\data\easy_ham\01078.e83af8e93466283be2ba03e34854682e 2002-02-02 08:11:08 [email protected] Re: problems with ‘apt-get -f install’ 371.1476 TRAINING 1
priority.threshold

59.2488866334203

ggplot(train.ranks.df,aes(x=Rank))+stat_density(aes(fill="darkred"))+geom_vline(aes(xintercept=priority.threshold),color="green",linetype="dashed")+geom_text(aes(x=priority.threshold,y=0,label=round(priority.threshold,3)))

第四章 排序:智能收件箱_第3张图片

sd(train.ranks.df$Rank)

227.684723068453

测试数据集

test.ranks<-lapply(test.paths,rank.message)##计算评分
test.ranks.matrix<-do.call(rbind,test.ranks)
test.ranks.matrix<-cbind(test.paths,test.ranks.matrix,'TEST')
test.ranks.df<-data.frame(test.ranks.matrix,stringsAsFactors=FALSE)
names(test.ranks.df)<-c('Message','Date','From','Subj','Rank','Type')
test.ranks.df$Rank<-as.numeric(test.ranks.df$Rank)
test.ranks.df$Date<-date.converter(test.ranks.df$Date,pattern1,pattern2)
head(test.ranks.df)
Message Date From Subj Rank Type
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00696.767a9ee8575785978ea5174d3ad3ee26 2002-09-21 19:13:52 [email protected] Re: sed /s/United States/Roman Empire/g 573.38792 TEST
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00697.edd28212eb2b368046311fd1918aae7d 2002-09-21 20:37:43 [email protected] E-Textiles Come into Style 16.69912 TEST
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00698.09cdefd75c1242540db1183f9fc54461 2002-09-21 22:59:23 [email protected] flavor cystals 13.14839 TEST
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00702.bf064c61d1ba308535d0d8af3bcb1789 2002-09-22 03:00:48 [email protected] Re: Oh my… 97.93984 TEST
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00699.29e599983f044aee500f3a58c34acffc 2002-09-22 03:13:08 [email protected] Re: sed /s/United States/Roman Empire/g 453.67502 TEST
D:\ML_for_Hackers-master\03-Classification\data\easy_ham\00700.7eee792482a5a8cf20f1e4225f905f6b 2002-09-22 03:20:22 [email protected] [vox] Anarchist ‘Scavenger Hunt’ Raises D.C. Police Ire (fwd) 22.28577 TEST
ggplot(train.ranks.df,aes(x=Rank))+geom_density(aes(color='green'))+geom_density(data=test.ranks.df,aes(x=Rank,color='red'))

第四章 排序:智能收件箱_第4张图片

列出排序靠前的40个

reslut <-test.ranks.df[order(test.ranks.df$Rank,decreasing=TRUE),][1:40,c(2,3,4,5)]
row.names(reslut)<-1:nrow(reslut)
reslut
Date From Subj Rank
2002-09-23 18:02:05 [email protected] Re: sylpheed-claws 660.0621
2002-09-23 17:41:20 [email protected] Re: sylpheed-claws 648.4520
2002-09-21 19:13:52 [email protected] Re: sed /s/United States/Roman Empire/g 573.3879
2002-09-22 15:53:43 [email protected] RE: sed /s/United States/Roman Empire/g 484.4433
2002-09-23 15:09:46 [email protected] Re: sed /s/United States/Roman Empire/g 470.8348
2002-09-22 03:13:08 [email protected] Re: sed /s/United States/Roman Empire/g 453.6750
2002-09-22 22:11:03 [email protected] Re: bad focus/click behaviours 426.6982
2002-09-22 22:21:49 [email protected] Re: bad focus/click behaviours 423.0334
2002-09-22 22:00:37 [email protected] Re: bad focus/click behaviours 417.7473
2002-09-22 12:03:36 [email protected] Re: sed /s/United States/Roman Empire/g 407.2638
2002-09-22 22:08:16 [email protected] Re: bad focus/click behaviours 404.7715
2002-09-23 11:18:16 [email protected] Re: [VoID] a new low on the personals tip… 333.9248
2002-09-30 07:14:31 [email protected] Re: New testing packages 320.5003
2002-09-27 12:33:23 [email protected] RE: The Big Jump 282.4799
2002-09-25 23:09:39 [email protected] Re: Exmh && speed 232.7133
2002-09-23 19:22:24 [email protected] Not just like a virgin…a virgin…birth 154.8486
2002-09-30 21:46:08 [email protected] Re: New testing packages 142.9437
2002-09-24 10:04:06 [email protected] Re: 2002.06.00.00 141.7460
2002-09-28 19:37:42 [email protected] Re: EBusiness Webforms: cluetrain has left the station 130.8743
2002-10-03 21:58:55 [email protected] Re: Living Love - Another legacy of the 60’s 130.7779
2002-10-05 18:17:53 [email protected] Re: [SAtalk] razor2 auth? 127.4101
2002-09-23 10:43:01 [email protected] Re: sed /s/United States/Roman Empire/g 127.2770
2002-10-03 21:23:04 [email protected] Re: [SAtalk] 2.42: est release? 126.8429
2002-09-25 18:45:29 [email protected] Re: Digital radio playlists are prohibited?! 126.7640
2002-09-23 21:51:03 [email protected] Re: How about subsidizing SSL access to Google? 125.8781
2002-10-02 11:12:42 [email protected] Polit-spam 124.6367
2002-10-07 20:18:15 [email protected] Re: [SAtalk] Re: AWL bug in 2.42? 123.8745
2002-09-23 12:48:34 [email protected] Re: [SAtalk] Spam host? 123.7719
2002-10-09 18:23:20 [email protected] Re: [SAtalk] Re: fully-public corpus of mail available 119.9497
2002-09-28 15:54:42 [email protected] Re: EBusiness Webforms: cluetrain has left the station 117.8278
2002-09-30 22:59:46 [email protected] Re: apt 0.5.7 on RH 7.x 115.8525
2002-09-23 18:48:24 [email protected] Re: [Razor-users] razor unblessed reference 115.6458
2002-10-02 11:45:33 [email protected] freshrpms.net resources (was Re: use new apt to do null to RH8 114.8119
2002-10-02 10:53:24 [email protected] Re: use new apt to do null to RH8 upgrade? 114.5680
2002-09-30 23:28:59 [email protected] Re: Internet Archive bookmobile 114.2242
2002-10-01 12:12:44 [email protected] Re: use new apt to do null to RH8 upgrade? 114.1910
2002-09-30 22:00:02 [email protected] Re: apt 0.5.7 on RH 7.x 113.4832
2002-09-26 16:08:14 [email protected] Re: dumb question: X client behind a firewall? 112.4602
2002-09-30 21:50:46 [email protected] Re: ALSA Problem with Null kernel 111.9554
2002-10-02 18:57:28 [email protected] Re: use new apt to do null to RH8 upgrade? 110.3828

你可能感兴趣的:(优化算法,机器学习)