机器学习实战:Naive_Bayes(附数据集)

机器学习实战:Naive_Bayes

运行环境:Anaconda——Jupyter Notebook
Python版本为:3.6.6

数据集:email
提取码:vgyb

1.使用Python进行文本分类

1.1准备数据:从文本中构建词向量

程序清单:词表到向量的转换函数
import numpy as np
"""
函数说明:创建实验样本
Parameters:
    无
Returns:
    postingList - 实验样本切分的词条
    classVec - 类别标签向量
"""
def loadDataSet():
    # 切分的词条
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],      
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList,classVec


"""
函数说明:将切分的实验样本词条整理成不重复的词条列表,也就是词汇表
Parameters:
    dataSet - 整理的样本数据集
Returns:
    vocabSet - 返回不重复的词条列表,也就是词汇表
"""
def createVocabList(dataSet):
    vocabSet = set([])                 #创建一个空的不重复列表
    for document in dataSet:
        vocabSet = vocabSet | set(document) #取并集
    return list(vocabSet)


"""
函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0
Parameters:
    vocabList - createVocabList返回的列表
    inputSet - 切分的词条列表
Returns:
    returnVec - 文档向量,词集模型
"""
def setOfWord2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print('the word: %s is not in my vocabList' % word)
    return returnVec
listPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listPosts)
myVocabList
['dog',
 'worthless',
 'to',
 'so',
 'flea',
 'love',
 'is',
 'I',
 'help',
 'take',
 'please',
 'mr',
 'park',
 'not',
 'has',
 'steak',
 'buying',
 'him',
 'food',
 'garbage',
 'maybe',
 'stop',
 'stupid',
 'licks',
 'ate',
 'dalmation',
 'my',
 'cute',
 'quit',
 'how',
 'posting',
 'problems']
setOfWord2Vec(myVocabList,listPosts[0])
[1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1]

1.2 训练算法:从词向量计算概率

程序清单: 朴素贝叶斯分类器训练函数
"""
函数说明:朴素贝叶斯分类器训练函数
Parameters:
    trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵
    trainCategory - 训练类别标签向量,即loadDataSet返回的classVec
Returns:
    p0Vect - 非侮辱类的条件概率数组
    p1Vect - 侮辱类的条件概率数组
    pAbusive - 文档属于侮辱类的概率
"""
def trainNB0(trainMatrix,trainCatagory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCatagory)/numTrainDocs
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 0
    p1Denom = 0
    for i in range(numTrainDocs):
        if trainCatagory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
            
    p0Vect = p0Num/p0Denom
    p1Vect = p1Num/p1Denom
    return p0Vect,p1Vect,pAbusive
trainMat = []
for postinDoc in listPosts:
    trainMat.append(setOfWord2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
pAb
0.5
p0V
array([0.04166667, 0.        , 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.04166667, 0.        , 0.        , 0.04166667,
       0.04166667, 0.        , 0.08333333, 0.        , 0.        ,
       0.        , 0.04166667, 0.        , 0.04166667, 0.04166667,
       0.04166667, 0.125     , 0.04166667, 0.        , 0.04166667,
       0.        , 0.04166667])
p1V
array([0.10526316, 0.10526316, 0.05263158, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.05263158,
       0.        , 0.        , 0.05263158, 0.05263158, 0.        ,
       0.        , 0.05263158, 0.05263158, 0.05263158, 0.05263158,
       0.05263158, 0.05263158, 0.15789474, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.05263158, 0.        ,
       0.05263158, 0.        ])

1.3测试算法:根据现实情况修改分类器

  • 问题一:利用贝叶斯分类器对文档进行分类时,要计算多个概率的乘积以获得文档属于某个类别的概率,即计算p(w0|1)p(w1|1)p(w2|1)。如果其中一个概率值为0,那么最后的乘积也为0。为降低这种影响,可以将所有词的出现数初始化为1,并将分母初始化为2。
  • 问题二:另一个遇到的问题是下溢出, 这是由于太多很小的数相乘造成的。当计算乘积p(w0|ci)p(w1|ci)p(w2|ci)…p(wN|ci)时,由于大部分因子都非常小,所以程序会下溢出或者得到不正确的答案。(读者可以用Python尝试相乘许多很小的数,最后四舍五入后会得到0。)一种解决办法是对乘积取自然对数。在代数中有ln(a*b) = ln(a)+ln(b),于是通过求对数可以避免下溢出或者浮点数舍入导致的错误。同时,采用自然对数进行处理不会有任何损失
def trainNB0(trainMatrix,trainCatagory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCatagory)/numTrainDocs
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)
    p0Denom = 2
    p1Denom = 2
    for i in range(numTrainDocs):
        if trainCatagory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
            
    p0Vect = np.log(p0Num/p0Denom)
    p1Vect = np.log(p1Num/p1Denom)
    return p0Vect,p1Vect,pAbusive
trainMat = []
for postinDoc in listPosts:
    trainMat.append(setOfWord2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
pAb
0.5
p0V
array([-2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -3.25809654, -3.25809654, -2.56494936,
       -2.56494936, -3.25809654, -2.15948425, -3.25809654, -3.25809654,
       -3.25809654, -2.56494936, -3.25809654, -2.56494936, -2.56494936,
       -2.56494936, -1.87180218, -2.56494936, -3.25809654, -2.56494936,
       -3.25809654, -2.56494936])
p1V
array([-1.94591015, -1.94591015, -2.35137526, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -3.04452244, -3.04452244, -2.35137526,
       -3.04452244, -3.04452244, -2.35137526, -2.35137526, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -2.35137526, -2.35137526,
       -2.35137526, -2.35137526, -1.65822808, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -3.04452244, -2.35137526, -3.04452244,
       -2.35137526, -3.04452244])
程序清单 朴素贝叶斯分类函数
"""
函数说明:朴素贝叶斯分类器分类函数
Parameters:
	vec2Classify - 待分类的词条数组
	p0Vec - 侮辱类的条件概率数组
	p1Vec -非侮辱类的条件概率数组
	pClass1 - 文档属于侮辱类的概率
Returns:
	0 - 属于非侮辱类
	1 - 属于侮辱类
"""
def classifyNB(vec2classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2classify*p1Vec) + np.log(pClass1)
    p0 = sum(vec2classify*p0Vec) + np.log(1-pClass1)
    if p1>p0:
        return 1
    else:
        return 0

"""
函数说明:测试朴素贝叶斯分类器
第二个函数是一个便利函数,该函数封装所有操作,以节省输入代码的时间。
"""
def testingNB():
    listPosts,listClasses = loadDataSet()
    vocabList = createVocabList(listPosts)
    trainMat = []
    for postinDoc in listPosts:
        trainMat.append(setOfWord2Vec(vocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(trainMat,listClasses)
    testEntry = ['love','my','dalmation']
    thisDoc = setOfWord2Vec(vocabList,testEntry)
    print(testEntry ,'classify as:' ,classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = setOfWord2Vec(vocabList,testEntry)
    print(testEntry, 'classify as:' ,classifyNB(thisDoc,p0V,p1V,pAb))      
testingNB()
['love', 'my', 'dalmation'] classify as: 0
['stupid', 'garbage'] classify as: 1

2.准备数据:文档词袋模型

我们将每个词的出现与否作为一个特征,这可以被描述为词集模型(set-of-words model)。如果一个词在文档中出现不止一次,这可能意味着包含该词是否出现在文档中所不能表达的某种信息,这种方法被称为词袋模型(bag-of-words model)。

程序清单 朴素贝叶斯词袋模型
def bagOfWord2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

2.1 使用朴素贝叶斯过滤垃圾邮件

2.1.1准备数据:切分文本
mysend = 'We have to live art if we would be affected by art. We have to paint rather than look at paintings, to play instruments rather than go to concerts, to dance and sing and act ourselves, engaging all our senses in the ritual and discipline of the arts. Then something may begin to happen to us:to work upon our bodies and soul.'
import re
"""
函数说明:接收一个大字符串并将其解析为字符串列表
"""
def textParse(bigString):  # 将字符串转换为字符列表
    listOfTokens = re.split(r'\W*', bigString)  # 将特殊符号作为切分标志进行字符串切分,即非字母、非数字
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]  # 除了单个字母,例如大写的I,其它单词变成小写
textParse(mysend)
[]
2.1.2 测试算法:使用朴素贝叶斯进行交叉验证
"""
函数说明:测试朴素贝叶斯分类器,使用朴素贝叶斯进行交叉验证
"""
import re
import random
def textParse(bigStrings):
    listTokens = re.split('\W*',bigStrings)
    tokens = [token.lower() for token in listTokens if len(token)>2]
    return tokens  

def spamTest():
    docList = []
    fullList = []
    classList = []
    for i in range(1,26):
        wordList = [open('email/spam/%d.txt' % i,'r').read()]
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(1)
        wordList = [open('email/ham/%d.txt' %i,'r').read()]
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(0)
    myvocabList = createVocabList(docList)
    trainMat = []
    trainClasses = []
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    for docIndex in trainingSet:
        trainMat.append(setOfWord2Vec(myVocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(trainMat,trainClasses)
    errorCount = 0
    for docIndex in testSet:
        vec2classify = setOfWord2Vec(myVocabList,docList[docIndex])
        if classifyNB(vec2classify,p0V,p1V,pSpam)!= classList[docIndex]:
            errorCount += 1
    print('errorCount:',errorCount)
    print('the error rate is:%.2f' % (float(errorCount) / len(testSet)))
spamTest()
the word: --- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! --

-- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever
-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! --- is not in my vocabList
the word: Hydrocodone/Vicodin ES/Brand Watson

Vicodin ES - 7.5/750 mg: 30 - $195 / 120 $570
Brand Watson - 7.5/750 mg: 30 - $195 / 120 $570
Brand Watson - 10/325 mg: 30 - $199 / 120 - $588
NoPrescription Required
FREE Express FedEx (3-5 days Delivery) for over $200 order
Major Credit Cards + E-CHECK is not in my vocabList
the word: Yay to you both doing fine!

I'm working on an MBA in Design Strategy at CCA (top art school.)  It's a new program focusing on more of a right-brained creative and strategic approach to management.  I'm an 1/8 of the way done today! is not in my vocabList
the word: WHat is going on there?
I talked to John on email.  We talked about some computer stuff that's it.

I went bike riding in the rain, it was not that cold.

We went to the museum in SF yesterday it was $3 to get in and they had
free food.  At the same time was a SF Giants game, when we got done we
had to take the train with all the Giants fans, they are 1/2 drunk. is not in my vocabList
the word: Yo.  I've been working on my running website.  I'm using jquery and the jqplot plugin.  I'm not too far away from having a prototype to launch.  

You used jqplot right?  If not, I think you would like it. is not in my vocabList
the word: --- Codeine 15mg -- 30 for $203.70 -- VISA Only!!! --

-- Codeine (Methylmorphine) is a narcotic (opioid) pain reliever
-- We have 15mg & 30mg pills -- 30/15mg for $203.70 - 60/15mg for $385.80 - 90/15mg for $562.50 -- VISA Only!!! --- is not in my vocabList
the word: There was a guy at the gas station who told me that if I knew Mandarin
and Python I could get a job with the FBI. is not in my vocabList
the word: OEM Adobe & Microsoft softwares
Fast order and download

Microsoft Office Professional Plus 2007/2010 $129
Microsoft Windows 7 Ultimate $119
Adobe Photoshop CS5 Extended
Adobe Acrobat 9 Pro Extended
Windows XP Professional & thousand more titles is not in my vocabList
the word: Hello,

Since you are an owner of at least one Google Groups group that uses the customized welcome message, pages or files, we are writing to inform you that we will no longer be supporting these features starting February 2011. We made this decision so that we can focus on improving the core functionalities of Google Groups -- mailing lists and forum discussions.  Instead of these features, we encourage you to use products that are designed specifically for file storage and page creation, such as Google Docs and Google Sites.

For example, you can easily create your pages on Google Sites and share the site (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=174623) with the members of your group. You can also store your files on the site by attaching files to pages (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=90563) on the site. If you抮e just looking for a place to upload your files so that your group members can download them, we suggest you try Google Docs. You can upload files (http://docs.google.com/support/bin/answer.py?hl=en&answer=50092) and share access with either a group (http://docs.google.com/support/bin/answer.py?hl=en&answer=66343) or an individual (http://docs.google.com/support/bin/answer.py?hl=en&answer=86152), assigning either edit or download only access to the files.

you have received this mandatory email service announcement to update you about important changes to Google Groups. is not in my vocabList
the word: Bargains Here! Buy Phentermin 37.5 mg (K-25)

Buy Genuine Phentermin at Low Cost
VISA Accepted
30 - $130.50
60 - $219.00
90 - $292.50
120 - $366.00
180 - $513.00 is not in my vocabList
the word: Zach Hamm commented on your status.

Zach wrote:
"doggy style - enough said, thank you & good night"

 is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe is not in my vocabList
the word: This e-mail was sent from a notification-only address that cannot accept incoming e-mail. Please do not reply to this message.

Thank you for your online reservation. The store you selected has located the item you requested and has placed it on hold in your name. Please note that all items are held for 1 day.  Please note store prices may differ from those online.

If you have questions or need assistance with your reservation, please contact the store at the phone number listed below. You can also access store information, such as store hours and location, on the web at http://www.borders.com/online/store/StoreDetailView_98. is not in my vocabList
the word: Bargains Here! Buy Phentermin 37.5 mg (K-25)

Buy Genuine Phentermin at Low Cost
VISA Accepted
30 - $130.50
60 - $219.00
90 - $292.50
120 - $366.00
180 - $513.00 is not in my vocabList
the word: Hi Peter,

These are the only good scenic ones and it's too bad there was a girl's back in one of them. Just try to enjoy the blue sky : ))

D is not in my vocabList
the word: Ryan Whybrew commented on your status.

Ryan wrote:
"turd ferguson or butt horn."
 is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe
The proven NaturalPenisEnhancement that works!
100% MoneyBack Guaranteeed is not in my vocabList
the word: Buy Ambiem (Zolpidem) 5mg/10mg @ $2.39/- pill

30 pills x 5 mg - $129.00
60 pills x 5 mg - $199.20
180 pills x 5 mg - $430.20
30 pills x 10 mg - $ 138.00
120 pills x 10 mg - $ 322.80 is not in my vocabList
the word: Thanks Peter.

I'll definitely check in on this. How is your book
going? I heard chapter 1 came in and it was in 
good shape. ;-)

I hope you are doing well.

Cheers,

Troy is not in my vocabList
the word: OrderCializViagra Online & Save 75-90%

0nline Pharmacy NoPrescription required
Buy Canadian Drugs at Wholesale Prices and Save 75-90%
FDA-Approved drugs + Superb Quality Drugs only!
Accept all major credit cards
        Order Today! From $1.38
 is not in my vocabList
the word: Jay Stepp commented on your status.

Jay wrote:
""to the" ???"

Reply to this email to comment on this status.

To see the comment thread, follow the link below:

 is not in my vocabList
the word: BuyVIAGRA 25mg, 50mg, 100mg,
BrandViagra, FemaleViagra from $1.15 per pill

ViagraNoPrescription needed - from Certified Canadian Pharmacy

Buy Here... We accept VISA, AMEX, E-Check... Worldwide Delivery is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe is not in my vocabList
the word: Hi Peter,
 
The hotels are the ones that rent out the tent. They are all lined up on the hotel grounds : )) So much for being one with nature, more like being one with a couple dozen tour groups and nature.
I have about 100M of pictures from that trip. I can go through them and get you jpgs of my favorite scenic pictures.
 
Where are you and Jocelyn now? New York? Will you come to Tokyo for Chinese New Year? Perhaps to see the two of you then. I will go to Thailand for winter holiday to see my mom : )
 
Take care,
D
 is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe is not in my vocabList
the word: yeah I am ready.  I may not be here because Jar Jar has plane tickets to Germany for me.   is not in my vocabList
the word: A home based business opportunity is knocking at your door.

Don抰 be rude and let this chance go by.

You can earn a great income and find
your financial life transformed.

Learn more Here.


To Your Success.

Work From Home Finder Experts is not in my vocabList
the word: Codeine (the most competitive price on NET!)

Codeine (WILSON) 30mg x 30 $156.00
Codeine (WILSON) 30mg x 60 $291.00 (+4 FreeViagra pills)
Codeine (WILSON) 30mg x 90 $396.00 (+4 FreeViagra pills)
Codeine (WILSON) 30mg x 120 $492.00 (+10 FreeViagra pills) is not in my vocabList
the word: Get Up to 75% OFF at Online WatchesStore

Discount Watches for All Famous Brands

* Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands
* Louis Vuitton Bags & Wallets
* Gucci Bags
* Tiffany & Co Jewerly

Enjoy a full 1 year WARRANTY
Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost
You will 100% recieve your order
Save Up to 75% OFF Quality Watches is not in my vocabList
the word: LinkedIn

Julius O requested to add you as a connection on LinkedIn:

Hi Peter.

Looking forward to the book!

Accept 	View invitation from Julius O
 is not in my vocabList
the word: Get Up to 75% OFF at Online WatchesStore

Discount Watches for All Famous Brands

* Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands
* Louis Vuitton Bags & Wallets
* Gucci Bags
* Tiffany & Co Jewerly

Enjoy a full 1 year WARRANTY
Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost
You will 100% recieve your order is not in my vocabList
the word: I've thought about this and think it's possible. We should get another
lunch. I have a car now and could come pick you up this time. Does
this wednesday work? 11:50?

Can I have a signed copy of you book? is not in my vocabList
the word: Percocet 10/625 mg withoutPrescription 30 tabs - $225!
Percocet, a narcotic analgesic, is used to treat moderate to moderately SeverePain
Top Quality, EXPRESS Shipping, 100% Safe & Discreet & Private.
Buy Cheap Percocet Online is not in my vocabList
the word: we saw this on the way to the coast...thought u might like it

hangzhou is huge, one day wasn't enough, but we got a glimpse...

we went inside the china pavilion at expo, it is pretty interesting,
each province has an exhibit... is not in my vocabList
the word: Get Up to 75% OFF at Online WatchesStore

Discount Watches for All Famous Brands

* Watches: aRolexBvlgari, Dior, Hermes, Oris, Cartier, AP and more brands
* Louis Vuitton Bags & Wallets
* Gucci Bags
* Tiffany & Co Jewerly

Enjoy a full 1 year WARRANTY
Shipment via reputable courier: FEDEX, UPS, DHL and EMS Speedpost
You will 100% recieve your order is not in my vocabList
the word: Hi Hommies,

Just got a phone call from the roofer, they will come and spaying the foaming today. it will be dusty. pls close all the doors and windows.
Could you help me to close my bathroom window, cat window and the sliding door behind the TV?
I don't know how can those 2 cats survive......

Sorry for any inconvenience! is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe is not in my vocabList
the word: Ok I will be there by 10:00 at the latest. is not in my vocabList
the word: Experience with BiggerPenis Today! Grow 3-inches more

The Safest & Most Effective Methods Of_PenisEn1argement.
Save your time and money!
BetterErections with effective Ma1eEnhancement products.

#1 Ma1eEnhancement Supplement. Trusted by Millions. Buy Today! is not in my vocabList
the word: That is cold.  Is there going to be a retirement party?  
Are the leaves changing color? is not in my vocabList
the word: Benoit Mandelbrot 1924-2010

Benoit Mandelbrot 1924-2010

Wilmott Team

Benoit Mandelbrot, the mathematician, the father of fractal mathematics, and advocate of more sophisticated modelling in quantitative finance, died on 14th October 2010 aged 85.

Wilmott magazine has often featured Mandelbrot, his ideas, and the work of others inspired by his fundamental insights.

You must be logged on to view these articles from past issues of Wilmott Magazine. is not in my vocabList
the word: Hi Peter,

With Jose out of town, do you want to
meet once in a while to keep things
going and do some interesting stuff?

Let me know
Eugene is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe
The proven NaturalPenisEnhancement that works!
100% MoneyBack Guaranteeed is not in my vocabList
the word: 
SciFinance now automatically generates GPU-enabled pricing & risk model source code that runs up to 50-300x faster than serial code using a new NVIDIA Fermi-class Tesla 20-Series GPU.

SciFinance is a derivatives pricing and risk model development tool that automatically generates C/C++ and GPU-enabled source code from concise, high-level model specifications. No parallel computing or CUDA programming expertise is required.

SciFinance's automatic, GPU-enabled Monte Carlo pricing model source code generation capabilities have been significantly extended in the latest release. This includes:

 is not in my vocabList
the word: You Have Everything To Gain!

Incredib1e gains in length of 3-4 inches to yourPenis, PERMANANTLY

Amazing increase in thickness of yourPenis, up to 30%
BetterEjacu1ation control
Experience Rock-HardErecetions
Explosive, intenseOrgasns
Increase volume ofEjacu1ate
Doctor designed and endorsed
100% herbal, 100% Natural, 100% Safe is not in my vocabList
the word: LinkedIn

Kerry Haloney requested to add you as a connection on LinkedIn:

Peter,

I'd like to add you to my professional network on LinkedIn.

- Kerry Haloney
 
 is not in my vocabList
the word: OrderCializViagra Online & Save 75-90%

0nline Pharmacy NoPrescription required
Buy Canadian Drugs at Wholesale Prices and Save 75-90%
FDA-Approved drugs + Superb Quality Drugs only!
Accept all major credit cards is not in my vocabList
the word: Percocet 10/625 mg withoutPrescription 30 tabs - $225!
Percocet, a narcotic analgesic, is used to treat moderate to moderately SeverePain
Top Quality, EXPRESS Shipping, 100% Safe & Discreet & Private.
Buy Cheap Percocet Online is not in my vocabList
the word: Arvind Thirumalai commented on your status.

Arvind wrote:
""you know""

Reply to this email to comment on this status.

 is not in my vocabList
the word: Hi Peter,

    Sure thing.  Sounds good.  Let me know what time would be good for you.
I will come prepared with some ideas and we can go from there.

Regards,

-Vivek. is not in my vocabList
errorCount: 6
the error rate is:0.60

3.使用朴素贝叶斯分类器从个人广告中获取区域倾向

import feedparser
ny = feedparser.parse('http://www.nasa.gov/rss/dyn/image_of_the_day.rss')
sf = feedparser.parse('https://blog.csdn.net/j_shine/rss/list')
ny['entries']
sf['entries']
len(ny['entries'])
#len(sf['entries'])
60
print(len(sf['entries']))
20
import operator
def calcMostFreq(myVocabList,fullList):
    freqDict = {}
    for token in myVocabList:
        freqDict[token] = fullList.count(token)
    sortedFreq = sorted(freqDict.items(),key = operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

def localWords(feed1,feed0):
    import feedparser
    # 文档中的单词集单词的集合组成的列表
    docList=[]
    # 类别标签组成的列表
    classList = []
    # 文档中所有的文字组成的列表
    fullList =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullList.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullList)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: 
            vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen)); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bagOfWord2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bagOfWord2VecMN(vocabList, docList[docIndex])
        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V
vocabList, pSF, pNY=localWords(ny, sf)
the error rate is:  0.5
def getTopWords(ny, sf):
    vocabList, p0V, p1V = localWords(ny, sf)
    topNY = []
    topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.5 :
            topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.5 :
            topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key = lambda pair:pair[1], reverse=True) #pair表示列表中的元素? 例如(0, 'A')?
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key = lambda pair:pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY")
    for item in sortedNY:
        print(item[0])
getTopWords(ny, sf)

你可能感兴趣的:(机器学习)