代码人生的记忆---2018-07-12

def textParse(bigString):

    import re

    listOfTokens = re.split(r'\W*', bigString)

    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():

    docList = []; classList = [];fullText =  []

    for i in range(1,26):

        wordList = textParse(open('email/spam/%d.txt' % i).read())

        docList.append(wordList)

        fullText.extend(wordList)

        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())

        docList.append(wordList)

        fullText.extend(wordList)

        classList.append(0)

    vocabList = createVocabList(docList)

    trainingSet = range(50); testSet=[]

    for i in range(10):

        randIndex = int(random.uniform(0, len(trainingSet)))

        testSet.append(trainingSet[randIndex])

        del(trainingSet[randIndex])

    trainMat = []; trainClasses = []

    for docIndex in trainingSet:

        trainMat.append(setofWords2Vec(vocabList, docList[docIndex]))

        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))

    errorCount = 0

    for docIndex in testSet:

        wordVector = setofWords2Vec(vocabList, docList[docIndex])

        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:

            errorCount += 1

    print('the error rate is:', float(errorCount)/len(testSet))

spamTest()

def calcMostFreq(vocabList, fullText):

    import operator

    freqDict = {}

    for token in vocabList:

        freqDict[token] = fullText.count(token)

    sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)

    return sortedFreq[:30]

def localWords(feed1, feed0):

    import feedparser

    docList=[]; classList=[]; fullText=[]

    minLen = min(len(feed1['entries']),len(feed0['entries']))

    for i in range(minLen):

        wordList = textParse(feed1['entries'][i]['summary'])

        docList.append(wordList)

        classList.append(1)

        wordList = textParse(feed0['entries'][i]['summary'])

        docList.append(wordList)

        fullText.extend(wordList)

        classList.append(0)

    vocabList = createVocabList(docList)

    top30Words = calcMostFreq(vocabList, fullText)

    for pairW in top30Words:

        if pairW[0] in vocabList:

            vocabList.remove(pairW[0])

    trainingSet = range(2*minLen); testSet=[]

    for i in range(20):

        randIndex = int(random.uniform(0,len(trainingSet)))

        testSet.append(trainingSet[randIndex])

        del(trainingSet[randIndex]) 

    trainMat=[]; trainClasses=[]

    for docIndex in trainingSet:

        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))

    errorCount = 0

    for docIndex in testSet:

        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

        if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList:

            errorCount += 1

    print('the error rate is:', float(errorCount)/len(testSet))

    return vocabList, p0V, p1V

import feedparser

ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')

sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

def getTopWords(ny,sf):

    import operator

    vocabList, p0V, p1V = localWords(ny, sf)

    topNY=[]; topSF=[]

    for i in range(len(p0V)):

        if p0V > -6.0 :topSF.append((vocabList[i], p0V[i]))

        if p1V > -6.0 :topNY.append((vocabList[i], p1V[i]))

    sortedSF = sorted(topSF, key=lambda pair: pair[i], reverse=True)

    print('SF**SF**SF**SF**SF**')

    for item in sortedSF:

        print(item[0])

    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)

    print('NY**NY**NY**NY**NY**')

    for item in sortedNY:

        print(item[0])

getTopWords(ny, sf)

localWords(ny, sf)

你可能感兴趣的:(代码人生的记忆---2018-07-12)