- 对于python3的编码和解码问题 encode及decode
- 由于encode及decode问题带来的python3 中 ‘r’和’rb’问题
- range(50)需要list(range(50))可以返回列表形式
- set()的 | 并集操作
- python正则表达式,re的pattern及match search的方法
import numpy as np
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet |= set(document)
return list(vocabSet)
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
print(type(myVocabList))
print(myVocabList)
['posting', 'so', 'my', 'take', 'ate', 'not', 'help', 'mr', 'cute', 'please', 'quit', 'I', 'licks', 'has', 'park', 'love', 'buying', 'is', 'dalmation', 'him', 'how', 'steak', 'stupid', 'stop', 'dog', 'flea', 'problems', 'worthless', 'food', 'maybe', 'garbage', 'to']
a = set([1,2,3,1])
a
{1, 2, 3}
def setOfWord2Vec(vocablist, inputSet):
returnVec = [0] * len(vocablist)
for word in inputSet:
if word in vocablist:
returnVec[vocablist.index(word)] = 1
else:
print("the word: {0} is not in vocablist".format(word))
return returnVec
print(setOfWord2Vec(myVocabList, listOfPosts[3]))
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]
str = 'abca'
lis = ['a','b','a']
print(str.index('a'))
print(str.index('a',1))
print(str.find('a'))
print(str.find('a',2))
print(str.rfind('a'))
print(str.rindex('a'))
print(lis.find('a'))
0
3
0
3
3
3
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in ()
8 print(str.rindex('a'))
9 #list没有find方法
---> 10 print(lis.find('a'))
AttributeError: 'list' object has no attribute 'find'
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float(numTrainDocs)
p0Num = np.ones(numWords)
p1Num = np.ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect, p1Vect, pAbusive
trainMat = []
for postinDoc in listOfPosts:
trainMat.append(setOfWord2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
print(trainMat)
print(p0V)
print(p1V)
print(pAb)
[[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0]]
[ 0.03846154 0.07692308 0.15384615 0.03846154 0.07692308 0.03846154
0.07692308 0.07692308 0.07692308 0.07692308 0.03846154 0.07692308
0.07692308 0.07692308 0.03846154 0.07692308 0.03846154 0.07692308
0.07692308 0.11538462 0.07692308 0.07692308 0.03846154 0.07692308
0.07692308 0.07692308 0.07692308 0.03846154 0.03846154 0.03846154
0.03846154 0.07692308]
[ 0.0952381 0.04761905 0.04761905 0.0952381 0.04761905 0.0952381
0.04761905 0.04761905 0.04761905 0.04761905 0.0952381 0.04761905
0.04761905 0.04761905 0.0952381 0.04761905 0.0952381 0.04761905
0.04761905 0.0952381 0.04761905 0.04761905 0.19047619 0.0952381
0.14285714 0.04761905 0.04761905 0.14285714 0.0952381 0.0952381
0.0952381 0.0952381 ]
0.5
pAb
0.5
def classifyNB(vec2Classify, p0Vect, p1Vect, pClass1):
p1Vec = np.log(p1Vect)
p0Vec = np.log(p0Vect)
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOfPosts, listClasses = loadDataSet()
myVocabList = createVocabList(listOfPosts)
trainMat = []
for postInDoc in listOfPosts:
trainMat.append(setOfWord2Vec(myVocabList, postInDoc))
p0V,p1V,pAb = trainNB0(trainMat, listClasses)
testEntry1 = ['love', 'my', 'dalmation']
thisDoc1 = np.array(setOfWord2Vec(myVocabList, testEntry1))
print('{0} classified as : {1}'.format(testEntry1,classifyNB(thisDoc1, p0V, p1V, pAb)))
testEntry2 = ['stupid', 'garbage']
thisDoc2 = np.array(setOfWord2Vec(myVocabList, testEntry2))
print('{0} classified as : {1}'.format(testEntry2,classifyNB(thisDoc2, p0V, p1V, pAb)))
testingNB()
['love', 'my', 'dalmation'] classified as : 0
['stupid', 'garbage'] classified as : 1
def textParse(bigString):
import re
if bigString != None:
listOfTokens = re.split(r'\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []
classList = []
fullText = []
for i in range(26):
wordList = textParse(open('email\\spam\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('email\\ham\\%d.txt' % i, 'r').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
traingSet = list(range(50))
testSet = []
for i in range(10):
randIndex = int(np.random.uniform(0, len(traingSet)))
testSet.append(traingSet[randIndex])
del(traingSet[randIndex])
traingMat = []
traingClasses = []
for docIndex in traingSet:
traingMat.append(setOfWord2Vec(vocabList, docList[docIndex]))
traingClasses.append(classList[docIndex])
p0V, p1V, pAb = trainNB0(np.array(traingMat), np.array(traingClasses))
errorCount = 0.0
for docIndex in testSet:
wordVector = setOfWord2Vec(vocabList, docList[docIndex])
if classifyNB(np.array(wordVector), p0V, p1V, pAb) != classList[docIndex]:
errorCount += 1
print('the error rate is : ', float(errorCount) / len(testSet))
spamTest()
the error rate is : 0.1
'ab'.encode('gbk')
'\0xab'.encode('gbk')
b'\x00xab'
import chardet
f = open('email\\ham\\6.txt','rb')
chardet.detect(f.read())
{'confidence': 1.0, 'encoding': 'ascii', 'language': ''}
print(list(range(50)))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]