https://github.com/joelgrus/data-science-from-scratch/blob/master/code/natural_language_processing.py
#import package
import requests,json,sys,re,csv,math,random,matplotlib,matplotlib.pyplot as plt,pandas as pd,numpy as np
from dateutil.parser import parse
from __future__ import division
from collections import Counter,defaultdict,deque
from functools import partial
from pandas import Series,DataFrame
--------------------------------------------CP1导论
---------1.3.1 find the key contacter
users=[
{'id':0,'name':'zieox'},
{'id':1,'name':'jack'},
{'id':2,'name':'rose'},
{'id':3,'name':'nike'},
{'id':4,'name':'dick'},
{'id':5,'name':'lucco'},
{'id':6,'name':'zpoo'},
{'id':7,'name':'sqqpo'},
{'id':8,'name':'dv'},
{'id':9,'name':'sad'},
{'id':10,'name':'kobe'}]
friends=[(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),(4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]
#对users-list加入一friend列
for user in users:
user["friend"]=[]
#将朋友加入fr列
for i,j in friends:
users[i]["friend"].append(users[j]['name'])
users[j]["friend"].append(users[i]['name'])
#定义一个求fr列长的公式
def num_of_friend(user):return len(user["friend"])
total_conn=sum(num_of_friend(user) for user in users)
from __future__ import division
num_users=len(users)
#求人均好友量
avg_conn=total_conn/num_users
#生成(user_id,num_friends)
n_f_nyid=[(user['id'],num_of_friend(user)) for user in users]
#sorted(n_f_nyid,key= lambda ab:ab[1],reverse=True)#
#n_f_nyid=[(avg_conn,num_of_friend(user)) for user in users]
#a=pd.DataFrame(n_f_nyid,index=[user['name'] for user in users]).plot()
---1.3.2 The scientist maybe you know
这个方式太抽象 我们构造一个新列来返回ID 对应好友id
def f_of_fid(user):
return [foaf['id']
for fr in user['friend']
for foaf in friend['friend']]
##print([friend['id'] for friend in users[0]['friend']])##
u=[{user['id']:user['fr_id']} for user in users]
#观察发现人们可以通过朋友的朋友认识
#所以我们设计一个共同的朋友,来计数.同时 我们需要排除已经成为好友的用户,使用以下功能实现
'''from collections import Counter
def not_the_same(user,other_user):
return user['id']!=other_user['id']
def not_fr(user,other_user):
return all(not_the_same(friend,other_user) for friend in user['friend'])
def f_of_fid(user):
return Counter(foaf['id']
for friend in user['friend']
for foaf in friend['friend']
if not_the_same(user,foaf)
and not_fr(user,foaf))
print(f_of_fid(users[3]))'''
interest=[(0,"hive"),(0,"spark"),(0,"python"),(0,"java"),
(0,"scala"),(0,"tableau"),(1,"python"),(1,"java"),(1,"scala"),
(2,"hadoop"),(2,"python"),(3,"sql"),(3,"impala"),(3,"go"),(4,"perl"),
(4,"java"),(4,"hive"),(5,"python"),(5,"mapreduce"),(5,"java"),(6,"C++"),
(6,"java"),(6,"vba"),(7,"tableau"),(7,"ppt"),(8,"python"),(8,"java"),
(9,"python"),(9,"hadoop"),(9,"scala")]
#将喜欢的interest加入数据集
for i in users:
i['interest']=[]
for i,j in interest:
users[i]['interest'].append(j)
---#找到有共同兴趣的id
def data_s_who_like(target_interest):
return [user_id for user_id,user_interest in interest if user_interest==target_interest]
from collections import defaultdict
#键是interest 值是带有这个interest和user_id的列表
user_id_by_interest=defaultdict(list)
for user_id,interest in interest:
user_id_by_interest[interest].append(user_id)
#我们找出最受欢迎的interest的
def most_common_interest_with(user):
return counter(interested_user_id
for interest in interest_by_userid[user['id']]
for interest_by_userid in user_id_by_interest[interest]
if interested_user_id!=user['id'])
---------1.3.3 working y-experience
sal_year=[(83000,8.7),(88000,8.1),(48000,0.7),(76000,6),
(69000,6.5),(76000,7.5),(60000,2.5),(83000,10),(48000,1.9),(63000,4.2)]
#在这里我们做一个键值对的列表
sal_byear=defaultdict(list)
for sal,year in sal_year:
sal_byear[year].append(sal)
avg_sal_byear={year:sum(sal)/len(sal)
for year,sal in sal_byear.items()}
#分桶操作(qcut)
def y_bucket(year):
if year <2:
return "less than 2"
elif year<5:
return"between 2 and 5"
else:
return "more than 5"
sal_byear_bucket=defaultdict(list)
for sal,year in sal_year:
bucket=y_bucket(year)
sal_byear_bucket[bucket].append(sal)
avg_sal_bucket={y_bucket:sum(sal)/len(sal)
for y_bucket,sal in sal_byear_bucket.items()}
#我们来基于interest做一个兴趣统计
words_count=Counter(word for user,interest in interest
for word in interest.lower().split())
for word,count in words_count.most_common():
if count>1:
print(word,count)
##defaultdict方法的count
http://kodango.com/understand-defaultdict-in-python
strings = ('puppy', 'kitten', 'puppy', 'puppy',
'weasel', 'puppy', 'kitten', 'puppy')
counts = {}
for kw in strings:
if kw not in counts:
counts[kw] = 1
else:
counts[kw] += 1
#defaultdict方法
for kw in strings:
counts.setdefault(kw, 0)
counts[kw] += 1
--------------------------------------------CP2 PYTHON-basic
for i in [1,2,3,4,5]:
print(i)
for j in [1,2,3,4,5]:
print(j)
print(j+i)
print(i)
print('done looping')
list=[[1,2,3],[4,5,6],[7,8,9]]
def add(f):
return f[1]
y=add(lambda x:x+4)
#列表+值
list.extend([11,12,13])
list+[15]
list.append(15)
#给y赋值
_,y=[1,2]
x,y=1,2
x,y=y,x
------dict
grades={'zieox':90,'jack':88}
grades['jack'] --->88
grades['kobe']=99
grades.get('zieox',0)
grades.keys() --->['zieox','jack']
grades.values() --->dict_values([90, 88])
grades.items() --->dict_items([('zieox',90), ('jack', 88)])
#使用defaultdict统计词频
strings = ('puppy', 'kitten', 'puppy', 'puppy',
'weasel', 'puppy', 'kitten', 'puppy','ppp')
counts = {}
#method-1
for word in strings:
if kw not in counts:
counts[word] = 1
else:
counts[word] += 1
#method-2
for word in strings:
try:
counts[word]+=1
except KeyError:
counts[word]=1
#method-3
for kw in strings:
p_count=counts.get(word,0)
counts[word]=p_count+1
#以上方法太笨拙,以下代码简便
from collections import defaultdict
counts=defaultdict(int)
for word in strings:
counts[word]+=1
#使用collections包的Counter计数器是最简单的
from collections import Counter
---
c=Counter(strings)
for i,j in c.most_common(10):
print(i,j)
---this is the easist method
------set
我们使用set有2个原因
#1.set上有一种很快的操作IN
s=set()
for i in range(30):
s.add(i)
sl=['a','an','at']+['yet','you']
'zip' in sl #这种操作需要遍历每个元素
sl_set=set(sl)
'zip' in sl _set #书上说这样就快了
#2.便于在一个汇总中寻找离散项目
l=[1,2,4,5,6,7,8,9,9,9,8]
numl=len(l)
num_distinct_set=len(set(l))
------控制流
#if 流
p='even' if x%2==0 else 'odd'
#while 流
x=0
while x<10:
print(x,'less than 10')
x+=1
#for if
for x in range(10):
if x==3:
continue
if x==5:
break
print(x)
------排序 (sort)
l=[1,2,4,5,6,7,8,9,9,9,8]
l2=sorted(l)
l.sort()
x=sorted([-1,4,99,-8],key=abs,reverse=True)
w=sorted(counts.items(),key=lambda ab:ab[1],reverse=True)
------列表解析
even=[x for x in range(5) if x%2==0]
square=[x*x for x in range(5)]
even_s=[x*x for x in range(5) if x%2==0]
#转换为dict或set
square={x:x*x for x in range(5)}
pair=[(x,y) for x in range(10) for y in range(10)]
inxrease_pair=[(x,y) for x in range(10) for y in range(x+1,10)]
------生成器和迭代器
def lazy_r(n):
i=0
while i < n:
yield i
i+=1
def natural_num():
n=1
while True:
yield n
n+=1
#way2
laz_below20=(random.random() for _ in range(4))
------随机性
import random
f4 = [random.random() for _ in range(4)]
#设置随机数种子为10
random.seed(10)
random.random()
#当我们再次设置随机数种子为10时random.random()产出与上次一致
#范围内随机取
random.randrange(10) --->0-9 随机取一值
random.randrange(3.6) --->[3,4,5]
#使用random.shuffle 随机重排列表中元素
t1=[ i for i in range(10)]
random.shuffle(t1)
print(t1)
fr=['zieox','jack','rose','nike','dick']
#使用choice 在列表中随机取数
random.choice(fr)
#使用random.sample 随机选择(不替换)选择一个元素样本
num=[i for i in range(60)]
win=random.sample(num,6) --->不重复采6个样本
ff=[random.choice(range(10)) for _ in range(4)] --->重复采4个样本
------面向对象的编程
#自己写一个set集合
class set:
def __init__(self,values=None)
s1=Set()
s2=Set([1,2,2,3])
self.dict={}
if values is not None:
for value in values:
self.add(value)
def __repr__(self):
return "set:"+str(self.dict.keys())
def add(self,value):
self.dict[value]=True
def contains(self,value):
return value in self.dict
def remove(self,value):
del self.dict[value]
------函数式工具functools
def exp(base,power):return base**power n**m函数
def two_to_the(power):return exp(2,power) 2**m函数
from functools import partial
two_to_the=partial(exp,2) --->包含一个变量的函数
print(two_to_the(3)) ---> 2**3
square=partial(exp,power=2) --->n**2函数
def double(x):
return 2*x
xs=[1,2,3,4]
twice_xs=[double(x) for x in xs]
以上等价于 twice_xs=map(double,xs)
list_double=partial(map,double) --->double 了一个列表的function
twice_xs=list_double(xs) --->同上
---map函数 <映射>
def x2(x,y):return x*y
pro=map(x2,[1,2],[3,4]) --->1*3+2*4=[3,8]
---filter函数 <过滤>
def is_even(x):
return x%2==0
x_evens=[x for x in xs if is_even(x)]
以上等价于 x_evens=filter(is_even,xs)
list_evener=partial(filter,is_even)
x_evens=list_evener(xs)
---reduce
x_product=reduce(x2,xs) --->=1*2*3*4=24
list_product=partial(reduce,multiply) --->reduce了一个列表function
x_product=list_product(xs) --->同上24
------枚举 enumerate<==>(index,element)
fr=['zieox','jack','rose','nike','dick']
for i,j in enumerate(fr):
print(i,j)
------压缩和参数拆分 zip&unzip
list1=[1,2,3,4]
list2=['a','b','c','d']
zip(list2,list1) --->[('a',1),('b',2),('c',3),('d',4)]
pairs=[('a',1),('b',2),('c',3),('d',4)]
a,b=zip(*pairs)
zip(('a',1),('b',2),('c',3),('d',4)) --->(('a','b','c','d'),(1,2,3,4))
def add(a,b):return a+b
add(1,2) --->3
add(*[1,2]) --->3
------args&kargs
def doubler(f):
def g(x):
return 2*f(x)
return g
def f1(x):
return x+1
g=doubler(f1)
#对于多个参数的函数以上写法并不适用
所以我们用到参数拆分和魔法
def magic(*args,**kwargs):
print('unnamed args:',args)
print('keyword args:',kwargs)
magic(1,2,key='word',key2='word2')
def other_way_magic(x,y,z):
return x+y+z
x_y_list=[1,2]
z_dict={'z':3}
print(other_way_magic(*x_y_list,**z_dict))
p33
def doubler_correct(f):
def g(*args,**kwargs):
return 2*f(*args,**kwargs)
return g
g=doubler_correct(f2)
print( g(1,2)) --->6
--------------------------------------------CP3可视化
#example-matplotlib
from matplotlib import pyplot as pl
year=[1950,1960,1970,1980,1990,2000,2010]
gdp=[300.2,543.3,1075.9,2862.5,5979.6,10289.7,14958.3]
pl.plot(year,gdp,color='green',marker='o',linestyle='solid')
pl.title('name-GDP')
pl.ylabel('billion$')
pl.show()
------bar (条形图)
#coding:utf-8
movies=['old boy','revenages','iron-man','starwar','HelloKitty']
num_oscars=[11,65,53,26,11]
#在3.6中此操作将条形放置于中心左侧
xs=[i+0.1 for i ,_ in enumerate(movies)]
pl.bar(xs,num_oscars)
pl.ylabel('num of get_oscar')
pl.title('my favourite movie')
pl.xticks([i+0.5 for i ,_ in enumerate(movies)],movies)
pl.show()
#example3
from collections import Counter
grades=[86,45,46,79,13,49,79,65,35,46]
#//取整
decile=lambda grade:grade//10*10 === y=(x//10)*10
#用counter计数器计数
histgram=Counter(decile(grade) for grade in grades)
#分别按照histgrame的key和value计数值作图,其中设置宽度为8
pl.bar([x for x in histgram.keys()],histgram.values(),8)
#设置x,y轴取值范围
pl.axis([-5,105,0,5])
#设置x轴的标记点
pl.xticks([10*i for i in range(11)])
pl.xlabel('+scorelike')
pl.ylabel('num of students')
pl.title('score-pic')
pl.show()
#example4
mentions=[500,505]
years=[2013,2014]
pl.bar(years,mentions,0.8)
pl.xticks(years)
pl.ylabel('mentioned ds')
pl.ticklabel_format(useOffset=False)
pl.axis([2012.5,2014.5,499,506])
pl.title('look this is a big change')
pl.show()
#now we set x,y -value_range
pl.axis([2013,2015,0,550])
pl.show()
------line picture (线图)
'''#easy_way
varience=[2**(i-1) for i in range(1,10)]
bias_aquared=sorted(varience,reverse=True)'''
varience=[1, 2, 4, 8, 16, 32, 64, 128, 256]
bias_aquared=[256, 128, 64, 32, 16, 8, 4, 2, 1]
total_error=[x+y for x,y in zip(varience,bias_aquared)]
xs=[i for i ,_ in enumerate(varience)]
pl.plot(xs,varience,'g-',label='varience')
pl.plot(xs,bias_aquared,'r-.',label='bias^2')
pl.plot(xs,total_error,'b:',label='total_error')
#loc=9指(tag框)顶部中央
pl.legend(loc=9)
pl.xlabel('model-complex_value')
pl.title('bias - var picture')
pl.show()
------scatter 散点图
import random
'''friends=[random.randint(1,100) for _ in range(10)]
minutes=[random.randint(100,200) for _ in range(10)]'''
#这里我们不用随机数
friends=[61, 73, 80, 93, 13, 26, 57, 59, 88, 84]
minutes=[157, 184, 101, 198, 196, 158, 178, 150, 113, 154]
labels=['a','b','c','d','e','f','g','h','i','j']
pl.scatter(friends,minutes)
for label,friend_count,minute_count in zip(labels,friends,minutes):
pl.annotate(label,
xy=(friend_count,minute_count),
xytext=(-5,5),
textcoords='offset points')
pl.title('num_minutes&friends')
pl.xlabel('num of friend')
pl.ylabel('minutes of spending')
pl.show()
'''#在这里我们实验一下在轴上放置tag,利用annotate,xytext为位移,axis为xy轴的值域
pl.bar([i for i in range(10)],minutes)
pl.axis([-1,10,0,200])
for label,i,j in zip(labels,[i for i in range(10)],minutes):
pl.annotate(label,
xy=(i,j),
xytext=(-5,5),
textcoords='offset points')'''
--------------------------------------------CP4 linear algebra
------4.1 vector 向量
height_weight_age=[70,170,40]
grades=[95,80,75,62]
def vector_add(v,w):
return[v_i+w_j for v_i,w_j in zip(v,w)]
def vector_subtract(v,w):
return[v_i-w_j for v_i,w_j in zip(v,w)]
'''def vector_sum(vectors):
result=vectors[0]
for vector in vectors[1:]:
result=vector_add(result,vector)
return result'''
def vector_sum(vectors):
return reduce(vector_add,vectors)
#vector_sum=partial(reduce,vector_add)
def scalar_multiply(c,v):
return [c*v_i for v_i in v]
def vector_mean(vectors):
n=len(vectors)
return scalar_multiply(1/n,vector_sum(vectors))
#点乘
def dot(v,w):
return sum(v_i*w_i for v_i,w_i in zip(v,w))
#计算向量平方和
def sum_of_squares(v):
return dot(v,v)
#计算向量的大小(长度)
import maths
def magnitude(v):
return math.sqrt(sum_of_squares(v))
def squared_distance(v,w):
return sum_of_squares(vector_subtract(v,w))
def distance(v,w):
return magnitude(vector_subtract(v,w))
------4.2矩阵。
a=[[1,2,3],[4,5,6]]
b=[[1,2],[3,4],[5,6]]
def shape(A):
num_rows=len(A)
num_cols=len(A[0]) if A else 0
return num_cols,num_rows
#get_row
def get_row(A,i):
return A[i]
#get_col
def get_column(A,j):
return [A_i[j] for A_i in A]
#我们创建一个 对角线为1 其余为0的单位矩阵
def make_matrix(num_rows,num_cols,entry_fn):
return [[entry_fn(i,j) for j in range(num_cols)] for i in range(num_rows)]
def is_diagonal(i,j):
return 1 if i==j else 0
indentity_matrix=make_matrix(5,5,is_diagonal)
data=[[70,170,40],[65,120,26],[77,250,19]]
---------------------------------------------CP5 基础统计
------basic_item
#friend-bar 绘制朋友数直方图
num_friends=[11,65,53,26,11]*3
daily_minutes=[10,88,76,20,10]
from collections import Counter
from matplotlib.pyplot import plot as pl
friends=Counter(num_friends)
xs=range(101)
ys=[friends[x] for x in xs]
pl.bar(xs,ys)
pl.axis([0,101,0,25])
pl.title('friend_num')
pl.xlabel('num of friends')
pl.ylabel('num_peo')
pl.show()
num_point=len(num_friends)
largest_value=max(num_friends)
smallest_value=min(num_friends)
sorted_values=sorted(num_friends)
smallest_value=min(num_friends)=sorted_values[0]
largest_value=max(num_friends)=sorted_values[-1]
print('count:',num_point,'max:',largest_value,'min:',smallest_value)
------5.1.1中心倾向
#写个平均值
from __future__ import division
def mean(x):
return sum(x)/len(x)
mean(num_friends)
#中位数
def median(v):
n=len(v)
sorted_v=sorted(v)
midpoint=n//2
if n%2==1:
return sorted_v[midpoint]
else:
lo=midpoint-1
hi=midpoint
return(sorted_v[lo]+sorted_v[hi])/2
mdedian(num_friends)
#分位数(quantile)
def quantile(x,p):
p_index=int(p*len(x))
return sorted(x)[p_index]
quantile(num_friends,0.1)
#众数(mode)
def mod(x):
counts=Counter(x)
max_count=max(counts.values())
return [x_i for x_i,count in counts.iteritems() if count==max_count]
mod(num_friends)
------5.1.2离散度
#极差(range)
def data_range(x):
return max(x)-min(x)
data_range(num_friends)
#方差(variance) sum((X-Xavg)/(n-1))
def de_mean(x):
x_bar=mean(x)
return [x_i - x_bar for x_i in x]
def sum_of_suqares(x):
return sum([i**2 for i in x])
def variance(x):
n=len(x)
deviations=de_mean(x)
return sum_of_suqares(deviations)/(n-1)
variance(num_friends)
#标准差(standard deviation)
def standard_deviation(x):
return math.sqrt(variance(x))
standard_deviation(num_friends)
#分位数75% 与25%分位数之差
def interquantile_range(x):
return quantile(x,0.75)-quantile(x,0.25)
interquantile_range(num_friends)
------5.2相关
#协方差(covariance) COV dot点乘:元素相乘再相加
from numpy import dot
def covariance(x,y):
n=len(x)
return dot(de_mean(x),de_mean(y))/(n-1)
covariance(num_friends,daily_minutes)
#相关系数 value_range [-1,1]
def correlation(x,y):
stdev_x=standard_deviation(x)
stdev_y=standard_deviation(y)
if stdev_x>0 and stdev_y>0:
return covariance(x,y)/stdev_x/stdev_y
else:
return 0
'''
outlier=num_friends.index(100)
num_friends_good=[x for i,x in enumerate(num_friends) if i!=outlier]
num_minutes_good=[x for i,x in enumerate(daily_minutes) if i!=outlier]
correlation(num_friends_good,num_minutes_good)
'''
------------------------------------------------CP6概率
------6.1 independence
if E&F dependent then P(E,F)=P(E)P(F)
------6.2条件概率
if E&F dependent then P(E,F)=P(E)P(F)
if not dependent then P(E|F)=P(E,F)/P(F) ==> P(E,F)=P(E,F)P(F) ==>P(E|F)=P(E)
---example生男孩女孩概率问题
假设
1.每个孩子是男孩和女孩的概率相同
2.第二个孩子性别和第一个孩子性别独立
2个孩子都是女孩概率
P(B|G)=P(B,G)/P(G)=P(B)/P(G)=0.5
import random
def random_kid():
return random.choice(['boy','girl'])
random.seed(0)
both_girl=0
either_girl=0
older_girl=0
for _ in range(1000):
younger=random_kid()
older=random_kid()
if older=='girl':
older_girl+=1
if older=='girl' and younger=='girl':
both_girl+=1
if older=='girl' or younger=='girl':
either_girl+=1
print('P(both|older):',both_girl/older_girl)
print('P(both|either):',both_girl/either_girl)
------6.3贝叶斯定理
P(E|F)=P(E,F)/P(F)=P(F|E)P(E)/P(F)
p(F)=P(F,E)+P(F,-E) -E表示E没发生
==>P(E|F)=P(E,F)/(P(F,E)+P(F,-E))=P(E,F)/(P(F|E)P(E)+P(F|-E)P(-E))
------6.4随机变量
------6.5连续分布
#均匀分布概率密度函数
def uniform_pdf(x):
return 1 if x>=0 and x<1 else 0
#累积分布函数
def uniform_cdf(x):
if x<0:return 0
elif x<1:return x
else: return 1
------6.6正态分布
f(x|a,b)=(1/(sqrt(2*pi)*b))exp(-(x-a)^2/(2b^2))
如果a=0 b=1则为标准正态分布
import math
import matplotlib
import matplotlib.pyplot as pl
def normal_pdf(x,mu=0,sigma=1):
sqrt_two_pi=math.sqrt(2*math.pi)
return (math.exp(-(x-mu)**2/2/sigma**2)/(sqrt_two_pi*sigma))
xs=[x/10 for x in range(-50,50)]
pl.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend()
pl.title('多个正态分布的概率密度函数')
pl.show()
#标准正态分布的累计分布函数
def normal_cdf(x,mu=0,sigma=1):
return(1+math.erf((x-mu)/math.sqrt(2)/sigma))/2
#draw some 概率累计分布函数
pl.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend(loc=4)
pl.title('多个正态分布的累计分布函数')
pl.show()
#对normal_cdf取逆,从而可以求出特定的概率的相应值,因而我们创建二分查找函数inverse_normal_cdf
def inverse_normal_cdf(p,mu=0,sigma=1,tolerance=0.00001):
if nu!=0 or sigma!=1:
return mu+sigma*inverse_normal_cdf(p,tolerance=tolerance)
low_z,low_p=-10.0,0
hi_z,hi_p=10.0,1
while hi_z-low_z>tolerance:
mid_z=(low_z+hi_z)/2
mid_p=normal_cdf(mid_z)
if mid_pp:
hi_z,hi_p=mid_z,mid_p
else:
break
return mid_z
------6.7 中心极限定理
一个定义为大量独立同分布的随机变量函数的均值的随机变量本身就是接近于正态分布的
def bernoulli_trial(p):
return 1 if random.random() < p else 0
def binomial(n,p):
return sum(bernoulli_trial(p) for _ in range(n))
#二项分布与正态分布图
def make_hist(p,n,num_points):
data=[binomial(n,p) for _ in range(num_points)]
histogram=Counter(data)
pl.bar([x-0.4 for x in histogram.keys()],[v/num_points for v in histogram.values()],0.8,color='0.75')
mu=p*n
sigma=math.sqrt(n*p*(1-p))
xs=range(min(data),max(data)+1)
ys=[normal_cdf(i+0.5,mu,sigma)-normal_cdf(i-0.5,mu,sigma) for i in xs]
pl.plot(xs,ys)
pl.title('二项分布与正态近似')
pl.show()
make_hist(0.75,100,10000)
------------------------------------------------CP7假设检验
------------Coin-example
import math
#创建正态分布曲线
def normal_appro_to_binomial(n,p):
mu=p*n
sigma=math.sqrt(p*(1-p)*n)
return mu,sigma
#正态CDF是一个变量在一个阀值以下的概率
normal_ptobability_below=normal_cdf
def normal_ptobability_above(lo,mu=0,sigma=1):return 1-normal_cdf(lo,mu,sigma)
def normal_ptobability_between(lo,hi,mu=0,sigma=1):return normal_cdf(hi,mu,sigma)-normal_cdf(lo,mu,sigma)
def normal_ptobability_outside(lo,hi,mu=0,sigma=1):return 1-normal_ptobability_between(lo,hi,mu,sigma)
def normal_upper_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(ptobability,mu,sigma)
def normal_lower_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(1-ptobability,mu,sigma)
def normal_two_sided_bounds(ptobability,mu=0,sigma=1):
tail_ptobability=(1-ptobability)/2
upper_bound=normal_lower_bound(tail_ptobability,mu,sigma)
lower_bound=normal_upper_bound(tail_ptobability,mu,sigma)
return lower_bound,upper_bound
m0,sigma0=normal_appro_to_binomial(1000,0.5)
normal_two_sided_bounds(0.95,mu0,sigma0)
#检验势
lo,hi=normal_two_sided_bounds(0.95,mu0,sigma0)
mu1,sigma1=normal_appro_to_binomial(1000,0.55)
type_2_ptobability=normal_ptobability_between(lo,hi,mu1,sigma1)
power=1-type_2_ptobability
hi=normal_upper_bound(0.95,mu0,sigma0)
type_2_ptobability=normal_ptobability_below(lo,hi,mu1,sigma1)
power=1-type_2_ptobability
#是否均匀的双面检验
def two_sided_p_value(x,mu=0,sigma=1):
if x >= mu:
return 2*normal_ptobability_above(x,mu,sigma)
else:
return x*normal_ptobability_below(x,mu,sigma)
two_sided_p_value(529.5,mu0,sigma0)
#模拟检验
extreme_value_count=0
for _ in range(100000):
num_heads=sum(1 if random.random()<0.5 else 0 for _ in range(1000))
if num_heads>= 530 or num_heads<=470:
extreme_value_count+=1
print(extreme_value_count/100000)
# so we got
upper_p_value=normal_ptobability_above
lower_p_value=normal_ptobability_below
#对于单边检验,如果我们看到525次正面朝上,那么可以计算
upper_p_value(524.5,mu0,sigma0)
upper_p_value(526.5,mu0,sigma0)
------------置信区间
math.sqrt(p*(1-p)/1000)
#根据我们跑出的次数计算p_hat
p_hat=525/1000
mu=p_hat
sigma=math.sqrt(mu*(1-mu)/1000)
#计算置信区间
normal_two_sided_bounds(0.95,mu,sigma)
------------P-hacking
def run_exp():
return [random.random()<0.5 for _ in range(1000)]
def reject(exp):
num_heads=len([flip for flip in exp if flip])
return num_heads<469 or num_heads>531
random.seed(0)
exps=[run_exp() for _ in range(1000)]
num_rejectons=len([exp for exp in exps if reject(exp)])
print(num_rejectons)
------------ABtest example
def estimated_parameters(N,n):
p=n/N
sigma=math.sqrt(p*(1-p)/N)
return p,sigma
def a_b_test_stas(N_A,n_A,N_B,n_B):
P_A,sigma_A=estimated_parameters(N_A,n_A)
P_B,sigma_B=estimated_parameters(N_B,n_B)
return (P_B-P_A)/math.sqrt(sigma_A**2,sigma_B**2)
z=a_b_test_stas(1000,200,1000,180)
two_sided_p_value(z)
z=a_b_test_stas(1000,200,1000,150)
two_sided_p_value(z)
------------Bayes
def b(alpha,beta):
return math.gamma(alpha)*math.gamma(beta)/math.gamma(alpha+beta)
def beta_pdf(x,alpha,beta):
if x<0 or x>1:
return 0
return x**(alpha-1)*(1-x)**(beta-1)/b(alpha,beta)
alpha/(alpha+beta)
------------------------------------------------CP8梯度下降
y=2 * (x ** 2)-2*x + 1 最小值求解(梯度下降)
x = 6
step = 0.1
for i in range(1000):
x -= step * (4 * x-2)
def y(x):
return 2 * (x ** 2)-2*x + 1
print ('x:',x,'y:',y(x))
#s^2函数
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)
------8.2估算梯度
from functools import partial
#差商
def difference_quotient(f,x,h):return(f(x+h)-f(x))/h
#x^2
def square(x):return x*x
#x^2倒数2x
def derivative(x):return 2*x
derivative_estimate=partial(difference_quotient,square,h=0.00001)
x=range(-10,10)
pl.title('精确的倒数值与估计值')
pl.plot(x,map(derivative,x),'rx',label='Actual')
pl.plot(x,map(derivative_estimate),'b+',label='Estimate')
pl.legend(loc=9)
pl.show()
#我们把导数看成是其第i个变量的函数,其他变量保持不变,以此计算第i个偏导数
def partial_difference_quotient(f,v,i,h):
w=[v_j+(h if j==i else 0) for j,v_j in enumerate(v)]
return (f(w)-f(v))/h
------8.3使用梯度 p90
def step(v,direction,step_size):
return [v_i+step_size*direction_i for v_i,direction_i in zip(v,direction)]
def sum_of_squares_gradient(v):
return [2*v_i for v_i in v]
v=[random.randint(-10,10) for i in range(3)]
tolerance=0.0000001
while True:
gradient=sum_of_squares_gradient(v)
next_v=step(v,gradient,-0.01)
if distance(next_v,v)type the_bible.txt|python most_common_words.py 10
------9.2 read file
---9.2.1 text_file basic
#reading
file_for_reading=open('reading_file.txt',r)
#writing
file_for_writing=open('reading_file.txt',w)
#appending
file_for_appending=open('reading_file.txt',a)
file_for_writing.close()
with open(filename,'r') as f:
data=function_that_gets_data_from(f)
process(data)
starts_with_hash=0
#if start with '#' then +1
with open('input.txt','r') as f:
for line in file:
if re.match('^#',line):
starts_with_hash+=1
def get_domain(email_address):
return email_address.lower().split('@')[-1]
with open('email_address.txt',r) as f:
domain_counts=Counter(get_domain(line.strip()) for line in f if '@' in line)
a=open('C:/Users/zhangjiajun858/Desktop/te.txt','r')
with open('C:/Users/zhangjiajun858/Desktop/te.txt','r') as f:
for line in lines:
print line
------9.2.2 restrict file
import csv
with open('C:/Users/zhangjiajun858/Desktop/te.csv','rb') as f:
reader=csv.reader(f,delimiter='\t')
for row in reader:
date=row[0]
symbol=row[1]
closing_price=float(row[2])
process(date,symbol,closing_price)
------9.3web catching (p99)
---9.3.2 O'Reilly book
------9.4 use API
---9.4.1 JSON(XML)
---9.4.2 use unverification API
import requests,json
endpoint="https://api.github.com/users/joelgrus/repos"
repos=json.loads(requests.get(endpoint).txt)
from dateutil.parser import parse
dates=[parse(repo['create_at']) for repo in repos]
month_counts=Counter(date.month for date in dates)
weekday_counts=Counter(date.weekday() for date in dates)
---9.4.3
------9.5 EX: use Twitter API
from twython import Twython
twitter=Twython(CONSUMER_KEY,CONSUMER_SECRET)
for status in twitter.search(q='"data science"')['statuses']:
user=status['user']['screen_name'].encode('utf-8')
text=status['text'].encode('utf-8')
print user,':',text
print
------------------------------------------------CP10 data-work
------10.1 探索一维数据
import matplotlib.pyplot as pl
import math
from collections import Counter
def bucketize(point,bucket_size):return bucket_size*math.floor(point/bucket_size)
def make_histogram(points,bucket_size):return Counter(bucketize(point,bucket_size) for point in points)
def plot_histogram(points,bucket_size,titlt=''):
histogram=make_histogram(points,bucket_size)
pl.bar(histogram.keys(),histogram.values(),width=bucket_size)
pl.title(title)
pl.show()
random.seed(0)
uniform=[200*random.random()-100 for _ in range(1000)]
normal=[57*inverse_normal_cdf(random.random()) for _ in range(1000)]
plot_histogram(uniform,10,'均匀分布的直方图')
plot_histogram(normal,10,'正态分布的直方图')
------10.1.2二维数据
df random_normal():return inverse_normal_cdf(random.random())
xs=[random_normal() for _ in range(1000)]
ys1=[x+random_normal()/2 for x in xs]
ys2=[-x+random_normal()/2 for x in xs]
pl.scatter(xs,ys1,marker='.',color='black',label='ys1')
pl.scatter(xs,ys2,marker='.',color='gray',label='ys2')
pl.xlabel('xs')
pl.ylabel('ys')
pl.legend=(loc=9)
pl.title('差别很大的联合分布')
pl.show()
print(correlation(xs,ys1))
print(correlation(xs,ys22))
------10.1.3多维数据
def correlation_matrix(data):
_,num_columns=shape(data)
def matrix_entry(i,j):
return correlation(get_column(data,i),get_column(data,j))
return make_matrix(num_columns,num_columns,matrix_entry)
import matplotlib.pyplot as pl
_,num_columns=shape(data)
fig,ax=pl.subplots(num_columns,num_columns)
for i in range(num_columns):
for j in range(num_points):
if i !=j :ax[i][j].scatter(get_column(datat,j),get_column(data,i))
else:a[i][j].annotate('series'+str(i),(0.5,0.5),xycoords='axes fraction',ha='center',va='center')
if i 0:ax[i][j].yaxis.set_visible(False)
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_xlim(ax[0][1].get_xlim())
pl.show()
------10.2清理与修改
def parse_row(input_row,parsers):
return [parser(value) if parsers is not None else value for value,parser in zip(input_row,parsers)]
def parse_rows_with(reader,parsers):
for row in reader:
yield parse_row(row,parsers)
def try_or_none(f):
def f_or_none(x):
try:return f(x)
except: return none
return f_or_none
#rewrite
def parse_row():
return [try_or_none(parser)(value) if parser is not None else value for value,parser in zip(input_row,parsers)]
import dateutil.parser
data=[]
with open(,'rb') as f:
reader=csv.reader(f)
for line in parse_rows_with(reader,[dateutil.parser.parse,none,float]):
data.append(line)
for row in data:
if any(x is None for x in row):
print(row)
def try_parse_field(field_name,value,parser_dict):
parser=parser_dict.get(field_name)
if parser is not None:
return try_or_none(parser)(value)
else:
return value
def parse_dict(input_dict,parser_dict):
return{field_name:try_parse_field(field_name,value,parser_dict)
for field_name,value in input_dict.iteritems()}
------10.3 data process P121
------10.4 data adjusting
import math
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)
def magnitude(v):
return math.sqrt(sum_of_squares(v))
def vector_subtract(v,w):
return[v_i-w_j for v_i,w_j in zip(v,w)]
def distance(v,w):
return magnitude(vector_subtract(v,w))
ab=distance([63,150],[67,160])
ac=distance([63,150],[70,171])
bc=distance([67,160],[70,171])
#calculate means&STD 计算评价值和方差
def scale(data_matrix):
num_rows,num_cols=shape(data_matrix)
means=[mean(get_column(data_matrix,j)) for j in range(num_cols)]
stdevs=[standard_deviation(get_column(data_matrix,j) for j in range(num_cols))]
return means,stdevs
#then we made a new data-matrix 创建一个新矩阵
def rescale(data_matrix):
def rescaled(i,j):
if stdevs[j]>0:
return (data_matrix[i][j]-means[j])/stdevs[j]
else:
return data_matrix[i][j]
num_rows,num_cols=shape()
return make_matrix(num_rows,num_cols,rescaled)
------10.5 降维
#convert to non-zero 将数据转换成每个维度为零的形式
def fr_mean_matrix(a):
nr,nc=shape(a)
column_means,_ = scale(a)
return make_matrix(nr,nc,lambda i,j:a[i][j]-column_means[j])
#calculate the direction of vector 计算向量方向
def direction(w):
mag=magnitude(w)
return [w_i/mag for w_i in w]
#calculate direction-w 's var 计算w方向上的方差
def directional_variance_i(x_i,w):
return dot(x_i,direction(w))**2
def directional_variance(X,w):
return sum(directional_variance_i(x_i,w) for x_i in X)
#使用梯度下降计算 方差最大的那个方向
def directional_variance_gradient_i(x_i,w):
projection_length=dot(x_i,direction(w))
return [2*projection_length*x_ij for x_ij in x_i]
def directional_variance_gradient(X,w):
return vector_sum(directional_variance_gradient_i(x_i,w) for x_i in X)
#使用随机梯度下降法
def first_principal_component_sgd(X):
guess=[1 for _ in X[0]]
unscaled_maximizer=maximize_stochastic(
lambda x,_, w:directional_variance_i(x,w),
lambda x,_, w:directional_variance_gradient_i(x,w),
X,
[None for _ in X],guess)
return direction(unscaled_maximizer)
#使用以上方程得到第一主成分方向后,我们将数据在这个方向上投影得到这个成分的值
def project(v,w):
projection_length=dot(v,w)
return scalar_multiply(projection_length,w)
#if we want get anyother we remove p127
def remove_projection_from_vector(v,w):
return vector_subtract(v,project(v,w))
def remove_projection(X,w):
return [remove_projection_from_vector(x_i,w) for x_i in X]
#在更高维度的数据集中,我们可以通过迭代找到我们所需的任意数目的主成分
def principal_component_analysis(X,num_components):
components=[]
for _ in range(num_components):
component=first_principal_component(X)
components.append(component)
x=remove_projection(X,component)
return components
#然后将数据转换为由主成分生成的低维空间中的点
def transform_vector(X,components):
return [dot(v,w) for w in components]
def transform(X,components):
return [transform_vector(x_i,components) for x_i in X ]
---p128
------------------------------------------------CP11 ML
#划分数据集
def split_data(data,prob):
results=[],[]
for row in data:
results[0 if random.random()
model=SomeKindOfModel()
x_train,x_test,y_train,y_test=train_test_split(xs,ys,0.33)
model.train(x_train,y_train)
performance=model.test(x_test,y_test)
#计算准确率
def accuracy(tp,fp,fn,tn):
correct=tp+tn
total=tp+fp+fn+tn
return correct/total
print(accuracy(70,4930,13930,981070))
#计算查准率(precision) 和 查全率(recall)
def precision(tp,fp,fn,tn):
return tp/(tp+fp)
print(precision(70,4930,13930,981070))
def recall(tp,fp,fn,tn):
return tp/(tp+fn)
print(recall(70,4930,13930,981070))
#F1-score(结合查准率和查全率)
def f1_score(tp,fp,fn,tn):
p=precision(tp,fp,fn,tn)
r=recall(tp,fp,fn,tn)
return 2*p*r/(p+r)
import random
import numpy as np
a=[]
b=[]
c=[]
for i in range(1000000):
if i%3==0 or i%4==0:
a.append(i)
a.remove(0)
for x in a:
for y in a:
for z in a:
n=x+y+z
b.append(n)
for i in b:
if i%x==0 and i%y==0 and i%z==0:
print(i)
print(c[:2])
------------------------------------------------CP12 k-Nearest Neighbors (k近邻法)
#投票统计函数
from collections import Counter
def raw_majority_vote(labels):
votes=Counter(labels)
winner,_=votes.most_common(1)[0]
return winner
#随机选择一个获胜者
#根据距离加权投票并选择加权的获胜者
#减少k值知道找到唯一获胜者
def majority_vote(labels):
vote_counts=Counter(labels)
winner,winner_count=vote_counts.most_common(1)[0]
num_winners=len([count for count in vote_counts.values() if count==winner_count])
if num_winners==1:
return winner
else:
return majority_vote(labels[:-1])
#创建分类器
def knn_classify(k,labeled_points,new_point):
by_distance=sorted(labeled_points,key=lambda point: distance(point,new_point))
k_nearest_labels=[label for _,label in by_distance[:k]]
return majority_vote(k_nearest,labels)
#12.2 案例:最喜欢的编程语言
cities = [(-86.75,33.5666666666667,'Python'),(-88.25,30.6833333333333,'Python'),(-112.016666666667,33.4333333333333,'Java'),(-110.933333333333,32.1166666666667,'Java'),(-92.2333333333333,34.7333333333333,'R'),(-121.95,37.7,'R'),(-118.15,33.8166666666667,'Python'),(-118.233333333333,34.05,'Java'),(-122.316666666667,37.8166666666667,'R'),(-117.6,34.05,'Python'),(-116.533333333333,33.8166666666667,'Python'),(-121.5,38.5166666666667,'R'),(-117.166666666667,32.7333333333333,'R'),(-122.383333333333,37.6166666666667,'R'),(-121.933333333333,37.3666666666667,'R'),(-122.016666666667,36.9833333333333,'Python'),(-104.716666666667,38.8166666666667,'Python'),(-104.866666666667,39.75,'Python'),(-72.65,41.7333333333333,'R'),(-75.6,39.6666666666667,'Python'),(-77.0333333333333,38.85,'Python'),(-80.2666666666667,25.8,'Java'),(-81.3833333333333,28.55,'Java'),(-82.5333333333333,27.9666666666667,'Java'),(-84.4333333333333,33.65,'Python'),(-116.216666666667,43.5666666666667,'Python'),(-87.75,41.7833333333333,'Java'),(-86.2833333333333,39.7333333333333,'Java'),(-93.65,41.5333333333333,'Java'),(-97.4166666666667,37.65,'Java'),(-85.7333333333333,38.1833333333333,'Python'),(-90.25,29.9833333333333,'Java'),(-70.3166666666667,43.65,'R'),(-76.6666666666667,39.1833333333333,'R'),(-71.0333333333333,42.3666666666667,'R'),(-72.5333333333333,42.2,'R'),(-83.0166666666667,42.4166666666667,'Python'),(-84.6,42.7833333333333,'Python'),(-93.2166666666667,44.8833333333333,'Python'),(-90.0833333333333,32.3166666666667,'Java'),(-94.5833333333333,39.1166666666667,'Java'),(-90.3833333333333,38.75,'Python'),(-108.533333333333,45.8,'Python'),(-95.9,41.3,'Python'),(-115.166666666667,36.0833333333333,'Java'),(-71.4333333333333,42.9333333333333,'R'),(-74.1666666666667,40.7,'R'),(-106.616666666667,35.05,'Python'),(-78.7333333333333,42.9333333333333,'R'),(-73.9666666666667,40.7833333333333,'R'),(-80.9333333333333,35.2166666666667,'Python'),(-78.7833333333333,35.8666666666667,'Python'),(-100.75,46.7666666666667,'Java'),(-84.5166666666667,39.15,'Java'),(-81.85,41.4,'Java'),(-82.8833333333333,40,'Java'),(-97.6,35.4,'Python'),(-122.666666666667,45.5333333333333,'Python'),(-75.25,39.8833333333333,'Python'),(-80.2166666666667,40.5,'Python'),(-71.4333333333333,41.7333333333333,'R'),(-81.1166666666667,33.95,'R'),(-96.7333333333333,43.5666666666667,'Python'),(-90,35.05,'R'),(-86.6833333333333,36.1166666666667,'R'),(-97.7,30.3,'Python'),(-96.85,32.85,'Java'),(-95.35,29.9666666666667,'Java'),(-98.4666666666667,29.5333333333333,'Java'),(-111.966666666667,40.7666666666667,'Python'),(-73.15,44.4666666666667,'R'),(-77.3333333333333,37.5,'Python'),(-122.3,47.5333333333333,'Python'),(-89.3333333333333,43.1333333333333,'R'),(-104.816666666667,41.15,'Java')]
#processing
cities = [([longitude, latitude], language) for longitude, latitude, language in cities]
# key is language, value is pair (longitudes, latitudes)
plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }
# we want each language to have a different marker and color
markers = { "Java" : "o", "Python" : "s", "R" : "^" }
colors = { "Java" : "r", "Python" : "b", "R" : "g" }
for (longitude, latitude), language in cities:
plots[language][0].append(longitude)
plots[language][1].append(latitude)
# create a scatter series for each language
for language, (x, y) in plots.items():
plt.scatter(x, y, color=colors[language], marker=markers[language],
label=language, zorder=10)
#plot_state_borders(plt) # assume we have a function that does this
plt.legend(loc=0) # let matplotlib choose the location
plt.axis([-130,-60,20,55]) # set the axes
plt.title("Favorite Programming Languages")
plt.show()
'''for k in [1, 3, 5, 7]:
num_correct = 0
for city in cities:
location, actual_language=city
other_cities = [other_city for other_city in cities if other_city != city]
predicted_language = knn_classify(k, other_cities, location)
if predicted_language == actual_language:
num_correct += 1
print(k, "neighbor[s]:", num_correct, "correct out of", len(cities))
#TypeError: unsupported operand type(s) for -: 'list' and 'float'
'''
#12.3维度灾难
#创建一个可以得到指定个数的随机数列表
def random_point(dim):
return [random.random() for _ in range(dim)]
def random_distances(dim,num_pairs):
return [distance(random_point(dim),random+point(dim)) for _ in range(num_pairs)]
dimensions=range(1,101)
avg_distances=[]
min_distances=[]
random.seed(0)
for dim in dimensions:
distances=random_distances(dim,10000)
avg_distances.append(mean(distance))
min_distances.append(min(distance))
min_avg_ratio=[min_dist/avg_dist for min_dist,avg_dist in zip(min_distances,avg_distances)]
------------------------------------------------CP13 Naive Bayes(朴素贝叶斯算法)
def tokenize(message):
message = message.lower() # convert to lowercase
all_words = re.findall("[a-z0-9']+", message) # extract the words
return set(all_words) # remove duplicates
def count_words(training_set):
"""training set consists of pairs (message, is_spam)"""
counts = defaultdict(lambda: [0, 0])
for message, is_spam in training_set:
for word in tokenize(message):
counts[word][0 if is_spam else 1] += 1
return counts
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
"""turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)"""
return [(w,(spam + k) / (total_spams + 2 * k),(non_spam + k) / (total_non_spams + 2 * k)) for w, (spam, non_spam) in counts.iteritems()]
def spam_probability(word_probs, message):
message_words = tokenize(message)
log_prob_if_spam = log_prob_if_not_spam = 0.0
for word, prob_if_spam, prob_if_not_spam in word_probs:
# for each word in the message,
# add the log probability of seeing it
if word in message_words:
log_prob_if_spam += math.log(prob_if_spam)
log_prob_if_not_spam += math.log(prob_if_not_spam)
# for each word that's not in the message
# add the log probability of _not_ seeing it
else:
log_prob_if_spam += math.log(1.0 - prob_if_spam)
log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
prob_if_spam = math.exp(log_prob_if_spam)
prob_if_not_spam = math.exp(log_prob_if_not_spam)
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
class NaiveBayesClassifier:
def __init__(self, k=0.5):
self.k = k
self.word_probs = []
def train(self, training_set):
# count spam and non-spam messages
num_spams = len([is_spam for message, is_spam in training_set if is_spam])
num_non_spams = len(training_set) - num_spams
# run training data through our "pipeline"
word_counts = count_words(training_set)
self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
def classify(self, message):
return spam_probability(self.word_probs, message)
#13.4 moudel_test
data = []
# regex for stripping out the leading "Subject:" and any spaces after it
subject_regex = re.compile(r"^Subject:\s+")
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
is_spam = "ham" not in fn
with open(fn,'r') as file:
for line in file:
if line.startswith("Subject:"):
subject = subject_regex.sub("", line).strip()
data.append((subject, is_spam))
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)
classified.sort(key=lambda row: row[2])
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
hammiest_spams = filter(lambda row: row[1], classified)[:5]
def p_spam_given_word(word_prob):
word, prob_if_spam, prob_if_not_spam = word_prob
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
words = sorted(classifier.word_probs, key=p_spam_given_word)
spammiest_words = words[-5:]
hammiest_words = words[:5]
def drop_final_s(word):
return ("s$","",word)
------------------------------------------------*args,**kwargs 参数
*args,**kwargs
https://www.cnblogs.com/xuyuanyuan123/p/6674645.html
http://python.jobbole.com/83476/
#*args表示任何多个无名参数,它是一个tuple
#**kwargs表示关键字参数,它是一个dict
#形参*args可以传入不指定tuple,**kwargs传入一个dict
在函数定义中使用*args和**kwargs传递可变长参数。
*args用作传递非命名键值可变长参数列表(位置参数); **kwargs用作传递键值可变长参数列表。
def foo(*args,**kwargs):
print('args=',args)
print('kwargs=',kwargs)
print('**********************')
if __name__=='__main__':
foo(1,2,3)
foo(a=1,b=2,c=3)
foo(1,2,3,a=1,b=2,c=3)
foo(1,'b','c',a=1,b='b',c='c')
------------------------------------------------CP14 Simple Linear Regression (简单线性回归)
#Y_i=bX_i+a+e
num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
def predict(alpha, beta, x_i):
return beta * x_i + alpha
#计算误差
def error(alpha,beta,x_i,y_i):
return y_i-predict(alpha,beta,x_i)
#计算误差平方和
def sum_of_squared_errors(alpha,beta,x,y):
return sum(error(alpha,beta,x_i,y_i)**2 for x_i,y_i in zip(x,y))
#利用最小二乘法计算误差最小时的alpha和beta
def least_squares_fit(x,y):
beta=correlation(x,y)*standard_deviation(y)/standard_deviation(x)
alpha=mean(y)-beta*mean(x)
return alpha,beta
alpha,beta=least_squares_fit(num_friends_good,daily_minutes_good)
#R^2(决定系数):用来评估模型的拟合效果
def total_sum_of_squares(y):
return sum(v**2 for v in de_mean(y))
def r_squared(alpha,beta,x,y):
return 1.0-(sum_of_squared_errors(alpha,beta,x,y)/total_sum_of_squares(y))
r_squared(alpha,beta,num_friends_good,daily_minutes_good)