Apls008

数据科学指南-学习笔记

https://github.com/joelgrus/data-science-from-scratch/blob/master/code/natural_language_processing.py
#import package
import requests,json,sys,re,csv,math,random,matplotlib,matplotlib.pyplot as plt,pandas as pd,numpy as np
from dateutil.parser import parse
from __future__ import division
from collections import Counter,defaultdict,deque
from functools import partial
from pandas import Series,DataFrame
--------------------------------------------CP1导论
---------1.3.1 find the key contacter
users=[
{'id':0,'name':'zieox'},
{'id':1,'name':'jack'},
{'id':2,'name':'rose'},
{'id':3,'name':'nike'},
{'id':4,'name':'dick'},
{'id':5,'name':'lucco'},
{'id':6,'name':'zpoo'},
{'id':7,'name':'sqqpo'},
{'id':8,'name':'dv'},
{'id':9,'name':'sad'},
{'id':10,'name':'kobe'}]

friends=[(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),(4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]

#对users-list加入一friend列
for user in users:
user["friend"]=[]
#将朋友加入fr列
for i,j in friends:
users[i]["friend"].append(users[j]['name'])
users[j]["friend"].append(users[i]['name'])
#定义一个求fr列长的公式
def num_of_friend(user):return len(user["friend"])
total_conn=sum(num_of_friend(user) for user in users)

from __future__ import division
num_users=len(users)
#求人均好友量
avg_conn=total_conn/num_users
#生成(user_id,num_friends)
n_f_nyid=[(user['id'],num_of_friend(user)) for user in users]
#sorted(n_f_nyid,key= lambda ab:ab[1],reverse=True)# 
#n_f_nyid=[(avg_conn,num_of_friend(user)) for user in users]
#a=pd.DataFrame(n_f_nyid,index=[user['name'] for user in users]).plot()

---1.3.2 The scientist maybe you know
这个方式太抽象 我们构造一个新列来返回ID 对应好友id
def f_of_fid(user):
return [foaf['id'] 
for fr in user['friend']
for foaf in friend['friend']] 
##print([friend['id'] for friend in users[0]['friend']])##


u=[{user['id']:user['fr_id']} for user in users]
#观察发现人们可以通过朋友的朋友认识
#所以我们设计一个共同的朋友,来计数.同时 我们需要排除已经成为好友的用户,使用以下功能实现

'''from collections import Counter
def not_the_same(user,other_user):
return user['id']!=other_user['id']
def not_fr(user,other_user):
return all(not_the_same(friend,other_user) for friend in user['friend'])
def f_of_fid(user):
return Counter(foaf['id']
for friend in user['friend']
for foaf in friend['friend']
if not_the_same(user,foaf)
and not_fr(user,foaf))
print(f_of_fid(users[3]))'''

interest=[(0,"hive"),(0,"spark"),(0,"python"),(0,"java"),
(0,"scala"),(0,"tableau"),(1,"python"),(1,"java"),(1,"scala"),
(2,"hadoop"),(2,"python"),(3,"sql"),(3,"impala"),(3,"go"),(4,"perl"),
(4,"java"),(4,"hive"),(5,"python"),(5,"mapreduce"),(5,"java"),(6,"C++"),
(6,"java"),(6,"vba"),(7,"tableau"),(7,"ppt"),(8,"python"),(8,"java"),
(9,"python"),(9,"hadoop"),(9,"scala")]

#将喜欢的interest加入数据集
for i in users:
i['interest']=[]
for i,j in interest:
    users[i]['interest'].append(j)
---#找到有共同兴趣的id
def data_s_who_like(target_interest):
return [user_id for user_id,user_interest in interest if user_interest==target_interest]

from collections import defaultdict
#键是interest 值是带有这个interest和user_id的列表
user_id_by_interest=defaultdict(list)
for user_id,interest in interest:
user_id_by_interest[interest].append(user_id)
#我们找出最受欢迎的interest的
def most_common_interest_with(user):
    return counter(interested_user_id
    for interest in interest_by_userid[user['id']]
    for interest_by_userid in user_id_by_interest[interest]
    if interested_user_id!=user['id'])

---------1.3.3 working y-experience
sal_year=[(83000,8.7),(88000,8.1),(48000,0.7),(76000,6),
(69000,6.5),(76000,7.5),(60000,2.5),(83000,10),(48000,1.9),(63000,4.2)]
#在这里我们做一个键值对的列表
sal_byear=defaultdict(list)
for sal,year in sal_year:
sal_byear[year].append(sal)
avg_sal_byear={year:sum(sal)/len(sal)
for year,sal in sal_byear.items()}

#分桶操作(qcut)
def y_bucket(year):
if year <2:
return "less than 2"
elif year<5:
return"between 2 and 5"
else:
return "more than 5"

sal_byear_bucket=defaultdict(list)
for sal,year in sal_year:
bucket=y_bucket(year)
sal_byear_bucket[bucket].append(sal)
avg_sal_bucket={y_bucket:sum(sal)/len(sal)
for y_bucket,sal in sal_byear_bucket.items()}
#我们来基于interest做一个兴趣统计
words_count=Counter(word for user,interest in interest 
for word in interest.lower().split())

for word,count in words_count.most_common():
if count>1:
print(word,count)

##defaultdict方法的count
http://kodango.com/understand-defaultdict-in-python
strings = ('puppy', 'kitten', 'puppy', 'puppy',
           'weasel', 'puppy', 'kitten', 'puppy')
counts = {}
for kw in strings:
    if kw not in counts:
        counts[kw] = 1
    else:
        counts[kw] += 1
        
#defaultdict方法
for kw in strings:
    counts.setdefault(kw, 0)
    counts[kw] += 1

--------------------------------------------CP2 PYTHON-basic
for i in [1,2,3,4,5]:
print(i)
for j in [1,2,3,4,5]:
print(j)
print(j+i)
print(i)
print('done looping')
list=[[1,2,3],[4,5,6],[7,8,9]]
def add(f):
return f[1]
y=add(lambda x:x+4)
#列表+值
list.extend([11,12,13])
list+[15]
list.append(15)
#给y赋值
_,y=[1,2]
x,y=1,2
x,y=y,x

------dict
grades={'zieox':90,'jack':88}
grades['jack']                        --->88
grades['kobe']=99                          
grades.get('zieox',0)                      
grades.keys()                         --->['zieox','jack']
grades.values()                       --->dict_values([90, 88])
grades.items()                        --->dict_items([('zieox',90), ('jack', 88)])

#使用defaultdict统计词频
strings = ('puppy', 'kitten', 'puppy', 'puppy',
           'weasel', 'puppy', 'kitten', 'puppy','ppp')
counts = {}
#method-1
for word in strings:
    if kw not in counts:
        counts[word] = 1
    else:
        counts[word] += 1
#method-2
for word in strings:
try:
counts[word]+=1
except KeyError:
counts[word]=1
#method-3
for kw in strings:
p_count=counts.get(word,0)
counts[word]=p_count+1
#以上方法太笨拙,以下代码简便
from collections import defaultdict
counts=defaultdict(int)
for word in strings:
counts[word]+=1
#使用collections包的Counter计数器是最简单的
from collections import Counter
---
c=Counter(strings)
for i,j in c.most_common(10):
print(i,j)
---this is the easist method

------set
我们使用set有2个原因
#1.set上有一种很快的操作IN
s=set()
for i in range(30):
s.add(i)
sl=['a','an','at']+['yet','you']
'zip' in sl                           #这种操作需要遍历每个元素
sl_set=set(sl)
'zip' in sl _set                      #书上说这样就快了
#2.便于在一个汇总中寻找离散项目
l=[1,2,4,5,6,7,8,9,9,9,8]
numl=len(l)
num_distinct_set=len(set(l))

------控制流
#if 流
p='even' if x%2==0 else 'odd'
#while 流
x=0
while x<10:
print(x,'less than 10')
x+=1
#for if 
for x in range(10):
if x==3:
continue
if x==5:
break
print(x)

------排序 (sort)
l=[1,2,4,5,6,7,8,9,9,9,8]
l2=sorted(l)
l.sort()
x=sorted([-1,4,99,-8],key=abs,reverse=True)
w=sorted(counts.items(),key=lambda ab:ab[1],reverse=True)

------列表解析
even=[x for x in range(5) if x%2==0]
square=[x*x for x in range(5)]
even_s=[x*x for x in range(5) if x%2==0]
#转换为dict或set
square={x:x*x for x in range(5)}
pair=[(x,y) for x in range(10) for y in range(10)]
inxrease_pair=[(x,y) for x in range(10) for y in range(x+1,10)]

------生成器和迭代器
def lazy_r(n):
i=0
while i < n:
yield i
i+=1
def natural_num():
n=1
while True:
yield n
n+=1
#way2
laz_below20=(random.random() for _ in range(4))

------随机性
import random
f4 = [random.random() for _ in range(4)]
#设置随机数种子为10
random.seed(10) 
random.random()
#当我们再次设置随机数种子为10时random.random()产出与上次一致
#范围内随机取
random.randrange(10)           --->0-9 随机取一值
random.randrange(3.6)          --->[3,4,5]
#使用random.shuffle 随机重排列表中元素
t1=[ i for i in range(10)]
random.shuffle(t1)
print(t1)

fr=['zieox','jack','rose','nike','dick']
#使用choice 在列表中随机取数
random.choice(fr)
#使用random.sample 随机选择(不替换)选择一个元素样本
num=[i for i in range(60)]
win=random.sample(num,6)                          --->不重复采6个样本
ff=[random.choice(range(10)) for _ in range(4)]   --->重复采4个样本

------面向对象的编程
#自己写一个set集合
class set:
def __init__(self,values=None)
s1=Set()
s2=Set([1,2,2,3])
self.dict={}
if values is not None:
for value in values:
self.add(value)
def __repr__(self):
return "set:"+str(self.dict.keys())
def add(self,value):
self.dict[value]=True
def contains(self,value):
return value in self.dict
def remove(self,value):
del self.dict[value]

------函数式工具functools
def exp(base,power):return base**power        n**m函数
def two_to_the(power):return exp(2,power)     2**m函数
from functools import partial
two_to_the=partial(exp,2)                 --->包含一个变量的函数
print(two_to_the(3))                      ---> 2**3
square=partial(exp,power=2)               --->n**2函数

def double(x):
return 2*x
xs=[1,2,3,4]
twice_xs=[double(x) for x in xs] 
以上等价于 twice_xs=map(double,xs)  
list_double=partial(map,double)          --->double 了一个列表的function
twice_xs=list_double(xs)                 --->同上    
---map函数 <映射>
def x2(x,y):return x*y
pro=map(x2,[1,2],[3,4])                  --->1*3+2*4=[3,8]


---filter函数 <过滤>
def is_even(x):
return x%2==0
x_evens=[x for x in xs if is_even(x)]
以上等价于 x_evens=filter(is_even,xs)
list_evener=partial(filter,is_even)
x_evens=list_evener(xs)

---reduce
x_product=reduce(x2,xs)                  --->=1*2*3*4=24
list_product=partial(reduce,multiply)    --->reduce了一个列表function
x_product=list_product(xs)               --->同上24

------枚举 enumerate<==>(index,element)
fr=['zieox','jack','rose','nike','dick']
for i,j in enumerate(fr):
print(i,j)

------压缩和参数拆分 zip&unzip
list1=[1,2,3,4]
list2=['a','b','c','d']
zip(list2,list1)                         --->[('a',1),('b',2),('c',3),('d',4)]
pairs=[('a',1),('b',2),('c',3),('d',4)]
a,b=zip(*pairs)

zip(('a',1),('b',2),('c',3),('d',4))     --->(('a','b','c','d'),(1,2,3,4))
def add(a,b):return a+b
add(1,2)                                 --->3
add(*[1,2])                              --->3

------args&kargs
def doubler(f):
def g(x):
return 2*f(x)
return g
def f1(x):
return x+1
g=doubler(f1)

#对于多个参数的函数以上写法并不适用
所以我们用到参数拆分和魔法
def magic(*args,**kwargs):
print('unnamed args:',args)
print('keyword args:',kwargs)
magic(1,2,key='word',key2='word2')
def other_way_magic(x,y,z):
return x+y+z
x_y_list=[1,2]
z_dict={'z':3}
print(other_way_magic(*x_y_list,**z_dict))

p33
def doubler_correct(f):
def g(*args,**kwargs):
return 2*f(*args,**kwargs)
return g
g=doubler_correct(f2)
print( g(1,2))                         --->6


--------------------------------------------CP3可视化
#example-matplotlib
from matplotlib import pyplot as pl
year=[1950,1960,1970,1980,1990,2000,2010]
gdp=[300.2,543.3,1075.9,2862.5,5979.6,10289.7,14958.3]
pl.plot(year,gdp,color='green',marker='o',linestyle='solid')
pl.title('name-GDP')
pl.ylabel('billion$')
pl.show()

------bar (条形图)
#coding:utf-8
movies=['old boy','revenages','iron-man','starwar','HelloKitty']
num_oscars=[11,65,53,26,11]
#在3.6中此操作将条形放置于中心左侧
xs=[i+0.1 for i ,_ in enumerate(movies)]
pl.bar(xs,num_oscars)
pl.ylabel('num of get_oscar')
pl.title('my favourite movie')
pl.xticks([i+0.5 for i ,_ in enumerate(movies)],movies)
pl.show()

#example3
from collections import Counter
grades=[86,45,46,79,13,49,79,65,35,46]
#//取整
decile=lambda grade:grade//10*10  === y=(x//10)*10
#用counter计数器计数
histgram=Counter(decile(grade) for grade in grades)
#分别按照histgrame的key和value计数值作图,其中设置宽度为8
pl.bar([x for x in histgram.keys()],histgram.values(),8)
#设置x,y轴取值范围
pl.axis([-5,105,0,5])
#设置x轴的标记点
pl.xticks([10*i for i in range(11)])
pl.xlabel('+scorelike')
pl.ylabel('num of students')
pl.title('score-pic')
pl.show()

#example4
mentions=[500,505]
years=[2013,2014]
pl.bar(years,mentions,0.8)
pl.xticks(years)
pl.ylabel('mentioned ds')
pl.ticklabel_format(useOffset=False)
pl.axis([2012.5,2014.5,499,506])
pl.title('look this is a big change')
pl.show()
#now we set x,y -value_range
pl.axis([2013,2015,0,550])
pl.show()

------line picture (线图)
'''#easy_way
varience=[2**(i-1) for i in range(1,10)]
bias_aquared=sorted(varience,reverse=True)'''
varience=[1, 2, 4, 8, 16, 32, 64, 128, 256]
bias_aquared=[256, 128, 64, 32, 16, 8, 4, 2, 1]
total_error=[x+y for x,y in zip(varience,bias_aquared)]
xs=[i for i ,_ in enumerate(varience)]
pl.plot(xs,varience,'g-',label='varience')
pl.plot(xs,bias_aquared,'r-.',label='bias^2')
pl.plot(xs,total_error,'b:',label='total_error')
#loc=9指(tag框)顶部中央
pl.legend(loc=9)
pl.xlabel('model-complex_value')
pl.title('bias - var picture')
pl.show()

------scatter 散点图
import random
'''friends=[random.randint(1,100) for _ in range(10)]
minutes=[random.randint(100,200) for _ in range(10)]'''
#这里我们不用随机数
friends=[61, 73, 80, 93, 13, 26, 57, 59, 88, 84]
minutes=[157, 184, 101, 198, 196, 158, 178, 150, 113, 154]
labels=['a','b','c','d','e','f','g','h','i','j']
pl.scatter(friends,minutes)

for label,friend_count,minute_count in zip(labels,friends,minutes):
pl.annotate(label,
xy=(friend_count,minute_count),
xytext=(-5,5),
textcoords='offset points')
pl.title('num_minutes&friends')
pl.xlabel('num of friend')
pl.ylabel('minutes of spending')
pl.show()


'''#在这里我们实验一下在轴上放置tag,利用annotate,xytext为位移,axis为xy轴的值域
pl.bar([i for i in range(10)],minutes)
pl.axis([-1,10,0,200])
for label,i,j in zip(labels,[i for i in range(10)],minutes):
pl.annotate(label,
xy=(i,j),
xytext=(-5,5),
textcoords='offset points')'''

--------------------------------------------CP4 linear algebra
------4.1 vector 向量
height_weight_age=[70,170,40]
grades=[95,80,75,62]
def vector_add(v,w):
 return[v_i+w_j for v_i,w_j in zip(v,w)]
def vector_subtract(v,w):
  return[v_i-w_j for v_i,w_j in zip(v,w)]

'''def vector_sum(vectors):
 result=vectors[0]
 for vector in vectors[1:]:
  result=vector_add(result,vector)
 return result'''

def vector_sum(vectors):
 return reduce(vector_add,vectors)
 #vector_sum=partial(reduce,vector_add)
def scalar_multiply(c,v):
 return [c*v_i for v_i in v]
def vector_mean(vectors):
 n=len(vectors)
 return scalar_multiply(1/n,vector_sum(vectors))
#点乘
def dot(v,w):
 return sum(v_i*w_i for v_i,w_i in zip(v,w))
#计算向量平方和
def sum_of_squares(v):
 return dot(v,v)
#计算向量的大小(长度)
import maths
def magnitude(v):
 return math.sqrt(sum_of_squares(v))
def squared_distance(v,w):
 return sum_of_squares(vector_subtract(v,w))
def distance(v,w):
 return magnitude(vector_subtract(v,w))

------4.2矩阵。
a=[[1,2,3],[4,5,6]]
b=[[1,2],[3,4],[5,6]]

def shape(A):
 num_rows=len(A)
 num_cols=len(A[0]) if A else 0
 return num_cols,num_rows
#get_row
def get_row(A,i):
 return A[i]
#get_col
def get_column(A,j):
 return [A_i[j] for A_i in A]
#我们创建一个 对角线为1 其余为0的单位矩阵
def make_matrix(num_rows,num_cols,entry_fn):
 return [[entry_fn(i,j) for j in range(num_cols)] for i in range(num_rows)]
def is_diagonal(i,j):
 return 1 if i==j else 0 
indentity_matrix=make_matrix(5,5,is_diagonal)

data=[[70,170,40],[65,120,26],[77,250,19]]

---------------------------------------------CP5 基础统计
------basic_item
#friend-bar 绘制朋友数直方图
num_friends=[11,65,53,26,11]*3
daily_minutes=[10,88,76,20,10]
from collections import Counter
from matplotlib.pyplot import plot as pl
friends=Counter(num_friends)
xs=range(101)
ys=[friends[x] for x in xs]
pl.bar(xs,ys)
pl.axis([0,101,0,25])
pl.title('friend_num')
pl.xlabel('num of friends')
pl.ylabel('num_peo')
pl.show()

num_point=len(num_friends)
largest_value=max(num_friends)
smallest_value=min(num_friends)
sorted_values=sorted(num_friends)
smallest_value=min(num_friends)=sorted_values[0]
largest_value=max(num_friends)=sorted_values[-1]
print('count:',num_point,'max:',largest_value,'min:',smallest_value)

------5.1.1中心倾向
#写个平均值
from __future__ import division
def mean(x):
 return sum(x)/len(x)
mean(num_friends)

#中位数
def median(v):
 n=len(v)
 sorted_v=sorted(v)
 midpoint=n//2
 if n%2==1:
  return sorted_v[midpoint]
 else:
  lo=midpoint-1
  hi=midpoint
  return(sorted_v[lo]+sorted_v[hi])/2
mdedian(num_friends)

#分位数(quantile)
def quantile(x,p):
 p_index=int(p*len(x))
 return sorted(x)[p_index]
quantile(num_friends,0.1)

#众数(mode)
def mod(x):
 counts=Counter(x)
 max_count=max(counts.values())
 return [x_i for x_i,count in counts.iteritems() if count==max_count]
mod(num_friends)

------5.1.2离散度
#极差(range)
def data_range(x):
 return max(x)-min(x)
data_range(num_friends)

#方差(variance)  sum((X-Xavg)/(n-1))
def de_mean(x):
 x_bar=mean(x)
 return [x_i - x_bar for x_i in x]
def sum_of_suqares(x):
 return sum([i**2 for i in x])
def variance(x):
 n=len(x)
 deviations=de_mean(x)
 return sum_of_suqares(deviations)/(n-1)
variance(num_friends)

#标准差(standard deviation)
def standard_deviation(x):
 return math.sqrt(variance(x))
standard_deviation(num_friends)

#分位数75% 与25%分位数之差
def interquantile_range(x):
 return quantile(x,0.75)-quantile(x,0.25)
interquantile_range(num_friends)

------5.2相关
#协方差(covariance)  COV  dot点乘：元素相乘再相加
from numpy import dot
def covariance(x,y):
 n=len(x)
 return dot(de_mean(x),de_mean(y))/(n-1)
covariance(num_friends,daily_minutes)
#相关系数 value_range [-1,1]
def correlation(x,y):
 stdev_x=standard_deviation(x)
 stdev_y=standard_deviation(y)
 if stdev_x>0 and stdev_y>0:
  return covariance(x,y)/stdev_x/stdev_y
 else:
  return 0

'''
outlier=num_friends.index(100)
num_friends_good=[x for i,x in enumerate(num_friends) if i!=outlier]
num_minutes_good=[x for i,x in enumerate(daily_minutes) if i!=outlier]


correlation(num_friends_good,num_minutes_good)
'''
------------------------------------------------CP6概率
------6.1 independence
if E&F dependent then P(E,F)=P(E)P(F)

------6.2条件概率
if E&F dependent then P(E,F)=P(E)P(F)
if not dependent then P(E|F)=P(E,F)/P(F) ==> P(E,F)=P(E,F)P(F) ==>P(E|F)=P(E)
---example生男孩女孩概率问题
假设
1.每个孩子是男孩和女孩的概率相同
2.第二个孩子性别和第一个孩子性别独立
2个孩子都是女孩概率
P(B|G)=P(B,G)/P(G)=P(B)/P(G)=0.5
import random
def random_kid():
return random.choice(['boy','girl'])
random.seed(0)
both_girl=0
either_girl=0
older_girl=0
for _ in range(1000):
younger=random_kid()
older=random_kid()
if older=='girl':
older_girl+=1
if older=='girl' and younger=='girl':
both_girl+=1
if older=='girl' or younger=='girl':
either_girl+=1
print('P(both|older):',both_girl/older_girl)
print('P(both|either):',both_girl/either_girl)

------6.3贝叶斯定理
P(E|F)=P(E,F)/P(F)=P(F|E)P(E)/P(F)
p(F)=P(F,E)+P(F,-E)     -E表示E没发生
==>P(E|F)=P(E,F)/(P(F,E)+P(F,-E))=P(E,F)/(P(F|E)P(E)+P(F|-E)P(-E))

------6.4随机变量

------6.5连续分布
#均匀分布概率密度函数
def uniform_pdf(x):
return 1 if x>=0 and x<1 else 0
#累积分布函数
def uniform_cdf(x):
if x<0:return 0
elif x<1:return x
else: return 1 

------6.6正态分布
f(x|a,b)=(1/(sqrt(2*pi)*b))exp(-(x-a)^2/(2b^2))
如果a=0 b=1则为标准正态分布
import math
import matplotlib
import matplotlib.pyplot as pl
def normal_pdf(x,mu=0,sigma=1):
sqrt_two_pi=math.sqrt(2*math.pi)
return (math.exp(-(x-mu)**2/2/sigma**2)/(sqrt_two_pi*sigma))
xs=[x/10 for x in range(-50,50)]

pl.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend()
pl.title('多个正态分布的概率密度函数')
pl.show()
#标准正态分布的累计分布函数
def normal_cdf(x,mu=0,sigma=1):
return(1+math.erf((x-mu)/math.sqrt(2)/sigma))/2
#draw some 概率累计分布函数
pl.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend(loc=4)
pl.title('多个正态分布的累计分布函数')
pl.show()
#对normal_cdf取逆，从而可以求出特定的概率的相应值,因而我们创建二分查找函数inverse_normal_cdf
def inverse_normal_cdf(p,mu=0,sigma=1,tolerance=0.00001):
if nu!=0 or sigma!=1:
return mu+sigma*inverse_normal_cdf(p,tolerance=tolerance)
low_z,low_p=-10.0,0
hi_z,hi_p=10.0,1
while hi_z-low_z>tolerance:
mid_z=(low_z+hi_z)/2
mid_p=normal_cdf(mid_z)
if mid_pp:
hi_z,hi_p=mid_z,mid_p
else:
break
return mid_z

------6.7 中心极限定理
一个定义为大量独立同分布的随机变量函数的均值的随机变量本身就是接近于正态分布的
def bernoulli_trial(p):
return 1 if random.random() < p else 0
def binomial(n,p):
return sum(bernoulli_trial(p) for _ in range(n))
#二项分布与正态分布图
def make_hist(p,n,num_points):
data=[binomial(n,p) for _ in range(num_points)]
histogram=Counter(data)
pl.bar([x-0.4 for x in histogram.keys()],[v/num_points for v in histogram.values()],0.8,color='0.75')
mu=p*n
sigma=math.sqrt(n*p*(1-p))
xs=range(min(data),max(data)+1)
ys=[normal_cdf(i+0.5,mu,sigma)-normal_cdf(i-0.5,mu,sigma) for i in xs]
pl.plot(xs,ys)
pl.title('二项分布与正态近似')
pl.show()
make_hist(0.75,100,10000)



------------------------------------------------CP7假设检验
------------Coin-example
import math
#创建正态分布曲线
def normal_appro_to_binomial(n,p):
 mu=p*n
 sigma=math.sqrt(p*(1-p)*n)
 return mu,sigma
#正态CDF是一个变量在一个阀值以下的概率
normal_ptobability_below=normal_cdf
def normal_ptobability_above(lo,mu=0,sigma=1):return 1-normal_cdf(lo,mu,sigma)
def normal_ptobability_between(lo,hi,mu=0,sigma=1):return normal_cdf(hi,mu,sigma)-normal_cdf(lo,mu,sigma)
def normal_ptobability_outside(lo,hi,mu=0,sigma=1):return 1-normal_ptobability_between(lo,hi,mu,sigma)

def normal_upper_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(ptobability,mu,sigma)
def normal_lower_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(1-ptobability,mu,sigma)
def normal_two_sided_bounds(ptobability,mu=0,sigma=1):
 tail_ptobability=(1-ptobability)/2
 upper_bound=normal_lower_bound(tail_ptobability,mu,sigma)
 lower_bound=normal_upper_bound(tail_ptobability,mu,sigma)
 return lower_bound,upper_bound

m0,sigma0=normal_appro_to_binomial(1000,0.5)
normal_two_sided_bounds(0.95,mu0,sigma0)

#检验势
lo,hi=normal_two_sided_bounds(0.95,mu0,sigma0)
mu1,sigma1=normal_appro_to_binomial(1000,0.55)
type_2_ptobability=normal_ptobability_between(lo,hi,mu1,sigma1)
power=1-type_2_ptobability

hi=normal_upper_bound(0.95,mu0,sigma0)
type_2_ptobability=normal_ptobability_below(lo,hi,mu1,sigma1)
power=1-type_2_ptobability

#是否均匀的双面检验
def two_sided_p_value(x,mu=0,sigma=1):
 if x >= mu:
  return 2*normal_ptobability_above(x,mu,sigma)
 else:
  return x*normal_ptobability_below(x,mu,sigma)
two_sided_p_value(529.5,mu0,sigma0)
#模拟检验
extreme_value_count=0
for _ in range(100000):
 num_heads=sum(1 if random.random()<0.5 else 0 for _ in range(1000))
 if num_heads>= 530 or num_heads<=470:
  extreme_value_count+=1
print(extreme_value_count/100000)
# so we got
upper_p_value=normal_ptobability_above
lower_p_value=normal_ptobability_below

#对于单边检验,如果我们看到525次正面朝上，那么可以计算
upper_p_value(524.5,mu0,sigma0)
upper_p_value(526.5,mu0,sigma0)

------------置信区间
math.sqrt(p*(1-p)/1000)
#根据我们跑出的次数计算p_hat
p_hat=525/1000
mu=p_hat
sigma=math.sqrt(mu*(1-mu)/1000)
#计算置信区间
normal_two_sided_bounds(0.95,mu,sigma)

------------P-hacking
def run_exp():
 return [random.random()<0.5 for _ in range(1000)]
def reject(exp):
 num_heads=len([flip for flip in exp if flip])
 return num_heads<469 or num_heads>531
random.seed(0)
exps=[run_exp() for _ in range(1000)]
num_rejectons=len([exp for exp in exps if reject(exp)])
print(num_rejectons)

------------ABtest example
def estimated_parameters(N,n):
 p=n/N
 sigma=math.sqrt(p*(1-p)/N)
 return p,sigma

def a_b_test_stas(N_A,n_A,N_B,n_B):
 P_A,sigma_A=estimated_parameters(N_A,n_A)
 P_B,sigma_B=estimated_parameters(N_B,n_B)
 return (P_B-P_A)/math.sqrt(sigma_A**2,sigma_B**2)

z=a_b_test_stas(1000,200,1000,180)
two_sided_p_value(z)
z=a_b_test_stas(1000,200,1000,150)
two_sided_p_value(z)

------------Bayes
def b(alpha,beta):
 return math.gamma(alpha)*math.gamma(beta)/math.gamma(alpha+beta)
def beta_pdf(x,alpha,beta):
 if x<0 or x>1:
  return 0
 return x**(alpha-1)*(1-x)**(beta-1)/b(alpha,beta)
alpha/(alpha+beta)



------------------------------------------------CP8梯度下降
y=2 * (x ** 2)-2*x + 1 最小值求解(梯度下降)
x = 6 
step = 0.1
for i in range(1000):
    x -= step * (4 * x-2)
def y(x):
    return 2 * (x ** 2)-2*x + 1
print ('x:',x,'y:',y(x))
#s^2函数
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)

------8.2估算梯度
from functools import partial
#差商
def difference_quotient(f,x,h):return(f(x+h)-f(x))/h
#x^2
def square(x):return x*x
#x^2倒数2x
def derivative(x):return 2*x

derivative_estimate=partial(difference_quotient,square,h=0.00001)
x=range(-10,10)
pl.title('精确的倒数值与估计值')
pl.plot(x,map(derivative,x),'rx',label='Actual')
pl.plot(x,map(derivative_estimate),'b+',label='Estimate')
pl.legend(loc=9)
pl.show()
#我们把导数看成是其第i个变量的函数,其他变量保持不变,以此计算第i个偏导数
def partial_difference_quotient(f,v,i,h):
w=[v_j+(h if j==i else 0) for j,v_j in enumerate(v)]
return (f(w)-f(v))/h

------8.3使用梯度 p90
def step(v,direction,step_size):
return [v_i+step_size*direction_i for v_i,direction_i in zip(v,direction)]
def sum_of_squares_gradient(v):
    return [2*v_i for v_i in v]
v=[random.randint(-10,10) for i in range(3)]
tolerance=0.0000001
while True:
    gradient=sum_of_squares_gradient(v)
    next_v=step(v,gradient,-0.01)
    if distance(next_v,v)type the_bible.txt|python most_common_words.py 10

------9.2 read file 
---9.2.1 text_file basic
#reading
file_for_reading=open('reading_file.txt',r)
#writing
file_for_writing=open('reading_file.txt',w)
#appending
file_for_appending=open('reading_file.txt',a)
file_for_writing.close()

with open(filename,'r') as f:
 data=function_that_gets_data_from(f)
 process(data)

starts_with_hash=0
#if start with '#' then +1
with open('input.txt','r') as f:
 for line in file:
  if re.match('^#',line):
   starts_with_hash+=1

def get_domain(email_address):
 return email_address.lower().split('@')[-1]
with open('email_address.txt',r) as f:
 domain_counts=Counter(get_domain(line.strip()) for line in f if '@' in line)

a=open('C:/Users/zhangjiajun858/Desktop/te.txt','r')
with open('C:/Users/zhangjiajun858/Desktop/te.txt','r') as f:
 for line in lines:
  print line

------9.2.2 restrict file
import csv 
with open('C:/Users/zhangjiajun858/Desktop/te.csv','rb') as f:
    reader=csv.reader(f,delimiter='\t')
    for row in reader:
        date=row[0]
        symbol=row[1]
        closing_price=float(row[2])
        process(date,symbol,closing_price)

------9.3web catching (p99)
---9.3.2 O'Reilly book

------9.4 use API
---9.4.1 JSON(XML)

---9.4.2 use unverification API
import requests,json
endpoint="https://api.github.com/users/joelgrus/repos"
repos=json.loads(requests.get(endpoint).txt)

from dateutil.parser import parse
dates=[parse(repo['create_at']) for repo in repos]
month_counts=Counter(date.month for date in dates)
weekday_counts=Counter(date.weekday() for date in dates)

---9.4.3

------9.5 EX: use Twitter API
from twython import Twython
twitter=Twython(CONSUMER_KEY,CONSUMER_SECRET)
for status in twitter.search(q='"data science"')['statuses']:
 user=status['user']['screen_name'].encode('utf-8')
 text=status['text'].encode('utf-8')
 print user,':',text
 print 

------------------------------------------------CP10 data-work
------10.1 探索一维数据
import matplotlib.pyplot as pl
import math
from collections import Counter
def bucketize(point,bucket_size):return bucket_size*math.floor(point/bucket_size)
def make_histogram(points,bucket_size):return Counter(bucketize(point,bucket_size) for point in points)
def plot_histogram(points,bucket_size,titlt=''):
 histogram=make_histogram(points,bucket_size)
 pl.bar(histogram.keys(),histogram.values(),width=bucket_size)
 pl.title(title)
 pl.show()
random.seed(0)
uniform=[200*random.random()-100 for _ in range(1000)] 
normal=[57*inverse_normal_cdf(random.random()) for _ in range(1000)]
plot_histogram(uniform,10,'均匀分布的直方图')
plot_histogram(normal,10,'正态分布的直方图')

------10.1.2二维数据
df random_normal():return inverse_normal_cdf(random.random())
xs=[random_normal() for _ in range(1000)]
ys1=[x+random_normal()/2 for x in xs]
ys2=[-x+random_normal()/2 for x in xs]
pl.scatter(xs,ys1,marker='.',color='black',label='ys1')
pl.scatter(xs,ys2,marker='.',color='gray',label='ys2')
pl.xlabel('xs')
pl.ylabel('ys')
pl.legend=(loc=9)
pl.title('差别很大的联合分布')
pl.show()

print(correlation(xs,ys1))
print(correlation(xs,ys22))

------10.1.3多维数据
def correlation_matrix(data):
 _,num_columns=shape(data)
 def matrix_entry(i,j):
  return correlation(get_column(data,i),get_column(data,j))
 return make_matrix(num_columns,num_columns,matrix_entry)

import matplotlib.pyplot as pl
_,num_columns=shape(data)
fig,ax=pl.subplots(num_columns,num_columns)
for i in range(num_columns):
 for j in range(num_points):
  if i !=j :ax[i][j].scatter(get_column(datat,j),get_column(data,i))
  else:a[i][j].annotate('series'+str(i),(0.5,0.5),xycoords='axes fraction',ha='center',va='center')
  if i 0:ax[i][j].yaxis.set_visible(False)
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_xlim(ax[0][1].get_xlim())
pl.show()

------10.2清理与修改
def parse_row(input_row,parsers):
 return [parser(value) if parsers is not None else value for value,parser in zip(input_row,parsers)]
def parse_rows_with(reader,parsers):
 for row in reader:
  yield parse_row(row,parsers)
def try_or_none(f):
 def f_or_none(x):
  try:return f(x)
  except: return none
 return f_or_none
#rewrite
def parse_row():
 return [try_or_none(parser)(value) if parser is not None else value for value,parser in zip(input_row,parsers)]

import dateutil.parser
data=[]
with open(,'rb') as f:
 reader=csv.reader(f)
 for line in parse_rows_with(reader,[dateutil.parser.parse,none,float]):
  data.append(line)
for row in data:
 if any(x is None for x in row):
  print(row)
def try_parse_field(field_name,value,parser_dict):
 parser=parser_dict.get(field_name)
 if parser is not None:
  return try_or_none(parser)(value)
 else:
  return value
def parse_dict(input_dict,parser_dict):
 return{field_name:try_parse_field(field_name,value,parser_dict)
for field_name,value in input_dict.iteritems()}


------10.3 data process   P121
------10.4 data adjusting
import math
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)
def magnitude(v):
return math.sqrt(sum_of_squares(v))
def vector_subtract(v,w):
return[v_i-w_j for v_i,w_j in zip(v,w)]
def distance(v,w):
return magnitude(vector_subtract(v,w))
ab=distance([63,150],[67,160])
ac=distance([63,150],[70,171])
bc=distance([67,160],[70,171])
#calculate means&STD 计算评价值和方差
def scale(data_matrix):
num_rows,num_cols=shape(data_matrix)
means=[mean(get_column(data_matrix,j)) for j in range(num_cols)]
stdevs=[standard_deviation(get_column(data_matrix,j) for j in range(num_cols))]
return means,stdevs
#then we made a new data-matrix 创建一个新矩阵
def rescale(data_matrix):
def rescaled(i,j):
if stdevs[j]>0:
return (data_matrix[i][j]-means[j])/stdevs[j]
else:
return data_matrix[i][j]
num_rows,num_cols=shape()
return make_matrix(num_rows,num_cols,rescaled)

------10.5 降维
#convert to non-zero 将数据转换成每个维度为零的形式
def fr_mean_matrix(a):
nr,nc=shape(a)
column_means,_ = scale(a)
return make_matrix(nr,nc,lambda i,j:a[i][j]-column_means[j])
#calculate the direction of vector 计算向量方向
def direction(w):
mag=magnitude(w)
return [w_i/mag for w_i in w]
#calculate direction-w 's var 计算w方向上的方差
def directional_variance_i(x_i,w):
return dot(x_i,direction(w))**2
def directional_variance(X,w):
return sum(directional_variance_i(x_i,w) for x_i in X)
#使用梯度下降计算 方差最大的那个方向
def directional_variance_gradient_i(x_i,w):
projection_length=dot(x_i,direction(w))
return [2*projection_length*x_ij for x_ij in x_i]
def directional_variance_gradient(X,w):
return vector_sum(directional_variance_gradient_i(x_i,w) for x_i in X)
#使用随机梯度下降法
def first_principal_component_sgd(X):
guess=[1 for _ in X[0]]
unscaled_maximizer=maximize_stochastic(
lambda x,_, w:directional_variance_i(x,w),
lambda x,_, w:directional_variance_gradient_i(x,w),
X,
[None for _ in X],guess)
return direction(unscaled_maximizer)
#使用以上方程得到第一主成分方向后,我们将数据在这个方向上投影得到这个成分的值
def project(v,w):
projection_length=dot(v,w)
return scalar_multiply(projection_length,w)
#if we want get anyother we remove p127
def remove_projection_from_vector(v,w):
return vector_subtract(v,project(v,w))
def remove_projection(X,w):
return [remove_projection_from_vector(x_i,w) for x_i in X]
#在更高维度的数据集中,我们可以通过迭代找到我们所需的任意数目的主成分
def principal_component_analysis(X,num_components):
components=[]
for _ in range(num_components):
component=first_principal_component(X)
components.append(component)
x=remove_projection(X,component)
return components
#然后将数据转换为由主成分生成的低维空间中的点
def transform_vector(X,components):
return [dot(v,w) for w in components]
def transform(X,components):
return [transform_vector(x_i,components) for x_i in X ]
---p128



------------------------------------------------CP11 ML
#划分数据集
def split_data(data,prob):
results=[],[]
for row in data:
results[0 if random.random()
model=SomeKindOfModel()
x_train,x_test,y_train,y_test=train_test_split(xs,ys,0.33)
model.train(x_train,y_train)
performance=model.test(x_test,y_test)
#计算准确率
def accuracy(tp,fp,fn,tn):
correct=tp+tn
total=tp+fp+fn+tn
return correct/total
print(accuracy(70,4930,13930,981070))
#计算查准率(precision) 和 查全率(recall)
def precision(tp,fp,fn,tn):
return tp/(tp+fp)
print(precision(70,4930,13930,981070))
def recall(tp,fp,fn,tn):
return tp/(tp+fn)
print(recall(70,4930,13930,981070))
#F1-score(结合查准率和查全率)
def f1_score(tp,fp,fn,tn):
p=precision(tp,fp,fn,tn)
r=recall(tp,fp,fn,tn)
return 2*p*r/(p+r)




import random 
import numpy as np


a=[]
b=[]
c=[]
for i in range(1000000):
if i%3==0 or i%4==0:
a.append(i)
a.remove(0)
for x in a:
for y in a:
for z in a:
n=x+y+z
b.append(n)
for i in b:
if i%x==0 and i%y==0 and i%z==0:
print(i)
print(c[:2])


------------------------------------------------CP12 k-Nearest Neighbors (k近邻法)
#投票统计函数
from collections import Counter
def raw_majority_vote(labels):
 votes=Counter(labels)
 winner,_=votes.most_common(1)[0]
 return winner 

#随机选择一个获胜者
#根据距离加权投票并选择加权的获胜者
#减少k值知道找到唯一获胜者
def majority_vote(labels):
 vote_counts=Counter(labels)
 winner,winner_count=vote_counts.most_common(1)[0]
 num_winners=len([count for count in vote_counts.values() if count==winner_count])
 if num_winners==1:
  return winner 
 else:
  return majority_vote(labels[:-1])

#创建分类器
def knn_classify(k,labeled_points,new_point):
 by_distance=sorted(labeled_points,key=lambda point: distance(point,new_point))
 k_nearest_labels=[label for _,label in by_distance[:k]]
 return majority_vote(k_nearest,labels)

#12.2 案例：最喜欢的编程语言
cities = [(-86.75,33.5666666666667,'Python'),(-88.25,30.6833333333333,'Python'),(-112.016666666667,33.4333333333333,'Java'),(-110.933333333333,32.1166666666667,'Java'),(-92.2333333333333,34.7333333333333,'R'),(-121.95,37.7,'R'),(-118.15,33.8166666666667,'Python'),(-118.233333333333,34.05,'Java'),(-122.316666666667,37.8166666666667,'R'),(-117.6,34.05,'Python'),(-116.533333333333,33.8166666666667,'Python'),(-121.5,38.5166666666667,'R'),(-117.166666666667,32.7333333333333,'R'),(-122.383333333333,37.6166666666667,'R'),(-121.933333333333,37.3666666666667,'R'),(-122.016666666667,36.9833333333333,'Python'),(-104.716666666667,38.8166666666667,'Python'),(-104.866666666667,39.75,'Python'),(-72.65,41.7333333333333,'R'),(-75.6,39.6666666666667,'Python'),(-77.0333333333333,38.85,'Python'),(-80.2666666666667,25.8,'Java'),(-81.3833333333333,28.55,'Java'),(-82.5333333333333,27.9666666666667,'Java'),(-84.4333333333333,33.65,'Python'),(-116.216666666667,43.5666666666667,'Python'),(-87.75,41.7833333333333,'Java'),(-86.2833333333333,39.7333333333333,'Java'),(-93.65,41.5333333333333,'Java'),(-97.4166666666667,37.65,'Java'),(-85.7333333333333,38.1833333333333,'Python'),(-90.25,29.9833333333333,'Java'),(-70.3166666666667,43.65,'R'),(-76.6666666666667,39.1833333333333,'R'),(-71.0333333333333,42.3666666666667,'R'),(-72.5333333333333,42.2,'R'),(-83.0166666666667,42.4166666666667,'Python'),(-84.6,42.7833333333333,'Python'),(-93.2166666666667,44.8833333333333,'Python'),(-90.0833333333333,32.3166666666667,'Java'),(-94.5833333333333,39.1166666666667,'Java'),(-90.3833333333333,38.75,'Python'),(-108.533333333333,45.8,'Python'),(-95.9,41.3,'Python'),(-115.166666666667,36.0833333333333,'Java'),(-71.4333333333333,42.9333333333333,'R'),(-74.1666666666667,40.7,'R'),(-106.616666666667,35.05,'Python'),(-78.7333333333333,42.9333333333333,'R'),(-73.9666666666667,40.7833333333333,'R'),(-80.9333333333333,35.2166666666667,'Python'),(-78.7833333333333,35.8666666666667,'Python'),(-100.75,46.7666666666667,'Java'),(-84.5166666666667,39.15,'Java'),(-81.85,41.4,'Java'),(-82.8833333333333,40,'Java'),(-97.6,35.4,'Python'),(-122.666666666667,45.5333333333333,'Python'),(-75.25,39.8833333333333,'Python'),(-80.2166666666667,40.5,'Python'),(-71.4333333333333,41.7333333333333,'R'),(-81.1166666666667,33.95,'R'),(-96.7333333333333,43.5666666666667,'Python'),(-90,35.05,'R'),(-86.6833333333333,36.1166666666667,'R'),(-97.7,30.3,'Python'),(-96.85,32.85,'Java'),(-95.35,29.9666666666667,'Java'),(-98.4666666666667,29.5333333333333,'Java'),(-111.966666666667,40.7666666666667,'Python'),(-73.15,44.4666666666667,'R'),(-77.3333333333333,37.5,'Python'),(-122.3,47.5333333333333,'Python'),(-89.3333333333333,43.1333333333333,'R'),(-104.816666666667,41.15,'Java')]
#processing
cities = [([longitude, latitude], language) for longitude, latitude, language in cities]
# key is language, value is pair (longitudes, latitudes)
plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }    
# we want each language to have a different marker and color
markers = { "Java" : "o", "Python" : "s", "R" : "^" }
colors = { "Java" : "r", "Python" : "b", "R" : "g" }
for (longitude, latitude), language in cities:
 plots[language][0].append(longitude)
 plots[language][1].append(latitude)
# create a scatter series for each language
for language, (x, y) in plots.items():
 plt.scatter(x, y, color=colors[language], marker=markers[language],
 label=language, zorder=10)
 #plot_state_borders(plt) # assume we have a function that does this
 plt.legend(loc=0) # let matplotlib choose the location
 plt.axis([-130,-60,20,55]) # set the axes
 plt.title("Favorite Programming Languages")
 plt.show()

'''for k in [1, 3, 5, 7]:
 num_correct = 0
 for city in cities:
  location, actual_language=city
  other_cities = [other_city for other_city in cities if other_city != city]
  predicted_language = knn_classify(k, other_cities, location)
  if predicted_language == actual_language:
   num_correct += 1
 print(k, "neighbor[s]:", num_correct, "correct out of", len(cities))

#TypeError: unsupported operand type(s) for -: 'list' and 'float'
'''

#12.3维度灾难
#创建一个可以得到指定个数的随机数列表
def random_point(dim):
 return [random.random() for _ in range(dim)]

def random_distances(dim,num_pairs):
 return [distance(random_point(dim),random+point(dim)) for _ in range(num_pairs)]

dimensions=range(1,101)
avg_distances=[]
min_distances=[]
random.seed(0)
for dim in dimensions:
 distances=random_distances(dim,10000)
 avg_distances.append(mean(distance))
 min_distances.append(min(distance))

min_avg_ratio=[min_dist/avg_dist for min_dist,avg_dist in zip(min_distances,avg_distances)]



------------------------------------------------CP13 Naive Bayes(朴素贝叶斯算法)
def tokenize(message):
 message = message.lower()                      # convert to lowercase
 all_words = re.findall("[a-z0-9']+", message)  # extract the words
 return set(all_words)                          # remove duplicates

def count_words(training_set):
"""training set consists of pairs (message, is_spam)"""
 counts = defaultdict(lambda: [0, 0])
 for message, is_spam in training_set:
  for word in tokenize(message):
   counts[word][0 if is_spam else 1] += 1
 return counts

def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
"""turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)"""
 return [(w,(spam + k) / (total_spams + 2 * k),(non_spam + k) / (total_non_spams + 2 * k)) for w, (spam, non_spam) in counts.iteritems()]

def spam_probability(word_probs, message):
 message_words = tokenize(message)
 log_prob_if_spam = log_prob_if_not_spam = 0.0
 for word, prob_if_spam, prob_if_not_spam in word_probs:
# for each word in the message,
# add the log probability of seeing it
  if word in message_words:
   log_prob_if_spam += math.log(prob_if_spam)
   log_prob_if_not_spam += math.log(prob_if_not_spam)
# for each word that's not in the message
# add the log probability of _not_ seeing it
  else:
   log_prob_if_spam += math.log(1.0 - prob_if_spam)
   log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)    
 prob_if_spam = math.exp(log_prob_if_spam)
 prob_if_not_spam = math.exp(log_prob_if_not_spam)
 return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:
 def __init__(self, k=0.5):
  self.k = k
  self.word_probs = []
 def train(self, training_set):
# count spam and non-spam messages
  num_spams = len([is_spam for message, is_spam in training_set if is_spam])
  num_non_spams = len(training_set) - num_spams
# run training data through our "pipeline"
  word_counts = count_words(training_set)
  self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
 def classify(self, message):
  return spam_probability(self.word_probs, message)
    
#13.4 moudel_test


data = []
# regex for stripping out the leading "Subject:" and any spaces after it
subject_regex = re.compile(r"^Subject:\s+")
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
 is_spam = "ham" not in fn
 with open(fn,'r') as file:
  for line in file:
   if line.startswith("Subject:"):
    subject = subject_regex.sub("", line).strip()
    data.append((subject, is_spam))
    
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)
classified.sort(key=lambda row: row[2])
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
hammiest_spams = filter(lambda row: row[1], classified)[:5]


def p_spam_given_word(word_prob):
 word, prob_if_spam, prob_if_not_spam = word_prob
 return prob_if_spam / (prob_if_spam + prob_if_not_spam)
words = sorted(classifier.word_probs, key=p_spam_given_word)    
spammiest_words = words[-5:]
hammiest_words = words[:5]

def drop_final_s(word):
 return ("s$","",word)


------------------------------------------------*args,**kwargs 参数
*args,**kwargs
https://www.cnblogs.com/xuyuanyuan123/p/6674645.html
http://python.jobbole.com/83476/

#*args表示任何多个无名参数，它是一个tuple
#**kwargs表示关键字参数，它是一个dict
#形参*args可以传入不指定tuple，**kwargs传入一个dict
在函数定义中使用*args和**kwargs传递可变长参数。
*args用作传递非命名键值可变长参数列表（位置参数）; **kwargs用作传递键值可变长参数列表。

def foo(*args,**kwargs):
 print('args=',args)
 print('kwargs=',kwargs)
 print('**********************')
if __name__=='__main__':
 foo(1,2,3)
 foo(a=1,b=2,c=3)
 foo(1,2,3,a=1,b=2,c=3)
 foo(1,'b','c',a=1,b='b',c='c')


------------------------------------------------CP14 Simple Linear Regression (简单线性回归)
#Y_i=bX_i+a+e
num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
    

def predict(alpha, beta, x_i):
    return beta * x_i + alpha
#计算误差
def error(alpha,beta,x_i,y_i):
    return y_i-predict(alpha,beta,x_i)
#计算误差平方和
def sum_of_squared_errors(alpha,beta,x,y):
    return sum(error(alpha,beta,x_i,y_i)**2 for x_i,y_i in zip(x,y))
#利用最小二乘法计算误差最小时的alpha和beta
def least_squares_fit(x,y):
    beta=correlation(x,y)*standard_deviation(y)/standard_deviation(x)
    alpha=mean(y)-beta*mean(x)
    return alpha,beta

alpha,beta=least_squares_fit(num_friends_good,daily_minutes_good)

#R^2(决定系数)：用来评估模型的拟合效果
def total_sum_of_squares(y):
    return sum(v**2 for v in de_mean(y))
def r_squared(alpha,beta,x,y):
    return 1.0-(sum_of_squared_errors(alpha,beta,x,y)/total_sum_of_squares(y))
r_squared(alpha,beta,num_friends_good,daily_minutes_good)                          --->0.329

#利用梯度下降法求解
def squared_error(x_i,y_i,theta):
    alpha,beta=theta
    return error(alpha,beta,x_i,y_i)**2
def squared_error_gradient(x_i,y_i,theta):
    alpha,beta=theta
    return [-2*error(alpha,beta,x_i,y_i),-2*error(alpha,beta,x_i,y_i)*x_i]

random.seed(0)
theta=[random.random(),random.random()]
alpha,beta=minimize_stochastic(squared_error,squared_error_gradient,num_friends_good,daily_minutes_good,theta,0.0001 )
print(alpha,beta)



------------------------------------------------CP15 Multiple Regression(多重回归分析)
if __name__ == "__main__":
    
x = [[1,49,4,0],[1,41,9,0],[1,40,8,0],[1,25,6,0],[1,21,1,0],[1,21,0,0],[1,19,3,0],[1,19,0,0],[1,18,9,0],[1,18,8,0],[1,16,4,0],[1,15,3,0],[1,15,0,0],[1,15,2,0],[1,15,7,0],[1,14,0,0],[1,14,1,0],[1,13,1,0],[1,13,7,0],[1,13,4,0],[1,13,2,0],[1,12,5,0],[1,12,0,0],[1,11,9,0],[1,10,9,0],[1,10,1,0],[1,10,1,0],[1,10,7,0],[1,10,9,0],[1,10,1,0],[1,10,6,0],[1,10,6,0],[1,10,8,0],[1,10,10,0],[1,10,6,0],[1,10,0,0],[1,10,5,0],[1,10,3,0],[1,10,4,0],[1,9,9,0],[1,9,9,0],[1,9,0,0],[1,9,0,0],[1,9,6,0],[1,9,10,0],[1,9,8,0],[1,9,5,0],[1,9,2,0],[1,9,9,0],[1,9,10,0],[1,9,7,0],[1,9,2,0],[1,9,0,0],[1,9,4,0],[1,9,6,0],[1,9,4,0],[1,9,7,0],[1,8,3,0],[1,8,2,0],[1,8,4,0],[1,8,9,0],[1,8,2,0],[1,8,3,0],[1,8,5,0],[1,8,8,0],[1,8,0,0],[1,8,9,0],[1,8,10,0],[1,8,5,0],[1,8,5,0],[1,7,5,0],[1,7,5,0],[1,7,0,0],[1,7,2,0],[1,7,8,0],[1,7,10,0],[1,7,5,0],[1,7,3,0],[1,7,3,0],[1,7,6,0],[1,7,7,0],[1,7,7,0],[1,7,9,0],[1,7,3,0],[1,7,8,0],[1,6,4,0],[1,6,6,0],[1,6,4,0],[1,6,9,0],[1,6,0,0],[1,6,1,0],[1,6,4,0],[1,6,1,0],[1,6,0,0],[1,6,7,0],[1,6,0,0],[1,6,8,0],[1,6,4,0],[1,6,2,1],[1,6,1,1],[1,6,3,1],[1,6,6,1],[1,6,4,1],[1,6,4,1],[1,6,1,1],[1,6,3,1],[1,6,4,1],[1,5,1,1],[1,5,9,1],[1,5,4,1],[1,5,6,1],[1,5,4,1],[1,5,4,1],[1,5,10,1],[1,5,5,1],[1,5,2,1],[1,5,4,1],[1,5,4,1],[1,5,9,1],[1,5,3,1],[1,5,10,1],[1,5,2,1],[1,5,2,1],[1,5,9,1],[1,4,8,1],[1,4,6,1],[1,4,0,1],[1,4,10,1],[1,4,5,1],[1,4,10,1],[1,4,9,1],[1,4,1,1],[1,4,4,1],[1,4,4,1],[1,4,0,1],[1,4,3,1],[1,4,1,1],[1,4,3,1],[1,4,2,1],[1,4,4,1],[1,4,4,1],[1,4,8,1],[1,4,2,1],[1,4,4,1],[1,3,2,1],[1,3,6,1],[1,3,4,1],[1,3,7,1],[1,3,4,1],[1,3,1,1],[1,3,10,1],[1,3,3,1],[1,3,4,1],[1,3,7,1],[1,3,5,1],[1,3,6,1],[1,3,1,1],[1,3,6,1],[1,3,10,1],[1,3,2,1],[1,3,4,1],[1,3,2,1],[1,3,1,1],[1,3,5,1],[1,2,4,1],[1,2,2,1],[1,2,8,1],[1,2,3,1],[1,2,1,1],[1,2,9,1],[1,2,10,1],[1,2,9,1],[1,2,4,1],[1,2,5,1],[1,2,0,1],[1,2,9,1],[1,2,9,1],[1,2,0,1],[1,2,1,1],[1,2,1,1],[1,2,4,1],[1,1,0,1],[1,1,2,1],[1,1,2,1],[1,1,5,1],[1,1,3,1],[1,1,10,1],[1,1,6,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,4,1],[1,1,9,1],[1,1,9,1],[1,1,4,1],[1,1,2,1],[1,1,9,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,1,1],[1,1,1,1],[1,1,5,1]]
daily_minutes_good =[68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]

#15.1 minutes=a+b_1*friends+b_2*work_hours+b3phd_+e
#对于多重回归 可以看成简单线性的列衍生（只要在数据列中加一列）
from __future__ import division
from collections import Counter
from functools import partial

def predict(x_i,beta):
    return dot(x_i,beta)

#15.2 最小二乘模拟的进一步假设 P165
#15.3拟合模型
#计算误差&平方和
def error(x_i, y_i, beta):
    return y_i - predict(x_i, beta)
def squared_error(x_i, y_i, beta):
    return error(x_i, y_i, beta) ** 2
#w2
def squared_error_gradient(x_i, y_i, beta):
    """the gradient corresponding to the ith squared error term"""
    return [-2 * x_ij * error(x_i, y_i, beta) for x_ij in x_i]

def estimate_beta(x, y):
    beta_initial = [random.random() for x_i in x[0]]
    return minimize_stochastic(squared_error,squared_error_gradient,x, y,beta_initial,0.001) 
random.seed(0)
beta=estimate_beta(x,daily_minutes_good)

#15.5拟合优度
#r^2
def multiple_r_squared(x, y, beta):
    sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2 for x_i, y_i in zip(x, y))
    return 1.0 - sum_of_squared_errors / total_sum_of_squares(y)

#15.6Bootstrap
data=get_sample(num_points=n)
def bootstrap_sample(data):
    """randomly samples len(data) elements with replacement"""
    return [random.choice(data) for _ in data]
    
def bootstrap_statistic(data, stats_fn, num_samples):
    """evaluates stats_fn on num_samples bootstrap samples from data"""
    return [stats_fn(bootstrap_sample(data))
    for _ in range(num_samples)]

close_to_100=[99.5+random.random() for _ in range(101)]
far_from_100=([99.5+random.random()]+[random.random() for _ in range(50)]+[200+random.random() for _ in range(50)])

bootstrap_statistic(close_to_100,median,100)
bootstrap_statistic(far_from_100,median,100)

#15.7 回归系数的标准误差
def estimate_sample_beta(sample):
    x_sample, y_sample = zip(*sample) # magic unzipping trick
    return estimate_beta(x_sample, y_sample)
random.seed(0)
bootstrap_betas=bootstrap_statistic(list(zip(x,daily_minutes_good)),estimate_sample_beta,100)
bootstrap_standard_errors=[standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4)]    

#计算P值
def p_value(beta_hat_j, sigma_hat_j):
    if beta_hat_j > 0:
        return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
    else:
        return 2 * normal_cdf(beta_hat_j / sigma_hat_j)

#15.8 正则化
def ridge_penalty(beta, alpha):
    return alpha * dot(beta[1:], beta[1:])
    
def squared_error_ridge(x_i, y_i, beta, alpha):
    """estimate error plus ridge penalty on beta"""
    return error(x_i, y_i, beta) ** 2 + ridge_penalty(beta, alpha)
    
def ridge_penalty_gradient(beta, alpha):
    """gradient of just the ridge penalty"""
    return [0] + [2 * alpha * beta_j for beta_j in beta[1:]]
    
def squared_error_ridge_gradient(x_i, y_i, beta, alpha):
    """the gradient corresponding to the ith squared error term
    including the ridge penalty"""
    return vector_add(squared_error_gradient(x_i, y_i, beta),ridge_penalty_gradient(beta, alpha))
    
def estimate_beta_ridge(x, y, alpha):
    """use gradient descent to fit a ridge regression
    with penalty alpha"""
    beta_initial = [random.random() for x_i in x[0]]
    return minimize_stochastic(partial(squared_error_ridge, alpha=alpha),partial(squared_error_ridge_gradient,alpha=alpha),x, y,beta_initial,0.001)
    
random.seed(0)
beta = estimate_beta_ridge(x, daily_minutes_good, alpha=0.0)

def lasso_penalty(beta, alpha):
    return alpha * sum(abs(beta_i) for beta_i in beta[1:]) 


------------------------------------------------CP16 Logistic Regression(逻辑回归)
#raw_data[experience,salary,paid_account]
data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1)]
#取出experience,salary
x=[[1]+list(row[:2]) for row in data]
#取出是否是付费用户(1 or 0)
y=[row[2] for row in data]
#利用线性回归找出最佳模型 paid_account=b_0+b_1*experience+b_2*salary+e
rescaled_x=rescale(x)
beta=estimate_beta(rescaled_x,y)
predictions=[predict(x_i,beta) for x_i in rescaled_x]
plt.scatter(predictions,y)
plt.xlabel('predicted')
plt.ylabel('actual')
pl.show()
#logistic函数
def logistic(x):
    return 1.0 / (1 + math.exp(-x))
def logistic_prime(x):
    return logistic(x) * (1 - logistic(x))

def logistic_log_likelihood_i(x_i, y_i, beta):
    if y_i == 1:
        return math.log(logistic(dot(x_i, beta)))
    else:
        return math.log(1 - logistic(dot(x_i, beta)))

def logistic_log_partial_ij(x_i, y_i, beta, j):
    """here i is the index of the data point,
    j the index of the derivative"""

    return (y_i - logistic(dot(x_i, beta))) * x_i[j]
    
def logistic_log_gradient_i(x_i, y_i, beta):
    """the gradient of the log likelihood 
    corresponding to the i-th data point"""

    return [logistic_log_partial_ij(x_i, y_i, beta, j) for j, _ in enumerate(beta)]
            
def logistic_log_gradient(x, y, beta):
    return reduce(vector_add,[logistic_log_gradient_i(x_i, y_i, beta) for x_i, y_i in zip(x,y)]) 

data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),]
data = map(list, data) 

#16.3应用模型 P178
random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33)
fn = partial(logistic_log_likelihood, x_train, y_train)
gradient_fn = partial(logistic_log_gradient, x_train, y_train)
# pick a random starting point
beta_0 = [1, 1, 1]
# and maximize using gradient descent
beta_hat = maximize_batch(fn, gradient_fn, beta_0)
beta_hat = maximize_stochastic(logistic_log_likelihood_i,logistic_log_gradient_i,x_train, y_train, beta_0)


#16.4拟合优度
rue_positives = false_positives = true_negatives = false_negatives = 0

for x_i, y_i in zip(x_test, y_test):
        predict = logistic(dot(beta_hat, x_i))

        if y_i == 1 and predict >= 0.5:  # TP: paid and we predict paid
            true_positives += 1
        elif y_i == 1:                   # FN: paid and we predict unpaid
            false_negatives += 1
        elif predict >= 0.5:             # FP: unpaid and we predict paid
            false_positives += 1
        else:                            # TN: unpaid and we predict unpaid
            true_negatives += 1

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
predictions=[logistic(dot(beta_hat,x_i)) for x_i in x_test]
plt.scatter(predictions,y_test)
plt.xlabel('predicted probability')
plt.ylabel('actual outcome')
plt.title('logistic regression predicted vs. actual')
pl.show()


------------------------------------------------CP17 Decision Trees(决策树)
#data
inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)]
#17.2熵
"""given a list of class probabilities, compute the entropy"""
def entropy(class_probabilities):
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)
#计算类别概率
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]
def data_entropy(labeled_data):        
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

#17.3分割之熵
#在数学上我们把数据集S划分成若干子集s1,s2,s3...，各个子集数量上所占比为q1,q2,q3... 我们通过加权形式计算分割熵
def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets"""
    total_count = sum(len(subset) for subset in subsets)
    return sum( data_entropy(subset) * len(subset) / total_count for subset in subsets )

def group_by(items, key_fn):
    """returns a defaultdict(list), where each input item is in the list whose key is key_fn(item)"""
    groups = defaultdict(list)
    for item in items:
        key = key_fn(item)

        groups[key].append(item)
    return groups
#17.4创建决策树
def partition_by(inputs, attribute):
    """returns a dict of inputs partitioned by the attribute each input is a pair (attribute_dict, label)"""
    return group_by(inputs, lambda x: x[0][attribute]) 
#分割熵
def partition_entropy_by(inputs,attribute):
    """computes the entropy corresponding to the given partition"""        
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values()) 

def partition_entropy_by(inputs,attribute):
    """computes the entropy corresponding to the given partition"""        
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values()) 
#找出最小分割的熵
senior_inputs = [(input, label) for input, label in inputs if input["level"] == "Senior"]
for key in ['level','lang','tweets','phd']:
    print(key,partition_entropy_by(senior_inputs,key))

#17.5综合应用（招聘决策树）
def classify(tree,input):
    if tree in [True,False]:
        return tree
    attribute,subtree_dict=tree
    subtree_key=input.get(attribute)
    if subtree_key not in subtree_dict:
        subtree_key=None
    subtree=subtree_dict[subtree_key]
    return classify(subtree,input)
def build_tree_id3(inputs,split_candidates=None):
    if split_candidates is None:
        split_candidates=inputs[0][0].keys()
    num_inputs=len(inputs)
    num_trues=len([label for item,label in inputs if label])
    num_falses=num_inputs-num_trues
    if num_trues==0:return False
    if num_falses==0:return True
    if not split_candidates:
        return num_trues>=num_falses
    best_attribute=min(split_candidates,key=partial(partitions_entropy_by,inputs))
    partitions=partition_by(inputs,best_attributes)
    new_candidates=[a for a in split_candidates if a!=best_attribute]
    subtrees={attribute_valueLbuild_tree_id3(subset,new_candidates) for attribute_value,subset in partitions.iteritems()}
    subtrees[None]=num_trues>num_falses
    return (best_attribute,subtrees)
#随机森林
def forest_classify(trees,input):
    votes=[classify(tree,input) for tree in trees]
    vote_counts=Counter(votes)
    return vote_counts.most_common(1)[0][0]
if len(split_candidates)<=self.num_split_candidates:
    sampled_split_candidates=split_candidates
else:
    sampled_split_candidates=random.sample(split_candidates,self.num_split_candidates)
best_attribute=min(sampled_split_candidates,key=partial(partition_entropy_by,inputs))
partitions=partition_by(inputs,best_attribute)

------------------------------------------------CP18 Neural Networks
#18.1 perception 感知器
def step_function(x):
    return 1 if x >= 0 else 0

def perceptron_output(weights, bias, x):
    """returns 1 if the perceptron 'fires', 0 if not"""
    return step_function(dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))
    
def neuron_output(weights, inputs):
    return sigmoid(dot(weights, inputs))

#18.2 前馈神经网络
def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
#18.3反向传播
def backpropagate(network, input_vector, target):

    hidden_outputs, outputs = feed_forward(network, input_vector)
    
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target[i]) for i, output in enumerate(outputs)]
                     
    # adjust weights for output layer (network[-1])
    for i, output_neuron in enumerate(network[-1]):
        for j, hidden_output in enumerate(hidden_outputs + [1]):output_neuron[j] -= output_deltas[i] * hidden_output

    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * dot(output_deltas, [n[i] for n in network[-1]]) for i, hidden_output in enumerate(hidden_outputs)]

    # adjust weights for hidden layer (network[0])
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

def patch(x, y, hatch, color):
    """return a matplotlib 'patch' object with the specified
    location, crosshatch pattern, and color"""
    return matplotlib.patches.Rectangle((x - 0.5, y - 0.5), 1, 1,
                                        hatch=hatch, fill=False, color=color)

#
def show_weights(neuron_idx):
    weights = network[0][neuron_idx]
    abs_weights = map(abs, weights)
# turn the weights into a 5x5 grid
# [weights[0:5], ..., weights[20:25]]
    grid = [abs_weights[row:(row+5)] for row in range(0,25,5)] 
    ax = plt.gca() # to use hatching, we'll need the axis
    ax.imshow(grid, # here same as plt.imshow
              cmap=matplotlib.cm.binary, # use white-black color scale
              interpolation='none') # plot blocks as blocks

# cross-hatch the negative weights
    for i in range(5): # row
        for j in range(5): # column
            if weights[5*i + j] < 0: # row i, column j = weights[5*i + j]
                # add black and white hatches, so visible whether dark or light
                ax.add_patch(patch(j, i, '/', "white"))
                ax.add_patch(patch(j, i, '\\', "black"))
    plt.show()

def make_digit(raw_digit):
        return [1 if c == '1' else 0 for row in raw_digit.split("\n") for c in row.strip()]
              
inputs = map(make_digit, raw_digits)
targets = [[1 if i == j else 0 for i in range(10)] for j in range(10)]
random.seed(0)   # to get repeatable results
input_size = 25  # each input is a vector of length 25
num_hidden = 5   # we'll have 5 neurons in the hidden layer
output_size = 10 # we need 10 outputs for each input
# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]
# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]
# the network starts out with random weights
network = [hidden_layer, output_layer]
# 10,000 iterations seems enough to converge
for __ in range(10000):
    for input_vector, target_vector in zip(inputs, targets):
            backpropagate(network, input_vector, target_vector)
def predict(input):
    return feed_forward(network, input)[-1]

for i, input in enumerate(inputs):
    outputs = predict(input)
    print i, [round(p,2) for p in outputs]

------------------------------------------------CP19 clustering 聚类分析
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]

#19.2模型
class KMeans:
    """performs k-means clustering"""
    def __init__(self, k):
        self.k = k          # number of clusters
        self.means = None   # means of clusters
    def classify(self, input):
        """return the index of the cluster closest to the input"""
        return min(range(self.k),key=lambda i: squared_distance(input, self.means[i]))
    def train(self, inputs):
        self.means = random.sample(inputs, self.k)
        assignments = None
        while True:
            # Find new assignments
            new_assignments = map(self.classify, inputs)
            # If no assignments have changed, we're done.
            if assignments == new_assignments:                
                return
            # Otherwise keep the new assignments,
            assignments = new_assignments    
            for i in range(self.k):
                i_points = [p for p, a in zip(inputs, assignments) if a == i]
                # avoid divide-by-zero if i_points is empty
                if i_points:
                    self.means[i] = vector_mean(i_points)
random.seed(0)
cluster=KMeans(3)
KMeans.train(inputs)
print(clusterer.means)

#19.4选择聚类数目K
def squared_clustering_errors(inputs, k):
    """finds the total squared error from k-means clustering the inputs"""
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = map(clusterer.classify, inputs)
    return sum(squared_distance(input,means[cluster]) for input, cluster in zip(inputs, assignments))

def plot_squared_clustering_errors(plt):
    ks = range(1, len(inputs) + 1)
    errors = [squared_clustering_errors(inputs, k) for k in ks]
    plt.plot(ks, errors)
    plt.xticks(ks)
    plt.xlabel("k")
    plt.ylabel("total squared error")
    plt.show()

import matplotlib.image as mpimg
path_to_png_file="/home/lufax/桌面/py3-into/img.jpg"
img=mpimg.imread(path_to_png_file)
top_row=img[0]
top_left_pixel=top_row[0]
red,green,blue=top_left_pixel
pixels=[pixel for row in img for pixel in row]
clusterer=KMeans(5)

#19.5 示例：对色彩聚类
def recolor_image(input_file, k=5):
    img = mpimg.imread(path_to_png_file)
    pixels = [pixel for row in img for pixel in row]
    clusterer = KMeans(k)
    clusterer.train(pixels) # this might take a while    
    def recolor(pixel):
        cluster = clusterer.classify(pixel) # index of the closest cluster
        return clusterer.means[cluster]     # mean of the closest cluster
    new_img = [[recolor(pixel) for pixel in row] for row in img]
    plt.imshow(new_img)
    plt.axis('off')
    plt.show()

#19.6自上而下的分层聚类
def is_leaf(cluster):
    """a cluster is a leaf if it has length 1"""
    return len(cluster) == 1
def get_children(cluster):
    """returns the two children of this cluster if it's a merged cluster;raises an exception if this is a leaf cluster"""
    if is_leaf(cluster):
        raise TypeError("a leaf cluster has no children")
    else:
        return cluster[1]
def get_values(cluster):
    """returns the value in this cluster (if it's a leaf cluster)
    or all the values in the leaf clusters below it (if it's not)"""
    if is_leaf(cluster):
        return cluster # is already a 1-tuple containing value
    else:
        return [value
                for child in get_children(cluster)
                for value in get_values(child)]
def cluster_distance(cluster1, cluster2, distance_agg=min):
    """finds the aggregate distance between elements of cluster1
    and elements of cluster2"""
    return distance_agg([distance(input1, input2)
                        for input1 in get_values(cluster1) for input2 in get_values(cluster2)])
def get_merge_order(cluster):
    if is_leaf(cluster):
        return float('inf')
    else:
        return cluster[0] # merge_order is first element of 2-tuple
#创建聚类算法
def bottom_up_cluster(inputs, distance_agg=min):
    # start with every input a leaf cluster / 1-tuple
    clusters = [(input,) for input in inputs]
    
    # as long as we have more than one cluster left...
    while len(clusters) > 1:
        # find the two closest clusters
        c1, c2 = min([(cluster1, cluster2)
                     for i, cluster1 in enumerate(clusters)
                     for cluster2 in clusters[:i]],
                     key=lambda (x, y): cluster_distance(x, y, distance_agg))

        # remove them from the list of clusters
        clusters = [c for c in clusters if c != c1 and c != c2]

        # merge them, using merge_order = # of clusters left
        merged_cluster = (len(clusters), [c1, c2])

        # and add their merge
        clusters.append(merged_cluster)

    # when there's only one cluster left, return it
    return clusters[0]

def generate_clusters(base_cluster, num_clusters):
    # start with a list with just the base cluster
    clusters = [base_cluster]
    
    # as long as we don't have enough clusters yet...
    while len(clusters) < num_clusters:
        # choose the last-merged of our clusters
        next_cluster = min(clusters, key=get_merge_order)
        # remove it from the list
        clusters = [c for c in clusters if c != next_cluster]
        # and add its children to the list (i.e., unmerge it)
        clusters.extend(get_children(next_cluster))

    # once we have enough clusters...
    return clusters

------------------------------------------------CP20 Natural Language Processing 自然语言
(word,job_popularity,resume_popularity)
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),("data science", 60, 70), ("analytics", 90, 3),("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),("actionable insights", 40, 30), ("think out of the box", 45, 10),("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]
#20.1词云
"""equals 8 if total is 0, 28 if total is 200"""
def text_size(total):
   return 8 + (total/200*20)
#生成data 词云图
for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity, resume_popularity, word,ha='center',va='center',size=text_size(job_popularity + resume_popularity))
plt.xlabel("Popularity on Job Postings")
plt.ylabel("Popularity on Resumes")
plt.axis([0, 100, 0, 100])
plt.show()

#20.2 n-grams 模型
def fix_unicode(text):
    return text.replace(u"\u2019", "'")

def get_document():
    url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html5lib')
    content = soup.find("div", "article-body")        # find article-body div
    regex = r"[\w']+|[\.]"                            # matches a word or a period
    document = []

for paragraph in content("p"):
        words = re.findall(regex, fix_unicode(paragraph.text))
        document.extend(words)

bigrams=zip(document,document[1:])
transitions=defaultdict(list)
for prev,current in bigrams:
    transitions[prev].append(current)
#生成句子
def generate_using_bigrams(transitions):
    current = "."   # this means the next word will start a sentence
    result = []
    while True:
        next_word_candidates = transitions[current]    # bigrams (current, _)
        current = random.choice(next_word_candidates)  # choose one at random
        result.append(current)                         # append it to results
    if current == ".": return " ".join(result) # if "." we're done

#3元 n-gram
trigrams=zip(document,document[1:],document[2:])
trigram_transitions=defaultdict(list)
starts=[]
for prev,current,next in trigrams:
    if prev=='.':
        starts.append(current)
    trigram_transitions[(prev,current)].append(next)

def generate_using_trigrams(starts, trigram_transitions):
    current = random.choice(starts)   # choose a random starting word
    prev = "."                        # and precede it with a '.'
    result = [current]
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next = random.choice(next_word_candidates)
        prev, current = current, next
        result.append(current)
        if current == ".":
            return " ".join(result)

#20.3 语法
grammar = {
        "_S"  : ["_NP _VP"],
        "_NP" : ["_N",
                 "_A _NP _P _A _N"],
        "_VP" : ["_V",
                 "_V _NP"],
        "_N"  : ["data science", "Python", "regression"],
        "_A"  : ["big", "linear", "logistic"],
        "_P"  : ["about", "near"],
        "_V"  : ["learns", "trains", "tests", "is"]
}
def is_terminal(token):
    return token[0] != "_"

def expand(grammar, tokens):
    for i, token in enumerate(tokens):
        # ignore terminals
        if is_terminal(token): continue
        # choose a replacement at random
        replacement = random.choice(grammar[token])
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
        return expand(grammar, tokens)
    return tokens
def generate_sentence(grammar):
    return expand(grammar, ["_S"])

#20.4 Gibbs Sampling（吉布斯采样）
def roll_a_die():
    return random.choice([1,2,3,4,5,6])

def direct_sample():
    d1 = roll_a_die()
    d2 = roll_a_die()
    return d1, d1 + d2

def random_y_given_x(x):
    """equally likely to be x + 1, x + 2, ... , x + 6"""
    return x + roll_a_die()

def random_x_given_y(y):
    if y <= 7:
        # if the total is 7 or less, the first die is equally likely to be
        # 1, 2, ..., (total - 1)
        return random.randrange(1, y)
    else:
        # if the total is 7 or more, the first die is equally likely to be
        # (total - 6), (total - 5), ..., 6
        return random.randrange(y - 6, 7)

def gibbs_sample(num_iters=100):
    x, y = 1, 2 # doesn't really matter
    for _ in range(num_iters):
        x = random_x_given_y(y)
        y = random_y_given_x(x)
    return x, y

def compare_distributions(num_samples=1000):
    counts = defaultdict(lambda: [0, 0])
    for _ in range(num_samples):
        counts[gibbs_sample()][0] += 1
        counts[direct_sample()][1] += 1
    return counts

#20.5  TOPIC MODELING 主题建模
def sample_from(weights):
    total = sum(weights)
    rnd = total * random.random()       # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                        # return the smallest i such that
        if rnd <= 0: return i           # sum(weights[:(i+1)]) >= rnd

documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K = 4

document_topic_counts = [Counter()
                         for _ in documents]

topic_word_counts = [Counter() for _ in range(K)]

topic_counts = [0 for _ in range(K)]

document_lengths = map(len, documents)

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_that are assigned to _topic_ (plus some smoothing)"""

    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    """the fraction of words assigned to _topic_that equal _word_ (plus some smoothing)"""

    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d, word, k):
    """given a document and a word in that document,return the weight for the k-th topic"""

    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1
 topic_names = ["Big Data and programming languages",
                   "Python and statistics",
                   "databases",
                   "machine learning"]
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
         if count > 0: print k, word, count

for document, topic_counts in zip(documents, document_topic_counts):
        print document
        for topic, count in topic_counts.most_common():
            if count > 0:
                print topic_names[topic], count,
        print

------------------------------------------------CP21 Network Analysis(网络分析)
#21.1 Betweenness Centrality (中介中心度)
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }]
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),(4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
# give each user a friends list
for user in users:
    user["friends"] = []   
# and populate it
for i, j in friendships:
    # this works because users[i] is the user whose id is i
    users[i]["friends"].append(users[j]) # add i as a friend of j
    users[j]["friends"].append(users[i]) # add j as a friend of i   

# 广度优先搜索算法实现 P239
def shortest_paths_from(from_user):
    # a dictionary from "user_id" to *all* shortest paths to that user
    shortest_paths_to = { from_user["id"] : [[]] }
    # a queue of (previous user, next user) that we need to check.
    # starts out with all pairs (from_user, friend_of_from_user)
    frontier = deque((from_user, friend) for friend in from_user["friends"])
    # keep going until we empty the queue
    while frontier: 
        prev_user, user = frontier.popleft() # take from the beginning
        user_id = user["id"]
        # the fact that we're pulling from our queue means that
        # necessarily we already know a shortest path to prev_user
        paths_to_prev = shortest_paths_to[prev_user["id"]]
        paths_via_prev = [path + [user_id] for path in paths_to_prev]
        # it's possible we already know a shortest path to here as well
        old_paths_to_here = shortest_paths_to.get(user_id, [])
        # what's the shortest path to here that we've seen so far?
        if old_paths_to_here:
            min_path_length = len(old_paths_to_here[0])
        else:
            min_path_length = float('inf')       
        # any new paths to here that aren't too long
        new_paths_to_here = [path_via_prev
                             for path_via_prev in paths_via_prev
                             if len(path_via_prev) <= min_path_length
                             and path_via_prev not in old_paths_to_here]
        shortest_paths_to[user_id] = old_paths_to_here + new_paths_to_here
        # add new neighbors to the frontier
        frontier.extend((user, friend) for friend in user["friends"] if friend["id"] not in shortest_paths_to)
    return shortest_paths_to
#载入路径数值(one-all路径——以{list}形式)
for user in users:
    user['shortest_paths']=shortest_paths_from(user)

#now we calculate Betweenness Centrality (计算中介中心度BC)
#set ori_value of B_C 
for user in users:
    user['betweenness_centrality']=0
#then we get B_C 至此我们得到了中介中心度
for source in users:
    source_id = source["id"]
    for target_id, paths in source["shortest_paths"].items():
        if source_id < target_id:   # don't double count
            num_paths = len(paths)  # how many shortest paths?
            contrib = 1 / num_paths # contribution to centrality
            for path in paths:
                for id in path:
                    if id not in [source_id, target_id]:
                        users[id]["betweenness_centrality"] += contrib
#print the B_C
print([str(u['id'])+' :'+str(u['betweenness_centrality']) for u in users])
##(查看各相对中心度值相对大小)

#计算接近中心度(closeness centrality= 1/疏远度)
#计算疏远度(farness)
def farness(user):
    """the sum of the lengths of the shortest paths to each other user"""
    return sum(len(paths[0]) 
               for paths in user["shortest_paths"].values())
for user in users:
    user["closeness_centrality"] = 1 / farness(user)

#21.2 特征向量中心度
#21.2.1 矩阵乘法(matrix multiplication)
#函数：矩阵A的i行乘以矩阵B的j列
def matrix_product_entry(A, B, i, j):
    return np.dot(get_row(A, i), get_column(B, j))
#函数：矩阵乘法
def matrix_multiply(A, B):
    n1, k1 = np.shape(A)
    n2, k2 = np.shape(B)
    if k1 != n2:
        raise ArithmeticError("incompatible shapes!")             
    return make_matrix(n1, k2, partial(matrix_product_entry, A, B))
#将1维向量转换为矩阵
def vector_as_matrix(v):
    """returns the vector v (represented as a list) as a n x 1 matrix"""
    return [[v_i] for v_i in v]
#将矩阵转为n*1向量
def vector_from_matrix(v_as_matrix):
    """returns the n x 1 matrix as a list of values"""
    return [row[0] for row in v_as_matrix]
#对向量做矩阵乘法（返回值为向量）
def matrix_operate(A, v):
    v_as_matrix = vector_as_matrix(v)
    product = matrix_multiply(A, v_as_matrix)
    return vector_from_matrix(product)

#计算向量特征值
def find_eigenvector(A, tolerance=0.00001):
    guess = [1 for __ in A]
    while True:
        result = matrix_operate(A, guess)
        length = magnitude(result)
        next_guess = scalar_multiply(1/length, result)

        if distance(guess, next_guess) < tolerance:
            return next_guess, length # eigenvector, eigenvalue
        guess = next_guess

#21.2.2 eigenvector centrality(中心度)
#相当于返回一个i行j列的矩阵，再计算eigenvector_centralities就是中心度
def entry_fn(i, j):
    return 1 if (i, j) in friendships or (j, i) in friendships else 0
n = len(users)
adjacency_matrix = make_matrix(n, n, entry_fn)
eigenvector_centralities, _ = find_eigenvector(adjacency_matrix)

#21.3 directed graphs & pagerank（有向图和PageRank） P246
endorsements = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1), (1, 3),
                (2, 3), (3, 4), (5, 4), (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]

for user in users:
    user["endorses"] = []       # add one list to track outgoing endorsements
    user["endorsed_by"] = []    # and another to track endorsements
    
for source_id, target_id in endorsements:
    users[source_id]["endorses"].append(users[target_id])
    users[target_id]["endorsed_by"].append(users[source_id])
endorsements_by_id = [(user["id"], len(user["endorsed_by"]))for user in users]

sorted(endorsements_by_id,key=lambda (user_id, num_endorsements): num_endorsements,reverse=True)

def page_rank(users, damping = 0.85, num_iters = 100):
    # initially distribute PageRank evenly
    num_users = len(users)
    pr = { user["id"] : 1 / num_users for user in users }
    # this is the small fraction of PageRank
    # that each node gets each iteration
    base_pr = (1 - damping) / num_users
    for __ in range(num_iters):
        next_pr = { user["id"] : base_pr for user in users }
        for user in users:
            # distribute PageRank to outgoing links
            links_pr = pr[user["id"]] * damping
            for endorsee in user["endorses"]:
                next_pr[endorsee["id"]] += links_pr / len(user["endorses"])
        pr = next_pr     
    return pr


------------------------------------------------CP22 Recommender Systems(推荐系统)
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

#22.2推荐流行
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()
#创建最受欢迎top函数  (推荐前5个流行的)
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency) 
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]

#22.3 user-based filtering (基于用户的协同过滤方法)
#创建一个指标衡量2用户之间的相似度（余弦相似度）
def cosine_similarity(v, w):
    return np.dot(v, w) / math.sqrt(np.dot(v, v) * np.dot(w, w))
#独一数组
unique_interests = sorted(list({ interest 
                                 for user_interests in users_interests
                                 for interest in user_interests }))

#创建user-interest向量
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0 for interest in unique_interests]
#创建兴趣矩阵（对应axis=1 为unique_interest，axis=0为users_interests）
user_interest_matrix = list(map(make_user_interest_vector, users_interests))
#计算用户兴趣相似度
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j) 
for interest_vector_j in user_interest_matrix]
for interest_vector_i in user_interest_matrix]

#创建用户相似度函数（输入id返回 id和相似度）
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero 
             if user_id != other_user_id and similarity > 0]  # similarity
    return sorted(pairs,                                      # sort them
                  key=lambda similarity: similarity,          # most similar
                  reverse=True)                               # first
#创建推荐函数
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities 
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),key=lambda weight: weight[1],reverse=True)
    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

#22.4 Item-Based Collaborative Filtering (基于物品的协同过滤)
#计算用户兴趣矩阵
interest_user_matrix =[[user_interest_vector[j] for user_interest_vector in user_interest_matrix] for j, _ in enumerate(unique_interests)]
#计算基于物品的兴趣相似度矩阵
interest_similarities =[[cosine_similarity(user_vector_i, user_vector_j) for user_vector_j in interest_user_matrix]for user_vector_i in interest_user_matrix]
#计算与interest_id最相似的项（并推荐兴趣）
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,key=lambda similarity: similarity[1],reverse=True)
#基于物品的协同过滤推荐函数
def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity
    suggestions = sorted(suggestions.items(),key=lambda similarity: similarity[1],reverse=True)
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) for suggestion, weight in suggestions if suggestion not in users_interests[user_id]]


------------------------------------------------CP23 Databases and SQL
#23.1 create table &insert
users =[[0, "Hero", 0],[1, "Dunn", 2],[2, "Sue", 3],[3, "Chi", 3]]
#sql
create table users(
 user_id INT NOT NULL,
 name VARCHAR(200),
 num_friends INT)
insert into users(user_id,name,num_friends) values(1,'zieox','8')

#py
class Table:
    def __init__(self, columns):
        self.columns = columns
        self.rows = []

    def __repr__(self):
        """pretty representation of the table: columns then rows"""
        return str(self.columns) + "\n" + "\n".join(map(str, self.rows))

    def insert(self, row_values):
        if len(row_values) != len(self.columns):
            raise TypeError("wrong number of elements")
        row_dict = dict(zip(self.columns, row_values))
        self.rows.append(row_dict)


#23.2 update

def update(self, updates, predicate):
        for row in self.rows:
            if predicate(row):
                for column, new_value in updates.iteritems():row[column] = new_value
#sql
update users set num_friends=3 where user_id =1;
#py
users.update({'num_friends':3},lambda row:row['user_id']==1)


#23.3delete 
def delete(self, predicate=lambda row: True):
        """delete all rows matching predicate or all rows if no predicate supplied"""
        self.rows = [row for row in self.rows if not(predicate(row))]

#sql 
delete from users
delete from users where user_id=1
#py
users.delete(lambda row:row['user_id']==1)
users.delete()

#23.4 select 
def select(self, keep_columns=None, additional_columns=None):
        if keep_columns is None:         # if no columns specified,
            keep_columns = self.columns  # return all columns
        if additional_columns is None:
            additional_columns = {}
        # new table for results
        result_table = Table(keep_columns + additional_columns.keys())

        for row in self.rows:
            new_row = [row[column] for column in keep_columns]
            for column_name, calculation in additional_columns.iteritems():
                new_row.append(calculation(row))
            result_table.insert(new_row)
        return result_table

def where(self, predicate=lambda row: True):
        """return only the rows that satisfy the supplied predicate"""
        where_table = Table(self.columns)
        where_table.rows = filter(predicate, self.rows)
        return where_table

def limit(self, num_rows=None):
        """return only the first num_rows rows"""
        limit_table = Table(self.columns)
        limit_table.rows = (self.rows[:num_rows] if num_rows is not None else self.rows)
        return limit_table
#sql-py
select * from users
users.select()

select * from users limit 2
users.limit(2)

select user_id from users
users.select(keep_columns=['user_id'])

select user_id from users where name='Dunn'
users.where(lambda row:row['name']=='Dunn')

#23.5 group by
def group_by(self, group_by_columns, aggregates, having=None):
        grouped_rows = defaultdict(list)
        # populate groups
        for row in self.rows:
            key = tuple(row[column] for column in group_by_columns)
            grouped_rows[key].append(row)
        result_table = Table(group_by_columns + aggregates.keys())
        for key, rows in grouped_rows.iteritems():
            if having is None or having(rows):
                new_row = list(key)
                for aggregate_name, aggregate_fn in aggregates.iteritems():
                    new_row.append(aggregate_fn(rows))
                result_table.insert(new_row)
        return result_table

 def order_by(self, order):
        new_table = self.select()       # make a copy
        new_table.rows.sort(key=order)
        return new_table

def join(self, other_table, left_join=False):
        join_on_columns = [c for c in self.columns if c in other_table.columns]      # columns in both tables
        additional_columns = [c for c in other_table.columns if c not in join_on_columns]   # columns only in right table
        # all columns from left table + additional_columns from right table
        join_table = Table(self.columns + additional_columns)
        for row in self.rows:
            def is_join(other_row):
                return all(other_row[c] == row[c] for c in join_on_columns)
            other_rows = other_table.where(is_join).rows
            # each other row that matches this one produces a result row
            for other_row in other_rows:
                join_table.insert([row[c] for c in self.columns]+[other_row[c] for c in additional_columns])
            # if no rows match and it's a left join, output with Nones
            if left_join and not other_rows:
                join_table.insert([row[c] for c in self.columns] + [None for c in additional_columns])
        return join_table

def min_user_id(rows):return min(row['user_id'] for row in rows)

stats_by_length=users.select(additional_columns={'name_length':name_length}).group_by(group_by_columns=['name_length'],aggregates={'min_user_id':min_user_id,'num_users':len})

def first_letter_of_name(row):return row["name"][0] if row["name"] else ""
def average_num_friends(rows):return sum(row["num_friends"] for row in rows) / len(rows)
def enough_friends(rows):return average_num_friends(rows) > 1
avg_friends_by_letter = users.select(additional_columns={'first_letter' : first_letter_of_name}).group_by(group_by_columns=['first_letter'],aggregates={ "avg_num_friends" : average_num_friends },having=enough_friends)

def sum_user_ids(rows): return sum(row["user_id"] for row in rows)
user_id_sum = users.where(lambda row: row["user_id"] > 1).group_by(group_by_columns=[],aggregates={ "user_id_sum" : sum_user_ids })

friendliest_letters = avg_friends_by_letter.order_by(lambda row: -row["avg_num_friends"]).limit(4)

# 23.7 JOINs
user_interests = Table(["user_id", "interest"])
user_interests.insert([0, "SQL"])
user_interests.insert([0, "NoSQL"])
user_interests.insert([2, "SQL"])
user_interests.insert([2, "MySQL"])
#sql
select users.name from users join user_interests on users.user_id=user_interests.user_id where user_interests.interest='SQL'
#py
sql_users = users.join(user_interests).where(lambda row: row["interest"] == "SQL").select(keep_columns=["name"])

def count_interests(rows):
        """counts how many rows have non-None interests"""
        return len([row for row in rows if row["interest"] is not None])

user_interest_counts = users.join(user_interests, left_join=True).group_by(group_by_columns=["user_id"],aggregates={"num_interests" : count_interests })

#23.8 SUBQUERIES
#sql
select min(user_id) as min_user_id from (select user_id from user_interests where interest='sql') sql_interest
#py
likes_sql_user_ids = user_interests.where(lambda row: row["interest"] == "SQL").select(keep_columns=['user_id'])
likes_sql_user_ids.group_by(group_by_columns=[],aggregates={ "min_user_id" : min_user_id })


------------------------------------------------CP24 mapreduce
data=pd.read_csv('/home/lufax/桌面/py3-into/test.txt')
#24.1 example:words_count
def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word for document in documents for word in tokenize(document))

def tokenize(message):
    message = message.lower()                      # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)  # extract the words
    return set(all_words)                          # remove duplicates
#键值对转换
def wc_mapper(document):
    """for each word in the document, emit (word,1)"""        
    for word in tokenize(document):
        yield (word, 1)
def wc_reducer(word, counts):
    """sum up the counts for a word"""
    yield (word, sum(counts))

def word_count(documents):
    """count the words in the input documents using MapReduce"""
    # place to store grouped values
    collector = defaultdict(list) 
    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)
    return [output for word, counts in collector.iteritems() for output in wc_reducer(word, counts)]

#
def map_reduce(inputs, mapper, reducer):
    """runs MapReduce on the inputs using mapper and reducer"""
    collector = defaultdict(list)

    for input in inputs:
        for key, value in mapper(input):
            collector[key].append(value)

    return [output for key, values in collector.iteritems() for output in reducer(key,values)]

def reduce_with(aggregation_fn, key, values):
    """reduces a key-values pair by applying aggregation_fn to the values"""
    yield (key, aggregation_fn(values))

def values_reducer(aggregation_fn):
    """turns a function (values -> output) into a reducer"""
    return partial(reduce_with, aggregation_fn)

sum_reducer = values_reducer(sum)
max_reducer = values_reducer(max)
min_reducer = values_reducer(min)
count_distinct_reducer = values_reducer(lambda values: len(set(values)))

# Analyzing Status Updates
def data_science_day_mapper(status_update):
    """yields (day_of_week, 1) if status_update contains "data science" """
    if "data science" in status_update["text"].lower():
        day_of_week = status_update["created_at"].weekday()
    yield (day_of_week, 1)
data_science_days = map_reduce(status_updates,data_science_day_mapper,sum_reducer)

def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))
            
def most_popular_word_reducer(user, words_and_counts):
    """given a sequence of (word, count) pairs, 
    return the word with the highest total count"""
    word_counts = Counter()
    for word, count in words_and_counts:
        word_counts[word] += count
    word, count = word_counts.most_common(1)[0]
    yield (user, (word, count))
user_words = map_reduce(status_updates,words_per_user_mapper,most_popular_word_reducer)

def liker_mapper(status_update):
    user = status_update["username"]
    for liker in status_update["liked_by"]:
        yield (user, liker)
                
distinct_likers_per_user = map_reduce(status_updates, liker_mapper, count_distinct_reducer)

#24.5 matrix multiplication

def matrix_multiply_mapper(m, element):
    """m is the common dimension (columns of A, rows of B)
    element is a tuple (matrix_name, i, j, value)"""
    matrix, i, j, value = element

    if matrix == "A":
        for column in range(m):
            # A_ij is the jth entry in the sum for each C_i_column
            yield((i, column), (j, value))
    else:
        for row in range(m):
            # B_ij is the ith entry in the sum for each C_row_j
    yield((row, j), (i, value))

def matrix_multiply_reducer(m, key, indexed_values):
    results_by_index = defaultdict(list)
    for index, value in indexed_values:
        results_by_index[index].append(value)

    # sum up all the products of the positions with two results
sum_product = sum(results[0] * results[1] for results in results_by_index.values() if len(results) == 2)

if sum_product != 0.0:
    yield (key, sum_product)


entries = [("A", 0, 0, 3), ("A", 0, 1,  2),("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)]
mapper = partial(matrix_multiply_mapper, 3)
reducer = partial(matrix_multiply_reducer, 3)




#exp.1
import time
from pandas import Series,DataFrame
import numpy as np
from pandas.tseries.offsets import Hour,Minute,Day,MonthEnd

data={'re':[272, 277, 173, 262, 136, 103, 291, 44, 196, 247],
'wi':[118, 141, 162, 136, 147, 167, 177, 111, 114, 142]}
indx=pd.date_range('4/2/2018',periods = 10)

df2 = DataFrame(data,columns=['re', 'wi'],index=indx)

df2['answer']=df2.cumsum()['re']-df2.cumsum()['wi'].shift(1).fillna(0)
#移动增量
df2['improv']=df2.answer-df2.answer.shift(1).fillna(0)

#exp.2
配置P3
https://blog.csdn.net/yin__ren/article/details/78848171
#计算区域花费占比图
grouped=df['cost'].groupby(df['region'])
grouped.sum().plot(kind='pie',autopct = '%3.1f%%',)
--->
df['cost'].groupby(df['region']).sum().plot(kind='pie',autopct = '%3.1f%%',title='cost% by regions')
#计算BJ的年龄分布图
bins=[i*10 for i in range(1,11)]
pd.cut(df[df.region=='BJ'].age,bins).value_counts().plot(kind='pie',autopct = '%3.1f%%',title='age-T-OF-BJ')

for i in ['ee', 'num', 'co st']:
    for j  in ['ee', 'num', 'cost']:
        print(i+' - '+j+': '+str(df[i].corr(df[j])))
#计算相关系数(并绘图)
a=['vage', 'age', 'ageg', 'ee','num', 'cost']
#这种写法出来的数据有类似AB，BA的重复项
corr=pd.Series({i+' - '+j:df[i].corr(df[j]) for i in a for j in a if i!=j})
#去除重复 写法
b={}
for i in a:
    for j in a:
        if i==j:
            break
        else:
             x=i+' - '+j
             y=df[i].corr(df[j])
             b[x]=y
corr=pd.Series(b)
corr.sort_values(ascending=True).plot(kind='barh',figsize=(10,10),title='corr_value')
corr.plot(kind='barh',figsize=(10,10),title='corr_value')

你可能感兴趣的:(学习笔记,学习,python)

python将网银web工程转换成客户端electron工程案例银行金融科技人工智能机器学习 DeepSeek electron
以下是一个将网银Web工程转换为Electron客户端的技术方案，结合Python和Electron实现桌面端增强功能：bash#项目结构webank-electron/├──main/#Electron主进程代码│├──main.js│└──python_server.py├──renderer/#网页渲染进程│└──webank-web/#原始网银Web工程├──package.json└──
基于ChatGPT、GIS与Python机器学习的地质灾害风险评估、易发性分析、信息化建库及灾后重建高级实践 weixin_贾防洪评价风险评估滑坡泥石流地质灾害
第一章、ChatGPT、DeepSeek大语言模型提示词与地质灾害基础及平台介绍【基础实践篇】1、什么是大模型？大模型（LargeLanguageModel,LLM）是一种基于深度学习技术的大规模自然语言处理模型。代表性大模型：GPT-4、BERT、T5、ChatGPT等。特点：多任务能力：可以完成文本生成、分类、翻译、问答等任务。上下文理解：能理解复杂的上下文信息。广泛适配性：适合科研、教育、行
通过 Kibana 操作 Elasticsearch：从入门到实践格子先生Lab elasticsearch 大数据搜索引擎
引言Kibana是Elasticsearch的可视化工具，提供了一个用户友好的界面来管理和操作Elasticsearch中的数据。通过Kibana，你可以轻松地执行数据搜索、创建可视化图表、构建仪表盘等操作。本文将带你从零开始学习如何通过Kibana操作Elasticsearch，掌握其基本功能和进阶操作。1.Kibana简介1.1什么是Kibana？Kibana是一个开源的数据可视化工具，专为E
anythingLLM 使用教程惟贤箬溪穷玩Ai AIGC 人工智能
一、anythingLLM简介anythingLLM是一款灵活且功能强大的语言模型，它基于先进的深度学习架构构建，旨在为用户提供多样化的自然语言处理服务。其设计理念注重通用性和可扩展性，能够适应多种领域和任务，无论是文本生成、智能问答，还是翻译、摘要提取等，都能展现出出色的性能。与同类模型相比，anythingLLM具有训练数据丰富、模型优化程度高的优势，能够生成更符合逻辑、更具实用性的文本内容。
深度解析大模型推理框架：原理、应用与实践百度_开发者中心人工智能大模型自然语言处理
在当今数据驱动的时代，大模型推理框架已经成为人工智能领域的重要支柱。本文将通过简明扼要、清晰易懂的方式，带领读者深入了解大模型推理框架的原理、应用领域和实践经验，帮助读者更好地掌握这一技术，并在实际工作中发挥其价值。一、大模型推理框架简介大模型推理框架是指一种基于深度学习技术的推理框架，主要用于解决大规模数据集下的复杂问题。该框架通过对海量数据进行高效的训练和推理，能够快速地对各种复杂场景进行分析
大模型推理框架：从理论到实践的全面解析百度_开发者中心人工智能大模型自然语言处理
在数据驱动的时代，深度学习技术已经渗透到各个行业，从图像识别到自然语言处理，从推荐系统到智能客服，其应用无处不在。然而，深度学习模型的训练和推理过程往往涉及大量数据和复杂计算，传统的计算框架难以满足需求。因此，大模型推理框架应运而生，成为解决这一问题的关键。一、大模型推理框架基本概念大模型推理框架是一种基于深度学习技术的推理框架，它通过对海量数据进行高效的训练和推理，能够快速地对各种复杂场景进行分
PHP与Java的区别分析 Monika Zhang java 架构设计 php java 开发语言
一、语言特点php：一种的像Python的动态弱语言类型的服务器脚本语言，不需要编译代码；它是专为Web开发目的而开发和设计的，而且简单容易上手。Java：是一种通用的面向对象编程语言，属于强势优选语言类型，在执行前必须先正确编译。是面向对象的和人类可读的；支持服务器端和客户端；可用于开发独立应用程序或基于Web的应用程序，上手比PHP难。二、语法1.PHP是一种脚本语言，代码在服务器上执行，而结
闭包的概念总结与分析 Monika Zhang java java
1定义闭包又称词法闭包闭包最早定义为一种包含和的实体.在计算机科学中，闭包（英语：Closure），又称词法闭包（LexicalClosure）或函数闭包（functionclosures），是引用了自由变量的函数。解释一：闭包是引用了自由变量的函数，这个被引用的变量将和这个函数一同存在。解释二：闭包是函数和相关引用环境组成的实体。注：：除了局部变量的其他变量《Python核心编程》对闭包的解释:
毕业论文代码实验（Python\MATLAB）基于K-means聚类的EMD-BiLSTM-Attention光伏功率预测模型清风AI 毕业设计代码实现 python lstm 深度学习神经网络人工智能 matlab pytorch
一、项目背景1.1光伏功率预测意义在能源结构转型背景下（国家能源局2025规划），光伏发电渗透率已超过18%。但受天气突变、云层遮挡等因素影响，光伏出力具有显著波动性，导致：电网调度难度增加（±15%功率波动）电力市场交易风险提升光储协同控制效率降低1.2技术挑战多尺度特征耦合：分钟级辐照度变化与小时级天气模式共存非线性映射关系：气象因素与发电功率呈高阶非线性关系数据模态差异：数值天气预报(NWP
DeepSeek 部署指南 (使用 vLLM 本地部署) AGI大模型资料分享员人工智能语言模型学习 chatgpt 深度学习大模型 deepseek
DeepSeek部署指南(使用vLLM本地部署)本文档将指导您如何使用vLLM在本地部署DeepSeek语言模型。我们以deepseek-ai/DeepSeek-R1-Distill-Qwen-7B模型为例进行演示。1、安装Python环境首先，您需要安装Python环境。访问Python官网:https://www.python.org/根据您的操作系统选择安装包:Python官网提供Windo
Python基础知识点总结豆芽819 tip python 开发语言
1Python简介Python特点：解释型语言：无需编译，逐行执行。动态类型：变量类型在运行时确定。简洁易读：语法接近自然语言，代码简洁。跨平台支持：Windows/Linux/macOS均可运行。应用领域：Web开发、数据分析、人工智能、自动化脚本等。开发环境：推荐使用IDLE、PyCharm、VSCode或JupyterNotebook。2Python数值运算基本运算符：算术：+,-,*,/,
Python与区块链隐私保护技术：如何在去中心化世界中保障数据安全 Echo_Wish Python！实战！区块链 python 去中心化
Python与区块链隐私保护技术：如何在去中心化世界中保障数据安全在区块链世界里，透明性和不可篡改性是两大核心优势，但这也带来了一个悖论——如何在公开账本的同时保障用户隐私？如果你的交易记录对所有人可见，如何防止敏感信息泄露？Python作为区块链开发中最受欢迎的语言之一，提供了强大的工具和库来增强隐私保护。本文将深入探讨区块链的隐私保护技术，并结合Python代码示例，带你了解如何在Web3时代
人脸识别的一些代码饿了就干饭 CV相关人脸识别
1、cv2入门函数imread及其相关操作2、（详解）opencv里的cv2.resize改变图片大小Python3、机器学习之人脸识别face_recognition使用4、使用face_recognition进行人脸校准5、简单的人脸识别通用流程示意图（这个看着写的挺好的）6、face_recognition和图像处理中left、top、right、bottom解释7、使用pillow库对图片
Python从入门到精通的系统性学习路径 niuTaylor 编程区 python 学习开发语言
Python从入门到精通的系统性学习路径一、基础语法快速突破1.变量与基础操作#动态类型演示a=10#整型a=3.14#浮点型a="Python"#字符串a=[1,2,3]#列表#格式化输出进阶name="Alice"print(f"{name:*^20}")#居中填充输出：******Alice*******2.运算符优先级实战#常见运算符优先级练习result=5+3*2**2//(4%3)p
Spring系列学习之Spring Messaging消息支持 m0_74825488 面试学习路线阿里巴巴 spring linq java
英文原文：https://docs.spring.io/spring-boot/docs/current/reference/html/boot-features-messaging.html目录JMSActiveMQ支持Artemis支持使用JNDIConnectionFactory发送消息接收消息AMQPRabbitMQ支持发送消息接收消息ApacheKafka支持发送消息接收消息Kafka流
个人学习编程(3-21) leetcode刷题 Rsecret2 编程笔记学习 leetcode 算法
链接列表的中间值：测试用例1：创建链表[1,2,3,4,5]，调用middleNode，预期返回值是3。测试用例2：创建链表[1,2,3,4,5,6]，调用middleNode，预期返回值是3。判断长度，然后length/2structListNode*middleNode(structListNode*head){intlength=0;for(structListNode*curr=head;
Python技术全景解析：从基础到前沿的深度探索靠近彗星 python 开发语言性能优化个人开发极限编程
目录一、Python为何成为开发者首选？1.核心优势矩阵2.性能进化史二、Python核心应用领域1.数据科学黄金三角2.AI开发新范式三、现代Python进阶技巧1.类型提示革命2.异步编程实战四、Python工程化实践1.现代项目架构2.性能优化矩阵五、Python未来生态展望1.前沿技术融合2.性能革命六、学习路线图1.技能成长路径基础阶段（1-3月）专业方向（3-6月）深度进阶（6-12月
如何使用DeepSeek编写测试用例？海姐软件测试 deepseek 大数据测试工具
一、DeepSeek在测试用例设计中的定位DeepSeek作为AI工具，并非直接替代测试设计，而是通过以下方式提升效率：快速生成基础用例框架（等价类、边界值等）智能补充易遗漏场景（如特殊字符、异常流）自动化脚本片段生成（Python/pytest/JUnit等）测试数据构造建议（符合业务规则的Mock数据）二、四步法实战：AI协作编写测试用例Step1：明确需求输入输入质量决定输出质量，需向Dee
LLM-Agent方法评估与效果分析 agent人工智能ai开发
1.引言近年来，随着大型语言模型（LLM）的快速发展，基于强化学习（RL）对LLM进行微调以使其具备代理（Agent）能力成为研究热点。从基础的单智能体强化学习算法（如PPO）到多智能体协作、语料重组以及在线自学习等新技术不断涌现，研究人员致力于探索如何提高LLM在实际应用中的决策能力、推理能力和任务执行效率。本文主要聚焦于当前LLM-Agent方法的检索与评估，旨在全面探讨各类方法的技术实现、实
Explore Model-Based Feature Importance 后端
Question1.ExploreModel-BasedFeatureImportanceThroughoutthisquestion,youmayonlyusePython.Foreachsub-question,providecommentary(ifneeded)alongwithscreenshotsofthecodeused.Pleasealsoprovideacopyofthecode
Java学习------常用类String 日暮南城故里 Java学习记录 java 学习开发语言 String类
1.介绍Java中的String属于引用数据类型，Java专门在堆中准备了一个字符串常量池。我们在开发时，字符串使用的频率是很高的，因此将这些字符串放在常量池中可以省去对象的创建过程，提高效率。常量池属于一种缓存技术，缓存技术是一种可以提高程序执行效率的手段。Strings1=“hello”;Strings2=“hello”;System.out.println(s1==s2);//这里输出的结果
Python 标准库之 logging 模块 36度道 python系列学习笔记 python
1.logging模块简介在软件开发过程中，了解程序的运行状态、记录重要事件以及排查错误是至关重要的。logging模块为Python提供了灵活且强大的日志记录功能。它允许开发者控制日志的输出内容、输出位置（如文件、控制台）、日志级别（用于过滤不同重要程度的日志信息）等，帮助开发者更好地监控和调试程序。2.基本使用简单配置与输出：importlogging#配置日志基本设置logging.basi
python 标准库之 functools 模块 36度道 python系列学习笔记 python
functools模块提供了一系列用于处理函数的工具。其中，像partial可以创建一个新的可调用对象，这个对象固定了原函数的部分参数，有点像给函数穿上了“参数防护服”；reduce能对一个序列进行累积计算，就好比是一个勤劳的小会计，按顺序把序列里的数加起来或者做其他运算；wraps主要用于装饰器，它能帮助装饰器函数保留被装饰函数的元信息，比如函数名、文档字符串等，让被装饰函数“表里如一”。底层原
大模型学习-让其他电脑可访问本地ollama的模型并进行流式响应 Gratitute_林腾大模型学习学习语言模型
目录让其他电脑可访问本地ollama流式响应让其他电脑可访问本地ollama默认情况下，其他电脑不能直接访问本地Ollama服务。解决方法：让Ollama监听局域网地址，而不是localhost我们可以让Ollama监听局域网IP，在Ollama服务器上运行：setOLLAMA_HOST=0.0.0.0:11434ollamaserve注意：这种方式只对当前CMD窗口有效，关闭窗口后就会失效。如果
Python——函数生如雪花 Python python
一、十进制小数转换成二进制小数【问题描述】编写程序，输入十进制小数（只考虑正数），把它转换为以字符串形式存储的二进制小数，输出该二进制小数字符串。对于转换得到的二进制小数，小数点后最多保留10位。小数点后不足10位，则输出这些位，尾部不补0；小数点后超出10位，则直接舍弃超出部分。【输入形式】十进制浮点小数【输出形式】对应输入小数的二进制小数字符串。若整数部分或者小数部分为0，则输出0。比如输入0
Python Web框架 Flask vs Django vs FastAPI ZengDerby python flask fastapi django
如果您需要构建大型的、功能丰富的应用程序，Django可能是一个很好的选择。如果您需要更灵活的框架，可以选择Flask来定制开发。而对于追求极致性能和高并发处理的项目，FastAPI可能是一个更加理想的选择。优缺点Flask在小型项目或微服务理想的选择。Flask灵活且轻量，非常适合快速开发小型应用。Flask是一个非常灵活的框架，它允许您根据项目需求进行定制。您可以根据需要选择合适的插件和扩展。
双一流软件工程大二听闻 Java 前景堪忧，是否该转C++或人工智能或者读研？程序员yt java c++人工智能
今天给大家分享的是一位粉丝的提问，双一流软件工程大二听闻Java前景堪忧，是否该转C++或人工智能或者读研？接下来把粉丝的具体提问和我的回复分享给大家，希望也能给一些类似情况的小伙伴一些启发和帮助。同学提问：yt老师好，我是双一流软件工程的大二学生，一直在学习java方向，目前掌握了数据库，spring框架等内容，大一暑假在老家一个小公司找了段实习，有蓝桥杯java组b组国一，专业排名前2（保研名
python if用法 IT技术土狗 python从入门到入狱 python
pythonif用法流程控制流程控制即控制流程，具体指控制程序的执行流程，而程序的执行流程分为三种结构：顺序结构（之前我们写的代码都是顺序结构）、分支结构（用到if判断）、循环结构（用到while与for）1、分支结构分支结构就是根据条件判断的真假去执行不同分支对应的子代码2、为什么需要分支结构人类某些时候需要根据条件来决定做什么事情，比如：如果今天下雨，就带伞所以程序中必须有相应的机制来控制计算
嵌入式音频框架alsa学习之pcm状态 Liu-Eleven linux声音框架音视频学习 pcm
/**PCMstate*/typedefenum_snd_pcm_state{/**Open*/SND_PCM_STATE_OPEN=0,/**Setupinstalled*/SND_PCM_STATE_SETUP,/**Readytostart*/SND_PCM_STATE_PREPARED,/**Running*/SND_PCM_STATE_RUNNING,/**Stopped:underru
Effective Modern C++ 条款6：auto推导若非己愿，使用显式类型初始化惯用法举个栗子2 Effective Modern C++c++
更多C++学习笔记，关注wx公众号：cpp读书笔记Item6:Usetheexplicitlytypedinitializeridiomwhenautodeducesundesiredtypes在Item5中解释了比起显式指定类型使用auto声明变量有若干技术优势，但是有时当你想向左转auto却向右转。举个例子，假如我有一个函数，参数为Widget，返回一个std::vector，这里的bool表
多线程编程之存钱与取钱周凡杨 java thread 多线程存钱取钱
生活费问题是这样的：学生每月都需要生活费，家长一次预存一段时间的生活费，家长和学生使用统一的一个帐号，在学生每次取帐号中一部分钱，直到帐号中没钱时通知家长存钱，而家长看到帐户还有钱则不存钱，直到帐户没钱时才存钱。问题分析：首先问题中有三个实体，学生、家长、银行账户，所以设计程序时就要设计三个类。其中银行账户只有一个，学生和家长操作的是同一个银行账户，学生的行为是
java中数组与List相互转换的方法征客丶 JavaScript java jsonp
1.List转换成为数组。（这里的List是实体是ArrayList) 　　调用ArrayList的toArray方法。　　toArray 　　public T[] toArray(T[] a)返回一个按照正确的顺序包含此列表中所有元素的数组；返回数组的运行时类型就是指定数组的运行时类型。如果列表能放入指定的数组，则返回放入此列表元素的数组。否则，将根据指定数组的运行时类型和此列表的大小分
Shell 流程控制 daizj 流程控制 if else while case shell
Shell 流程控制和Java、PHP等语言不一样，sh的流程控制不可为空，如(以下为PHP流程控制写法)： <?php if(isset($_GET["q"])){ search(q);}else{// 不做任何事情} 在sh/bash里可不能这么写，如果else分支没有语句执行，就不要写这个else，就像这样 if else if if 语句语
Linux服务器新手操作之二周凡杨 Linux 简单操作
1.利用关键字搜寻Man Pages man -k keyword 其中-k 是选项，keyword是要搜寻的关键字如果现在想使用whoami命令，但是只记住了前3个字符who，就可以使用 man -k who来搜寻关键字who的man命令 [haself@HA5-DZ26 ~]$ man -k
socket聊天室之服务器搭建朱辉辉33 socket
因为我们做的是聊天室，所以会有多个客户端，每个客户端我们用一个线程去实现，通过搭建一个服务器来实现从每个客户端来读取信息和发送信息。我们先写客户端的线程。 public class ChatSocket extends Thread{ Socket socket; public ChatSocket(Socket socket){ this.sock
利用finereport建设保险公司决策分析系统的思路和方法老A不折腾 finereport 金融保险分析系统报表系统项目开发
决策分析系统呈现的是数据页面，也就是俗称的报表，报表与报表间、数据与数据间都按照一定的逻辑设定，是业务人员查看、分析数据的平台，更是辅助领导们运营决策的平台。底层数据决定上层分析，所以建设决策分析系统一般包括数据层处理（数据仓库建设）。项目背景介绍通常，保险公司信息化程度很高，基本上都有业务处理系统（像集团业务处理系统、老业务处理系统、个人代理人系统等）、数据服务系统（通过
始终要页面在ifream的最顶层林鹤霄
index.jsp中有ifream，但是session消失后要让login.jsp始终显示到ifream的最顶层。。。始终没搞定，后来反复琢磨之后，得到了解决办法，在这儿给大家分享下。。 index.jsp--->主要是加了颜色的那一句 <html> <iframe name="top" ></iframe> <ifram
MySQL binlog恢复数据 aigo mysql
1，先确保my.ini已经配置了binlog： # binlog log_bin = D:/mysql-5.6.21-winx64/log/binlog/mysql-bin.log log_bin_index = D:/mysql-5.6.21-winx64/log/binlog/mysql-bin.index log_error = D:/mysql-5.6.21-win
OCX打成CBA包并实现自动安装与自动升级 alxw4616 ocx cab
近来手上有个项目,需要使用ocx控件 (ocx是什么? http://baike.baidu.com/view/393671.htm) 在生产过程中我遇到了如下问题. 1. 如何让 ocx 自动安装? a) 如何签名? b) 如何打包? c) 如何安装到指定目录? 2.
Hashmap队列和PriorityQueue队列的应用百合不是茶 Hashmap队列 PriorityQueue队列
HashMap队列已经是学过了的,但是最近在用的时候不是很熟悉,刚刚重新看以一次, HashMap是K,v键 ,值 put()添加元素 //下面试HashMap去掉重复的 package com.hashMapandPriorityQueue; import java.util.H
JDK1.5 returnvalue实例 bijian1013 java thread java多线程 returnvalue
Callable接口：返回结果并且可能抛出异常的任务。实现者定义了一个不带任何参数的叫做 call 的方法。 Callable 接口类似于 Runnable，两者都是为那些其实例可能被另一个线程执行的类设计的。但是 Runnable 不会返回结果，并且无法抛出经过检查的异常。 ExecutorService接口方
angularjs指令中动态编译的方法(适用于有异步请求的情况) 内嵌指令无效 bijian1013 JavaScript AngularJS
在directive的link中有一个$http请求，当请求完成后根据返回的值动态做element.append('......');这个操作，能显示没问题，可问题是我动态组的HTML里面有ng-click，发现显示出来的内容根本不执行ng-click绑定的方法！
【Java范型二】Java范型详解之extend限定范型参数的类型 bit1129 extend
在第一篇中，定义范型类时，使用如下的方式： public class Generics<M, S, N> { //M,S,N是范型参数 } 这种方式定义的范型类有两个基本的问题： 1. 范型参数定义的实例字段，如private M m = null;由于M的类型在运行时才能确定，那么我们在类的方法中，无法使用m，这跟定义pri
【HBase十三】HBase知识点总结 bit1129 hbase
1. 数据从MemStore flush到磁盘的触发条件有哪些？ a.显式调用flush，比如flush 'mytable' b.MemStore中的数据容量超过flush的指定容量，hbase.hregion.memstore.flush.size,默认值是64M 2. Region的构成是怎么样？ 1个Region由若干个Store组成
服务器被DDOS攻击防御的SHELL脚本 ronin47
mkdir /root/bin vi /root/bin/dropip.sh #!/bin/bash/bin/netstat -na|grep ESTABLISHED|awk ‘{print $5}’|awk -F:‘{print $1}’|sort|uniq -c|sort -rn|head -10|grep -v -E ’192.168|127.0′|awk ‘{if($2!=null&a
java程序员生存手册-craps 游戏-一个简单的游戏 bylijinnan java
import java.util.Random; public class CrapsGame { /** * *一个简单的赌*博游戏，游戏规则如下： *玩家掷两个骰子，点数为1到6，如果第一次点数和为7或11，则玩家胜， *如果点数和为2、3或12，则玩家输， *如果和为其它点数，则记录第一次的点数和，然后继续掷骰，直至点数和等于第一次掷出的点
TOMCAT启动提示NB: JAVA_HOME should point to a JDK not a JRE解决开窍的石头 JAVA_HOME
当tomcat是解压的时候，用eclipse启动正常，点击startup.bat的时候启动报错; 报错如下： The JAVA_HOME environment variable is not defined correctly This environment variable is needed to run this program NB: JAVA_HOME shou
[操作系统内核]操作系统与互联网 comsci 操作系统
我首先申明：我这里所说的问题并不是针对哪个厂商的，仅仅是描述我对操作系统技术的一些看法操作系统是一种与硬件层关系非常密切的系统软件，按理说，这种系统软件应该是由设计CPU和硬件板卡的厂商开发的，和软件公司没有直接的关系，也就是说，操作系统应该由做硬件的厂商来设计和开发
富文本框ckeditor_4.4.7 文本框的简单使用支持IE11 cuityang 富文本框
<html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> <title>知识库内容编辑</tit
Property null not found darrenzhu datagrid Flex Advanced propery null
When you got error message like "Property null not found ***", try to fix it by the following way: 1)if you are using AdvancedDatagrid, make sure you only update the data in the data prov
MySQl数据库字符串替换函数使用 dcj3sjt126com mysql 函数替换
需求：需要将数据表中一个字段的值里面的所有的 . 替换成 _ 原来的数据是 site.title site.keywords .... 替换后要为 site_title site_keywords 使用的SQL语句如下： updat
mac上终端起动MySQL的方法 dcj3sjt126com mysql mac
首先去官网下载: http://www.mysql.com/downloads/ 我下载了5.6.11的dmg然后安装,安装完成之后..如果要用终端去玩SQL.那么一开始要输入很长的:/usr/local/mysql/bin/mysql 这不方便啊,好想像windows下的cmd里面一样输入mysql -uroot -p1这样...上网查了下..可以实现滴. 打开终端,输入: 1
Gson使用一（Gson） eksliang json gson
转载请出自出处：http://eksliang.iteye.com/blog/2175401 一.概述从结构上看Json，所有的数据（data）最终都可以分解成三种类型：第一种类型是标量（scalar），也就是一个单独的字符串（string）或数字（numbers），比如"ickes"这个字符串。第二种类型是序列（sequence），又叫做数组（array）
android点滴4 gundumw100 android
Android 47个小知识 http://www.open-open.com/lib/view/open1422676091314.html Android实用代码七段（一） http://www.cnblogs.com/over140/archive/2012/09/26/2611999.html http://www.cnblogs.com/over140/arch
JavaWeb之JSP基本语法 ihuning javaweb
目录 JSP模版元素 JSP表达式 JSP脚本片断 EL表达式 JSP注释特殊字符序列的转义处理如何查找JSP页面中的错误 JSP模版元素 JSP页面中的静态HTML内容称之为JSP模版元素，在静态的HTML内容之中可以嵌套JSP
App Extension编程指南（iOS8/OS X v10.10）中文版啸笑天 ext
当iOS 8.0和OS X v10.10发布后，一个全新的概念出现在我们眼前，那就是应用扩展。顾名思义，应用扩展允许开发者扩展应用的自定义功能和内容，能够让用户在使用其他app时使用该项功能。你可以开发一个应用扩展来执行某些特定的任务，用户使用该扩展后就可以在多个上下文环境中执行该任务。比如说，你提供了一个能让用户把内容分
SQLServer实现无限级树结构 macroli oracle sql SQL Server
表结构如下：数据库id path titlesort 排序 1 0 首页 0 2 0,1 新闻 1 3 0,2 JAVA 2 4 0,3 JSP 3 5 0,2,3 业界动态 2 6 0,2,3 国内新闻 1 创建一个存储过程来实现，如果要在页面上使用可以设置一个返回变量将至传过去 create procedure test as begin decla
Css居中div，Css居中img，Css居中文本，Css垂直居中div qiaolevip 众观千象学习永无止境每天进步一点点 css
/**********Css居中Div**********/ div.center { width: 100px; margin: 0 auto; } /**********Css居中img**********/ img.center { display: block; margin-left: auto; margin-right: auto; }
Oracle 常用操作(实用) 吃猫的鱼 oracle
SQL>select text from all_source where owner=user and name=upper('&plsql_name'); SQL>select * from user_ind_columns where index_name=upper('&index_name'); 将表记录恢复到指定时间段以前
iOS中使用RSA对数据进行加密解密 witcheryne ios rsa iPhone objective c
RSA算法是一种非对称加密算法,常被用于加密数据传输.如果配合上数字摘要算法, 也可以用于文件签名. 本文将讨论如何在iOS中使用RSA传输加密数据. 本文环境 mac os openssl-1.0.1j, openssl需要使用1.x版本, 推荐使用[homebrew](http://brew.sh/)安装. Java 8 RSA基本原理 RS