Python —— 数据科学的手段 (第2版)配套代码

第一部分

第1章

3*'Python is easy! '
abs(3*4+(-2/5)**2)/(4.5-76)+max(-34,9)
import os
print (os.getcwd()) #查看目前的工作目录
os.chdir('D:/Python work') #改变工作目录

第2章

import os
os.getcwd()
import os
os.chdir('D:/work') #或者os.chdir('D:\\work')
import os
os.mkdir('work2')
import os
os.rmdir('work2')
import os
os.rename('fff.txt','fool.txt') #重命名
os.remove('h.txt')              #删除文件
ST='I am happy and you too'
print('length =',len(ST),'\n',ST[5:10],'\n',(ST+'! ')*2)
x=[list(range(5)),"Python is great!",["Program is art"],abs(-2.34),
   [[1,20],[-34,60]]]
y=(list(range(5)),"Python is great!",["Program is art"],abs(-2.34),
   [[1,20],[-34,60]])
Tup=3,4,6,[2,3],"Time"
z={'seq': list(range(5)), 'string': "Python is great!", 
   'ls': ["Program is art"], 'value': abs(-2.34), 
   'mat':  [[1,20],[-34,60]]}
s1='A great person'
s2=['you', 'I', 'they','we','you','he','they']
s3=(32,64,32,'He is the one',(2,3))
s4={'One': 234, 'Two': 45,'Three': 45}
print(set(s1),'\n',set(s2),'\n',set(s3),'\n',set(s4))
print(type(x),type(y),type(z),'\n',type('string'),type(3.5),type(7))
print(x[2:],'\n',y[-4:],'\n',z['mat'])
x=list(range(10))#=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(x[:3],x[7:],x[3:6],x[-3:],x[-1],x[:-4])
x='A poet can survive everything but a misprint.'
x[:10]+x[10:20]+x[20:30]+x[30:40]+x[40:]
x=[[1,15,3],[['People'],' above all']]
y=("Good morning",[2,5,-1])
z={'a': 'A string', 'b': [[2,3],'yes'],'c':{'A': [3,'Three',4],'B':range(5)}}
print(x[0][:2],x[1][1][:3],'\n',y[0][:3],'\n', z['c']['B'][-3:],'\n',z['b'][1][1:] )
for i in x:
    print(i)
    for k in i:
        print(k)
for i in list(z):
    print(z[i])
    for j in z[i]:
        if type(z[i])==dict:
            j=z[i][j]
        print(j)
for (i,j) in (x,y):
    print(i,j)
    for l in i:
        print(l)
    for m in j:
        print(m)
for i in range(len(x)):
    print(x[i])
    for j in range(len(x[i])):
        print(x[i][j])
print(list(range(-1,11,2)), list(range(2,7)), list(range(10,-10,-3)))
'I'+' have to say:'+' "You are '+ 'very '*2 + 'good!"'
print(['Hi!'] * 2+['I am']+['here']+["Isn't It"])
print(('Tiger','Lion')*2+('Wolf','Cat')+([1,-3.9],'Good'))
s='Good morning!'
x=[[1, 15, 3], [['People'], ' above all']]
y=('Good morning', [2, 5, -1])
z={'a': 'A string', 'b': [[2, 3], 'yes'], 
    'c': {'A': [3, 'Three', 4], 'B': range(0, 5)}}

print(len(s),len(x),len(y),len(z))
print(['People'] in x, ['People'] in x[1], 'Good' in s)
print('Good morning' in y, 'A string' in z, 'a' in z)
print(max('A', 'black', 'rose'),max([1,-5]), min(['people','leader']),
        min({"a":2,"b":4}))
x=[[3,7],'Oscar Wilde']
y=['save','the world',['is','impossible']]
x.append(y);print(x)
x=[[3,5,7],'Oscar Wilde']
y=['save','the world',['is','impossible']]
x.extend(y);print (x)
x=[[1,2],'Word',[3,5,7],'Oscar Wilde']
x.pop();print(x) #去掉最后一个
x=[[1,2],'Word',[3,5,7],'Oscar Wilde']
x.pop(2);print(x) #去掉下标为2的元素(即[3,5,7])
x=[[1,2],'Word',[3,5,7],'Oscar Wilde',[3,5,7]]
x.remove([3,5,7]);print(x)
x.remove([3,5,7]);print(x)
x.remove('Word');print(x)
y=('Efficiency', [2, [5, -1]])
print(type(y),'\n',list(y),type(list(y)),'\n',
      tuple(list(y)),type(tuple(list(y))))
z={'a': 'A string', 'b': [[2, 3], 'yes'], 'c': {'A': 'Why', 'B': 4}}
print('keys:\n',z.keys(),'\nget:\n',z.get('a'),
  '\nitems:\n',z.items(),'\nvalues:\n',z.values())
z.pop('c') #去掉'c'
print('pop last:',z.popitem()) #去掉剩下的最后一个('b')
print('after pop:',z) #还剩下'a'
z['new']=[[2,4],[5,7,9]];z
a={'a': (2,3),'b': ['word','sentence']}
b={2:[345,321],'a':("two","three")}
c={2:999,'b':'strong'}
print({**a,**b,**c})
print({**b,**a})
z['new']=34/56.2; z
del z['a'];z
y=zip(('100','A',1202,),'ABCDE',['I', 'like','apple','very much'])
print('y=',y)
print('list(y)=',list(y))
print('list(y)=',list(y))
y2=list(y) 
print('y2=',y2)
print('y=',y)
A=(2,'5','Today');B=[30,'tax',[5,4]]
D=dict()
for i in range(len(A)):
    D[A[i]]=B[i]
print(D)
A=(2,'5','Today');B=[30,'tax',[5,4]]
print(dict(zip(A,B)))
A=(2,'5','Today')
print(list(zip(A)))
A=('What','is','this');B=[30,'tax',[5,4]]
for i in zip(A,B):
    print(i)
for i,j in zip(A,B):
    print(i)
    print(j)
A=('What','is','this');B=[30,'tax',[5,4]]
ZIP=zip(A,B)
for i in ZIP:
    print(i)
for i,j in ZIP:
    print(i)
    print(j)
A='ABCD';B=[1,2,3,4]
x=list(zip(A,B))
print(x)
A,B=zip(*x)
print('A=',A,'; B=',B)
year=[2017,2018,2019];inport=[2800,3496,4765];export=[3990,5023,8766]
for i,j,k in zip(year,export,inport):
    print('In year',i ,' red=',j-k)
height=[1.74,1.83,1.69];weight=[55, 62, 71];name=['Tom', 'Jack','Smith']
print('sort by height:',sorted(zip(height,weight,name)))
print('sort by weight:',sorted(zip(weight,height,name)))
print('sort by name:',sorted(zip(name,height,weight)))
A = set('geography');print(A)
v={(1,3,6),'world',((2,3),(1,7)),'world',('world')};v
for i in v:
    print(i)
A={1,4,'world',(3,4,'country'),'world',1}
print(list(A),type(list(A)),'\n',tuple(A),type(tuple(A)),'\n',
  set(list(A)),type(set(list(A))),'\n',set(tuple(A)),type(set(tuple(A))))
u={1.2,5.7,'word',(1,4),('key',5)}
u.add((2,6,1,'sun'));u
x=set(['I','you','he','I','they','we','we'])
x.remove('I');print(x)
A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
A|B #union 可试试 A|=B
A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A&B,'\n',A,B) #可试试: A.intersection(B)
A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A-B) #可试试 A -= B
A={1,4,'world',(3,4,'country')}
B={'world',5,1,('one','two')}
print(A ^ B,'\n',(A|B)-(A&B))
A = {1, 2, 3, 1, 2}
B = {3, 2, 3, 1}
print(A == B,A!=B,A<=B,B>A&B,{1,3,4}>={1,3},)
print(set.difference(set(['a',2,'5']),set(['a',7])))
print(set.union(set(['a',2,'5']),set(['a','a',7])))
print(set.intersection(set(['a',2,'5']),set(['a','a',7])))
from collections import Counter
s1='pneumonoultramicroscopicsilicovolcanoconiosis'
s2=['you', 'I', 'they','we','you','he','they']
s3=(32,64,32,'He is the one',(2,3))
s4={'One': 234, 'Two': 45,'Three': 45,'One': 299,'One': 23}
s5=set(s2)
print(Counter(s1),'\n',Counter(s2),'\n',Counter(s3),'\n',
  Counter(s4),'\n', Counter(s5))
s=[(1,2,4),'happy',('peace','and', 'war'),'happy']
u=['happy',(1,2,4),('peace','and', 'war')]
print(Counter(s)==Counter(u),set(s)==set(u))
print(Counter(s))
print(dict(Counter(s)))
print(list(Counter(s)))
print(tuple(Counter(s)))
print(set(Counter(s)))
def Positive(x):
    y=[]
    for i in x:
        if i>0: 
            y.append(i)
    return(y)
print(Positive([-2,-2,3,5,7,3])) #对list
print(Positive((-2,-2,3,5,7,3))) #对tuple
print(Positive({-2,-2,3,5,7,3})) #对set
def f(x): return x**2-x
g=lambda x,y: max(x**2,y**3+x)
f(0.8),g(3.4,0.5) #把数值代入函数执行
print(list(map(lambda x: x**2+1-abs(x), [1.2,5.7,23.6,6,1.2])))
print(list(map(lambda x: x**2+1-abs(x), (1.2,5.7,23.6,6,1.2))))
print(list(map(lambda x: x**2+1-abs(x), {1.2,5.7,23.6,6,1.2})))
for i in map(lambda x: x**2+1-abs(x), [1.2,5.7,23.6,6,1.2]): print(i)
for j in map(lambda x: x**2+1-abs(x), (1.2,5.7,23.6,6,1.2)): print(j)
for j in map(lambda x: x**2+1-abs(x), {1.2,5.7,23.6,6,1.2}): print(j)
print(tuple(map(lambda x,y: x**2*y-abs(x)/y, [1.2,5.7],[-45,26])))
print(tuple(map(lambda x,y: x**2*y-abs(x)/y, (1.2,5.7),(-45,26))))
print(tuple(map(lambda x,y: x**2*y-abs(x)/y, {1.2,5.7},{-45,26})))
gg=lambda x,y: x**2*y-abs(x)/y
print(gg(1.2,-45),gg(5.7,26))#表面上的`次序`
print(gg(1.2,26),gg(5.7,-45))#实际次序
print(list(filter(lambda x: x>0,[-1,4,-5,7])))#滤去list的负值
print(list(filter(lambda x: x>0,(-1,4,-5,7,-5,8,7))))#滤去tuple的负值
print(list(filter(lambda x: x>0,{-1,4,-5,7,-5,8,7})))#滤去set的负值
print(list(filter(lambda x: abs(x)>5,range(-10,12,2))))#取绝对值大于5的值
list(filter(bool,('',(1,2,4),'happy',0,None,True,False,2020)))
def Age():
    x1=120.
    x0=0
    x=x1/2
    for i in range(6):
        y=input("Is your age greater than %s ? Input 'Y' or 'N':" %x)
        if y=='Y' or y=='y' :
            x0=x
            x=x0+(x1-x0)/2
        else:
            x1=x
            x=x0+(x1-x0)/2
    print('Your age is about {} years old'.format(int(x)))
Age() #执行上面函数的语句
x=input('Type your name please: ')
print('My name is',x)
x=eval(input('Type any number: '))
print('The square root of your number is',x**(1/2))
print('World'!='word')
print(34==34.0)
print(3>2 and 4>=3)
print(3<2 or 'c'>='a')
print(not 3<2)
print('A'<'a' and 'A'>'1')
def f(x): return 2*x**3-4*x**2+5*x-20 #定义多项式函数
def solf(f=f): #定义solf函数
    x1=3.
    x0=2.
    x=x1/2.
    e=10**(-18) #确定精度
    while abs(f(x))>e:  #不满足精度则继续的循环
        if f(x)<0:
            x0=x
            x=x0+(x1-x0)/2
        else:
            x1=x
            x=x0+(x1-x0)/2
    return x 
solf(f) #运行函数solf
x=eval(input('Enter a number'))
if x<0:
    x=x**2
    w='x is negative and change to'
elif x==0:
    x=x+1.
    w='x=0 and change to'
else:
    x=x**3
    w='x>0 and change to'
print (w,x) 
import random #输入模块
random.seed(1010) #设定随机种子使得这里产生的结果可以重复
print(random.randint(1,100)) #从1到100中随机选择一个数字
print(random.choice([1,2.0,4,'word'])) #从表中随机选择一个元素
print(random.sample(range(100),5)) #从[0,100)(不包含100)随机选择5个数字
print(random.sample([1,2.0,4,'word'],2)) #从[1,2.0,4,'word']随机选择2元素
print(random.random()) #产生区间[0.0,1)(不包含1)中的随机数
print(random.uniform(2,5)) #产生一个2,5之间的均匀分布随机数
print(random.gauss(3,5)) #产生一个均值为3, 标准差为5的正态分布随机数
x=99;y=x;print(x,y,id(x)==id(y))
y=10;print(x,y,id(x)==id(y))
x=[1,2,3];y=x;y[0]=10;print(x,y,id(x)==id(y))
x[2]='test';print(x,y,id(x)==id(y))
x=[1,2];y=x[:]
print(x,y,id(x)==id(y),id(x[0])==id(y[0]),id(x[1])==id(y[1]))
print(id(x),id(y),id(x[0]),id(y[0]),id(x[1]),id(y[1]))#位置(与电脑有关)
x=[1,2];y=x[:]
y[0]=33;print(x,y)
x=[1,2,5,2];y=x;y[0]=3;y[3]=99
print(x,y)
x=[1,2,5,2];y=x;x[0]=7;x[3]=88;y[2]='string'
print(x,y)
x=[1,2,5,2];y=x[:];y[0]=3;y[3]=99
print(x,y)
x=[1,2,5,2];y=x[:];x[0]=44;y[3]=77
print(x,y)
x=eval(input('Enter a number'))
print(x,type(x))
y=input('Enter a word')
print(y,type(y))
p=open('PYGMALION.txt','r') #打开文件
print('file name=',p.name)#打印文件名
print('Is file closed? ', p.closed) #是否关闭了
print('Access mode=',p.mode) #可访问的权限
print('position=', p.tell()) #指针位置
print(p.read(194))            #读取并打印头194字节(byte)     
print('position=', p.tell())  #显示指针(读到哪里了)
import textwrap
p.seek(0,0)                 #指针位置归零
print('Position=',p.tell())
print("\n".join(textwrap.wrap(p.read(194),70)))
print('Position=',p.tell())
p.close()                   #关闭
print('Is file closed? ', p.closed)
a=open('fool.txt','w')
a.write('A message ')
a.write('and more.')
a.close()
b=open('fool.txt','a')
b.write(' OK?')
b.close()
b=open('fool.txt','r+')
print(b.read(100))
b.write(' OK?')
b.seek(0,0) #回到指针0, 再读取, 看加入的内容有没有.
print(b.read(100))
b.close()
O=open("UN.txt") 
print(O.name)                                        
print(O.encoding)
print(O.mode)   
O=open("UN.txt")
for line in O:  #按序提取O中的元素(line)
    print(line)
with open("UN.txt", "rt") as O:
    text = O.read()
print(text)  
x=[]                           #建立空list
O=open("UN.txt")               #Open file
for line in O:                 #按序提取O中的元素(line)
    for word in line.split():  #按序取每个line中的元素(word)
        if word.endswith('lity'): #条件
            x.append(word)     #把满足条件的词逐个放入x中
print('There are', len(x), 'words ended with "lity", they are:\n',x)
b=0;c=0;d=0;e=0
for line in open("UN.txt"):
    b+=1                    #行计数
    if len(line.split())>0: #不算空行
        c+=1                #对非空行计数   
    for word in line.split():
        d+=1                #对词计数
        for char in word:
            e+=1            #对字符计数
print('Total {} lines with {} no-empty lines, {} words and {} characters'\
      .format(b,c,d,e))
print('Integer: {:2d}, float: {:1.2f}, \
anything: {} and: {}'.format(234,21.5, 2.718, 'Hi!'))
print('Integer: %s, float: %s, anything: %s and: %s' %(234,21.5, 2.718, 'Hi!'))
b=0
for line in open("UN.txt"):
    if len(line.split())>0:
        for word in line.split():
            if word=='Whereas':
                b+=1
print('The count of word "Whereas" is %s' %b)
import textwrap
c=0
for line in open("OW.txt"):
    if c<3:
        if len(line.split())>0:
            c+=1
            print('The line {} has {} words:'.format(c,len(line.split())))
            print("\n".join(textwrap.wrap(line,70)))
import textwrap
c=0
g=open('OW.txt')
for line in g.readlines():
    if len(line)>1:
        if c<3:
            c+=1
            print('Line {} has {} characters'.format(c,len(i)),'\n',"\n".\
                  join(textwrap.wrap(line,70)))
g.close()
import textwrap
g=open('OW.txt')
print('The 9th line:\n', "\n".join(textwrap.wrap(g.readlines()[8],70)) )
g.seek(0,0)
print('The 8th words of the 9th line:\n',\
      "\n".join(textwrap.wrap(g.readlines()[8].split()[7],70)))
g.close()

第3章

class Customer(object):
    """A customer of XXX Bank with an account have the
    following properties:

    Attributes:
    name: The customer's name.
    balance: The current balance.
    penalty: Penalty for overwithdraw (%)
    reward: reward for deposit (%)
    """

    def __init__(self, name, balance=0.0, penalty=0.3, reward=0.1):
        """Return a Customer object whose name is *name*, starting
        balance is *balance*, the penalty rate is *penalty* and 
        the reward rate is *reward*."""
        self.name = name
        self.balance = balance
        self.p = penalty
        self.r = reward

    def withdraw(self, amount):
        """Return the balance after withdrawing *amount*."""
        self.withd=amount
        self.balance=self.balance-self.withd
        if self.balance < 0:
            self.balance=self.balance*(1+self.p)
        return self.balance

    def deposit(self, amount):
        """Return the balance after depositing *amount*."""
        self.depos=amount
        self.balance=self.balance+self.depos
        if self.balance > 0:
            self.balance=self.balance*(1+self.r)
        return self.balance
print(Customer.__doc__)
print(Customer.withdraw.__doc__)
print(Customer.deposit.__doc__)
Jack=Customer('Jack',1000, 0.7, 0.25)
print('Name=', Jack.name)
print('Original balance=', Jack.balance) 
Jack.withdraw(1500)
print('Withdraw {}, balance={}'.format(Jack.withd,Jack.balance))
print('Penalty rate={}, Reward rate={}'.format(Jack.p, Jack.r))
Jack.deposit(3700)
print('Deposite {}, balance={}'.format(Jack.depos,Jack.balance))
print('Penalty rate={}, Reward rate={}'.format(Jack.p, Jack.r))
June=Customer('Smith',30, 0.44, 0.13)
print('Name=', June.name)
print('Original balance=', June.balance)
June.withdraw(20)
print('Withdraw {}, balance={}'.format(June.withd,June.balance))
print('Penalty rate={}, Reward rate={}'.format(June.p, June.r))
June.deposit(125)
print('Deposite {}, balance={}'.format(June.depos,June.balance))
print('Penalty rate={}, Reward rate={}'.format(June.p, June.r))
print(Jack.__doc__)
print(Jack.withdraw.__doc__)
print(Jack.deposit.__doc__)
class Son(Customer):
    def withdraw(self, amount):
        """Return the balance after withdrawing *amount*."""
        self.withd=amount
        self.r0=self.r
        self.p0=self.p
        self.balance=self.balance-self.withd
        if self.balance < -30:
            self.p0=self.p0*10
            self.balance=self.balance*(1+self.p0)
        else:
            self.p0=self.p
        return self.balance, self.p0

    def deposit(self, amount):
        """Return the balance after depositing *amount*."""
        self.depos=amount
        self.r0=self.r
        self.p0=self.p
        self.balance=self.balance+self.depos
        if self.balance > 0:
            self.r0=self.r0*3
            self.balance=self.balance*(1+self.r0)
        else:
            self.p0=self.p0
            self.r0=self.r0
        return self.balance, self.r0
Jackson=Son('Jackson',30, 0.44, 0.13)
print('Name=', Jackson.name)
print('Original balance=', Jackson.balance) 
print('Original Penalty rate={}, Reward rate={}'.format(Jackson.p,\
       Jackson.r))
Jackson.withdraw(250)
print('Withdraw {}, balance={}'.format(Jackson.withd,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))
Jackson.deposit(5000)
print('Deposite {}, balance={}'.format(Jackson.depos,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))
Jackson.deposit(50)
print('Deposite {}, balance={}'.format(Jackson.depos,Jackson.balance))
print('Penalty rate={}, Reward rate={}'.format(Jackson.p0, Jackson.r0))

第二部分

第4章

import numpy as np
np.random.seed(1010) #随机种子
np.random.rand(2,5,3) #产生30个[0.0,1.0)中的随机数并形成2乘5乘3的三维数组
np.random.randn(3,5) #产生15个标准正态分布随机数并形成3乘5的二维数组
np.random.normal(3,5,100) #产生100个均值为3, 标准差为5的N(3,5)随机数
np.random.uniform(3,7,100) #产生100个上下界分别为3和7的均匀分布随机数
np.random.randint(3,30,34) #产生34个[3,30)中的随机整数
np.random.random_integers(3,30,34) #产生34个[3,30)中的随机整数
x=[2,5,-7.6]
#下面是从数组x中按照给定概率p随机(放回)抽取20个样本
np.random.choice(x,20,replace=True,p=[0.1,0.3,0.6])
#下面是从数组x中完全随机(不放回)抽取2个样本
np.random.choice(x,2,replace=False)
np.random.permutation(range(10)) #把0到9的自然数随机排列
x0=[[1,3,-5],[3,4],'It is a word',(2,6),{3:51,'I':(2,1)}]
x=np.array([[1,3,-5],[3,4],'It is a word',(2,6),{3:51,'I':(2,1)}])

print(x0,'\n', x)
print(x0[0][:2], x0[4][3],x0[2][3:5],x0[4]['I'],len(x0))
print(x[0][:2], x[4][3],x[2][3:5],x[4]['I'],x.shape,x.size)
y=np.array(((2,1,-7),[5.5,21,32],(3,8.,1)))
z=np.array((((2,3),(1,43),[2,8]),[[2,3],[3,1],(9,5)]))
print(y,'\n',z,'\nshape of y ={}, shape of z ={}, \
\ndim of y={}, dim of z={}, size of y={}, size of z={}'.format(y.shape,\
    z.shape,y.ndim,z.ndim,y.size,z.size))
np.arange(3.2),np.arange(3.2,7.8),np.arange(2.2,5.8,.5),np.arange(2.3,-9,-1.5)
np.linspace(-2.1,6,3),np.linspace(-2.5,-16,4)
a=np.array([[2,5,-1,2,10],(3,1,4.,6,34)])
print(np.zeros([2,3]),'\n',np.ones((2,4)),'\n',
      np.full((2,5),-np.inf),'\n',np.zeros_like(a),'\n',np.eye(3),
      '\n',np.identity(2))
a=np.array([[2,5,-1,2],(3,1,4.,6)])
print(np.empty((2,3)),'\n',np.empty_like(a))
np.fromfunction(lambda i, j: i**2 + i*j, (3, 4))
np.random.seed(1010);a=np.random.rand(3,4)
id=np.fromfunction(lambda i, j: i==j, (3, 4))
id,a,a[id]
x = np.random.randn(5,3) #产生标准正态随机数组成的5乘3矩阵
np.savetxt('tabs1.txt',x) #存成以制表符分隔的文件
np.savetxt('commas1.csv',x,delimiter=',') #存成以逗号分隔的文件(如csv)
u = np.loadtxt('commas1.csv',delimiter=',') #读取以逗号分隔的文件
v = np.loadtxt('tabs1.txt') #读取以制表符分隔的文件
print('Shape of x, u and v are: [%s, %s ,%s]'%(x.shape,u.shape,v.shape))
print('x has', x.ndim, 'dimensions')
print('x and u are identical? %s' %(np.sum(x!=u)==0))
print('x and v are identical? %s' %(np.sum(x!=v)==0))
y = np.array([[[1,4,7],[2,5,8]],[[3,6,9],[10,100,1000]]])
# 等价于 y = np.as,matrix([[[1,4,7],[2,5,8]],[[3,6,9],[10,100,1000]]])
print('y=\n',y)
print('y[0,:,:]=\n',y[0,:,:])
print('y[1,:,:]=\n',y[1,:,:])
print('y[:,0,:]=\n',y[:,0,:])
print('y[:,1,:]=\n',y[:,1,:])
print('y[:,:,0]=\n',y[:,:,0])
print('y[:,:,1]=\n',y[:,:,1])
print('y[1,0,0]={}, y[0,1,:]={}'.format(y[1,0,0],y[0,1,:]))
print('shape of y=', np.shape(y),'\ndimension of y=', y.ndim)
print('"type(y)"=%s, "y.dtype"=%s' %(type(y),y.dtype))
x=np.arange(16).reshape(2,8);x
x.reshape(4,4),x.reshape(1,-1)
x.reshape(2,-1,4),x.reshape(4,-1,2).shape #shape (4,2,2)
x=np.arange(4)
print(x[np.newaxis,:],x.reshape(1,-1)) #行向量 1x8 矩阵
print(x[:,np.newaxis]==x.reshape(-1,1)) #列向量 8x1 矩阵
x=np.arange(5)
print(np.resize(x,(2,8)),'\n',np.resize(x,(1,3)))
import numpy as np
u=np.array([0, 1, 2]);v=np.array([5,2,7]) #整型list转换成np.array
print('shape of u=%s; shape of v=%s' %(u.shape,v.shape)) #形状
print('type of u=%s, type of v=%s' %(u.dtype,v.dtype)) #输出u和v类型
print('type of (u+v) is %s, type of (u*v) is %s, \ntype of (u/v)is %s,\
type of (u**v)is %s' %((u+v).dtype,(u*v).dtype,(u/v).dtype,(u**v).dtype))
print("u+v,u*v,u/v:u**v:\n",u+v,u*v,u/v,u**v) 
x=np.array([1,3,2.7]);y=np.array([2,-2.5,-1])
print(x+y,'\n',x-y,'\n',x/y,'\n',x**y)
x=np.array([[1,3,2],[2,3,1]]) 
#上式等价于 x=np.asmatrix([[1,3,2],[2,3,1]])
print('x=\n',x)
print('x**3=\n',x**3,'\n3**x=\n',3**x)
x=np.array([1,3,2]) #行矩阵
y=np.array((2.,-2)).reshape(-1,1) #变成列矩阵
x=np.ones((3,4)) 
y=np.arange(4)
z=np.arange(3)
print(x*y[np.newaxis,:])#等价于 x*y.reshape(1,-1), y*x 和 x*y
print(x*z[:,np.newaxis]) #等价于 x*z.reshape(-1,1))
x = np.array([ 123.858, 112.9652, -16.4278])
print(np.round(x,3),np.round(x, -2)) #四舍五入位数(负数为小数点前位数)
print(np.around(x,3),np.around(x,-2)) #同上
print(np.floor(x),np.ceil(x)) #比x小的最大整数及比x大的最小整数
x=np.array([-2,7,-1,9,6,-5]).reshape(2,3)
print('x=','\n', x)
print('np.max(x)=', np.max(x))
print('np.argmax(x)=', np.argmax(x))
print('x=','\n', x)
print('x.max(0)=' ,x.max(axis=0),'x.argmax(0)=' ,x.argmax(axis=0))
print('x.min(1)=' ,x.min(axis=1),'x.argmin(1)=', x.argmin(axis=1))
x = np.array([123.858, -23.6, 112.9652, -16.4278])
print('sum=', np.sum(x),'\ncumsum=', np.cumsum(x)) #和及累积和
print('prod=',np.prod(x),'\ncumprod=', np.cumprod(x)) #乘积及累积乘积
print('diff(x)=',np.diff(x)) #差分
x.shape=2,2 #把x转换成2乘2矩阵
print('x=\n',x)
print('diff by column =',np.diff(x,axis=0)) #按列(对不同的行元素)差分
print('diff by row =\n',np.diff(x,axis=1)) #按行(对不同的列元素)差分 
y=np.arange(32).reshape(2,2,8)
y.sum(axis=0) # 2x8
y.sum(axis=(0,1)) 
print('sign(x)=\n' ,np.sign(x),'\nexp(x)=\n', np.exp(x))
print('log(abs(x))=\n', np.log(np.abs(x)),'\nx**2=\n', x**2)
x=np.arange(3,5,.5) #从3到5(不包含5)等间隔为0.5的数列
y=np.arange(4)
print(x,y,x.shape,y.shape)
print('np.dot(x,y)={}, np.sum(x*y)={}'.format(np.dot(x,y),np.sum(x*y)))
np.random.seed(1010)
x=np.random.randn(3,5)
y=np.random.randn(3,5)
print(x.dot(y.T)) #x 和 y 的转置做矩阵乘法
print(x.T.dot(y)) # x 转置和 y做矩阵乘法
x=np.arange(3);y=np.linspace(1,10,5)
x,y,np.outer(x,y)
x=np.array(['I', 'am', 'OK'], dtype=object);y=np.arange(5)
x,y,np.outer(x,y)
x = np.array([[1.0,2.0,4],[3.0,4.0,-1]])
y = np.array([[5.0,6.0,-2],[7.0,8.0,9]])
print('x.shape=',x.shape,'y.shape=',y.shape) #都是2乘3矩阵
print('x=\n',x,'\ny=\n',y)
z = np.vstack((x,y)) #x,y纵向叠加合并成4乘3矩阵
z1 = np.hstack((x,y)) ##x,y横向叠加合并成2乘6矩阵
print('z=\n',z,'\nz1=\n',z1, '\nz.shape=',z.shape,
  'z1.shape=', z1.shape)
z = np.concatenate((x,y),axis=0)#等同于 np.vstack((x,y))
z1 = np.concatenate((x,y),axis=1) #等同于 np.hstack((x,y)) 
x = np.arange(24).reshape(4, 6)
print('x= %s \n hsplit=\n%s'%(x,np.hsplit(x,2)))
np.vsplit(x,4)
x=np.arange(9)
np.split(x,(2,5,7,12))
u=np.array([[11,32,26],[47,54,89],[92,64,95]]);u
np.insert(u,1,0) #相当于np.insert(u.flatten(),1,0)
np.insert(u,1,0,axis=1)
np.insert(u,1,0,axis=0)
np.insert(u,1,[[1,2,3],[4,5,6]],axis=0)
np.insert(u,1,np.array([[1,2,3],[4,5,6]]),axis=1)
np.random.seed(1010)
x=np.arange(12).reshape(2,6)
y=x;z=x.copy()
print(y is x,'\n',y==x,'\n',z is x,'\n',z==x)
print(x)
y[0,0]=99;z[0,:]=-777
print(x,'\n',y,'\n',z)
x=np.zeros((4,5))+999 #产生全部元素为999的4乘5矩阵
print('x=\n',x)
x[0,:]=np.pi #第0行全部赋值为圆周率pi
print('x=\n',x)
x[0:2,0:2]=0 #0到1行及0到1列赋值为0
print('\nx=\n',x)
x[:,4]=np.arange(4) #第4列赋值为0,1,2,3
print('\nx=\n',x)
x[1:3,2:4]=np.array([[1,2],[3,4]]) #1到2行及2到3列用2乘2矩阵赋值
print('\nx=\n',x)
x=np.c_[0:12:4] #从0开始, 间隔4, 直到(但不包含)12为止的列向量
y=np.arange(0,12,4).reshape(-1,1) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y identical? ',np.sum(x-y)==0)
x=np.c_[0:10:3j] #从0开始, 3个元素, 直到(包含)10为止的列向量
y=np.arange(0,11,10/(3-1)).reshape(-1,1) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y identical? ',np.sum(x-y)==0)
x=np.r_[0:10:4] #从0开始, 间隔4, 直到(但不包含)10为止的行向量
y=np.arange(0,10,4) #等价语句
print('x=\n',x)
print('y=\n',y)
print('Is x and y equal? ',np.sum(x-y)==0)
x=np.c_[0:10:5j] #从0开始, 5个元素, 直到(包含)10为止的列向量
y=np.arange(0,12,10/(5-1))[:,np.newaxis] #等价语句
print('x=\n',x)
print('y=\n',y)
x = np.arange(-10,10,.2)
y = np.arange(-10,10,.2)
X, Y = np.meshgrid(x, y)
Z = X**2 + Y**2
print(X.shape,Y.shape,Z.shape)
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,4))
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X, Y, Z, rstride=10, cstride=10)
plt.show()
x = np.array([[10,2,7],[3,5,4],[45,76,100],[30,2,0]]) 
y=np.diag(x) #对角线元素
z=np.diag(y) #x的对角线元素组成的对角型方阵(非对角型元素为0)
print('x=\n{}\ny=diag(x)=\n{}\nz=diag(y)=\n{}'.format(x,y,z))
x = np.array([[10,2,7],[3,5,4],[45,76,100],[30,2,0]]) 
print('np.triu(x)=\n' ,np.triu(x)) #x上三角阵
print('np.tril(x)=\n',np.tril(x)) #x下三角阵
np.random.seed(1010)
x = np.random.randn(50,5)
va,ve=np.linalg.eig(np.corrcoef(x.T))
print('eigen values=\n{}\neigen vectors=\n{}'.format(va,ve))
import numpy as np
np.random.seed(1010)
x=np.random.randn(3,4)
print('x=\n',x)
u,d,v= np.linalg.svd(x) #奇异值分解
print('u=\n',u)
print('D=\n',np.diag(d))
print('v=\n',v)
print('condition number=',np.linalg.cond(x)) #条件数
#验证: 条件数等于最大和最小奇异值之比
print('Are they equal?',np.max(d)/np.min(d)-np.linalg.cond(x)<10**15)
Z=np.array([[1,-2j],[2j,5]])
print('Z=\n',Z)
L=np.linalg.cholesky(Z) #Cholsky分解
print('L=\n',L)          #L
L1=L.T.conj()
print('L.T.conj()=\n',L1) #L的共轭转置
print(np.sum(np.dot(L,L1)-Z)) #验证其等于Z (差的元素总和为0)
np.random.seed(1010)
A=np.random.randn(3,3)#产生一个标准正态随机数的矩阵A
print('inverse of A=\n',np.linalg.inv(A)) #A的逆
print('determinant of A=\n',np.linalg.det(A)) #行列式|A|
b=np.random.randn(3)
print('solution of Ax=b:\n',np.linalg.solve(A,b)) #解联立方程Ax=b
np.random.seed(1010)
X = np.random.randn(100,3)         #无截距项的自变量
X1=np.hstack((np.ones((100,1)),X)) #有截距项的自变量
y = X.dot(np.array([1,2,3]))+np.random.randn(100)
print('OLS without intercept:')
beta, SSR, rank, sv= np.linalg.lstsq(X,y,rcond=None)#无截距最小二乘法
print('beta={}\nSSR={}\nrank={}\nsv={}'.format(beta, SSR, rank, sv))
print('OLS with intercept:')
beta, SSR, rank, sv= np.linalg.lstsq(X1,y,rcond=None)#有截距最小二乘法
print('beta={}\nSSR={}\nrank={}\nsv={}'.format(beta, SSR, rank, sv))
A=np.eye(3)
B=np.array([[1,2],[3,4]])
print('A=\n{}\n B=\n{}'.format(A,B))
Z = np.kron(A,B) #A和B矩阵的Kronecker积
print('Z=np.kron(A,B)=\n{}\nz.shape={}'.format(Z,Z.shape))
print('trace(Z)={}, rank(Z)={}'.format(np.trace(Z),
  np.linalg.matrix_rank(Z)))
import datetime as dt
yr, mo, dd = 2016, 8, 30
hr, mm, ss, ms= 10, 32, 10, 11
print('dt.date(yr, mo, dd)=',dt.date(yr, mo, dd)) #标准输出年月日
print('dt.time(hr, mm, ss, ms)=',dt.time(hr, mm, ss, ms))#最小至毫秒
d1=dt.datetime(yr, mo, dd, hr, mm, ss, ms)#年月日及时间全部
print(d1)
d2 = dt.datetime(yr + 1, mo+2, dd+1, hr-1, mm, ss, ms)
print('time difference d2-d1=', d2-d1)
dates = np.array(['2016-09-01','2017-09-02'],dtype='datetime64')
print('dates=\n',dates,'\ntype of dates=',dates.dtype)
print('dates[0]=',dates[0],'dates[1]=',dates[1])
x=np.array([2+3j])
y=np.array([4-13j])
z=np.array(-20-4j)
print(x/z*y+x**2*z/y)
coef = [3.2, 12, 1, 4, -15, 28]
np.roots(coef)
p=np.poly1d([3,-4,6,2])
p1=np.poly1d([2,4])
print ('p=\n',p) #打印p
print ('p1=\n',p1) #打印p1
print ('p(1:9)=',p(np.arange(1,10,1)))#计算x取1,2,. . .,9时p的值
print ('p*p1=\n',p*p1) #打印p*p1
pi27=p.integ(m=2,k=7)
print ('p.integ(m=2,k=7)=\n',pi27)
pd1=p.deriv(m=1)
print ('p.deriv(m=1)=\n',pd1)
def mine(a, b, c):
    if a > 2*b:
        return np.log(a-2*b)*c
    elif a< 2*b:
        return np.log(2*b-a)*c**2
    else:
        return np.pi    
mine(3,7,8)
mine([3,5,9,0],[7,-5,7,8],8)
vmine = np.vectorize(mine)
print (vmine([3,5,9,0],[7,-5,7,8],[8,8,8,8]))
print (vmine([3,5,9,0],[7,-5,7,8],8))

第5章

import pandas as pd
d0={'x':5,'y':989}
d1={'y':np.arange(3), 'x':([4.5,9],8),'z': (2,4,2)}
d2={'y': {'a':4,'b': 90}, 'x':([4.5],[9,8])}
z=pd.DataFrame([d0,d1,d2])
print(z)
print('Use "iloc" with indices:\n' ,z.iloc[2,0][1])
print('Use "loc" with indices and names:\n',z.loc[2,'y']['b'])
print('Use column names and indices:\n',z['z'][:1])
print('Use column names and indices:\n',z.y[1])
d3={'x':[-5,7,9,-2.5],'y':[1,-2,9.8,6.4]}
u=pd.DataFrame(d3)
print('u=\n',u)
d4=np.array([[-5,7,9,-2.5],[1,-2,9.8,6.4]]).T
v=pd.DataFrame(d4,columns=['x','y'])
print('v=\n',v)
import numpy as np
np.random.seed(1010)
name1=['X1','X2','X3','Y']
w=pd.DataFrame(np.random.randn(7,4),columns=name1)
w['sex']=['Femal']*3+['Male']*4
print(w)
print(w.head(2)) #前2行(默认值是5行)
print(w.tail(3)) #最后3行(默认值是5行)
print(w.describe()) 
print(w.columns)
print(w.index)
w.index=['A','B','C','D','E','F','G']
print(w[w.columns[2:]][:2]) #输出最后3个变量的头2行
print('size of w=',w.size)
print('shape of w=',w.shape)
df=pd.DataFrame({'price': [12,34,10],'tax': [0.12,0.4,0.5]})
df.columns
df.rename(columns={'price':'P','tax':'T'},inplace=True)
df.columns
np.random.seed(1010)
w={'X':np.random.randn(7),'Y':np.random.randn(7),
   'Year':np.arange(2014,2021,1)}
df=pd.DataFrame(w)
print(df)
df.index=np.arange(10,17)
print(df)
df1=df.set_index('Year')
print(df1)
del df1.index.name
print(df1)
new_index = df.index[::-1]
print(df.reindex(new_index))
print(df1.reset_index())
print(df1.reset_index(drop=True))
import numpy as np
np.random.seed(1010)
name1=['X1','X2','X3','Y'] 
w=pd.DataFrame(np.random.randn(7,4),columns=name1) 
w['sex']=['Femal']*3+['Male']*4
w.index=['A','B','C','D','E','F','G']
v=pd.DataFrame(np.random.randn(5,3),columns=['X1','X2','Y']) 
print(w,'\n',v)
w.to_csv('Test.csv',index=False) #index=False意味着文件不置行名字
w.to_csv('Test2.txt',index=True) #index=True在文件中增加了一列
w1=pd.read_csv('Test.csv') 
w2=pd.read_table('Test.csv',sep=',')
w3=pd.read_table('Test2.txt',sep=',')
print('w1:\n',w1,'\nw2==w1:\n',w2==w1,'\nw3:\n',w3)
df=w.to_csv(sep=';',index=True)
print(df)
writer=pd.ExcelWriter('Test1.xlsx')
w.to_excel(writer,'Sheet1',index=True)
# 数据v存入指定工作表左上角位置: 从第2行, 第3列开始(从第0行列算起)
v.to_excel(writer,'Sheet2',startrow=2,startcol=3,index=False) 
writer.save()
W=pd.read_excel('Test1.xlsx','Sheet1',index_col=0)
print(W)
V=pd.read_excel('Test1.xlsx','Sheet2',usecols=range(3,6),skiprows=2)
# 下式和上式等价
#V=pd.read_excel('Test1.xlsx','Sheet2',usecols='D:F',skiprows=2)
print(V)
w.to_pickle("test.pkl")
w_pkl=pd.read_pickle('test.pkl')
print(w_pkl.head(2))
w.to_json('test_index.json',orient='index')
w_index_json=pd.read_json('test_index.json')
print(w_index_json)
w.to_json('test_records.json',orient='records')
w_records_json=pd.read_json('test_records.json')
print(w_records_json.head(2))
w_table_json=pd.read_json('test_table.json',orient='table')
print(w_table_json.tail(2))
w.to_hdf('data.h5', key='w', mode='w')
w_h5=pd.read_hdf('data.h5', key='w')
print(w_h5.tail(3))
w.to_parquet('w.parquet.gzip', compression='gzip')  
w_parq=pd.read_parquet('w.parquet.gzip') 
print(w_parq.head(3))
import feather
feather.write_dataframe(w, 'data.feather')
w_feather = feather.read_dataframe('data.feather')
print(w_feather.head(2))
w.to_stata('test.dta')  
w_dta=pd.read_stata('test.dta') 
print(w_dta.head(3))
print(w[['X1','Y']][:2]) #X1和Y的前2行
print(w[:2]) #所有变量的前两行
print(w[w.columns[3:]][-3:]) #第3个变量及后面变量的最后3行
print(w.sex[:4]) #sex变量的前4个元素
print(w.loc['A':'C','X3':'sex']) #index'A'到'C', 变量'X3'到'sex'
print(w.loc[['G','A','F'],['sex','Y','X1']]) # 随意选择的行名和变量名
print(w.iloc[[1,0,3],[0,4,2]])
print(w.iloc[[3,2,0],-3:])
print(w.iloc[:2,-3:])
import pandas as pd 
np.random.seed(8888) 
name1=['X1','X2','X3','Y']
u=pd.DataFrame(np.random.randn(7,4),columns=name1) 
print('u.head(2)=\n',u.head(2)) 
print('u.shape=',u.shape) 
v=pd.DataFrame(np.random.randn(5,3),columns=['X1','X2','Y']) 
print('v.head(2)=\n',v.head(2)) 
print('v.shape=',v.shape)
x=pd.DataFrame(np.random.randn(3,4),index=['s','u','t']) 
x.columns=['w','u','v','x']
print('x.head(2)=\n',x.head(2)) 
print('v.shape=',x.shape) 
s=pd.DataFrame({'sex':['Male','Female','Male','Female','Male'],'X1': range(5)})
print('s.head(2)=\n',s.head(2)) 
print('s.shape=',s.shape) 
np.random.seed(1010)
name1=['X1','X2','X3','Y'] 
w=pd.DataFrame(np.random.randn(7,4),columns=name1) 
w['sex']=['Femal']*3+['Male']*4
np.random.seed(1010)
df=pd.DataFrame(np.random.randn(7,2),columns=('X1','X2'))
df['sex']=['Female']*4+['Male']*3
print(df,'\n',df.T) #或 df.transpose()
print(s+w)
print('w*v/u=\n',w*v/u,'\nw**u=\n',w**u)
print('v**2+v*5+2*np.exp(v)=\n',v**2+v*5+2*np.exp(v)) #简单运算
print('v-v.iloc[0]=\n',v-v.iloc[0]) #v的每一行减去第0行
print('x-x[index=t]=\n',x-x.loc['t']) #x的每一行减去标签为't'的行
print('x.T.dot(x)=\n',x.T.dot(x)) #用numpy的矩阵转置及矩阵乘法函数
print(x.sum(axis=0),"\n",x.sum(axis=1),"\n",x.mean(axis=0))
print(x.std(axis=0),"\n",x.prod(axis=0),"\n",x.count(axis=0),
           "\n",x.cumsum(axis=0))
np.random.seed(1010)
w=pd.DataFrame(np.random.randn(7,4),columns=['X1','X2','X3','Y'])
w['sex']=['Femal']*3+['Male']*4
w.index=['A','B','C','D','E','F','G']
print(w.loc[(w['X1']<0) | (w.sex=='Female'),['sex','X1','Y','X3']])
w[(w['X1']<0) | (w.sex=='Female')][['sex','X1','Y','X3']]
print(w.sort_values(by='X1', ascending=False))
print(w.sort_values(by=['sex','Y'], ascending=[False,True]))
np.random.seed(1010)
Grade = {'score': np.random.choice(range(30,100),size=6)}
df = pd.DataFrame(Grade)
print(df.T)
df.loc[df.score<60,'result']='fail'
df.loc[df.score>=60,'result']='pass' 
print(df)
df.insert(loc=0,column='name', value=['Tom','John','Jane','Ted',"Bob",'Lee'])
print(df)
df.insert(3,'extra',0)
print(df)
df.insert(3,'extra',np.arange(6)[::-1],allow_duplicates=True)
print(df)
v=np.random.choice(np.arange(60,100),(12,3))
name=np.repeat(['Tom','Bob','June'],4).reshape(-1,1)
year=np.array([2014,2015,2016,2017]*3).reshape(-1,1)
dd=np.hstack((name,year,v))
u=pd.DataFrame(data=dd,columns=['name','year','Math','Pys','Lit'])
u3=u.set_index(['name','year'])

print('u=\n',u,'\nu3=\n',u3)
u3.drop(['Lit','Math'],axis=1) #等价于 u3.drop(columns=['Lit','Math'])
print(u.drop([0,4,3]))
u3.drop(index='2014',level=1) #这里level=1标明'2014'是第1列index
u3.drop(index='June',level=0) 
print(u.drop(index=[0,4,3],columns='Math'))
u3.rename_axis([None,None],axis=0).drop(index='June',columns='Math')
Df=pd.DataFrame({'Math':[67,83,98],'Pys': [98,25,37]},
  index=['Tom','Bob','June'])
Df
new_index=['Tom', 'June', 'John'] 
Df.reindex(new_index)
Df.reindex(index=new_index,columns=['Math','Hist'],fill_value=999)
Gd=np.array([[87,79,80],[98,65,72],[69,88,86]])
w=pd.DataFrame(data=Gd,index=['Tom','Bob','June'],
  columns=['Math','Phy','Lit'])
print(w)
w1=w.stack() #等同于w.stack(0)
print(w1)
w2=pd.DataFrame(w1)
w2.reset_index(inplace=True)
w2.columns=('name','class','grade')
w2
w1.unstack() #等同于w1.unstack(level=-1)
w1.unstack(0) 
v=np.random.choice(np.arange(60,100),(12,3))
name=np.repeat(['Tom','Bob','June'],4).reshape(-1,1)
year=np.array([2014,2015,2016,2017]*3).reshape(-1,1)
dd=np.hstack((name,year,v))
u=pd.DataFrame(data=dd,columns=['name','year','Math','Pys','Lit'])
print(u)
u.pivot(index ='year',columns ='name',values =['Math','Pys','Lit'])
u1=pd.DataFrame(u.set_index(['name','year']).stack())
u1.reset_index(inplace=True)
u1.columns=['name','year','class','grade']
print(u1)
Tom=u1[u1['name']=='Tom'].pivot(index='year', columns='class',
    values='grade')
print(Tom)
Tom.rename_axis(None,axis=1).rename_axis(None,axis=0)
Tom.rename_axis(None,axis=1).reset_index('year')
y2014=u1[u1['year']=='2014'].pivot(index='name',columns='class',values='grade')
y2014.reset_index(level="name").rename_axis(None,axis=1)
Math=u1[u1['class']=='Math'].pivot(index='year',columns='name',values='grade')
Math.reset_index(level='year').rename_axis(None,axis=1)
df1=pd.DataFrame({'X1': [1, 3., 2],'X2': [-2., -1, 9]},index=[0, 1, 2])
df2=pd.DataFrame({'X1': [1/2, 3.5, 12, 43],'X2': [6., -5, 4, 7]},index=[0, 1, 2, 3])
print(df1,'\n',df2)
print(pd.concat((df1,df2))) #相当于 pd.concat((df1,df2),axis=0)
print(pd.concat((df1,df2),ignore_index=True))
df3=pd.DataFrame({'X3': ['Male', 'Female', 'Female', 'Male'],
                  'X4': ['H','P','G', 'H']},index=[5, 6, 7, 8])
print(df3)
print(pd.concat((df1,df2,df3),axis=1))
print(pd.concat((df1,df2,df3),ignore_index=True,axis=1))
s1=pd.Series([1, 2, 3], name='H')
s2=pd.Series([6, 5, 4], name='A')
s3=pd.Series([8, 9, 7], name='C')
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
df1=pd.DataFrame({'id': [1,3,5,2],'m_grade': [98,60,81,70]})
df2=pd.DataFrame({'id':[1,2,5,6],'s_grade':[50,90,78,60],'m_grade':[99,75,60,78]})
df3=pd.DataFrame({'xid': [6,1,2,4],'c_grade': [20,65,83,98]})
print(df1,'\n',df2,'\n',df3,'\n',df4)
pd.merge(df1,df2[['id','s_grade']])#默认 how='inner'
pd.merge(df1,df2[['id','s_grade']],how='outer')#默认 how='inner'
pd.merge(df1,df2,left_on='id', right_on='id',suffixes=('_first', '_second'))
pd.merge(left=df2,right=df3,left_on='id',right_on='xid',how='outer')
pd.merge(left=df2,right=df3,left_on='id',right_on='xid',how='left')
df2.join(df3,how='inner')
df1.join(df2,lsuffix='_l', rsuffix='_r',how='inner')
df1.join(df2,on='id', lsuffix='_l', rsuffix='_r',how='inner')
df1.join(df2,on='id', lsuffix='_l', rsuffix='_r',how='outer')
df1=pd.DataFrame({'id': [1,3,5,2],'m_grade': [98,60,81,70]})
df2=pd.DataFrame({'id':[1,2,5,6],'s_grade':[50,90,78,60],'m_grade':[99,75,60,78]})
df1.append(df2,sort=True,ignore_index=False)
df1.append(df2,sort=False,ignore_index=True)
d = [{'id':6,'m_grade':100},{'id':8,'m_grade':50}]
print(df1.append(d,ignore_index=True))\end{verbatim}
s = pd.Series([6, 100], index=['id', 'm_grade'])
df1.append(s,ignore_index=True)
np.random.seed(1010)
s=pd.Series(np.random.randn(4),index=['a','b','c','d'])
print('s=\n',s)
d=pd.Series({'a':2.7,'b':-3.6})
print('d=\n',d)
print('s[:3]=\n',s[:3])
print('s[[0,3]]=\n',s[[0,3]])
print("s[s.index>'b'=\n",s[s.index>'b'])
print("s[(s>-1.2) & (s<1.5)]=\n",s[(s>-1.2) & (s<1.5)])
print('s*2+np.exp(s)-abs(s**3)=\n',s*2+np.exp(s)-abs(s**3))
print('s[:2]+s[1:]=\n',s[:2]+s[1:])
import pandas as pd
import numpy as np
import matplotlib
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.random.seed(1010)
dates = pd.date_range('1989-01', periods=100, freq='M')
s1=pd.Series(np.random.randn(100).cumsum(), index=dates)
fig=plt.figure(figsize=(15,4))
plt.plot(s1)
import pandas as pd
diamonds=pd.read_csv("diamonds.csv")
print(diamonds.head()) #打印前几行
print('diamonds.columns=\n',diamonds.columns) #变量名字
print('sample shape=', diamonds.shape) #样本形状(行, 列数目)
print(diamonds.iloc[:,:7].describe()) #对除最后3个之外的数量变量进行描述
cut=diamonds.groupby("cut") #按照变量cut的各水平分群
print('cut.median()=\n',cut.median()) #变量相应cut的各个水平的中位数
print('Cross table=\n',pd.crosstab(diamonds.cut,diamonds.color))
np.random.seed(1010)
n=1000
x=pd.Series(np.random.randn(n),
index=pd.date_range('1/1/2014',periods=n,freq='D'))
x=x.cumsum() 
x5=pd.DataFrame(np.random.randn(n,5),index=x.index,
columns=['One','Two','Three','Four','Five'])
x5=x5.cumsum()
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,4))
x.plot(ax=axes[0])
x5.plot(ax=axes[1])
xw=pd.get_dummies(adult['workclass']).sum(axis=0) #转换成哑元再求和
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,3)) #两个图的排列
xw.plot(kind='barh',ax=axes[0]) #条形图
xw.plot(kind='pie',ax=axes[1]) #饼图
w=adult
M=w['hours_per_week'][w['sex']=='Male']
F=w['hours_per_week'][w['sex']=='Female']
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,4))
w[['hours_per_week']].plot(kind='hist',orientation='horizontal',
alpha=0.5,ax=axes[0])
M.plot(kind='hist',alpha=0.5,ax=axes[1],label='Male')
F.plot(kind='hist',alpha=0.5,ax=axes[1],label='Female')
plt.legend()
np.random.seed(1010)
fig,axes=plt.subplots(nrows=1,ncols=3,figsize=(12,4))
x3=pd.DataFrame(np.random.rand(10,3),columns=['ABC','NBC','CBS'])
x3.plot(kind='bar',ax=axes[0])
x3.plot(kind='bar',stacked=True,ax=axes[1])
x3.plot(kind='barh',stacked=True,ax=axes[2])#水平叠放条形图
diamonds=pd.read_csv("diamonds.csv")
diamonds.boxplot(column='carat',by='cut',figsize=(12,4))
diamonds.boxplot(column=['price'],by=['color','cut'],figsize=(12,5))
x=np.sin(np.arange(0,5,.2))+1
y=np.cos(np.arange(0,5,.2))+1
w=np.stack((x,y),axis=1)
w=pd.DataFrame(w,columns=['sin','cos'])
fig,axes=plt.subplots(nrows=1,ncols=2,figsize=(12,3.5))
w.plot(kind='area',ax=axes[0])
w.plot(kind='area',stacked=False,ax=axes[1])
X=pd.DataFrame(np.random.rand(60, 4), columns=['One', 'Two', 'Three','Four'])
X.plot(kind='scatter',x='One',y='Two',c='Four', s=X.Three*500,figsize=(12,4))

第6章

import matplotlib
%matplotlib inline 
#如果输入上面一行, 则会产生在输出结果之间的插图(不是独立的图)
import matplotlib.pyplot as plt 
x=np.arange(0.1,4,.01)
plt.figure(figsize=(20,7))
plt.plot(x,np.sin(x*50)/x,linewidth=3) #实线形式的曲线, 默认'b-' 
plt.plot(x,np.zeros(len(x)),'g--',linewidth=2) #虚线形式的绿色水平线 
plt.title('Curve $y=\sin(50x)/x$',fontsize=40,y=1.04) 
plt.xlabel('$x$',fontsize=30) 
plt.ylabel('$y=\sin(50x)/x$',fontsize=30)
plt.grid(True)
plt.savefig('mplsin.pdf') #存入文件
x=np.arange(0.,2.,.05)
plt.figure(figsize=(20,7))
plt.plot(x,np.cos(x),'r:',x,np.cos(x**2),'b^',
x,np.cos(x**3),'g-.',x,np.cos(x**4),'mo',
linewidth=15,markersize=30)
plt.ylim((-1.5,1.5))
plt.title('4 curves in one figure',fontsize=30)
x=np.arange(0.,2.,.05)
plt.figure(figsize=(20,7))
plt.plot(x,np.cos(x),'r:',linewidth=15,markersize=30)
plt.plot(x,np.cos(x**2),'b^',markersize=30)
plt.plot(x,np.cos(x**3),'g-.',linewidth=15)
plt.plot(x,np.cos(x**4),'mo',markersize=30)
plt.ylim((-1.5,1.5)) #确定图形的纵向空间范围
plt.title('4 curves in one figure',fontsize=30)
import scipy.stats as stats
plt.figure(figsize=(27,9))
plt.subplot(2, 3, 1) #2x3图形阵的第1个
y = 50*np.exp(.0004 + np.cumsum(.01*np.random.randn(100)))
plt.plot(y) #默认画蓝色实线
plt.xlabel('time ($\tau$)') #x轴标签
plt.ylabel('Price',fontsize=16) #y轴标签
plt.title('Random walk: $d\ln p_t = \mu dt + \sigma dW_t$',fontsize=16)

y = np.random.rand(5)
x = np.arange(5)
plt.subplot(2, 3, 2) #2x3图形阵的第2个
colors = ['#FF0000','#FFFF00','#00FF00','#00FFFF','#0000FF'] #颜色代码
plt.barh(x, y, height = 0.5, color = colors, \
edgecolor = '#000000', linewidth = 5) #水平条形图(barh)
plt.title('Bar plot')

y = np.random.rand(5)
y = y / sum(y)
y[y < .05] = .05
plt.subplot(2, 3, 3)
plt.pie(y) #饼图
plt.title('Pie plot')

z = np.random.randn(100, 2)
z[:, 1] = 0.5 * z[:, 0] + np.sqrt(0.5) * z[:, 1]
x = z[:, 0]
y = z[:, 1]
plt.subplot(2, 3, 4)
plt.scatter(x, y)
plt.title('Scatter plot')

plt.subplot(2, 3, 5)
x = np.random.randn(100)
plt.hist(x, bins=30, label='Empirical') #画直方图
xlim = plt.xlim()
ylim = plt.ylim()
pdfx = np.linspace(xlim[0], xlim[1], 200)
pdfy = stats.norm.pdf(pdfx) #scipy模块中的标准正态分布密度函数
pdfy = pdfy / pdfy.max() * ylim[1]
plt.plot(pdfx, pdfy,'r-',label='PDF')
plt.ylim((ylim[0], 1.2 * ylim[1]))
plt.legend()
plt.title('Histogram')

plt.subplot(2, 3, 6)
x = np.cumsum(np.random.randn(100,4), axis = 0)
plt.plot(x[:,0],'b-',label = 'Series 1')
plt.plot(x[:,1],'g-.',label = 'Series 2')
plt.plot(x[:,2],'r:',label = 'Series 3')
plt.plot(x[:,3],'h--',label = 'Series 4')
plt.legend()
plt.title('Random lines')
fig=plt.figure(figsize=(10,3))
f1=fig.add_subplot(1,2,1)
x=np.linspace(0.1,1)
f1.plot(x,stats.chi2.pdf(x,1),'-', label='$\chi^2(1)$')
f1.plot(x,stats.chi2.pdf(x,2),'-.', label='$\chi^2(2)$')
f1.plot(x,stats.chi2.pdf(x,3),'--', label='$\chi^2(3)$')
f1.plot(x,stats.chi2.pdf(x,4),':', label='$\chi^2(4)$')
f1.set_title('$\chi^2$ density functions')
f1.legend()
f2=fig.add_subplot(1,2,2)
x=np.linspace(0.01,.5,50)
f2.plot(x,stats.f.pdf(x,1,2),'-', label='$F(1,2)$')
f2.plot(x,stats.f.pdf(x,2,1),'-', label='$F(2,1)$')
f2.plot(x,stats.f.pdf(x,2,2),'-.', label='$F(2,2)$')
f2.plot(x,stats.f.pdf(x,1,1),'--', label='$F(1,1)$')
f2.plot(x,stats.f.pdf(x,1,3),':', label='$F(1,3)$')
f2.set_title('$F$ density functions')
f2.legend()
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
X=np.arange(-5,5,0.25)
Y=np.arange(-5,5,0.25)
X,Y=np.meshgrid(X,Y) #X为每行相同的矩阵,Y为X转置
Z=np.sin(np.sqrt(X**2+Y**2))
x=X.reshape(len(X)**2)#把矩阵拉长成为一个向量
y=Y.reshape(len(Y)**2)
z=Z.reshape(len(Z)**2)
fig=plt.figure()
ax=fig.gca(projection='3d')
ax.plot_trisurf(x,y,z,cmap=cm.jet,linewidth=0.3)
z=np.linspace(-1,1,1000)
x=z*np.sin(100*z)
y=z*np.cos(100*z)
plt.figure(figsize=(30,10))
plt.axes(projection='3d')
plt.plot(x,y,z,'-b')

第7章

import scipy.stats as stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd 
from scipy import io as sio
np.random.seed(789)
data = np.random.randn(5, 4)
sio.savemat("randn.mat", {'normal': data}) 
data = sio.loadmat('randn.mat', struct_as_record=True)
data['normal']
plt.figure(figsize=(18,4))
plt.subplots_adjust(top=1.5) #调节每个图四周的空间
x=np.arange(-4,4,.01)
plt.subplot(2,2,1)
plt.plot(x,stats.norm.cdf(x))
plt.title('cdf of $N(0,1): \Phi(x)$')
plt.subplot(2,2,2)
plt.plot(x,stats.norm.pdf(x))
plt.title('pdf of $N(0,1): \phi(x)$')
plt.subplot(2,2,3)
plt.plot(x,stats.norm.sf(x))
plt.title('sf of $N(0,1): 1-\Phi(x)$')
x=np.arange(.01,.99,.01)
plt.subplot(2,2,4)
plt.plot(x,stats.norm.ppf(x))
plt.title('ppf of $N(0,1): \Phi^{-1}(x)$')
np.random.seed(999)
stats.norm.rvs(size=10,loc=5,scale=2)
stats.norm.rvs(size=10,random_state=999,loc=5,scale=2)
fr=stats.norm(loc=5,scale=2) #把N(5,2)冻结到对象fr下面可得到各种有关结果
print('rvs(size=3,random=999): %s \nmean: %s \nstd: %s\
\ncdf(5.97): %s\npdf([-0.5,2.96]): %s \nkwds %s' \
      %(fr.rvs(size=3,random_state=999),fr.mean(),fr.std(),\
        fr.cdf(5.97),fr.pdf([-0.5,2.96]),fr.kwds))
stats.norm.isf([0.1,0.05,0.025,0.01,0.001]) 
#等价代码: -stats.norm.ppf([0.1,0.05,0.025,0.01,0.001])     
stats.t.isf([0.1,0.05,0.025,0.01,0.001],[[2],[5],[500]])
#等价代码: -stats.t.ppf([0.1,0.05,0.025,0.01,0.001],[[2],[5],[500]])
from scipy.stats import rv_continuous
class exponential_gen(rv_continuous):
    '''Exponential distribution'''
    def _pdf(self,x,L):
        return L*np.exp(-x*L)
    def _cdf(self,x,L):
        return 1-np.exp(-x*L)       
Exp=exponential_gen(name='exponential')
print('Exp.cdf:\n',Exp.cdf(np.arange(1,4,.3),.5))
print('Exp.pdf:\n',Exp.pdf(np.arange(1,4,.3),.6))
print('Exp.ppf:\n',Exp.ppf([0.1,0.05,0.01],.6))
print('Exp.rvs:\n',Exp.rvs(.6,size=7))
print('Exp.mean(.6):\n',Exp.mean(.6), Exp.var(.7),Exp.std(.7))  
from scipy.stats import rv_continuous
class gaussian_gen(rv_continuous):
    '''Gaussian distribution'''
    def _pdf(self,x,m,s):
        return np.exp(-(x-m)**2/2./s**2)/np.sqrt(2.0*s**2*np.pi)
Gaussian=gaussian_gen(name='gaussian')
print('Gaussian.cdf:\n',Gaussian.cdf(np.arange(-4,4,1),.01,3))
print('Gaussian.pdf:\n',Gaussian.pdf(np.arange(-4,4,1),0.01,2))
print('Gaussian.rvs:\n',Gaussian.rvs(0.001,2,size=3))
print('Gaussian.mean:\n',Gaussian.mean(0.001,2),Gaussian.var(0.1,2))
print('Gaussian.ppf:\n',Gaussian.ppf([.1,.2,.5,.9],2,4))
print(Gaussian.cdf([2,0.1],0,2))
from scipy.stats import rv_discrete
class pois_gen(rv_discrete):
    '''Poisson distribution'''
    def _pmf(self,k,m):
        return np.exp(-m)*m**k/math.factorial(k)
import math     
x=np.arange(1,6,1)
p=np.array([.1,.2,.3,.3,.1])
mydf=rv_discrete(name='mydf',values=(x,p))
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.xlim((0,6))
plt.ylim((0,.4))
plt.plot(x,mydf.pmf(x),'bo',ms=12,mec='r')
plt.title('PMF')
#上面ms为markersize简写, mec为markeredgecolor简写
plt.vlines(x,0,mydf.pmf(x),colors='k',lw=5)
plt.subplot(1,2,2)
plt.xlim((0,5))
plt.ylim((0,1.5))
plt.step(x,mydf.cdf(x),'b--',lw=4)
plt.title('CDF')    
from scipy import integrate
f = lambda x: 6*x**3-2*x**2+x-1 #定义被积函数f
integrate.quad(f, 0, 4)   #做f的从0到4的积分
def g(t, n, x):
    return np.exp(-x*t) / t**n
def gint(n, x):
        return integrate.quad(g, 1, np.inf, args=(n, x))[0]
vec_gint = np.vectorize(gint)   #向量化    
vec_gint(5, [4.3,3.1,0.2,0.21])
integrate.quad(lambda x: gint(4, x), 0, np.inf)
integrate.dblquad(lambda x, y: x*y, 0, 1/3.,lambda x: 0,lambda x: 1-3.*x)
def f(x, y):return x*y
def by():return [0, 1/3.]
def bx(y):return [0, 1-3.*y]
integrate.nquad(f, [bx, by])
def f(t, x):return np.exp(-x*t) / t**4
integrate.nquad(f, [[1, np.inf],[0, np.inf]])

第三部分

第8章

(np.arange(2.5),np.arange(3,8),np.arange(2,-4.6,-.5))
def Arange(x,y=None,step=None):
    if y==None and step==None:
        z=[]
        i=0
        while i< x:
            z.append(i)
            i+=1
        return(z)
    elif y!=None and step==None:
        z=[]
        i=x
        if y>x:
            while i< y:
                z.append(i)
                i+=1
        else:
            while i>y:
                z.append(i)
                i-=1
        return(z)
    elif y!=None and step!=None:
        z=[]
        i=x
        if y>x and step>0:
            while i< y:
                z.append(i)
                i+=step
        elif y y:
                z.append(i)
                i+=step
        else:
            print('Error1')
        return(z) 
    else:
        print('Error2')
    
(Arange(2.5),Arange(3,8),Arange(2,-4.6,-.5),Arange(2,-5))
def Length(x):
    if not hasattr(x, '__iter__'):
        return 1
    else:
        n=0
        for i in x:
            n+=1
        return n

x=[-6.3830e-01, -9.0400e-02, 2.2958e+00, -6.2140e-01, 8.5560e-01,
-9.6000e-03, 5.4290e-01, 3.4290e-01, 1.5519e+00, -8.4850e-01,
 1.0000e-04, 1.9846e+00, 1.2267e+00, 1.6071e+00, 2.4000e-03,
 6.4780e-01, 2.4260e-01, -1.5200e+00, 2.4870e-01, -7.4300e-01,
-5.8180e-01, -1.6385e+00, -4.3300e-02, 1.5950e+00, -7.1700e-01]
y='I am a student'
z={1:[2,4,6],'b':'list'}
(Length(x),Length(-2.3),Length('a'),Length(y),Length(z))
def Dim(x):
    n1=Length(x)
    n2=0
    if hasattr(x[0], '__iter__'):
        for i in x[0]:
            n2+=1
    else:
        n2=1
    return(n1,n2)            
# 测试
import numpy as np
x=np.random.randn(137,560)    
Dim(x),Dim([[3,2],[3,4],[3,3]]),Dim([2,-1,5]),Dim(np.random.randn(12))
def DimList(x):
    n=[]
    k=0
    for i in x:
        j=0
        k+=1
        if not hasattr(i, '__iter__'): 
            n.append(1) 
        else:
            for m in i:
                j+=1
            n.append(j)
    return (k,n)

# 测试:
x=[{2:1,'s':3},'a',['I am', 'a', 'student'],'probability',(1,'a')]
DimList(x)
def SumV(x): #向量和
    s=0
    for i in x:
        s=s+i
    return s

def SumAll(x): #求两层list或矩阵全部元素和
    z=0
    for i in x:
        if Length(i)>1:
            for j in i:
                z+=j
        else:
            z+=i
    return z

def Mean(x): #向量均值
    n=Length(x)
    return SumV(x)/n

def Prod(x): #向量乘积
    s=1
    for i in x:
        s=s*i
    return s

def Var(x): #样本方差
    s=SumV((np.array(x)-Mean(x))**2)/(Length(x)-1)
    return s

def Sd(x): #样本标准差
    s=Var(x)**(.5)
    return s

def Max(x):
    m=x[0]
    for i in x:
        if i>m:
            m=i
    return m

def Min(x):
    m=x[0]
    for i in x:
        if i
def FApply(M,axis=0,fun=Mean):
    R=[]
    X=np.array(M)
    r,c=Dim(X)
    if axis==0:
        j=0
        while j
x=[[-1.,4.,2.,7.],[12,6,-1,.9],[5,16,-11,5.9]]
print('\ncolumn sum:',FApply(x,0,SumV),'\nrow sum:',FApply(x,1,SumV))
print('\ncolumn max:',FApply(x,0,Max),'\nrow max:',FApply(x,1,Max))
print('\ncolumn min:',FApply(x,0,Min),'\nrow min:',FApply(x,1,Min))
print('\ncolumn mean:',FApply(x,0,Mean),'\nrow mean:',FApply(x,1,Mean))
print('\ncolumn var:',FApply(x,0,Var),'\nrow var:',FApply(x,1,Var))
print('\ncolumn sd:',FApply(x,0,SumV),'\nrow sd:',FApply(x,1,SumV))
print('\ncolumn scale:\n',Trans(np.array(FApply(x,0,Scale))),'\nrow scale:\n',
      np.array(FApply(x,1,Scale)))
print('\ncolumn cummean:\n',Trans(np.array(FApply(x,0,CumMean))),'\nrow cummean:\n',
      np.array(FApply(x,1,CumMean)))
print('\ncolumn cumsum:\n',Trans(np.array(FApply(x,0,CumSum))),'\nrow cumsum:\n',
      np.array(FApply(x,1,CumSum)))
def AllConst(c,n,m=None):
    if m==None:
        z=[]
        i=0
        while i
def DM(x):
    n,m=Dim(x)
    if n!=m:
        print('Not square matrix')
    if n>m: 
        w=m
    else:
        w=n
    z=[]
    i=0
    while i
def Diag(n):
    i=0
    Z=[]
    while i
def MD(x):
    n=Length(x)
    i=0
    Z=[]
    while i
a={'Ray':2,'Tom': 64,'Babara':99,'Ted':47,'John':53,'Jane':30,'Titi':21,
'Baby': 10,'Lucy': 5}
old=[];middle=[];young=[]
for k in a:
    if a[k]<30: young.append(k)
    elif a[k]<70: middle.append(k)
    else: old.append(k)    
print('old:',old,'middle:',middle,'young:',young)    
def Straighten(y):
    z=[]; k=0
    if Dim(y)[1]==1: return y
    while (k
def Reshape(y,r=3,c=8):
    if Prod(Dim(y))!=r*c: 
        return print('Wrong dimension!')
    if Dim(y)[1]!=1:
        y1=Straighten(y)
    else:
        y1=y
    m=[]
    k=1
    while (k<=r):
        m.append(y1[(k-1)*c:k*c])
        k+=1
    return np.array(m)

# 测试
Reshape(Y),Reshape(y) #使用前面生成的Y和y
def Stack(x,y,axis=0): #axis=1 等于 hatsck; axis=0 等于 vatsck 
    rx,cx=Dim(x)
    ry,cy=Dim(y)
    if axis==1:
        if rx!=ry:
            return('Dimension wrong')
        i=0
        res=[]
        while (i<=rx-1):
            res0=[]
            res0.extend(x[i,:].tolist())
            res0.extend(y[i,:].tolist())
            res.append(res0)
            i+=1
        return np.array(res)
    elif axis==0:
        if cx!=cy:
            return('Dimension wrong')
        i=0
        res=x.tolist()
        res.extend(y.tolist())
        return np.array(res)

# 测试:
np.random.seed(1010)
x=np.random.rand(3,2)
y=np.random.rand(3,3)
z=np.random.rand(2,5)
w=np.random.rand(3,5)
xy=Stack(x,y,axis=1)
zw=Stack(z,w,axis=0)
(xy,zw)
def Stack2(x,y,axis=0):#axis=1 等于 hatsck; axis=0 等于 vatsck 
    rx,cx=Dim(x)
    ry,cy=Dim(y)
    if axis==1:
        if rx!=ry:
            return('Dimension wrong')
        z=AllConst(0.,rx,cx+cy)
        z[:,:cx]=x
        z[:,cx:]=y
        return z
    elif axis==0:
        if cx!=cy:
            return('Dimension wrong')
        z=AllConst(0.,rx+ry,cx) 
        z[:rx,:]=x
        z[rx:,:]=y
        return z

#产生数据测试函数:
np.random.seed(1010)
x=np.random.randn(2,3)
y=np.random.randn(2,2)
z=np.random.randn(3,3)
xy=Stack2(x,y,axis=1)
xz=Stack2(x,z,axis=0)
(xy,xz)
def Trans(x):
    r,c=Dim(x)
    i=0
    R=[]
    while i
def Outer(x,y,math=["+","-","*","/","%","**",">","<"]): 
    op=math[0]
    nx=Length(x)
    ny=Length(y)
    R=AllConst(0.,nx,ny)
    i=0
    while i < nx:
        j=0
        while j < ny:
            R[i,j]=eval(str(x[i])+op+str(y[j]))
            j+=1
        i+=1
    return R

# 测试
np.random.seed(8888)
x=np.random.randn(3)
y=np.random.randn(5)
Outer(x,y,"*"),Outer(x,y,"%"),Outer(x,y,"+"),Outer(x,y,"/"),Outer(x,y,"<")
def Sweep(M,V,axis=0,math=["+","-","*","/","%","**",">","<"]): 
    #axis=0 按列(对行元素)运算,axis=1 按行(对列元素)运算
    op=math[0]
    r,c=Dim(M);n=Length(V)
    if axis==0:
        if c!=n: return ('Wrong dimension!')
    else:
        if r!=n: return ('Wrong dimension!')
    R=AllConst(0.,r,c)
    i=0
    while i < r:
        j=0
        while j < c:
            if axis==0:
                R[i,j]=eval(str(M[i,j])+op+str(V[j]))
            else:
                R[i,j]=eval(str(M[i,j])+op+str(V[i]))
            j+=1
        i+=1
    return R

# 测试
np.random.seed(1010)
M=np.random.randn(3,4); V=np.random.randn(4)
W=np.random.choice(np.arange(10),3)
Sweep(M,V,0,"%"),Sweep(M,W,1,"**"),Sweep(M,V,0,"/"),Sweep(M,W,1,"-")
def MProd(x,y):
    rx,cx=Dim(x)
    ry,cy=Dim(y)
    if cx!=ry:
        return ('Wrong dimension')
    z=AllConst(0.,rx,cy)
    for i in Arange(rx):
        for j in Arange(cy):
            for k in Arange(cx):
                z[i,j]+=x[i,k]*y[k,j]
    return z

# 测试
np.random.seed(1010)
x=np.random.rand(3,20)
y=np.random.rand(4,20)
MProd(x,Trans(y))
def Inv(w):
    n=Dim(w) 
    I=Diag(n[0])
    W=Stack(w.astype('float'),I,1)
    for i in Arange(n[1]):
        W[i,:]=W[i,:]/W[i,i]
        for j in Arange(n[0]):
            if j!=i:
                W[j,:]=W[j,:]-W[i,:]*W[j,i]
            else:
                continue
    return W[:,n[0]:] 
 
# 测试:
np.random.seed(1010)
w=np.random.rand(5,5)
np.round(MProd(Inv(w),w),12)
def SimpleSort(z, increasing=True): 
    x=z.copy()
    n = Length(x) 
    for i in Arange(n-1): 
        for j in Arange(0, n-i-1): 
            if increasing:
                if x[j] > x[j+1] : 
                    x[j], x[j+1] = x[j+1], x[j] 
            else:
                if x[j] < x[j+1] : 
                    x[j], x[j+1] = x[j+1], x[j] 
    return x
  
# 测试
x = [64, 34, 25, -2,12, 22, 11, 90,25,-1]  
SimpleSort(x,increasing=False),SimpleSort(x,True),x
def SORT(x,decreasing=False):
    if x[0]s[j]:
                s.insert(j+1,x[i])
                break
            else:    
                j+=1
    if decreasing==True:
        s=s[::-1]
    return s   

# 测试:
np.random.seed(1010)
x=np.random.randn(10)
print(np.array(SORT(x)))
print(np.array(SORT(x,decreasing=True)))  
y=np.array(['I', 'am', 'a', 'student', 'and', 'you','are', 'a', 'teacher'])
print(SORT(y))       
print(SORT(y,decreasing=True))
def Sort(x, decreasing=False):
    m=Length(x)
    s=AllConst(0.,m,m)
    for i in range(m):
        for j in Arange(m):
            s[i,j]=(x[i]
def Order(x, decreasing=False):
    m=Length(x)
    s=AllConst(0.,m,m)
    for i in range(m):
        for j in Arange(m):
            s[i,j]=(x[i]
def ORDERSORT(x,decreasing=False):
    if x[0]s[j]:
                s.insert(j+1,x[i])
                O.insert(j+1,i)
                break
            else:    
                j+=1
    if decreasing==True:
        s=s[::-1]
        O=O[::-1]
    return O,s 

# 测试:
np.random.seed(1010)
x=np.random.randn(10)
y=np.array(['I', 'am', 'a', 'student', 'and', 'you','are', 'a', 'teacher'])
print(np.array(ORDERSORT(x)))
print(np.array(ORDERSORT(x,decreasing=True)))  
print(ORDERSORT(y)[0],'\n',ORDERSORT(y)[1])       
print(ORDERSORT(y,decreasing=True)[0],'\n',ORDERSORT(y,decreasing=True)[1])           
import random
import matplotlib.pyplot as plt
x=0;y=0;X=[];Y=[]
random.seed(1010)
for i in range(1000):
    x=x+random.normalvariate(0,1)
    y=y+random.normalvariate(0,1)
    X.append(x)
    Y.append(y)
plt.figure(figsize=(20,7))
plt.plot(X,Y,'b.-')
np.random.seed(1010)
X=np.hstack((np.ones(200).reshape(-1,1),np.random.normal(20,3,(200,10))))
y=X.dot(np.array([29,10,-7,8,2,9,-2,-12,23,3,6]))+np.random.randn(200)
np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
def f(x):
    return np.log(x)+3/(16-2*np.exp(x))

def df(x):
    return 1/x+6*np.exp(x)/(16-2*np.exp(x))**2

ep=30;x0=.5;k=0
while abs(ep) > 10**-15 and k<10000:
    x0=x0-f(x0)/df(x0)
    ep=f(x0)
    k+=1
print('root=',x0,'\nf(x)=',f(x0),'\nafter',k,'iterations')
def ee(x):
    y=(np.exp(x)+np.exp(-x))/2
    return y

def ee1(x):
    y=(np.exp(x)-np.exp(-x))/2
    return y

def p3(a,b):
    U=ee1(b);V=ee1(a);
    Z=3*(ee(b)-ee(a))/(b-a)-U-V
    W=np.sqrt(Z**2-U*V)
    y=a+(b-a)*(1-(U+W+Z)/(U-V+2*W))
    return y

k=0
a=-200;b=300
y=p3(a,b)
while abs(ee1(y))>10**-15 and k<100:
    if y>0:
        b=y
        y=p3(a,b)
    else:
        a=y
        y=p3(a,b)
    k+=1
print('y=',y,'\nf(x)=',ee(y),'f(x)=',ee1(y),'k=',k)
def f(x):
    y=(np.exp(x)+np.exp(-x))/2
    return y

G=(np.sqrt(5)-1)/2
a1=-200;a2=300
a3=a2-G*(a2-a1)
a4=a1+G*(a2-a1)
f3=f(a3);f4=f(a4)
k=0
while abs(a3-a4)>10**-15 and k<1000:
    if f3
def RandN(n,loc,sd,seed=1010):
    A=[]
    i=0
    Seed=(np.array(Rand(n,seed))*10000)
    while i
import random 
random.seed(1010)
np.random.seed(1010)
n=50000
x_np=np.random.randn(n)
y=[]
for i in range(n):
    y.append(random.normalvariate(0,1))
x_sq=CumMean(x_np)
y_sq=CumMean(y)
print(x_sq[-1],y_sq[-1])

plt.figure(figsize=(20,6))
plt.subplot(121)
plt.plot(x_sq[1000:],'b-',label='numpy.random',linewidth=1)
plt.plot(y_sq[1000:],'g-',label='random',linewidth=1)
plt.plot(np.zeros(n-1000),'k--',label='zeros',linewidth=1)
plt.legend(loc='best')
plt.subplot(122)
plt.plot(x_sq[-50:],'b-o',label='numpy.random',linewidth=5)
plt.plot(y_sq[-50:],'g-s',label='random',linewidth=5)
plt.plot(np.zeros(50),'k--',label='zeros',linewidth=5)
plt.legend(loc='best')
print('numpy.random: ',np.mean(np.array(x_sq[-3000:])>0),
      '\nrandom: ',np.mean(np.array(y_sq[-3000:])>0))
import os

path = '/users/data/站点监测数据/'
files = []
file_names=[]
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if '.xlsx' in file:
            files.append(os.path.join(r, file))
            file_names.append(file)
CP=[]
for i in pd.Series(file_names):
    CP.append(i.split('_')[0])
CP=np.unique(CP)
CP
#产生与CP对应的dict() FD相应于CP的元素 各元素为相应省市文件地址
FD=dict() 
for i in CP:
    FD[i]=list()

for i in CP:
    for j in files:
        if j.split('/')[7].split('_')[0]==i:
            FD[i].append(j)
def DY(w):
    Date=[]    
    Year=[]
    y=w.f_datetime[0].split(' ')[0].split('/')[0]
    s=w.f_datetime[0].split(' ')[0]
    Date.append(s)
    Year.append(y)

    for i in w.f_datetime:
        if i.split(' ')[0] !=s:
            s=i.split(' ')[0]
            Date.append(s)
        if i.split(' ')[0].split('/')[0]!=y:
            y=i.split(' ')[0].split('/')[0]
            Year.append(y)
    Month=list(map(lambda x: '/'+str(x)+'/', range(1,13)))
    Hour=list(map(lambda x: ' '+str(x)+':', range(24)))
    month=np.arange(1,13)
    hour=np.arange(24)

    return {'Date':Date,'Year':Year,'Month':Month,'Hour':Hour},\
    {'Date':Date,'Year':Year,'Month':month,'Hour':hour}
w=pd.read_excel(files[0])
B,B0=DY(w)
print(B.keys(),B['Year'],B['Month'],B['Hour'],B['Date'][:10])
print(B0.keys(),'\n',B0['Hour'],'\n',B0['Month'])
def EX(w):#只有一个文件(站点)的4个汇总
    from astropy.time import Time
    B,B0=DY(w) #提取文件中的日和年(字符串), 月和小时比较规范不用函数
    nm=w.columns #文件中所有变量的名称表
    X=w[nm[7:]] #X不包括省市站点经纬度和时间, 从级别开始的度量
    X=pd.get_dummies(X, dummy_na=False) #把仅有的定性变量"级别"哑元化
    X.iloc[:,-6:]=X.iloc[:,-6:].astype('float') #把数量变量标为浮点型
    X=w[nm[1:7]].join(X)#加入前面未包含的省市站点经纬度和时间等变量
    W=dict() #准备一个dict以待装入年, 月, 小时, 日的平均
    for b in B:
        df=pd.DataFrame() #准备空DataFrame
        for i in B[b]:
            A=X[X['f_datetime'].str.contains(i)].iloc[:,6:].mean()
            df=df.append(pd.DataFrame(data=A.values.reshape(1,-1),columns=A.index),\
                sort=None)
        df.insert(0,value=B0[b],column=b) #插入到DataFrame
        for i in np.arange(1,6)[::-1]:
            df.insert(0,value=w[nm[i]][0],column=nm[i])
        if b=='Date': #转换日期格式
            df['Year']=list(map(lambda x: x.split('/')[0],df.Date)) 
            df.Date=list(map(lambda x: (Time(x.replace('/','-')).value)\
                .split(' ')[0],df.Date)) 
            df['Month']=list(map(lambda x: x.split('-')[1],df.Date)) 
            df['YearMonth']=list(map(lambda x: x.split('-')[0]+'-'+x.split('-')[1],\
                df.Date)) 
        W[b]=df
    return W
RS=dict()
for cp in CP: #省市名
    print(cp) #查看进度, 输出省市名称
    U=dict() #制造空dict
    for i in B:
        U[i]=pd.DataFrame()
    for i in FD[cp]: #文件地址
        print(i) #查看进度, 输出文件路径
        u=pd.read_excel(i) #读入cp省市的一个站点数据
        W=EX(u)
        for k in U: #对年月日时分别合并各个站点数据
            U[k]=U[k].append(W[k],ignore_index=True)
    RS[cp]=U  #形成省市为元素的dict  
for i in RS:
    print(i)
    for j in RS[i]:
        print(j, RS[i][j].shape, type(RS[i][j]))
for i in RS:
    print(i)
    with pd.ExcelWriter(i+'.xlsx') as writer: 
        for j in RS[i]:
            print(j)
            RS[i][j].to_excel(writer,sheet_name=j)
v1=RS['北京']['Year'].pivot(index='Year',columns='c_station',
    values='g_level_一级')
st=list(map(lambda x: 'Station-'+str(x),range(12)));st
for i in range(len(st)):
    print(st[i],'=',v1.columns[i])
Marker=['o','^','s','P','p','*','D','v','<','>','X','H']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(12):
    plt.plot(v1.iloc[:,i],marker=Marker[i], linestyle='dashed',linewidth=2, 
        markersize=12)
plt.legend(loc='best',ncol=6, shadow=True,labels=st)
plt.title('Beijing level-1 percentage for 12 stations from 2014 to 2019')
H=dict()
for i in CP:
    H[i]=RS[i]['Hour'].groupby(['Hour']).mean().reset_index()['g_level_一级']
H=pd.DataFrame(H)
print(H.head())
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(Marker)):
    plt.plot(H[CP[i]],'.-',marker=Marker[i], linestyle='dashed',linewidth=2, 
        markersize=12)
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Level-1 percentage of 24 hours for 4 places')
D=dict()
for i in CP:
    D1=RS[i]['Date'].loc[:,['Date','i_gkd_pm25(μg/m3)']]
    D1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)
    D1=D1.set_index('Date')
    D[i]=D1
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(CP)):
    t=list(map(lambda x: np.datetime64(Time(x.replace('/','-')).value,'D')\
               -np.datetime64('2014-01-01','D'),D[CP[i]].index))
    t=list(map(lambda x: x.astype('int'),t))
    plt.scatter(t,D[CP[i]].values,marker=Marker[i])
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Daily PM-2.5 in 4 places for 6 years')
M=RS['北京']['Month'].loc[:,['Month','i_gkd_pm25(μg/m3)']].groupby('Month').mean()
M.rename(columns={'i_gkd_pm25(μg/m3)':'北京'},inplace=True)
for i in CP[1:]:
    M1=RS[i]['Month'].loc[:,['Month','i_gkd_pm25(μg/m3)']].groupby('Month').mean()
    M1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)
    M=pd.merge(M,M1,left_on='Month', right_on='Month')
print(M.head())
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(Marker)):
    plt.plot(M[CP[i]],'.-',marker=Marker[i], linestyle='dashed',
        linewidth=2, markersize=12)
plt.legend(loc='best',ncol=4, shadow=True,labels=CP0)
plt.title('Monthly PM-2.5 in 4 places for 6 year mean')
vc=RS['重庆']['Month'].pivot(index='Month',columns='c_station',
    values='i_gkd_pm25(μg/m3)')
st=list(map(lambda x: 'Station-'+str(x),range(len(vc.columns))));st
for i in range(len(st)):
    print(st[i],'=',vc.columns[i])
Marker=['o','^','s','P','p','*','D','v','<','X']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(vc.columns)):
    plt.plot(vc.iloc[:,i],marker=Marker[i], linestyle='dashed',
        linewidth=2, markersize=12)
plt.legend(loc='best',ncol=5, shadow=True,labels=st)
plt.title('Chongqing monthly mean pm2.5 for 10 stations')
Y=RS['北京']['Year'].loc[:,['Year','i_gkd_pm25(μg/m3)']].groupby('Year').mean()
Y.rename(columns={'i_gkd_pm25(μg/m3)':'北京'},inplace=True)

for i in CP[1:]:
    Y1=RS[i]['Year'].loc[:,['Year','i_gkd_pm25(μg/m3)']].groupby('Year').mean()
    Y1.rename(columns={'i_gkd_pm25(μg/m3)':i},inplace=True)
    Y=pd.concat((Y,Y1),axis=1,sort=False)
print(Y)
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for i in range(len(CP)):
    plt.plot(Y.index,Y.iloc[:,i],marker=Marker[i], linestyle='dashed',
        linewidth=2, markersize=12)
plt.legend(loc='best',ncol=len(CP), shadow=True,labels=CP0)
plt.title('Yearly PM-2.5 in 4 places')
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
for j in range(len(CP)):
    pm=RS[CP[j]]['Date'].groupby('YearMonth').mean()
    plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], 
                 linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45)
plt.title('Monthly mean PM-2.5 in 4 places through years')
Marker=['o','^','s','P']
CP0=['Beijing','Anhui','Hebei','Chongqing']
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
plt.subplot(121)
for j in range(len(CP)):
    pm=RS[CP[j]]['Date'].groupby('YearMonth').mean()
    plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], 
                 linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45,fontsize=5)
plt.ylim(top=180)  
plt.title('Monthly mean PM-2.5 in 4 places through years')
plt.subplot(122)
for j in range(len(CP)):
    pm=RS[CP[j]]['Date'].groupby('YearMonth').median()
    plt.plot(pm.index,pm['i_gkd_pm25(μg/m3)'],marker=Marker[j], 
                 linestyle='dashed',linewidth=1, markersize=5)
plt.legend(loc='best',shadow=True,labels=CP0)
plt.xticks(rotation=45,fontsize=5)
plt.ylim(top=180)  
plt.title('Monthly median PM-2.5 in 4 places through years')

第9章

def iter(x, P, n):
    res = np.zeros((n+1, len(x)))
    res[0,] = x
    for i in range(n):
        x=x.dot(P)
        res[i+1,] = x
    return res

P=np.array([[.5,.23,.27],[.3,.25,.45],[.1,.5,.4]])

x=np.identity(3)
n=10
y={}
for i in range(x.shape[1]):
    y[i]=iter(x[i,:], P, n)
ev=np.linalg.eig(P.T)[1][:,0]
ev=ev/ev.sum()

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
for i in range(len(y)):
    plt.plot(y[i])
    plt.plot([10,10,10],ev,'p')
def run(i,P, n): # i 是出发状态, P是转移阵, n是走多少步
    res = []
    for t in range(n):
        i=np.random.choice(P.shape[0], size=1,p= P[int(i)])
        res.append(i)
    return res

np.random.seed(1010)
s = run(0,P, 100)

fig = plt.figure(figsize=(20,6))
plt.step(range(len(s)),s)
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20,8))

for i in np.unique(s):
    ss=np.cumsum(s==i)/np.arange(1,len(s)+1)
    plt.plot(ss,label='stat={}'.format(int(i)))
plt.legend(loc='best') 
np.random.seed(1010)
s = run(0,P, 5000)
fig = plt.figure(figsize=(20,6))
for i in np.unique(s):
    ss=np.cumsum(s==i)/np.arange(1,len(s)+1)
    plt.plot(ss,label='stat={}'.format(int(i)))
plt.legend(loc='best') 
# summary statistics of sample
n = 30
ybar = 15
s2 = 3
N=11000
# sample from the joint posterior (mu, tau | data)
mu   = np.zeros(N)
tau = np.zeros(N)
T = 1000    # burnin
tau[0]=1 # initialisation
for i in range(1,N): 
    mu[i] = np.random.normal(loc=ybar, scale=np.sqrt(1/(n*tau[i - 1])),size=1)    
    tau[i] = np.random.gamma(shape = n/2, 
                             scale = 2/((n-1)*s2+n*(mu[i]-ybar)**2),size=1)

mu  = mu[T:]   # remove burnin
tau = tau[T:] # remove burnin

import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(20,6))
ax = fig.add_subplot(221) 
ax.plot(mu)
ax = fig.add_subplot(222) 
ax.plot(tau)
ax = fig.add_subplot(223) 
sns.distplot(mu, hist=True, kde=True, 
             bins=int(300/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax = fig.add_subplot(224) 
sns.distplot(tau, hist=True, kde=True, 
             bins=int(300/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
coin=[1, 2, 1, 2, 2, 1, 1, 2, 1, 2]
heads=[55, 20, 57, 14, 13, 57, 56, 10, 49, 16]

heads=np.array(heads).astype(float) 
coin=np.array(coin)

p1MLE = heads[coin==1].sum()/(sum(coin==1)*n)
p2MLE = heads[coin==2].sum()/(sum(coin==2)*n)
 
print('MLE of p1 =',p1MLE,'\nMLE of p2 =', p2MLE)
np.random.seed(1010)
p1ME = np.random.uniform(0,1,1) # 用均匀分布设置初始猜测p1
p2ME = np.random.uniform(0,1,1) # 用均匀分布设置初始猜测p2
 
P1 = 0 #放置替换的估计
P2 = 0

from scipy.stats import binom

while (np.abs(p1ME-P1)>10**-15)& (np.abs(p2ME-P2)>10**-15):
    P1 = p1ME #迭代中替换前一步估计的 p1(t)以同时保留两次估计的记录
    P2 = p2ME
 
    den1 = binom.pmf(heads,n,p1ME) # 概率质量函数 p(n,p1(t))
    den2 = binom.pmf(heads,n,p2ME)
    # E-步骤
    h1 = den1/(den1+den2)*heads #根据 p1/(p1+p2)重新计算 10个 x数目的期望(按照p1)
    h2 = den2/(den1+den2)*heads #根据 p2/(p1+p2)重新计算 10个 x数目的期望(按照p2)
 
    t1 = den1/(den1+den2)*(n-heads) #根据p1/(p1+p2)重新计算 10个 n-x数目(按照p1)
    t2 = den2/(den1+den2)*(n-heads)
    # M-步骤
    p1ME = np.sum(h1)/np.sum((h1,t1)) #得到最大似然估计p1(t+1)并返回上面作为初始值
    p2ME = np.sum(h2)/np.sum((h2,t2))

# 先前计算的 MLE 估计
print("MLE estimates: p1MLE=%s, p2MLE=%s"%(p1MLE,p2MLE))

# EM 估计
print("EM estimates: p1EM=%s, p2EM=%s"%(np.round(p1ME,10),np.round(p2ME,10)))
p = 0.4
mu = (-1, 2)
sd = (.5, 2)

from scipy.stats import norm
import numpy as np

def f(x,p=p,mu=mu,sd=sd):
    return p*norm.pdf(x, mu[0], sd[0])+(1-p)*norm.pdf(x, mu[1], sd[1])

def q(x,sd=4):
    return np.random.normal(x,sd,1)[0]

def step(x, f, q, sd=4):
    xp = q(x,sd) # 随机从N(x,4)选一点
    alpha = min(1, f(xp) / f(x)) #接受概率(<=1)
    if (np.random.uniform(0,1,1)[0] < alpha): #以概率alpha接受新点
        x = xp
    return x


def run(x, f, q, sd, nsteps):
    res = np.zeros(nsteps)
    for i in range(nsteps):
        x = step(x, f, q)
        res[i] = x
    return res

res = run(0, f, q, 4,5000)


import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
plt.plot(res)
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(20,6))
sns.distplot(res, hist=True, kde=True, 
             bins=int(300/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
resfast=run(0, f, q, 33,5000)
resslow=run(0, f, q, .3,5000)

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,6))
plt.subplot(221)
plt.plot(resfast)
plt.title('Trace for fast sampling')
plt.subplot(222)
sns.distplot(resfast, hist=True, kde=True, 
             bins=int(300/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
plt.title('Histogram and density for fast sampling')
plt.subplot(223)
plt.plot(resslow)
plt.title('Trace for slow sampling')
plt.subplot(224)
sns.distplot(resslow, hist=True, kde=True, 
             bins=int(300/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
plt.title('Histogram and density for slow sampling')
resfast=run(0, f, q, 33,1000)
resslow=run(0, f, q, .3,1000)
#下面函数控制得到的参数不能是[0,1]区间之外的数
def watch(p):
    if p<0 or p>1:
        return 0.
    else:
        return 1.

#似然函数
def lh(p, f, nAA, nAa, naa):
    r=(f*p+(1-f)*p*p)**nAA*((1-f)*2*p*(1-p))**nAa*(f*(1-p)+(1-f)*(1-p)*(1-p))**naa
    return r

# 主要抽样程序
def fp(nAA, nAa, naa, niter, f0, p0, fsd, psd):
    f=np.ones(niter)*0.5
    p=np.ones(niter)*0.5
    f[0]=f0
    p[0]=p0
    for i in np.arange(2,niter):
        oldf=f[i-1]
        oldp=p[i-1]
        newf=oldf+np.random.normal(0,1,1)[0]
        newp=oldp+np.random.normal(0,1,1)[0]
        Af = watch(newf)*watch(newp)*lh(newp,newf,nAA,nAa,naa)/\
        lh(oldp,oldf,nAA,nAa,naa)
        if np.random.uniform(0,1,1)[0] < Af: #以概率alpha接受新点
            f[i] = newf
        else:
            f[i] =oldf
        Ap = watch(newf)*watch(newp)*lh(newp,f[i],nAA,nAa,naa)/\
        lh(oldp,f[i],nAA,nAa,naa)
        if np.random.uniform(0,1,1)[0] < Ap: #以概率alpha接受新点
            p[i] = newp
        else:
            p[i] =oldp
    return f,p        
# 执行抽样
f,p=fp(8,7,12,50000,0.5,0.5,0.01,0.01)     
# 画痕迹图及直方图
import scipy.stats as stats
import seaborn as sns
import scipy.stats as stats
import seaborn as sns
plt.figure(figsize=(20,7))
plt.subplot(221)
plt.plot(f)
plt.title('Trace plot of f')
plt.subplot(222)
sns.distplot(f, hist=True, kde=True, 
             bins=15, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
plt.title('Histogram of f')
plt.subplot(223)
plt.plot(p)
plt.title('Trace plot of p')
plt.subplot(224)
sns.distplot(p, hist=True, kde=True, 
             bins=15, color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
plt.title('Histogram of p')

第四部分

第10章

import pandas as pd
import numpy as np
w=pd.read_csv('SYB58_35_Index of industrial production.csv',skiprows=1)
w.head()
import pandas as pd
import numpy as np
w=pd.read_csv('SYB58_35_Index of industrial production.csv',skiprows=1)
w.head()
w.rename(columns={'Unnamed: 1':'CountryArea'},inplace=True)
w.columns
print('w.shape =',w.shape)
for i in [0,1,2,3]:
    print('Number of',w.columns[i],'=',len(set(w.iloc[:,i])))
print('Series;\n')
for i in set(w.iloc[:,2]):
    print(i)
for i in w.columns[[0,5,6]]:
    del w[i]
w.columns
for i in set(list(w['Series'])):
    w["Series"]= w["Series"].replace(i, i.split(': ')[1].split(' (')[0]) 

set(w['Series'])
w.to_csv('II.csv',index=False)
import pandas as pd
import numpy as np
u=pd.read_csv('II.csv', thousands=',')
print(u.head())
G=u[u['CountryArea']=='Germany'].pivot(index='Year',columns='Series',values='Value')
G
import matplotlib.pyplot as plt
G.plot(style='.-',figsize=(20,6))
plt.title('Index of industrial production of Germany')
plt.show()
T=u[u["Series"]=="Manufacturing"].pivot(index='Year',
    columns='CountryArea', values='Value') 
print(T[["Denmark","Finland", "Sweden", 'Norway','Japan']].head())
import matplotlib.pyplot as plt
T[["Denmark","Finland", "Sweden", 'Norway','Japan']].plot(style='.-',
    figsize=(20,6)) 
plt.show()
I2014=u[u.Year==2014].pivot(index='Series',columns='CountryArea',
    values='Value')
I2014[["Denmark","Finland", "Sweden", 'Norway','Japan']].plot(style='.-',
    figsize=(20,6))
plt.show()
I14=u[u.Year==2014].pivot(index='CountryArea',
    columns='Series',values='Value')
I14.loc[["Denmark","Finland", "Sweden", 'Norway', 'Japan'],:].\
    plot(style='.-', figsize=(20,6))
plt.show()
import pandas as pd
import numpy as np;import scipy.stats as stats
adult=pd.read_csv("adult.csv",header=None)
names=["age","workclass","fnlwgt","education","education_nnum",
"marital_status","occupation","relationship","race",
"sex","capital_gain","capital_loss","hours_per_week",
"native_location","income"]
adult.columns=names
print(adult.head())
print(adult.columns,'\n',adult.shape)
adult.describe()
print(stats.describe(adult.age))
adult.occupation.dtype
cat_cols = [adult.columns.get_loc(col) \
            for col in adult.select_dtypes(['object']).columns.tolist()]
print(cat_cols, '\n',adult.columns[cat_cols])
workclass=adult.groupby("workclass")
print(len(workclass))
workclass.mean()
adult.isna().sum(axis=0)#默认值是axis=0
print(pd.crosstab(adult.income,adult.race))
pd.crosstab([adult.income,adult.sex],adult.occupation)
xtb=pd.crosstab(adult.race,adult.marital_status)
print(xtb)
import matplotlib.pyplot as plt
%matplotlib inline 
fig=plt.figure(figsize=(10,4.5))
plt.subplot(1,2,1)
plt.pie(xtb.sum(0),labels=xtb.columns,autopct='%1.2f%%') #7
plt.title('marital status')
plt.subplot(1,2,2)
plt.pie(xtb.sum(1),labels=xtb.index,autopct='%1.1f') #5
plt.title('race')
print('1000*pi={:1.4f},\n1000*pi={:20.5f}'.format(np.pi*1000,np.pi*1000))
fig=plt.figure(figsize=(10,4.5))
plt.subplot(1,1,1)
plt.barh(y=range(len(xtb.columns)),width=xtb.sum(0),
    tick_label=xtb.columns) 
plt.title('marital status')
fig=plt.figure(figsize=(10,3))
plt.hist(adult.age,density=True,bins=15)
kde=stats.gaussian_kde(adult.age)
x=np.sort(adult.age)
plt.plot(x,kde(x),'k-')
plt.title('Age histogram and density estimation')
import numpy as np
import pandas as pd
import seaborn as sns
w = pd.read_csv('iris.csv')
print(w.head())
L=set(w['Species']);L
u=w.iloc[[1,3,5,51,53,55,101,103,105],:]
print(u)
np.random.seed(999)
u_nan=u.mask(np.random.random(u.shape)<0.3)
print(u_nan)
print(u_nan.isna().sum())#按列计算, 和u_nan.isna().sum(axis=0)
print('Total number of missing values =',u_nan.isna().sum().sum())
MM=list(set(u_nan['Species']))#[nan, 'setosa', 'virginica', 'versicolor']
S=u_nan['Species']
SS=np.zeros(len(S))>0
for i in np.arange(1,len(MM)):
    SS=SS+(S==MM[i])*i
u_nan['Species']=SS 
u_nan
u_nan['Species']=u_nan['Species'].mask(SS==0) #只有data frame 有mask函数
print(u_nan)
from missingpy import MissForest
imputer = MissForest(random_state=1010)
imputed = imputer.fit_transform(u_nan, cat_vars=4)#标明第4个是分类变量
imputed #得到的是np.array
u2=pd.DataFrame(imputed,columns=w.columns,)
print(u2)
Y=pd.DataFrame({'sex':[1,0,0,0,1,1,1,0]})
print('type before:', Y.sex.dtype)
Y['sex']=Y['sex'].astype('category') #改变类型
print('type after:', Y.sex.dtype)
u2["Species"] = u2["Species"].astype('category')
u2=pd.get_dummies(u2,drop_first=False)
print(u2.iloc[:,4:])
on=u2.columns[4:];on
nm=dict()
for i in range(len(on)):
    nm[on[i]]='Species_'+MM[i+1]
u2=u2.rename(columns=nm)
print(u2.iloc[:,4:])#只显示哑元化的几列
import numpy as np
import pandas as pd
adult=pd.read_csv("adult.csv",header=None)
names=["age","workclass","fnlwgt","education","education_nnum",
"marital_status","occupation","relationship","race",
"sex","capital_gain","capital_loss","hours_per_week",
"native_location","income"]
adult.columns=names
adult.isna().sum() #查看缺失值情况
print(adult.isna().sum())
print('Ratio of NaN =', adult.isna().sum().sum()/adult.size)
cat_cols=[]
for i in range(len(adult.columns)):
    if adult.iloc[:,i].dtype=='O':
        cat_cols.append(i)
print(cat_cols, '\n',adult.columns[cat_cols])
mm=[]
for i in cat_cols:
    l=list(set(adult.iloc[:,i]))
    mm.append([x for x in l if type(x) != float])
print(mm,len(mm))
v=adult.copy()

for i in range(len(cat_cols)):
    S=v.iloc[:,cat_cols[i]]
    SS=np.zeros(len(S))>0
    for j in np.arange(len(mm[i])):
        SS=SS+(S==mm[i][j])*(j+1)
    v.iloc[:,cat_cols[i]]=SS
    v.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) 
v.head() #不显示, 所有字符型水平已经转换成数字
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
w=pd.read_csv('iris.csv')
le.fit(w['Species'])
le.classes_
S1=le.transform(w.Species);S1
le.inverse_transform(S1)
from missingpy import MissForest
imputer = MissForest(random_state=1010)
imputed = imputer.fit_transform(v, cat_vars=cat_cols)
imputed #得到的是np.array
v2=pd.DataFrame(imputed,columns=v.columns,)
for i in cat_cols:
    v2.iloc[:,i] = v2.iloc[:,i].astype('category')
v3=pd.get_dummies(v2,drop_first=False)
v3.columns
k=0;d=dict()
for i in cat_cols:
    print(v2.columns[i])
    for j in range(len(set(v2.iloc[:,i]))):
        no=v2.columns[i]+'_'+str(list(set(v2.iloc[:,i]))[j])
        nn=v2.columns[i]+'_'+mm[k][j]
        d[no]=nn
    k=k+1

v3 = v3.rename(columns=d)
def ImpDum(df, drop=False):
    import numpy as np
    import pandas as pd
    from missingpy import MissForest
    cat_cols=[]
    for i in range(len(df.columns)):
        if df.iloc[:,i].dtype=='O':
            cat_cols.append(i)
    mm=[]
    for i in cat_cols:
        l=list(set(df.iloc[:,i]))
        mm.append([x for x in l if type(x) != float])
    v=df.copy()

    for i in range(len(cat_cols)):
        S=v.iloc[:,cat_cols[i]]
        SS=np.zeros(len(S))>0
        for j in np.arange(len(mm[i])):
            SS=SS+(S==mm[i][j])*(j+1)
        v.iloc[:,cat_cols[i]]=SS
        v.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) 
    imputer = MissForest(random_state=1010)
    v2 = imputer.fit_transform(v, cat_vars=cat_cols)
    v2=pd.DataFrame(v2,columns=v.columns,)
    for i in cat_cols:
        v2.iloc[:,i] = v2.iloc[:,i].astype('category')
    v3=pd.get_dummies(v2,drop_first=drop)
    
    k=0;d=dict()
    for i in cat_cols:
        for j in range(len(set(v2.iloc[:,i]))):
            no=v2.columns[i]+'_'+str(list(set(v2.iloc[:,i]))[j])
            nn=v2.columns[i]+'_'+mm[k][j]
            d[no]=nn
        k=k+1

    v3 = v3.rename(columns=d)
    return v3
w=ImpDum(df=adult)
print('w.shape =',w.shape,'\nw.columns =',w.columns)
def Imp(df):
    import numpy as np
    import pandas as pd
    from missingpy import MissForest
    cat_cols=[]
    for i in range(len(df.columns)):
        if df.iloc[:,i].dtype=='O':
            cat_cols.append(i)
    mm=[]
    for i in cat_cols:
        l=list(set(df.iloc[:,i]))
        mm.append([x for x in l if type(x) != float])
    v=df.copy()

    for i in range(len(cat_cols)):
        S=v.iloc[:,cat_cols[i]]
        SS=np.zeros(len(S))>0
        for j in np.arange(len(mm[i])):
            SS=SS+(S==mm[i][j])*(j+1)
        v.iloc[:,cat_cols[i]]=SS
        v.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) 
    imputer = MissForest(random_state=1010)
    v2 = imputer.fit_transform(v, cat_vars=cat_cols)
    v2=pd.DataFrame(v2,columns=v.columns,)
    return v2
w = pd.read_csv('iris.csv')
u=w.iloc[[1,3,5,51,53,55,101,103,105],:]
np.random.seed(999)
u_nan=u.mask(np.random.random(u.shape)<0.3)
u_imp=Imp(u_nan)
print(u_imp) #打印结果
def Dum(df, drop=False):
    import numpy as np
    import pandas as pd
    cat_cols=[]
    for i in range(len(df.columns)):
        if df.iloc[:,i].dtype=='O':
            cat_cols.append(i)
    mm=[]
    for i in cat_cols:
        l=list(set(df.iloc[:,i]))
        mm.append([x for x in l if type(x) != float])
    v=df.copy()

    for i in range(len(cat_cols)):
        S=v.iloc[:,cat_cols[i]]
        SS=np.zeros(len(S))>0
        for j in np.arange(len(mm[i])):
            SS=SS+(S==mm[i][j])*(j+1)
        v.iloc[:,cat_cols[i]]=SS
        v.iloc[:,cat_cols[i]]=v.iloc[:,cat_cols[i]].mask(SS==0) 
    for i in cat_cols:
        v.iloc[:,i] = v.iloc[:,i].astype('category')
    v3=pd.get_dummies(v,drop_first=drop)
    
    k=0;d=dict()
    for i in cat_cols:
        for j in range(len(set(v.iloc[:,i]))):
            no=v.columns[i]+'_'+str(list(set(v.iloc[:,i]))[j])
            nn=v.columns[i]+'_'+mm[k][j]
            d[no]=nn
        k=k+1

    v3 = v3.rename(columns=d)
    return v3
w = pd.read_csv('iris.csv')
w_dum=Dum(w)
print(w_dum.iloc[[0,1,51,52,101,102],4:])

第11章

import pandas as pd
import numpy as np

w=pd.read_csv('Boston.csv') 
print(w.head(3))
y=w.MEDV  #因变量
n=len(y)  #样本量
X=w.iloc[:,:-1] #自变量
print(X.columns)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

names = ['HGBoost',"Adaboost","Bagging", "Random Forest",\
         "Linear Model"]
regressors = [
    HistGradientBoostingRegressor(random_state=1010),
    AdaBoostRegressor(random_state=1010, n_estimators=100),
    BaggingRegressor(n_estimators=100),
    RandomForestRegressor(n_estimators=500,random_state=1010),
    LinearRegression()]
REG=dict(zip(names,regressors))
def Rfold(n,Z,seed):
    zid=(list(range(Z))*int(n/Z+1))[:n]
    np.random.seed(seed)
    np.random.shuffle(zid)
    return(np.array(zid))
Z=10
zid=Rfold(n,Z,1010)    
YPred=dict();
for i in REG:
    Y_pred=np.zeros(n)
    for j in range(Z):
        reg=REG[i]
        reg.fit(X[zid!=j],y[zid!=j])
        Y_pred[zid==j]=reg.predict(X[zid==j])
    YPred[names[i]]=Y_pred 
R=pd.DataFrame(YPred)    
M=np.sum((y-np.mean(y))**2)
A=dict()
for i in REG:
    A[i]=np.sum((y-YPred[i])**2)/M
import matplotlib.pyplot as plt
plt.figure(figsize = (12,4))
plt.barh(range(len(A)), A.values(), color = 'navy', height = 0.6)
plt.xlabel('NMSE')
plt.ylabel('Model')
plt.title('Normalized MSE for 5 Models') 
plt.yticks(np.arange(len(A)),A.keys())
for v,u in enumerate(A.values()): 
    plt.text(u, v, str(round(u,4)), va = 'center')
plt.show()
def RegCV(X,y,regress, Z=10, seed=8888, trace=True):
    from datetime import datetime
    n=len(y)
    zid=Rfold(n,Z,seed)
    YPred=dict();
    M=np.sum((y-np.mean(y))**2)
    A=dict()
    for i in regress:
        if trace: print(i,'\n',datetime.now())
        Y_pred=np.zeros(n)
        for j in range(Z):
            reg=regress[i]
            reg.fit(X[zid!=j],y[zid!=j])
            Y_pred[zid==j]=reg.predict(X[zid==j])
        YPred[i]=Y_pred 
        A[i]=np.sum((y-YPred[i])**2)/M
    if trace: print(datetime.now())
    R=pd.DataFrame(YPred)    
    return R,A
R,A=RegCV(X,y,REG)
def BarPlot(A,xlab='',ylab='',title='',size=[None,None,None,None,None]):
    import matplotlib.pyplot as plt
    plt.figure(figsize = (12,4))
    plt.barh(range(len(A)), A.values(), color = 'navy')
    plt.xlabel(xlab,size=size[0])
    plt.ylabel(ylab,size=size[1])
    plt.title(title,size=size[2]) 
    plt.yticks(np.arange(len(A)),A.keys(),size=size[3])
    for v,u in enumerate(A.values()): 
        plt.text(u, v, str(round(u,4)), va = 'center',color='navy',size=size[4])
    plt.show()
BarPlot(A,'NMSE','Model','Normalized MSE for 5 Models')
import pandas as pd
import numpy as np

w=pd.read_csv("DNA.csv") 
X=w.iloc[:,:-1];y=w.iloc[:,-1];n=len(y) 
y=pd.get_dummies(y).dot(np.arange(1,4)) 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,
    BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

names = ["Bagging", "Linear SVM", "RBF SVM", "Decision Tree",
    "Random Forest", "AdaBoost", "Naive Bayes",'HGboost']
classifiers = [
    BaggingClassifier(n_estimators=100,random_state=1010),
    SVC(kernel="linear", C=0.025,random_state=0),
    SVC(gamma='auto', C=1,random_state=0),
    DecisionTreeClassifier(max_depth=5,random_state=0),
    RandomForestClassifier(n_estimators=500,random_state=0),
    AdaBoostClassifier(n_estimators=100,random_state=0),
    GaussianNB(),
    HistGradientBoostingClassifier(random_state=0)]

CLS=dict(zip(names,classifiers))
def Fold(u,Z=10,seed=8888):
    u=np.array(u).reshape(-1)
    id=np.arange(len(u))
    zid=[];ID=[];np.random.seed(seed)
    for i in np.unique(u):
        n=sum(u==i)
        ID.extend(id[u==i])
        k=(list(range(Z))*int(n/Z+1))
        np.random.shuffle(k)
        zid.extend(k[:n])
    zid=np.array(zid);ID=np.array(ID)
    zid=zid[np.argsort(ID)]
    return zid
Z=10
Zid=Fold(y,Z=10,seed=8888)

YCPred=dict();
for i in CLS:
    print(i,'\n',datetime.now())
    Y_pred=np.zeros(len(y))
    for j in range(Z):
        clf=CLS[i]
        clf.fit(X[Zid!=j],y[Zid!=j])
        Y_pred[Zid==j]=clf.predict(X[Zid==j])
    YCPred[i]=Y_pred   
    print(datetime.now()) 
R=pd.DataFrame(YCPred)
A=dict()
for i in CLS:
    A[i]=np.mean(y!=R[i])
BarPlot(A,'Error rate','Model','Error rates of 8 models')
def ClaCV(X,y,CLS, Z=10,seed=8888, trace=True):
    from datetime import datetime
    n=len(y)
    Zid=Fold(y,Z,seed=seed)
    YCPred=dict();
    A=dict()
    for i in CLS:
        if trace: print(i,'\n',datetime.now())
        Y_pred=np.zeros(n)
        for j in range(Z):
            clf=CLS[i]
            clf.fit(X[Zid!=j],y[Zid!=j])
            Y_pred[Zid==j]=clf.predict(X[Zid==j])
        YCPred[i]=Y_pred 
        A[i]=np.mean(y!=YCPred[i])
    if trace: print(datetime.now())  
    R=pd.DataFrame(YCPred)
    return R, A
R,A=ClaCV(X,y,CLS)
import pandas as pd
import numpy as np
w=pd.read_csv('diamonds.csv') 

u=Dum(w)
u1=Dum(w,drop=True)
y=w.price
n=len(y)
X=u.copy();del X['price']
X1=u1.copy();del X1['price']
    
Z=10
zid=Rfold(n,Z,1010)    
from sklearn.linear_model import LinearRegression
lm=LinearRegression()

lm_pred=np.zeros(n)
for j in range(Z):
    lm.fit(X1[zid!=j],y[zid!=j])
    lm_pred[zid==j]=lm.predict(X1[zid==j]) 
lm_NMSE=((y-lm_pred)**2).sum()/np.sum((y-y.mean())**2)
lm_NMSE
def SRCV(X,y,REG,Z=10,seed=1010):
    n=len(y)
    zid=Rfold(n,Z,seed) 
    pred=np.zeros(n)
    for j in range(Z):
        REG.fit(X[zid!=j],y[zid!=j])
        pred[zid==j]=REG.predict(X[zid==j]) 
    NMSE=((y-pred)**2).sum()/np.sum((y-y.mean())**2)
    return NMSE, pred
NMSE, pred=SRCV(X1,y,lm);print(NMSE)
lm.fit(X1,y)
print('Coef:\n',lm.coef_,'\nIntercept =',lm.intercept_)
df=pd.read_csv('commun123.csv')
df_y=df['ViolentCrimesPerPop'];df_X=df.iloc[:,:-1]
LM=LinearRegression(fit_intercept=False, normalize=False)

M_coef=LM.fit(df_X,df_y).coef_
S_coef=[]
for i in range(df_X.shape[1]):
    S_coef.extend(LM.fit(np.array(df_X.iloc[:,i]).reshape(-1,1),df_y).coef_)
S_coef=np.array(S_coef)

plt.style.use('ggplot')
n = 122
fig, ax = plt.subplots(figsize=(18,6))
index = np.arange(n)
bar_width = 0.35
opacity = 0.9
ax.bar(index, M_coef, bar_width, alpha=opacity, color='r',\
       label='Coefficients of multiple regressions')
ax.bar(index+bar_width, S_coef, bar_width, alpha=opacity, color='b',\
       label='Coefficients of univariate regressions')
ax.set_xlabel('Covariates')
ax.set_ylabel('Coefficients')
ax.set_title('Coefficient comparison between multiple and univariate regression\
without constant term')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(df_X.columns,rotation=90)
ax.legend(loc='upper left')
plt.show()
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
w=pd.read_csv('trans.csv')

X=w.iloc[:,:3]
y=w['Donate']
n=len(y);Z=10
Zid=Fold(y,Z=10,seed=1010)
pred=np.zeros(len(y))
clf=LogisticRegression(solver='lbfgs')
for j in range(Z):
    clf.fit(X[Zid!=j],y[Zid!=j])
    pred[Zid==j]=clf.predict(X[Zid==j])

from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',np.mean(y!=pred))
def SCCV(X,y,CLS,Z=10,seed=1010):
    n=len(y)
    Zid=Fold(y,Z,seed)
    pred=np.zeros(len(y))
    for j in range(Z):
        CLS.fit(X[Zid!=j],y[Zid!=j])
        pred[Zid==j]=CLS.predict(X[Zid==j])
    error=np.mean(y!=pred)
    return error, pred
error, pred=SCCV(X,y,clf);print(error)
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz 

w=pd.read_csv("mushroom.csv")
w.columns
X=pd.get_dummies(w.iloc[:,1:],drop_first=False)
X.columns
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(w['type'])
y=le.transform(w['type'])
print('original levels =',le.classes_,'\nafter transform: y =',y)
print('inverse transform back =',le.inverse_transform(y))
clf=DecisionTreeClassifier(random_state=0, max_depth=None) #'gini'准则
clf=clf.fit(X,y)
dot_data=tree.export_graphviz(clf,out_file=None, 
    feature_names = X.columns,rounded=True, filled=True)
graph = graphviz.Source(dot_data) 
graph.render("mushroom") #输出图到mushroom.pdf文件
graph #显示图
clf=DecisionTreeClassifier(random_state=0, max_depth=None)
error, pred=SCCV(X,y,clf)
print('confusion matrix:\n',confusion_matrix(y, pred))
print('error rate = ', error)
from sklearn.model_selection import cross_val_score
clf=DecisionTreeClassifier(random_state=0, max_depth=None)#'gini'准则
cross_val_score(clf, X, y, cv=10)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import graphviz 

w=pd.read_csv('Boston.csv') 
y=w.MEDV
n=len(y)
X=w.iloc[:,:-1]
reg = DecisionTreeRegressor(random_state=0,max_depth=2)
reg=reg.fit(X,y)
dot_data=tree.export_graphviz(reg,out_file=None, 
    feature_names = X.columns,rounded=True, filled=True)
graph = graphviz.Source(dot_data) 
graph.render("Bostontree") #输出图到Bostontree.pdf文件
graph #显示图
print(((y[w.RM <= 6.941]-y[w.RM <= 6.941].mean())**2).mean(),
((y[w.RM > 6.941]-y[w.RM > 6.941].mean())**2).mean())
reg = DecisionTreeRegressor(random_state=0)
NMSE, pred=SRCV(X,y,reg,seed=1010);print(NMSE)
from sklearn.model_selection import cross_val_score
reg = DecisionTreeRegressor(random_state=0)
cross_val_score(reg, X, y, cv=10)
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

w=pd.read_csv('diamonds.csv') 
u=Dum(w)

y=w.price
n=len(y)
X=u.copy();del X['price']
names = ["Bagging", "Random Forest", "HGboost",'Linear Model']
regressors = [
    BaggingRegressor(n_estimators=100,random_state=1010),
    RandomForestRegressor(n_estimators=500,random_state=1010),
    HistGradientBoostingRegressor(random_state=1010),
    LinearRegression()]
REG=dict(zip(names,regressors))
R,A=RegCV(X,y,REG,seed=1010)
xlab='NMSE'
ylab='Model'
title='Normalized MSE for 4 Models'
BarPlot(A,xlab,ylab,title)
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

v= pd.read_csv('pendigits.csv',index_col=False)
X=v[v.columns[:16]]#自变量
y=v[v.columns[16]]#因变量

names = ["Bagging", "Random Forest", "HGBoost"]
classifiers = [
    BaggingClassifier(n_estimators=100,random_state=1010),
    RandomForestClassifier(n_estimators=500,random_state=1010),
    HistGradientBoostingClassifier(random_state=1010)]
CLS=dict(zip(names,classifiers))

R,A=ClaCV(X,y,CLS,seed=1010)

xlab='Error rate';ylab='Model';title='Error rate for 3 models'
BarPlot(A,xlab,ylab,title)
import pandas as pd
import numpy as np 
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]

from sklearn.neural_network import MLPClassifier
CLS=MLPClassifier()

error,pred=SCCV(X,y,CLS,seed=1010)

from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)
import pandas as pd
import numpy as np

w=pd.read_csv('Boston.csv') 
y=w.MEDV
X=w.iloc[:,:-1]

from sklearn.neural_network import MLPRegressor
REG=MLPRegressor(max_iter=1000)
NMSE, pred=SRCV(X,y,REG)

NMSE
import pandas as pd
import numpy as np 
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]

from sklearn.neighbors import KNeighborsClassifier
CLS=KNeighborsClassifier(n_neighbors=50)

error,pred=SCCV(X,y,CLS,seed=1010)
from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)
import pandas as pd
import numpy as np

w=pd.read_csv('Boston.csv') 
y=w.MEDV
X=w.iloc[:,:-1]

from sklearn.neighbors import KNeighborsRegressor
REG=KNeighborsRegressor(n_neighbors=3)
NMSE, pred=SRCV(X,y,REG)

NMSE
import pandas as pd
import numpy as np 
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]

from sklearn.svm import SVC
names=['Linear SVM','RBF SVM']
Cls=[SVC(kernel="linear", C=0.025,random_state=0),
SVC(gamma='auto', C=1,random_state=0)]
CLS=dict(zip(names,Cls))

R,A=ClaCV(X,y,CLS, seed=8888)

xlab='Error Rate';ylab='Model';title='Error rate of 2 SVM models'
BarPlot(A,xlab,ylab,title)
import pandas as pd
import numpy as np 
w = pd.read_csv('pima.csv')
X=w.iloc[:,:-1]
y=w.iloc[:,-1]

from sklearn.naive_bayes import GaussianNB
CLS = GaussianNB()

error,pred=SCCV(X,y,CLS,seed=1010)
from sklearn.metrics import confusion_matrix
print("confusion matrix:\n",confusion_matrix(y, pred))
print('error rate =',error)

你可能感兴趣的:(python)