Python数据分析学习总结
概述
数据分析的含义与目标
方法:统计分析方法
目标:提取有用信息
手段:研究、概括、总结
Python与数据分析
Python特点:简洁、开发效率高、运算速度慢、胶水特性(集成C语言)
Python数据分析:numpy、scipy、matplotlib、pandas、scikit-learn、keras…
Python数据分析大家族
numpy:数据结构基础
scipy:强大的科学计算方法(矩阵分析、信号分析、数理分析…)
matplotlib:丰富的可视化套件
pandas:基础数据分析套件
scikit-learn:强大的数据分析建模库
keras:人工神经网络
Python数据分析环境搭建
平台:Windows、Linux
科学计算工具:Anaconda
Python数据分析基础
numpy
开源、数据计算扩展;ndarray、多维操作、线性代数
numpy使用程序:
import numpy as np
def main():
lst=[[1,3,5],[2,4,6]]
print(type(lst))
np_lst=np.array(lst)
print(type(np_lst))
np_lst=np.array(lst, dtype=np.float)
print(np_lst.shape)
print(np_lst.ndim)
print(np_lst.dtype)
print(np_lst.itemsize)
print(np_lst.size)
if __name__=="__main__":
main()
执行结果:
(2, 3)
2
float64
8
6
numpy常用数组
print(np.zeros([2,4]))
print(np.ones([3,5]))
print(np.random.rand(2,4))
print(np.random.rand())
print("RandInt:")
print(np.random.randint(1,10,3))
print("Randn:") # 标准正态分布
print(np.random.randn(2,4))
print("Choice")
print(np.random.choice([10,20,30]))
print("Distribute:") # Beta分布
print(np.random.beta(1,10,100))
执行结果:
[[ 0. 0. 0. 0.]
[ 0. 0. 0. 0.]]
[[ 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1.]
[ 1. 1. 1. 1. 1.]]
[[ 0.80307088 0.25491367 0.54381007 0.10159737]
[ 0.71565024 0.62473538 0.66892166 0.41078071]]
0.16467244260637237
RandInt:
[5 3 2]
Randn:
[[-0.51707383 -1.46091351 -0.78197086 0.44640286]
[-0.0998081 0.40701679 0.07750661 0.66041753]]
Choice
10
Distribute:
[ 0.03897375 0.09804991 0.1617222 ..., 0.12878516 0.11699157
0.05681225]
numpy常用操作
print("Arange:")
print(np.arange(1,11))
print("Exp:")
print(np.exp(lst))
print("Exp2:")
print(np.exp2(lst))
print("Sqrt:")
print(np.sqrt(lst))
print("Sin:")
print(np.sin(lst))
print("Log:")
print(np.log(lst))
执行结果:
Arange:
[ 1 2 3 4 5 6 7 8 9 10]
Exp:
[[ 2.71828183 20.08553692 148.4131591 ]
[ 7.3890561 54.59815003 403.42879349]]
Exp2:
[[ 2. 8. 32.]
[ 4. 16. 64.]]
Sqrt:
[[ 1. 1.73205081 2.23606798]
[ 1.41421356 2. 2.44948974]]
Sin:
[[ 0.84147098 0.14112001 -0.95892427]
[ 0.90929743 -0.7568025 -0.2794155 ]]
Log:
[[ 0. 1.09861229 1.60943791]
[ 0.69314718 1.38629436 1.79175947]]
lst=np.array([[[1,2,3,4],[4,5,6,7]],[[7,8,9,10],[10,11,12,13]],[[14,15,16,17],[18,19,20,11]]])
print(lst.sum(axis=2))
print(lst.sum(axis=1))
print(lst.sum(axis=0))
print("Max:")
print(lst.max(axis=1))
print("Min:")
print(lst.min(axis=0))
执行结果:
[[10 22]
[34 46]
[62 68]]
[[ 5 7 9 11]
[17 19 21 23]
[32 34 36 28]]
[[22 25 28 31]
[32 35 38 31]]
Max:
[[ 4 5 6 7]
[10 11 12 13]
[18 19 20 17]]
Min:
[[1 2 3 4]
[4 5 6 7]]
lst1=np.array([10,20,30,40])
lst2=np.array([4,3,2,1])
print("Add:")
print(lst1+lst2)
print("Sub:")
print(lst1-lst2)
print("Mul:")
print(lst1*lst2)
print("Div:")
print(lst1/lst2)
print("Square:")
print(lst1**2)
print("Dot:")
print(np.dot(lst1.reshape([2,2]),lst2.reshape([2,2])))
print("Concatenate:")
print(np.concatenate((lst1,lst2),axis=0))
print("vstack:")
print(np.vstack((lst1,lst2)))
print("hstack:")
print(np.hstack((lst1,lst2)))
print("Split:")
print(np.split(lst1,2))
print(np.split(lst1,4))
print("Copy:")
print(np.copy(lst1))
执行结果:
Add:
[14 23 32 41]
Sub:
[ 6 17 28 39]
Mul:
[40 60 60 40]
Div:
[ 2.5 6.66666667 15. 40. ]
Square:
[ 100 400 900 1600]
Dot:
[[ 80 50]
[200 130]]
Concatenate:
[10 20 30 40 4 3 2 1]
vstack:
[[10 20 30 40]
[ 4 3 2 1]]
hstack:
[10 20 30 40 4 3 2 1]
Split:
[array([10, 20]), array([30, 40])]
[array([10]), array([20]), array([30]), array([40])]
Copy:
[10 20 30 40]
线程方程组
import numpy as np
from numpy.linalg import *
def main():
print(np.eye(3))
lst=np.array([[1,2],[3,4]])
print("Inv:")
print(inv(lst))
print("T:")
print(lst.transpose())
print("Det:")
print(det(lst))
print("Eig:")
print(eig(lst))
if __name__=="__main__":
main()
执行结果:
[[ 1. 0. 0.]
[ 0. 1. 0.]
[ 0. 0. 1.]]
Inv:
[[-2. 1. ]
[ 1.5 -0.5]]
T:
[[1 3]
[2 4]]
Det:
-2.0
Eig:
(array([-0.37228132, 5.37228132]), array([[-0.82456484, -0.41597356],
[ 0.56576746, -0.90937671]]))
numpy其他方面应用
import numpy as np
from numpy.linalg import *
def main():
print("FFT:")
print(np.fft.fft(np.array([1,1,1,1,1,1,1,1])))
print("Coef:")
print(np.corrcoef([1,0,1],[0,2,1]))
print("Poly:")
print(np.poly1d([2,1,3]))
if __name__=="__main__":
main()
执行结果:
FFT:
[ 8.+0.j 0.+0.j 0.+0.j 0.+0.j 0.+0.j 0.+0.j 0.+0.j 0.+0.j]
Coef:
[[ 1. -0.8660254]
[-0.8660254 1. ]]
Poly:
2
2 x + 1 x + 3
matplotlib
概述
matplotlib是关键的绘图库。
实现
import numpy as np
import matplotlib.pyplot as plt
def main():
#line
x=np.linspace(-np.pi,np.pi,256,endpoint=True)
c,s=np.cos(x),np.sin(x)
plt.figure(1)
plt.plot(x,c,color="blue",linewidth=1.0,linestyle="-",label="COS",alpha=0.5)
plt.plot(x,s,"r*",label="SIN")
plt.title("COS & SIN")
ax=plt.gca()
ax.spines["right"].set_color("none")
ax.spines["top"].set_color("none")
ax.spines["left"].set_position(("data",0))
ax.spines["bottom"].set_position(("data",0))
ax.xaxis.set_ticks_position("bottom")
ax.yaxis.set_ticks_position("left")
plt.show()
#scatter
fig=plt.figure()
ax=fig.add_subplot(3,3,1)
n=128
X=np.random.normal(0,1,n)
Y=np.random.normal(0,1,n)
T=np.arctan2(Y,X)
#plt.axes([0.025,0.025,0.95,0.95])
#plt.scatter(X,Y,s=75,c=T,alpha=0.5)
ax.scatter(X,Y,s=75,c=T,alpha=0.5)
plt.xlim(-1.5,1.5),plt.xticks([])
plt.ylim(-1.5,1.5),plt.yticks([])
plt.axis()
plt.title("scatter")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
#bar
fig.add_subplot(332)
n=10
X=np.arange(n)
Y1=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
Y2=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
plt.bar(X,+Y1,facecolor='#9999ff',edgecolor='white')
plt.bar(X,-Y2,facecolor='#9999ff',edgecolor='white')
for x,y in zip(X,Y1):
plt.text(x+0.4,y+0.05,'%.2f' % y,ha='center',va='bottom')
for x,y in zip(X,Y2):
plt.text(x+0.4,-y-0.05,'%.2f' % y,ha='center',va='bottom')
plt.show()
#Pie
fig.add_subplot(333)
n=20
Z=np.ones(n)
Z[-1]*=2
plt.pie(Z,explode=Z*.05,colors=['%s' % (i / float(n)) for i in range(n)],
labels=['%.2f' % (i / float(n)) for i in range(n)])
plt.gca().set_aspect('equal')
plt.xticks([]), plt.yticks([])
plt.show()
#polar
fig.add_subplot(334)
n=20
theta=np.arange(0.0,2*np.pi,2*np.pi/n)
radii=10*np.random.rand(n)
plt.plot(theta, radii)
plt.show()
#beatmap
fig.add_subplot(335)
from matplotlib import cm
data=np.random.rand(3,3)
cmap=cm.Blues
map=plt.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=0,vmax=1)
plt.show()
#hot map
fig.add_subplot(313)
def f(x,y):
return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
n=256
x=np.linspace(-3,3,n)
y=np.linspace(-3,3,n)
X,Y=np.meshgrid(x,y)
plt.contourf(X,Y,f(X,Y),8,alpha=.75,cmap=plt.cm.hot)
plt.show()
#3D
ax=fig.add_subplot(336,projection="3d")
ax.scatter(1,1,3,s=100)
plt.show()
if __name__=="__main__":
main()
scipy
简介
数值计算库
积分
程序:
import numpy as np
from scipy.integrate import quad,dblquad,nquad
def main():
# Integral
print(quad(lambda x:np.exp(-x),0,np.inf))
print(dblquad(lambda t,x:np.exp(-x*t)/t**3,0,np.inf,lambda x:1,lambda x:np.inf))
def f(x,y):
return x*y
def bound_y():
return [0,0.5]
def bound_x(y):
return [0,1-2*y]
print(nquad(f,[bound_x,bound_y]))
if __name__=="__main__":
main()
执行结果:
(1.0000000000000002, 5.842607038578007e-11)
(0.3333333333366853, 1.3888461883425516e-08)
(0.010416666666666668, 4.101620128472366e-16)
优化器
import numpy as np
from scipy.optimize import minimize
def main():
# Optimizer
def rosen(x):
return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0)
x0=np.array([1.3,0.7,0.8,1.9,1.2])
res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True})
print("ROSE MINI:", res)
if __name__=="__main__":
main()
执行结果:
Optimization terminated successfully.
Current function value: 0.000000
Iterations: 339
Function evaluations: 571
ROSE MINI: final_simplex: (array([[ 1. , 1. , 1. , 1. , 1. ],
[ 1. , 1. , 1. , 1. , 1. ],
[ 1. , 1. , 1. , 1.00000001, 1.00000001],
[ 1. , 1. , 1. , 1. , 1. ],
[ 1. , 1. , 1. , 1. , 1. ],
[ 1. , 1. , 1. , 1. , 0.99999999]]), array([ 4.86115343e-17, 7.65182843e-17, 8.11395684e-17,
8.63263255e-17, 8.64080682e-17, 2.17927418e-16]))
fun: 4.8611534334221152e-17
message: 'Optimization terminated successfully.'
nfev: 571
nit: 339
status: 0
success: True
x: array([ 1., 1., 1., 1., 1.])
插值
import numpy as np
from scipy.interpolate import interpld
def main():
def fun(x):
return x+2*np.cos(x)
sol=root(fun,0.1)
print("ROOT:",sol.x,sol.fun)
#Interpolation
x=np.linspace(0,1,10)
y=np.sin(2*np.pi*x)
li=interpld(x,y,kind="cubic")
x_new=np.linspace(0,1,50)
y_new=li(x_new)
figure()
plot(x,y,"r")
plot(x_new,y_new,"k")
show()
print(y_new)
if __name__=="__main__":
main()
线性计算与矩阵分解
程序:
import numpy as np
from scipy import linalg as lg
def main():
arr=np.array([[1,2],[3,4]])
print("Det:",lg.det(arr))
print("Inv:",lg.inv(arr))
b=np.array([6,14])
print("Sol:",lg.solve(arr,b))
print("Eig:",lg.eig(arr))
print("LU:",lg.lu(arr))
print("QR:",lg.qr(arr))
print("SVD:",lg.svd(arr))
print("Schur:",lg.schur(arr))
if __name__=="__main__":
main()
执行结果:
Det: -2.0
Inv: [[-2. 1. ]
[ 1.5 -0.5]]
Sol: [ 2. 2.]
Eig: (array([-0.37228132+0.j, 5.37228132+0.j]), array([[-0.82456484, -0.41597356],
[ 0.56576746, -0.90937671]]))
LU: (array([[ 0., 1.],
[ 1., 0.]]), array([[ 1. , 0. ],
[ 0.33333333, 1. ]]), array([[ 3. , 4. ],
[ 0. , 0.66666667]]))
QR: (array([[-0.31622777, -0.9486833 ],
[-0.9486833 , 0.31622777]]), array([[-3.16227766, -4.42718872],
[ 0. , -0.63245553]]))
SVD: (array([[-0.40455358, -0.9145143 ],
[-0.9145143 , 0.40455358]]), array([ 5.4649857 , 0.36596619]), array([[-0.57604844, -0.81741556],
[ 0.81741556, -0.57604844]]))
Schur: (array([[-0.37228132, -1. ],
[ 0. , 5.37228132]]), array([[-0.82456484, -0.56576746],
[ 0.56576746, -0.82456484]]))
pandas
简介
数据分析库
基础数据分析技术
import numpy as np
import pandas as pd
def main():
#Data Structure
s=pd.Series([i*2 for i in range(1,11)])
print(type(s))
dates=pd.date_range("20170301",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
print(df)
#Basic
print(df.head(3))
print(df.tail(3))
print(df.index)
print(df.values)
print(df.T)
print(df.sort(columns="C"))
print(df.sort_index(axis=1,ascending=False))
print(df.describe())
#Select
print(type(df["A"]))
print(df[:3])
print(df["20170301":"20170304"])
print(df.loc[dates[0]])
print(df.loc["20170301":"20170304",["B","D"]])
print(df.iloc[1:2,2:4])
print(df.iloc[1,4])
print(df[df.B>0][df.A<0])
print(df[df>0])
print(df[df["E"].isin([1,2])])
#Set
s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))
df["F"]=s1
print(df)
df.at[dates[0],"A"]=0
print(df)
df.iat[1,1]=1
df.loc[:,"D"]=np.array([4]*len(df))
df2=df.copy()
df2[df2>0]=df2
print(df2)
#Missing Value
df1=df.reindex(index=dates[:4],columns=list("ABCD")+["G"])
df1.loc[dates[0]:dates[1],"G"]=1
print(df1)
print(df1.dropna())
print(df1.fillna(value=2))
#Concat
pieces=[df[:3],df[-3:]]
print(pd.concat(pieces))
left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
right=pd.DataFrame({"key":["x","z"],"value":[3,4]})
print("LEFT:",left)
print("RIFHT:",right)
print(pd.merge(left,right,on="key",how="left"))
df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))})
print(df3.groupby("A").sum())
if __name__=="__main__":
main()
时间、绘图
import numpy as np
import pandas as pd
from pylab import *
def main():
#Time Series
t_exam=pd.date_range("20170301",periods=10,freq="S")
print(t_exam)
#Graph
ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000))
ts=ts.cumsum()
ts.plot()
show()
if __name__=="__main__":
main()
scikit-learn
简介
数据挖掘建模、机器学习
机器学习与决策树
机器学习:因子–>结果
结果:
不带标记–>无监督学习(聚类);带标记–>监督学习
有限离散–>分类;连续–>回归
决策树:监督学习;树形结构
Iris数据集
实现
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn import metrics
def main():
#Pre-processing
iris=load_iris()
print(iris)
print(len(iris["data"]))
train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
#Model
clf=tree.DecisionTreeClassifier(criterion="entropy")
clf.fit(train_data,train_target)
y_pred=clf.predict(test_data)
#Verify
print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))
print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred))
if __name__=="__main__":
main()
keras
简介
人工神经网络
简单神经网络实现
Keras安装步骤:Anaconda CMD;conda install mingw libpython;pip install keras;pip install np_utils
实例
注意:需要需要C:/user/username/.keras/keras.json,具体改后内容如下:{“backend”: “theano”,”image_data_format”: “th”,”epsilon”: 1e-07,”floatx”: “float32”}。
import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Activation
from keras.optimizers import SGD
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
def main():
pass
iris=load_iris()
print(iris["target"])
LabelBinarizer().fit_transform(iris["target"])
train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
labels_train=LabelBinarizer().fit_transform(train_target)
labels_test=LabelBinarizer().fit_transform(test_target)
model=Sequential(
[
Dense(5,input_dim=4),
Activation("relu"),
Dense(3),
Activation("sigmoid"),
]
)
# 优化器
sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=True)
model.compile(optimizer=sgd,loss="categorical_crossentropy")
model.fit(train_data,labels_train,nb_epoch=200,batch_size=40)
print(model.predict_classes(test_data))
#model.save_weights("D:/w")
#model.load_weights("D:/w")
if __name__=="__main__":
main()
参考文献
http://www.imooc.com/learn/843