Python数据分析学习总结

Python数据分析学习总结

  • 概述

    • 数据分析的含义与目标

      方法:统计分析方法

      目标:提取有用信息

      手段:研究、概括、总结

    • Python与数据分析

      Python特点:简洁、开发效率高、运算速度慢、胶水特性(集成C语言)

      Python数据分析:numpy、scipy、matplotlib、pandas、scikit-learn、keras…

    • Python数据分析大家族

      numpy:数据结构基础

      scipy:强大的科学计算方法(矩阵分析、信号分析、数理分析…)

      matplotlib:丰富的可视化套件

      pandas:基础数据分析套件

      scikit-learn:强大的数据分析建模库

      keras:人工神经网络

    • Python数据分析环境搭建

      平台:Windows、Linux

      科学计算工具:Anaconda

  • Python数据分析基础

    • numpy

      开源、数据计算扩展;ndarray、多维操作、线性代数

      • numpy使用程序:

        import numpy as np
        
        def main():
            lst=[[1,3,5],[2,4,6]]
            print(type(lst))
            np_lst=np.array(lst)
            print(type(np_lst))
            np_lst=np.array(lst, dtype=np.float)
            print(np_lst.shape)
            print(np_lst.ndim)
            print(np_lst.dtype)
            print(np_lst.itemsize)
            print(np_lst.size)
        
        if __name__=="__main__":
            main()
        执行结果:
        
        
        (2, 3)
        2
        float64
        8
        6
        
      • numpy常用数组

        print(np.zeros([2,4]))
        print(np.ones([3,5]))
        print(np.random.rand(2,4))
        print(np.random.rand())
        print("RandInt:")
        print(np.random.randint(1,10,3))
        print("Randn:")  # 标准正态分布
        print(np.random.randn(2,4)) 
        print("Choice")
        print(np.random.choice([10,20,30]))
        print("Distribute:")  # Beta分布
        print(np.random.beta(1,10,100))
        
        执行结果:
        [[ 0.  0.  0.  0.]
         [ 0.  0.  0.  0.]]
        [[ 1.  1.  1.  1.  1.]
         [ 1.  1.  1.  1.  1.]
         [ 1.  1.  1.  1.  1.]]
        [[ 0.80307088  0.25491367  0.54381007  0.10159737]
         [ 0.71565024  0.62473538  0.66892166  0.41078071]]
        0.16467244260637237
        RandInt:
        [5 3 2]
        Randn:
        [[-0.51707383 -1.46091351 -0.78197086  0.44640286]
         [-0.0998081   0.40701679  0.07750661  0.66041753]]
        Choice
        10
        Distribute:
        [ 0.03897375  0.09804991  0.1617222  ...,  0.12878516  0.11699157
          0.05681225]
        
      • numpy常用操作

        print("Arange:")
        print(np.arange(1,11))
        print("Exp:")
        print(np.exp(lst))
        print("Exp2:")
        print(np.exp2(lst))
        print("Sqrt:")
        print(np.sqrt(lst))
        print("Sin:")
        print(np.sin(lst))
        print("Log:")
        print(np.log(lst))
        执行结果:
        Arange:
        [ 1  2  3  4  5  6  7  8  9 10]
        Exp:
        [[   2.71828183   20.08553692  148.4131591 ]
         [   7.3890561    54.59815003  403.42879349]]
        Exp2:
        [[  2.   8.  32.]
         [  4.  16.  64.]]
        Sqrt:
        [[ 1.          1.73205081  2.23606798]
         [ 1.41421356  2.          2.44948974]]
        Sin:
        [[ 0.84147098  0.14112001 -0.95892427]
         [ 0.90929743 -0.7568025  -0.2794155 ]]
        Log:
        [[ 0.          1.09861229  1.60943791]
         [ 0.69314718  1.38629436  1.79175947]]
        
        lst=np.array([[[1,2,3,4],[4,5,6,7]],[[7,8,9,10],[10,11,12,13]],[[14,15,16,17],[18,19,20,11]]])
        print(lst.sum(axis=2))
        print(lst.sum(axis=1))
        print(lst.sum(axis=0))
        print("Max:")
        print(lst.max(axis=1))
        print("Min:")
        print(lst.min(axis=0))  
        执行结果:
        [[10 22]
         [34 46]
         [62 68]]
        [[ 5  7  9 11]
         [17 19 21 23]
         [32 34 36 28]]
        [[22 25 28 31]
         [32 35 38 31]]
        Max:
        [[ 4  5  6  7]
         [10 11 12 13]
         [18 19 20 17]]
        Min:
        [[1 2 3 4]
         [4 5 6 7]]
        
        lst1=np.array([10,20,30,40])
        lst2=np.array([4,3,2,1])
        print("Add:")
        print(lst1+lst2)
        print("Sub:")
        print(lst1-lst2)
        print("Mul:")
        print(lst1*lst2)
        print("Div:")
        print(lst1/lst2)
        print("Square:")
        print(lst1**2)
        print("Dot:")
        print(np.dot(lst1.reshape([2,2]),lst2.reshape([2,2])))
        print("Concatenate:")
        print(np.concatenate((lst1,lst2),axis=0))
        print("vstack:")
        print(np.vstack((lst1,lst2)))
        print("hstack:")
        print(np.hstack((lst1,lst2)))
        print("Split:")
        print(np.split(lst1,2))
        print(np.split(lst1,4))
        print("Copy:")
        print(np.copy(lst1))
        执行结果:
        Add:
        [14 23 32 41]
        Sub:
        [ 6 17 28 39]
        Mul:
        [40 60 60 40]
        Div:
        [  2.5          6.66666667  15.          40.        ]
        Square:
        [ 100  400  900 1600]
        Dot:
        [[ 80  50]
         [200 130]]
        Concatenate:
        [10 20 30 40  4  3  2  1]
        vstack:
        [[10 20 30 40]
         [ 4  3  2  1]]
        hstack:
        [10 20 30 40  4  3  2  1]
        Split:
        [array([10, 20]), array([30, 40])]
        [array([10]), array([20]), array([30]), array([40])]
        Copy:
        [10 20 30 40]
        
      • 线程方程组

        import numpy as np
        from numpy.linalg import *
        
        def main():
        
            print(np.eye(3))
            lst=np.array([[1,2],[3,4]])
            print("Inv:")
            print(inv(lst))
            print("T:")
            print(lst.transpose())
            print("Det:")
            print(det(lst))
            print("Eig:")
            print(eig(lst))
        
        if __name__=="__main__":
            main()
        
        执行结果:
        [[ 1.  0.  0.]
         [ 0.  1.  0.]
         [ 0.  0.  1.]]
        Inv:
        [[-2.   1. ]
         [ 1.5 -0.5]]
        T:
        [[1 3]
         [2 4]]
        Det:
        -2.0
        Eig:
        (array([-0.37228132,  5.37228132]), array([[-0.82456484, -0.41597356],
               [ 0.56576746, -0.90937671]]))
        
      • numpy其他方面应用

        import numpy as np
        from numpy.linalg import *
        
        def main():
        
            print("FFT:")
            print(np.fft.fft(np.array([1,1,1,1,1,1,1,1])))
            print("Coef:")
            print(np.corrcoef([1,0,1],[0,2,1]))
            print("Poly:")
            print(np.poly1d([2,1,3]))
        
        if __name__=="__main__":
            main()
        
        执行结果:
        FFT:
        [ 8.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j]
        Coef:
        [[ 1.        -0.8660254]
         [-0.8660254  1.       ]]
        Poly:
           2
        2 x + 1 x + 3
        
    • matplotlib

      • 概述

        matplotlib是关键的绘图库。

      • 实现

        import numpy as np
        import matplotlib.pyplot as plt
        
        def main():
            #line
            x=np.linspace(-np.pi,np.pi,256,endpoint=True)
            c,s=np.cos(x),np.sin(x)
            plt.figure(1)
            plt.plot(x,c,color="blue",linewidth=1.0,linestyle="-",label="COS",alpha=0.5)
            plt.plot(x,s,"r*",label="SIN")
            plt.title("COS & SIN")
            ax=plt.gca()
            ax.spines["right"].set_color("none")
            ax.spines["top"].set_color("none")
            ax.spines["left"].set_position(("data",0))
            ax.spines["bottom"].set_position(("data",0))
            ax.xaxis.set_ticks_position("bottom")
            ax.yaxis.set_ticks_position("left")
            plt.show()
        
            #scatter
            fig=plt.figure()
            ax=fig.add_subplot(3,3,1)
            n=128
            X=np.random.normal(0,1,n)
            Y=np.random.normal(0,1,n)
            T=np.arctan2(Y,X)
            #plt.axes([0.025,0.025,0.95,0.95])
            #plt.scatter(X,Y,s=75,c=T,alpha=0.5)
            ax.scatter(X,Y,s=75,c=T,alpha=0.5)
            plt.xlim(-1.5,1.5),plt.xticks([])
            plt.ylim(-1.5,1.5),plt.yticks([])
            plt.axis()
            plt.title("scatter")
            plt.xlabel("x")
            plt.ylabel("y") 
            plt.show()
        
            #bar
            fig.add_subplot(332)
            n=10
            X=np.arange(n)
            Y1=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
            Y2=(1-X/float(n))*np.random.uniform(0.5,1.0,n)
            plt.bar(X,+Y1,facecolor='#9999ff',edgecolor='white')
            plt.bar(X,-Y2,facecolor='#9999ff',edgecolor='white')
            for x,y in zip(X,Y1):
                plt.text(x+0.4,y+0.05,'%.2f' % y,ha='center',va='bottom')
            for x,y in zip(X,Y2):
                plt.text(x+0.4,-y-0.05,'%.2f' % y,ha='center',va='bottom')       
            plt.show()
        
            #Pie
            fig.add_subplot(333)
            n=20
            Z=np.ones(n)
            Z[-1]*=2
            plt.pie(Z,explode=Z*.05,colors=['%s' % (i / float(n)) for i in range(n)],
                    labels=['%.2f' % (i / float(n)) for i in range(n)])
            plt.gca().set_aspect('equal')
            plt.xticks([]), plt.yticks([])
            plt.show()
        
            #polar
            fig.add_subplot(334)
            n=20
            theta=np.arange(0.0,2*np.pi,2*np.pi/n)
            radii=10*np.random.rand(n)
            plt.plot(theta, radii)
            plt.show() 
        
            #beatmap
            fig.add_subplot(335)
            from matplotlib import cm
            data=np.random.rand(3,3)
            cmap=cm.Blues
            map=plt.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=0,vmax=1)
            plt.show()
        
            #hot map
            fig.add_subplot(313)
            def f(x,y):
                return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)
            n=256
            x=np.linspace(-3,3,n)
            y=np.linspace(-3,3,n)
            X,Y=np.meshgrid(x,y)
            plt.contourf(X,Y,f(X,Y),8,alpha=.75,cmap=plt.cm.hot)
            plt.show()
        
            #3D
            ax=fig.add_subplot(336,projection="3d")
            ax.scatter(1,1,3,s=100)
            plt.show()
        
        if __name__=="__main__":
            main()
        
    • scipy

      • 简介

        数值计算库

      • 积分

        程序:
        import numpy as np
        from scipy.integrate import quad,dblquad,nquad
        
        def main():
            # Integral
            print(quad(lambda x:np.exp(-x),0,np.inf))
            print(dblquad(lambda t,x:np.exp(-x*t)/t**3,0,np.inf,lambda x:1,lambda x:np.inf))
            def f(x,y):
                return x*y
            def bound_y():
                return [0,0.5]
            def bound_x(y):
                return [0,1-2*y]
            print(nquad(f,[bound_x,bound_y]))
        
        if __name__=="__main__":
            main()
        
        执行结果:
        (1.0000000000000002, 5.842607038578007e-11)
        (0.3333333333366853, 1.3888461883425516e-08)
        (0.010416666666666668, 4.101620128472366e-16)
        
      • 优化器

        import numpy as np
        from scipy.optimize import minimize
        
        def main():
            # Optimizer
            def rosen(x):
                return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0)
            x0=np.array([1.3,0.7,0.8,1.9,1.2])
            res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True})
            print("ROSE MINI:", res)
        
        if __name__=="__main__":
            main()
        
        执行结果:
        Optimization terminated successfully.
                     Current function value: 0.000000
                 Iterations: 339
                 Function evaluations: 571
        ROSE MINI:  final_simplex: (array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
               [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
               [ 1.        ,  1.        ,  1.        ,  1.00000001,  1.00000001],
               [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
               [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],
               [ 1.        ,  1.        ,  1.        ,  1.        ,  0.99999999]]), array([  4.86115343e-17,   7.65182843e-17,   8.11395684e-17,
                 8.63263255e-17,   8.64080682e-17,   2.17927418e-16]))
                   fun: 4.8611534334221152e-17
               message: 'Optimization terminated successfully.'
                  nfev: 571
                   nit: 339
                status: 0
               success: True
                     x: array([ 1.,  1.,  1.,  1.,  1.])
        
      • 插值

        import numpy as np
        from scipy.interpolate import interpld
        
        def main():
            def fun(x):
                return x+2*np.cos(x)
            sol=root(fun,0.1)
            print("ROOT:",sol.x,sol.fun)
            #Interpolation
            x=np.linspace(0,1,10)
            y=np.sin(2*np.pi*x)
            li=interpld(x,y,kind="cubic")
            x_new=np.linspace(0,1,50)
            y_new=li(x_new)
            figure()
            plot(x,y,"r")
            plot(x_new,y_new,"k")
            show()
            print(y_new)
        
        if __name__=="__main__":
            main()
        
      • 线性计算与矩阵分解

        程序:
        import numpy as np
        from scipy import linalg as lg
        
        def main():
            arr=np.array([[1,2],[3,4]])
            print("Det:",lg.det(arr))
            print("Inv:",lg.inv(arr))
            b=np.array([6,14])
            print("Sol:",lg.solve(arr,b))
            print("Eig:",lg.eig(arr))
            print("LU:",lg.lu(arr))
            print("QR:",lg.qr(arr))
            print("SVD:",lg.svd(arr))
            print("Schur:",lg.schur(arr))
        
        if __name__=="__main__":
            main()
        
        执行结果:
        Det: -2.0
        Inv: [[-2.   1. ]
         [ 1.5 -0.5]]
        Sol: [ 2.  2.]
        Eig: (array([-0.37228132+0.j,  5.37228132+0.j]), array([[-0.82456484, -0.41597356],
               [ 0.56576746, -0.90937671]]))
        LU: (array([[ 0.,  1.],
               [ 1.,  0.]]), array([[ 1.        ,  0.        ],
               [ 0.33333333,  1.        ]]), array([[ 3.        ,  4.        ],
               [ 0.        ,  0.66666667]]))
        QR: (array([[-0.31622777, -0.9486833 ],
               [-0.9486833 ,  0.31622777]]), array([[-3.16227766, -4.42718872],
               [ 0.        , -0.63245553]]))
        SVD: (array([[-0.40455358, -0.9145143 ],
               [-0.9145143 ,  0.40455358]]), array([ 5.4649857 ,  0.36596619]), array([[-0.57604844, -0.81741556],
               [ 0.81741556, -0.57604844]]))
        Schur: (array([[-0.37228132, -1.        ],
               [ 0.        ,  5.37228132]]), array([[-0.82456484, -0.56576746],
               [ 0.56576746, -0.82456484]]))
        
    • pandas

      • 简介

        数据分析库

      • 基础数据分析技术

        import numpy as np
        import pandas as pd
        
        def main():
            #Data Structure
            s=pd.Series([i*2 for i in range(1,11)])
            print(type(s))   
            dates=pd.date_range("20170301",periods=8)
            df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
            print(df)
            #Basic
            print(df.head(3))
            print(df.tail(3))
            print(df.index)
            print(df.values)
            print(df.T)
            print(df.sort(columns="C"))
            print(df.sort_index(axis=1,ascending=False))
            print(df.describe())
            #Select
            print(type(df["A"]))
            print(df[:3])
            print(df["20170301":"20170304"])
            print(df.loc[dates[0]])
            print(df.loc["20170301":"20170304",["B","D"]])
            print(df.iloc[1:2,2:4])
            print(df.iloc[1,4])
            print(df[df.B>0][df.A<0])
            print(df[df>0])
            print(df[df["E"].isin([1,2])])
        
            #Set
            s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))
            df["F"]=s1
            print(df)
            df.at[dates[0],"A"]=0
            print(df)
            df.iat[1,1]=1
            df.loc[:,"D"]=np.array([4]*len(df))
            df2=df.copy()
            df2[df2>0]=df2
            print(df2)
        
            #Missing Value
            df1=df.reindex(index=dates[:4],columns=list("ABCD")+["G"])
            df1.loc[dates[0]:dates[1],"G"]=1
            print(df1)
            print(df1.dropna())
            print(df1.fillna(value=2))
        
            #Concat
            pieces=[df[:3],df[-3:]]
            print(pd.concat(pieces))
            left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
            right=pd.DataFrame({"key":["x","z"],"value":[3,4]})
            print("LEFT:",left)
            print("RIFHT:",right)
            print(pd.merge(left,right,on="key",how="left"))
            df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))})
            print(df3.groupby("A").sum())
        
        if __name__=="__main__":
            main()
        
      • 时间、绘图

        import numpy as np
        import pandas as pd
        from pylab import *
        
        def main():
            #Time Series
            t_exam=pd.date_range("20170301",periods=10,freq="S")
            print(t_exam)
        
            #Graph
            ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000))
            ts=ts.cumsum()
            ts.plot()
            show()
        
        if __name__=="__main__":
            main()
        
    • scikit-learn

      • 简介

        数据挖掘建模、机器学习

      • 机器学习与决策树

        机器学习:因子–>结果

        结果:

        不带标记–>无监督学习(聚类);带标记–>监督学习

        有限离散–>分类;连续–>回归

        决策树:监督学习;树形结构

      • Iris数据集

        • 花萼长度
        • 花萼宽度
        • 花瓣长度
        • 花瓣宽度
        • 种类:Iris Setosa(山鸢尾)、Iris Versicolour(杂色鸢尾)、Iris Virginica(维吉尼亚鸢尾)
      • 实现

        import numpy as np
        import pandas as pd
        from sklearn.datasets import load_iris
        from sklearn.cross_validation import train_test_split
        from sklearn import tree
        from sklearn import metrics
        def main():
            #Pre-processing
            iris=load_iris()
            print(iris)
            print(len(iris["data"]))
            train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
        
            #Model
            clf=tree.DecisionTreeClassifier(criterion="entropy")
            clf.fit(train_data,train_target)
            y_pred=clf.predict(test_data)
        
            #Verify
            print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))
            print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred))
        
        if __name__=="__main__":
            main()
        
    • keras

      • 简介

        人工神经网络

      • 简单神经网络实现

        Keras安装步骤:Anaconda CMD;conda install mingw libpython;pip install keras;pip install np_utils

      • 实例

        注意:需要需要C:/user/username/.keras/keras.json,具体改后内容如下:{“backend”: “theano”,”image_data_format”: “th”,”epsilon”: 1e-07,”floatx”: “float32”}。

        import numpy as np
        from keras.models import Sequential
        from keras.layers import Dense,Activation
        from keras.optimizers import SGD
        from sklearn.datasets import load_iris
        from sklearn.preprocessing import LabelBinarizer
        from sklearn.cross_validation import train_test_split
        
        def main():
            pass
            iris=load_iris()
            print(iris["target"])
            LabelBinarizer().fit_transform(iris["target"])
            train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)
            labels_train=LabelBinarizer().fit_transform(train_target)
            labels_test=LabelBinarizer().fit_transform(test_target)
        
            model=Sequential(
                    [
                            Dense(5,input_dim=4),
                            Activation("relu"),
                            Dense(3),
                            Activation("sigmoid"),
                    ]
                    )
            # 优化器
            sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=True)
            model.compile(optimizer=sgd,loss="categorical_crossentropy")
            model.fit(train_data,labels_train,nb_epoch=200,batch_size=40)
            print(model.predict_classes(test_data))
            #model.save_weights("D:/w")
            #model.load_weights("D:/w")
        
        if __name__=="__main__":
            main()
        
    • 参考文献

      http://www.imooc.com/learn/843

你可能感兴趣的:(Python学习)