利用Python 进行数据分析 Python_for_data_analysis 笔记

 

利用Python 进行数据分析  徐敬一  译

Python for Data Analysis:Data Wrangling with Pandas Numpy and IPython Author  Wes McKinney

 

1.3 重要的Python 库

  1. NumPy
  2. pandas
  3. matplotlib
  4. IPthon Jupyter
  5. SciPy
  6. scikit-learn
  7. statsmodels

第二章 Python 语言基础

第三章 内建数据结构,函数及文件

3.1.1元组

3.1.2 列表

3.1.3 内建序函数

3.1.4字典

3.1.5集合

3.2 函数

 

第四章 Numpy

 

#chapter4 numpy basic 
from numpy import *
eye(4)

import numpy as np
data = np.random.randn(2,3)
data

data.shape
data.dtype

#4.1.1 生成ndarray

data1 = [6,7.5,8,0,1]
arr1 = np.array(data1)
arr1

data2 = [[1,2,3,4],[5,6,7,8]]
arr2 = np.array(data2)
arr2


arr2.ndim
arr2.shape

np.zeros(10)
np.zeros((3,6))
np.empty((2,3,2))

np.arange(15)




#深度学习入门 4.3.2 数值微分的例子

import numpy as np
import matplotlib.pylab as plt

def function_1(x):
    return 0.01*x**2+0.1*x

x=np.arange(0.0,20.0,0.1) # 以0.1 为单位,从0到20的数组X
y=function_1(x)
plt.xlabel("x")
plt.ylabel("f(x)")
plt.plot(x,y)
plt.show()



#4.1.2 ndarray 数据类型
#4.1.3 numpy 数组计算
#4.1.4 基础索引与切片

arr= np.arange(10)
arr
arr[5]
arr[5:8]
#4.1.5 布尔索引
names = np.array(['Bob','Joe','Will','Bob','will','Joe','Joe' ])
data = np.random.randn(7,4)
names

data
names =='Bob'
data[names =='Bob']


#4.1.6 神奇的索引


arr = np.empty((8,4))
for i in range(8):
    arr[i] = i
arr
arr = np.arange(32).reshape((8,4))
arr

#4.1.7 数组转置和换轴
arr = np.arange(15).reshape((3,5))
arr
arr.T

arr = np.random.randn(6,3)
np.dot(arr.T,arr)

#4.2 通用函数:快速的逐元素数组函数 ufunc

arr = np.arange(10)
arr
np.sqrt(arr)
np.exp(arr)


#4.3 使用数组进行面向数组编程
points = np.arange(-5,5,0.01) # 1000 equally spaced points
xs,ys = np.meshgrid(points,points)
ys

z= np.sqrt(xs**2+ys**2)
z
import matplotlib.pyplot as plt
plt.imshow(z,cmap = plt.cm.gray);plt.colorbar()


#4.3.2 数学和统计方法
#4.5 线性代数
x = np.array([[1.,2.,3.],[4.,5.,6.]])
y = np.array([[6.,23.],[-1,7],[8,9]])

x
y
x.dot(y)
np.dot(x,y)

from numpy.linalg import inv,qr
X = np.random.randn(5,5)
mat = X.T.dot(X)
inv(mat)
mat.dot(inv(mat))

q,r = qr(mat)
r

#4.6 伪随机数生成

samples = np.random.normal(size = (4,4))
samples
#4.7 随机漫步

import random
position = 0
walk = [position]
steps = 1000
for i in range(steps):
    step = 1 if random.randint(0,1) else -1 
    position +=step
    walk.append(position)

plt.plot(walk[:100])

第五章pandas入门

#chapter 5 pandas 入门

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

#Series 一维 数组型对象  DataFrame  矩阵的数据表

obj = pd.Series([4,7,-5,-3])
obj
obj.values
obj.index # 与range(4)

obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c'])
obj2
obj2.index
obj2['a']
obj2[obj2>0]
obj2*2
np.exp(obj2)

sdata = {'Ohio':35000,'Texas': 71000,'Oregon':16000,'Utah':5000}

obj3 = pd.Series(sdata)
obj3
states = ['California','Ohio','Oregon','Texas']

obj4 = pd.Series(sdata,index = states)
obj4

pd.isnull(obj4)
pd.notnull(obj4)
obj4.isnull()

obj4.name = 'population'
obj4.index.name = 'state'
obj4

obj.index = ['Bob','Steve','Jeff','Ryan']
obj

#5.1.2 DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002,2003],
        'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)

frame
frame.head()
pd.DataFrame(data,columns= ['year','state','pop'])

frame2 = pd.DataFrame(data,columns = ['year','state','pop','debt'],
                      index = ['one','two','three','four','five','six']  )

frame2

frame2.columns
frame2['state']
frame2.year
frame2.loc['three']
frame2['debt']= 16.5
frame2
frame2['debt'] = np.arange(6.)
frame2

val = pd.Series([-1.2,-1.5,-1.7], index = ['two','four','five'])
frame2['debt'] = val
frame2


frame2['eastern'] = frame2.state == 'Ohio'


import numpy as np
import pandas as pd 
from pandas import Series ,DataFrame
#5.1.3 索引对象
obj = pd.Series(range(3),index = ['a','b','c'])
index = obj.index
index

index[1:]
index[1] = 'd'

labels = pd.Index(np.arange(3))
labels

obj2 = pd.Series([1.5,-2.5,0],index = labels)
obj2

obj2.index is labels


#5.2 基本功能


#5.2.1 重建索引

obj = pd.Series([4.5,7.2,-5.3,3.6],index= ['d','b','a','c'])
obj
obj2= obj.reindex(['a','b','c','d','d'])
obj2

obj3 = pd.Series(['blue','purple','yellow'],index= [0,2,4])
obj3

obj3.reindex(range(6),method = 'ffill')

frame = pd.DataFrame(np.arange(9).reshape((3,3)),index = ['a','c','d'],columns = ['Ohio','Texas','California'])

frame
frame2 = frame.reindex(['a','b','c','d'])
frame2

states = ['Texas','Utah','California']
frame.reindex(columns = states)

frame.loc[['a','b','c','d'],states]

#5.2.2 轴向上删除条目

obj = pd.Series(np.arange(5.),index= ['a','b','c','d','e'])
obj

new_obj = obj.drop('c')

new_obj

obj.drop(['d','c'])

data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns = ['one','two','three','four'])

data

data.drop(['Colorado','Ohio'])
data.drop('two',axis = 1)

data.drop(['two','four'],axis = 'columns')
obj.drop('c',inplace =True)
obj

#5.2.3 索引,选择,过滤

obj = pd.Series(np.arange(4.),index = ['a','b','c','d'])
obj
obj['b']

obj[obj<2]

data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns = ['one','two','three','four'])


data

data[['three','one']]
data[:2]

data[data['three']>5]

data <5 

data[data<5] = 0

data

data.loc['Colorado',['two','three']]

data.iloc[2,[3,0,1]]

data.iloc[2]

data.iloc[[1,2],[3,0,1]]

#切片
data.loc[:'Utah','two']

data.iloc[:,:3][data.three > 5]

#5.2.4 整数索引
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.),index = ['a','b','c'])
ser2[-1]
ser2[:1]

ser2.loc[:1]
ser.loc[:1]
ser.iloc[:1]

#5.2.5 算数和数据对齐
s1 = pd.Series([7.3,-2.5,3.4,1.5],index = ['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index = ['a','c','e','f','g'])

s1
s2
s1+s2

df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),columns = list('bcd'),index = ['Ohio','Texas','Colorado'])

df2=pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])

df1
df2

df1+df2

#5.2.5.2 DataFrame 和 Series
#5.2.6 函数应用和映射

frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])


frame

np.abs(frame)

f = lambda x: x.max() - x.min()
frame.apply(f)

frame.apply(f,axis = 'columns')

def f(x):
    return pd.Series([x.min(),x.max()],index = ['min','max'])
frame.apply(f)

format = lambda x: '%.2f' % x
frame.applymap(format)

frame['e'].map(format)

#5.2.7 排序和排名

 

第六章 数据载入,存储,文件格式

第七章 数据清洗 准备

第八章数据规整:连接,联合与重塑

第九章绘图与可视化

 

 

你可能感兴趣的:(利用Python,进行数据分析,python,数据分析)