利用Python 进行数据分析 徐敬一 译
Python for Data Analysis:Data Wrangling with Pandas Numpy and IPython Author Wes McKinney
1.3 重要的Python 库
第二章 Python 语言基础
第三章 内建数据结构,函数及文件
3.1.1元组
3.1.2 列表
3.1.3 内建序函数
3.1.4字典
3.1.5集合
3.2 函数
第四章 Numpy
#chapter4 numpy basic
from numpy import *
eye(4)
import numpy as np
data = np.random.randn(2,3)
data
data.shape
data.dtype
#4.1.1 生成ndarray
data1 = [6,7.5,8,0,1]
arr1 = np.array(data1)
arr1
data2 = [[1,2,3,4],[5,6,7,8]]
arr2 = np.array(data2)
arr2
arr2.ndim
arr2.shape
np.zeros(10)
np.zeros((3,6))
np.empty((2,3,2))
np.arange(15)
#深度学习入门 4.3.2 数值微分的例子
import numpy as np
import matplotlib.pylab as plt
def function_1(x):
return 0.01*x**2+0.1*x
x=np.arange(0.0,20.0,0.1) # 以0.1 为单位,从0到20的数组X
y=function_1(x)
plt.xlabel("x")
plt.ylabel("f(x)")
plt.plot(x,y)
plt.show()
#4.1.2 ndarray 数据类型
#4.1.3 numpy 数组计算
#4.1.4 基础索引与切片
arr= np.arange(10)
arr
arr[5]
arr[5:8]
#4.1.5 布尔索引
names = np.array(['Bob','Joe','Will','Bob','will','Joe','Joe' ])
data = np.random.randn(7,4)
names
data
names =='Bob'
data[names =='Bob']
#4.1.6 神奇的索引
arr = np.empty((8,4))
for i in range(8):
arr[i] = i
arr
arr = np.arange(32).reshape((8,4))
arr
#4.1.7 数组转置和换轴
arr = np.arange(15).reshape((3,5))
arr
arr.T
arr = np.random.randn(6,3)
np.dot(arr.T,arr)
#4.2 通用函数:快速的逐元素数组函数 ufunc
arr = np.arange(10)
arr
np.sqrt(arr)
np.exp(arr)
#4.3 使用数组进行面向数组编程
points = np.arange(-5,5,0.01) # 1000 equally spaced points
xs,ys = np.meshgrid(points,points)
ys
z= np.sqrt(xs**2+ys**2)
z
import matplotlib.pyplot as plt
plt.imshow(z,cmap = plt.cm.gray);plt.colorbar()
#4.3.2 数学和统计方法
#4.5 线性代数
x = np.array([[1.,2.,3.],[4.,5.,6.]])
y = np.array([[6.,23.],[-1,7],[8,9]])
x
y
x.dot(y)
np.dot(x,y)
from numpy.linalg import inv,qr
X = np.random.randn(5,5)
mat = X.T.dot(X)
inv(mat)
mat.dot(inv(mat))
q,r = qr(mat)
r
#4.6 伪随机数生成
samples = np.random.normal(size = (4,4))
samples
#4.7 随机漫步
import random
position = 0
walk = [position]
steps = 1000
for i in range(steps):
step = 1 if random.randint(0,1) else -1
position +=step
walk.append(position)
plt.plot(walk[:100])
第五章pandas入门
#chapter 5 pandas 入门
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
#Series 一维 数组型对象 DataFrame 矩阵的数据表
obj = pd.Series([4,7,-5,-3])
obj
obj.values
obj.index # 与range(4)
obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c'])
obj2
obj2.index
obj2['a']
obj2[obj2>0]
obj2*2
np.exp(obj2)
sdata = {'Ohio':35000,'Texas': 71000,'Oregon':16000,'Utah':5000}
obj3 = pd.Series(sdata)
obj3
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata,index = states)
obj4
pd.isnull(obj4)
pd.notnull(obj4)
obj4.isnull()
obj4.name = 'population'
obj4.index.name = 'state'
obj4
obj.index = ['Bob','Steve','Jeff','Ryan']
obj
#5.1.2 DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002,2003],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
frame
frame.head()
pd.DataFrame(data,columns= ['year','state','pop'])
frame2 = pd.DataFrame(data,columns = ['year','state','pop','debt'],
index = ['one','two','three','four','five','six'] )
frame2
frame2.columns
frame2['state']
frame2.year
frame2.loc['three']
frame2['debt']= 16.5
frame2
frame2['debt'] = np.arange(6.)
frame2
val = pd.Series([-1.2,-1.5,-1.7], index = ['two','four','five'])
frame2['debt'] = val
frame2
frame2['eastern'] = frame2.state == 'Ohio'
import numpy as np
import pandas as pd
from pandas import Series ,DataFrame
#5.1.3 索引对象
obj = pd.Series(range(3),index = ['a','b','c'])
index = obj.index
index
index[1:]
index[1] = 'd'
labels = pd.Index(np.arange(3))
labels
obj2 = pd.Series([1.5,-2.5,0],index = labels)
obj2
obj2.index is labels
#5.2 基本功能
#5.2.1 重建索引
obj = pd.Series([4.5,7.2,-5.3,3.6],index= ['d','b','a','c'])
obj
obj2= obj.reindex(['a','b','c','d','d'])
obj2
obj3 = pd.Series(['blue','purple','yellow'],index= [0,2,4])
obj3
obj3.reindex(range(6),method = 'ffill')
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index = ['a','c','d'],columns = ['Ohio','Texas','California'])
frame
frame2 = frame.reindex(['a','b','c','d'])
frame2
states = ['Texas','Utah','California']
frame.reindex(columns = states)
frame.loc[['a','b','c','d'],states]
#5.2.2 轴向上删除条目
obj = pd.Series(np.arange(5.),index= ['a','b','c','d','e'])
obj
new_obj = obj.drop('c')
new_obj
obj.drop(['d','c'])
data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns = ['one','two','three','four'])
data
data.drop(['Colorado','Ohio'])
data.drop('two',axis = 1)
data.drop(['two','four'],axis = 'columns')
obj.drop('c',inplace =True)
obj
#5.2.3 索引,选择,过滤
obj = pd.Series(np.arange(4.),index = ['a','b','c','d'])
obj
obj['b']
obj[obj<2]
data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns = ['one','two','three','four'])
data
data[['three','one']]
data[:2]
data[data['three']>5]
data <5
data[data<5] = 0
data
data.loc['Colorado',['two','three']]
data.iloc[2,[3,0,1]]
data.iloc[2]
data.iloc[[1,2],[3,0,1]]
#切片
data.loc[:'Utah','two']
data.iloc[:,:3][data.three > 5]
#5.2.4 整数索引
ser = pd.Series(np.arange(3.))
ser2 = pd.Series(np.arange(3.),index = ['a','b','c'])
ser2[-1]
ser2[:1]
ser2.loc[:1]
ser.loc[:1]
ser.iloc[:1]
#5.2.5 算数和数据对齐
s1 = pd.Series([7.3,-2.5,3.4,1.5],index = ['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index = ['a','c','e','f','g'])
s1
s2
s1+s2
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),columns = list('bcd'),index = ['Ohio','Texas','Colorado'])
df2=pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1
df2
df1+df2
#5.2.5.2 DataFrame 和 Series
#5.2.6 函数应用和映射
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
frame
np.abs(frame)
f = lambda x: x.max() - x.min()
frame.apply(f)
frame.apply(f,axis = 'columns')
def f(x):
return pd.Series([x.min(),x.max()],index = ['min','max'])
frame.apply(f)
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
#5.2.7 排序和排名
第六章 数据载入,存储,文件格式
第七章 数据清洗 准备
第八章数据规整:连接,联合与重塑
第九章绘图与可视化