【机器学习】pandas基础铺垫

什么是 Pandas?

Numpy 的一个数据处理包,方便做大规模数据处理和整理,有点类似数据库。
 

基本数据结构

import pandas as pd
import numpy as np


# Series:索引在左边,值在右边。若没有创建索引,则默认0 ~ N-1索引
s = pd.Series([1 , 3 , np.nan , 44 , 1]) # np.nan表示空值
print(s)

# dataframe是一个表格型的数据结构,包含一组有序的行和列,每列是不同的值类型,有行索引,也有列索引。
dates = pd.date_range("20160101" , periods = 6)
df = pd.DataFrame(np.random.randn(6 , 4) , index = dates , columns = ['a' , 'b' , 'c' , 'd'])   #行 index,列 column
print(df)
print(df['b'])

#不给定行列标签,默认以0123输出
df1 = pd.DataFrame(np.arange(12).reshape((3 , 4)))
print(df1)

df2 = pd.DataFrame({'A' : 1. ,
            'B' : pd.Timestamp("20130102") ,
            'C' : pd.Series(1 , index = list(range(4)) , dtype = "float32"),
            'D' : np.array([3] * 4 , dtype = "float32"),
            'E' : pd.Categorical(["test" , "train" , "test" , "train"]),    # 类别标签,将类别信息转化成数值信息
            'F' : "foo"})

print(df2)
print(df2.dtypes)
print(pd.Categorical(df2['E']).codes)   # 利用codes将categorical数据转化为编号,数据型转化成数值型
print(df2.index , df2.columns)
print(df2.values)   #只看数据
print(df2.describe())   # 数据总结:针对这个表格的一些数据分析
print(df2.transpose())

#没有sort_column,只有sort_index
print(df2.sort_index(axis = 1 , ascending = False)) #False 代表降序,axis = 1代表按行排序
print(df2.sort_index(axis = 0 , ascending = False)) #False 代表降序,axis = 0代表按列排序

#对值进行排序
print(df2.sort_values(by = 'B'))

 

pandas 选择数据

import pandas as pd
import numpy as np

dates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ['A' , 'B' , 'C' , 'D'])

# 选择数据
print(df['A'])
print(df[0 : 3])
print(df["20130102" : "20130104"])

# 根据标签loc
print(df.loc["20130102"])
print(df.loc[: , ["A" , "B"]])
print(df.loc["20130102" , ["A" , "B"]])

#根据序列 iloc
print(df.iloc[3 , 1])
print(df.iloc[3 : 5 , 1 : 3])
print(df.iloc[[1 , 3 , 5] , 1 : 3])

# 混合ix : 固定两列 + 三行
print(df.ix[ : 3 , ["A" , "C"]])

#判断筛选
print(df[df.B > 10])

 

pandas 设置值

import pandas as pd
import numpy as np

# 创建数据
dates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"])

# 赋值
df.iloc[2 , 2] = 1111
df.loc["20130101" , "B"] = 222
df.B[df.A > 4] = 0  # 若A的数值 > 4,则更改B的数为 0

# 初始化按列初始批处理数据
df["F"] = np.nan
print(df)

# 添加数据:不同的数据
df["E"] = pd.Series([1 , 2 , 3 , 4 , 5 , 6] , index = pd.date_range("20130101" , periods = 6))
print(df)

 

pandas 处理丢失数据

import pandas as pd
import numpy as np

# 初始化这些数据为空
dates = pd.date_range("20130101" , periods = 6)
df = pd.DataFrame(np.arange(24).reshape((6 , 4)) , index = dates , columns = ["A" , "B" , "C" , "D"])
df.iloc[0 , 1] = np.nan
df.iloc[1 , 2] = np.nan

# dropna() 去掉有 np.nan 的行和列
df.dropna(axis = 0 , how = "any")   # "any"只要存在 nan 就 drop 掉,"all" 全是 nan 才 drop 掉
print(df)

# fillna() 将 nan 用其他数值代替,比如 0
df.fillna(value = 0)

# isnull() 判断是否有 nan 存在,如果有用 True 表示
df.isnull()
np.any(df.isnull()) == True

 

pandas 导入导出

import pandas as pd
import numpy as np

# 读取 csv文件
data = pd.read_csv("F:/train_set.csv")
print(data)

# 保存成 pickle(一种数据化文件格式,缩写 pkl )
data.to_pickle("F:/grade123.pickle")

 

concat 数据合并

import pandas as pd
import numpy as np

# pandas合并concat
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
df3 = pd.DataFrame(np.ones((3 , 4)) * 2 , columns = ['a' , 'b' , 'c' , 'd'])

# concat纵向合并,ignore_index重置参数,让index可以保持下来
res = pd.concat([df1 , df2 , df3] , axis = 0)
res = pd.concat([df1 , df2 , df3] , axis = 0 , ignore_index = True)
print(res)

# join合并方式
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['b' , 'c' , 'd' , 'e'])

# 纵向"外"合并 df1和 df2
res = pd.concat([df1 , df2] , axis = 0 , join = 'outer')    #outer 所有的column并在一起,如果没有的用 nan补齐
res1 = pd.concat([df1 , df2] , axis = 0 , join = 'inner')   #inner 相同的column交在一起
print(res)
print(res1)

#join_axes 按照axes合并
df1 = pd.DataFrame(np.ones((3 , 4)) * 0 , columns = ['a' , 'b' , 'c' , 'd'])
df2 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
df3 = pd.DataFrame(np.ones((3 , 4)) * 1 , columns = ['a' , 'b' , 'c' , 'd'])
s1 = pd.Series([1 , 2 , 3 , 4] , index = ['a' , 'b' , 'c' , 'd'])

res = pd.concat([df1 , df2] , axis = 1) #行并起来,不足的地方nan补齐
print(res)

# append:只能纵向合并,将df2合并到df1下面,重置index
res1 = df1.append(df2 , ignore_index = True)
print(res1)
res2 = df1.append([df2 , df3] , ignore_index = True)    #也可以合并多个
print(res2)

res3 = df1.append(s1 , ignore_index = True) #Series加入一行
print(res3)

 

merge 数据合并

import pandas as pd

left = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] ,
                    "A" : ["A0" , "A1" , "A2" , "A3"],
                    "B" : ["B0" , "B1" , "B2" , "B3"]
                    })
right = pd.DataFrame({"key" : ["K0" , "K1" , "K2" , "K3"] ,
                    "C" : ["C0" , "C1" , "C2" , "C3"],
                    "D" : ["D0" , "D1" , "D2" , "D3"]
                     })
print(left)
print(right)

# merge 用于一组key值的情况 
res = pd.merge(left , right , on = "key")
print(res)

# 多组key值合并的时候
left = pd.DataFrame({"key1" : ["K0" , "K0" , "K1" , "K2"] ,
                    "key2" : ["K0" , "K1" , "K0" , "K1"] ,
                    "A" : ["A0" , "A1" , "A2" , "A3"],
                    "B" : ["B0" , "B1" , "B2" , "B3"]
                    })
right = pd.DataFrame({"key1" : ["K0" , "K1" , "K1" , "K2"] ,
                     "key2" : ["K0" , "K0" , "K0" , "K0"] ,
                    "C" : ["C0" , "C1" , "C2" , "C3"],
                    "D" : ["D0" , "D1" , "D2" , "D3"]
                     })
#print(left)
#print(right)

# 按照column合并
res = pd.merge(left , right , on = ["key1" , "key2"] , how = "inner")
print(res)
res1 = pd.merge(left , right , on = ["key1" , "key2"] , how = "outer")
print(res1)
res2 = pd.merge(left , right , on = ["key1" , "key2"] , how = "left")   # 以left为标准,左边全部,右边只要是有的就都算进来
print(res2)
res3 = pd.merge(left , right , on = ["key1" , "key2"] , how = "right")
print(res3)

# indicator = True 将合并的记录放在新的一列
df1 = pd.DataFrame({"col1" : [0 , 1] , "col_left" : ["a" , "b"]})
df2 = pd.DataFrame({"col1" : [1 , 2 , 2] , "col_right" : [2 , 2 , 2]})

print(df1)
print(df2)

#根据col1进行合并,并启用indicator = True,并增加一列合并的记录
res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = True)
print(res)

#还可以自定义合并列记录的名称
res = pd.merge(df1 , df2 , on = "col1" , how = "outer" , indicator = "Result")
print(res)

# 根据index合并
left = pd.DataFrame({
                    "A" : ["A0" , "A1" , "A2"],
                    "B" : ["B0" , "B1" , "B2"]},
                    index = ["K0" , "K1" , "K2"]
                    )
right = pd.DataFrame({
                    "C" : ["C0" , "C2" , "C3"],
                    "D" : ["D0" , "D2" , "D3"]},
                    index = ["K0" , "K2" , "K3"]
                    )
#print(left)
#print(right)

# 根据index合并
res = pd.merge(left , right , left_index = True , right_index = True , how = "outer")
print(res)
res = pd.merge(left , right , left_index = True , right_index = True , how = "inner")
print(res)

#解决overlapping(重叠)的问题:添加后缀:suffixes
boys = pd.DataFrame({"K" : ["K0" , "K1" , "K2"] , "age" : [1 , 2 , 3]})
girls = pd.DataFrame({"K" : ["K0" , "K0" , "K3"] , "age" : [4 , 5 , 6]})
res = pd.merge(boys , girls , on = "K" , suffixes = ["_boy" , "_girl"] , how = "inner")
print(res)

 

利用 matplotlib 作图

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Series可视化
data = pd.Series(np.random.randn(1000) , index = np.arange(1000))
data.cumsum()   # 累加这个函数
data.plot() #可视化
plt.show()

# DataFrame可视化
data = pd.DataFrame(np.random.randn(1000 , 4) , index = np.arange(1000) , columns = list("ABCD"))
data.cumsum()
#data.plot()
#plt.show()

# scatter散点图
ax = data.plot.scatter(x = "A" , y = "B" , color = "DarkBlue" , label = "Class")
data.plot.scatter(x = "A" , y = "C" , color = "LightGreen" , label = "Class2" , ax = ax)
plt.show()

你可能感兴趣的:(机器学习竞赛)