pandas学习二

genre [ˈʒɒnrə] 让
column 靠乐母

操作pandas的index三种方法(重新设置行index,将某列设置为index,将index设置为列)

import pandas as pd
df = pd.DataFrame({‘地区’: [‘山东’, ‘北京’], “⼈⼝”: [1.1, 1.2],
“省会”: [“济南”, “北京”]})
print(df)

print(df.index)

重新创建一个行级索引

df2 = df.reindex([0, 1, 2]) # (返回值多一个索引,原数据不变)
print(df2)
df2.loc[2] = [‘山西’, 0.7, ‘太原’] # 相当于增加了一行
print(df2)

将某列转换为index索引(以一个副本的形式返回,原数据不变)

df3 = df.set_index(‘地区’) # 地区设置为index
print(‘*’*30)
print(df3.index)
print(df3.loc[‘山东’]) # 相当于一个字典的作用

将index设置为列

df4 = df3.reset_index()
print(df4)

列自动对齐

import pandas as pd
df = pd.DataFrame({‘地区’: [‘山东’, ‘北京’], “⼈⼝”: [1.1, 1.2],
“省会”: [“济南”, “北京”]})
df5 = pd.DataFrame({‘地区’: [‘陕西’], ‘省会’: [‘西安’], “⼈⼝”: [1.5]})
print(df.append(df5)) # 如果有,可以实现自动对齐;如果没有,相当于新增一列

空值检查及填充(利用isnull和notnull两个函数)

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.info()) # 可以检查空值
df_not_revenue_null = df[df[‘Revenue (Millions)’].notnull()] # 是notnull
df_not_revenue_average = df_not_revenue_null[‘Revenue (Millions)’].mean() # 计算平均值
df.loc[df[‘Revenue (Millions)’].isnull(), ‘Revenue (Millions)’] = df_not_revenue_average
print(df.info()) # 空值均被填充为平均值 注意: 必须有括号"()"

使用dataframe去重(使用drop_duplicate函数)

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)

print(df.head())

‘Director’, ‘Year’ 两个条件

df2 = df.drop_duplicates(subset=[‘Director’, ‘Year’], keep=‘first’)
ret = df[df[‘Director’] == ‘James Gunn’]
print(df.shape)
print(df2.shape) # 通过去重,有300多条数据是重复的

dataframe分组,分组后求平均值和总投票数

import pandas as pd

分组方法1(通过每个分组进行遍历)

df = pd.read_csv(‘IMDB-Movie-Data.csv’)

for dn, gc in df.groupby(by=[‘Director’]): # 分组后,平均分和总的得票数

print(dn, gc[‘Rating’].mean(), gc[‘Votes’].sum())

分组方法2(推荐使用),分组完成后,做聚合

ret = df.groupby(by=[‘Director’]).aggregate({‘Rating’: ‘mean’,
‘Votes’: ‘sum’}) # 通过字典传入参数
print(ret) # 计算结果
print(ret.index) # 索引就是导演的名字
print(ret.columns) # 查看有几列 输出是2列

使用join连接两个dataframe

import pandas as pd
df1 = pd.DataFrame({‘地区’: [‘北京’, ‘⼭东’, ‘天津’],
‘⼈⼝’: [1.2, 1.1, 0.7]})
df2 = pd.DataFrame({‘地区’: [‘北京’, ‘⼭东’, ‘陕⻄’],
‘省会’: [‘北京’, ‘济南’, ‘⻄安’]})
df1_dist = df1.set_index(‘地区’) # 先把地区设置为index,才可以进行连接
df2_dist = df2.set_index(‘地区’)
df_join = df1_dist.join(df2_dist, how=‘left’) # left right outer(并集) inner(交集)
print(df_join)

使用pivot_table透视dataframe

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.head()) # 查看前5行数据

直接用pivot_table实现透视

index=‘Director’ 是按行分组, columns是按列分组

print(df.pivot_table(index=‘Director’, columns=‘Year’, values=[‘Rating’, ‘Votes’],
aggfunc={‘Rating’: ‘mean’,
‘Votes’: ‘sum’}))

dataframe插入一行数据(两种方法的案例)

import pandas as pd
import numpy as np
df = pd.DataFrame([[1, 3], [4, 1], [6, 2]],
columns=[‘a’, ‘b’])
print(df)

insert只支持插入列,不支持插入行

方法1 切片后插入

df1 = df.iloc[:1, :]

df2 = df.iloc[1:, :]

add_row = pd.DataFrame([[10, 20]], index=[10], columns=[‘a’, ‘b’]) # 注意[[10, 20]]的写法

result_row = pd.concat([df1, add_row, df2], axis=0)

print(result_row)

方法2 把pandas对象转换为numpy对象,插入完数据后,再转换为pandas

arr = df.values
print(arr)
arr2 = np.insert(df.values, 1, [10, 20], axis=0)
print(arr2)
result2 = pd.DataFrame(arr2, columns=[‘a’, ‘b’])
print(result2)

使用标签或位置选择数据

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.loc[:, ‘Rank’:‘Genre’]) # 行全选,通过标签选择前3列数据(标签选择)
print(df.iloc[:, :3]) # 位置索引选择 用iloc函数

检查所有空值情况

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)
print(df.info()) # 整体查看空值,是info;具体哪一列,用loc函数查看
df2 = df.loc[df[‘Revenue (Millions)’].isnull(), :] # 行是选择为空值的,列是全选
print(df2)

列转为datatime类型

import pandas as pd
d = {‘year’: [2019, 2020, 2021], ‘day’: [40, 50, 60]}
df = pd.DataFrame(d)
print(df)
df[‘int_number’] = df[‘year’] * 1000 + df[‘day’]
print(df)
df[‘date’] = pd.to_datetime(df[‘int_number’], format=‘%Y%j’) # %j:一年当中的第几天
print(df)

生成时间序列的数据集

import pandas as pd
import numpy as np
from pandas.util.testing import makeTimeDataFrame
ret = makeTimeDataFrame(10)
print(ret.shape)

生成一个以小时为单位的时间数据集

ret2 = pd.DataFrame(np.random.randint(1, 1000, size=(10, 3)), columns=[‘编码’, ‘销量’, ‘库存’],
index=pd.util.testing.makeDateIndex(10, freq=‘H’))
print(ret2)

使用apply做类型检查,value_counts检查值出现的次数

import pandas as pd
d = {‘商品’: [‘牙刷’, ‘药膏’, ‘牙杯’, ‘毛巾’],
‘销量’: [100, ‘40’, 50.0, 200]}
df = pd.DataFrame(d)
print(df)
print(df[‘销量’].apply(type).value_counts()) # 查看值出现了几次
print(df[‘销量’].apply(type)) # 对销量这列进行透视
df[‘销量’] = df[‘销量’].astype(‘int’)
print(df[‘销量’].sum()) # 不类型转换会报错

使用map做特征工程

import pandas as pd
d = {
“gender”: [“male”, “female”, “male”, “female”],
“color”: [“red”, “green”, “blue”, “green”],
“age”: [25, 30, 15, 32]
}
df = pd.DataFrame(d)
print(df)
d = {‘male’: 0, ‘female’: 1}
df[‘gender2’] = df[‘gender’].map(d)
d2 = {‘red’: 0, ‘green’: 1, ‘blue’: 2}
df[‘color2’] = df[‘color’].map(d2)
print(df)

重新排序所有列

import pandas as pd
df = pd.read_csv(‘IMDB-Movie-Data.csv’)

print(df.head(3))

print(df.columns)

col1 = list(df.columns[1:3]) + list(df.columns[0:1]) + list(df.columns[3:])

col1 = list(df.columns[1:3])
col1.append(df.columns[0])
col1.extend(list(df.columns[3:]))
print(df[col1])

对数据做下采样

import pandas as pd
import numpy as np
time_index = pd.util.testing.makeDateIndex(240, freq=‘H’)

print(time_index)

data = np.random.randint(1, 10, size=(240, 1))
df = pd.DataFrame(data, index=time_index)

print(df)

print(df.resample(‘D’)[0].sum()) # 下采样

使用relpace做数据清洗

import pandas as pd
d = {“customer”: [“A”, “B”, “C”, “D”],
“sales”: [1100, “950.5”, “$400”, " KaTeX parse error: Expected 'EOF', got '}' at position 10: 1250.75"]}̲ df = pd.DataFr…]', ‘’, regex=True) # 把美元符号替换为空
print(df1)
print(df1.apply(type))
df[‘sales2’] = df1.astype(‘float’)
print(df)

替换小分类数据(使用value_counts做数据统计)

import pandas as pd
d = {‘carname’: [‘A’, ‘B’, ‘A’, ‘A’, ‘B’, ‘C’, ‘B’, ‘D’, ‘E’, ‘A’],
‘销量’: [100, 100, 200, 300, 100, 50, 20, 30, 40, 10]}
print(d)
df = pd.DataFrame(d)
print(df)
freq = df[‘carname’].value_counts()
small_category = freq[freq < 2]
print(freq)
print(small_category)
print(small_category.index) # D E F
df2 = df[‘carname’].replace(small_category.index, ‘其他’) # 原数据不变,生成新的对象,新的一列
print(df2)

替换小分类数据方法2(使用map做特征提取)

import pandas as pd
d = {‘carname’: [‘A’, ‘B’, ‘A’, ‘A’, ‘B’, ‘C’, ‘B’, ‘D’, ‘E’, ‘A’],
‘销量’: [100, 100, 200, 300, 100, 50, 20, 30, 40, 10]}
df = pd.DataFrame(d)
def f©:
freq = df[‘carname’].value_counts()
small_category = freq[freq < 2]
if c in small_category.index:
return “其他”
else:
return c
print(df[‘carname’].map(f))

你可能感兴趣的:(pandas,学习,python)