import numpy as np
import pandas as pd、
dflp = pd.read_csv('./data/train-left-up.csv')
dfrp = pd.read_csv('./data/train-right-up.csv')
dfld = pd.read_csv('./data/train-left-down.csv')
dfrd = pd.read_csv('./data/train-right-down.csv')
result_up = pd.concat([dflp, dfrp], axis = 1)
result_down = pd.concat([dfld, dfrd], axis = 1)
result = pd.concat([result_up, result_down])
# join 用于左右拼接
result1_up = dflp.join(dfrp)
result1_down = dfld.join(dfrd)
# append 用于上下拼接
result1 = result1_up.append(result1_down)
# merge 只能用于左右表拼接
Result_up = pd.merge(dflp, dfrp, left_index=True,right_index=True)
Result_down = pd.merge(dfld, dfrd, left_index=True,right_index=True)
# append 用于上下拼接
Result = Result_up.append(Result_down)
pd.merge()参数比较多,用法比较灵活,这边给出官方文档和解析:
pd.merge() 官方文档
DataFrame.merge(right, how=‘inner’, on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=(’_x’, ‘_y’), copy=True, indicator=False, validate=None)
这篇写的挺好的:Pandas学习笔记六——合并数据集:连接(join)和合并(merge)操作
Resul.to_csv('result.csv', index = False)
1. DataFrame → Series: df.stack(), 把列旋转成行
text = pd.read_csv('result.csv')
text.head()
unit_result = text.stack()
Output:
⚠️ 但数据保存后还是 DataFrame 格式
unit_result.to_csv('unit_result.csv')
test = pd.read_csv('unit_result.csv')
test.head()
2. Series → DataFrame: df.unstack(), 把行旋转成列
unit_result = unit_result.unstack()
groupby的运用
《利用Python进行数据分析·第二版》第十章 数据聚合与分组运算
import numpy as np
import pandas as pd
df = pd.read_csv('./result.csv')
# 输出为Series类型
# 方法一
df.groupby('Sex')['Fare'].mean()
# 方法二
df1 = df['Fare'].groupby(df['Sex']).mean()
# 输出为DataFrame类型
# 方法一
df.groupby('Sex')[['Fare']].mean()
# 方法二
df1 = df[['Fare']].groupby(df['Sex']).mean()
df2 = df.groupby('Sex')['Survived'].sum()
从 2.6.1 到 2.6.2 中,这些运算可以通过agg()函数来同时计算。并且可以使用rename函数修改列名
# 若公式'mean'或'sum'加上中括号(e.g. ['mean'], ['sum']),则表头会多出一行显示公式名称
# 输出类型为 DataFrame
# reset_index()可以解决主键与列名称不在一行的问题
num_agg = {'Fare':'mean', 'Survived': 'sum'}
df.groupby('Sex').agg(num_agg).rename(columns={'Fare': 'mean_fare', 'Survived': 'sum_Survived'}).reset_index()
将2.6.1和2.6.2的数据合并,并保存到sex_fare_survived.csv
# merge可以合并Series类型的数据
# 输出类型为 DataFrame
sex_fare_survived = pd.merge(df1,df2,on='Sex').reset_index()
sex_fare_survived.to_csv('sex_fare_survived.csv', index = False)
df['Survived'].groupby(df['Pclass']).sum()
df.groupby(['Pclass','Age'])['Fare'].mean().head()
# 计算泰坦尼克号男性与女性的平均票价
dfd1 = df[['Fare']].groupby(df['Sex']).mean().reset_index()
# 统计泰坦尼克号中男女的存活人数
dfd2 = df.groupby('Sex')[['Survived']].sum()
# 合并数据
# join 可以合并 DataFrame类型的数据
dfd1_dfd2 = dfd1.join(dfd2)
age_survived = df.groupby('Age')['Survived'].sum()
ageSurvivedMax = age_survived.max()
survivedMax = ageSurvivedMax / df['Survived'].sum()
# output: 0.043859649122807015