# Pandas常用功能小结:
查看表信息:df.shape
读取:pd.Dateframe([{},{}]), pd.readsql()
查询:df.where() df.query()
判断逻辑: ‘&’‘and’ ‘|’‘or’ .isin()
替换:replace() filno
筛选:loc() iloc()
合并:merge
移动:shift
删除:去重 :.drop_duplicates
数据转换:tolist() to_cidt()
设置索引:set_index() reindex()
官网DataFrame 【点击】
# 返回表的行/列 数
df.shape
# 判断为空
df.empty
# numpy & pandas 相互转换
## dataframe转化成array
df=df.values
## array转化成dataframe
import pandas as pd
df = pd.DataFrame(df)
# df.shape[0] # 查看表的行数
# 生成df
# df --> lis_dics
minute_net_inflow_info_dic_list = minute_net_inflow_info_df.to_dict('records')
# 设置索引
# base_info_df = base_info_df.set_index('code', drop=True)
# base_info_df = base_info_df.reindex(index=['code'])
# 选取数据
df.iloc[:2, :].to_dict(orient='records') # 提取前两行,所有列,并转换成字典
data.loc[['Ohio', 'Utah'], ['two', 'four']] # 使用行名和列名选取
### 条件查询
查询:【1】 【2】【3】【逻辑、条件】 【4】
# 条件查询,更改指定字段的值
new_df.loc[(new_df['code'].map(lambda d: d[:3])).isin(['688']), ['category']] = 14906
# 条件删除
df.drop(df[(df.score < 50) & (df.score > 20)].index, inplace=True)
数据提取:https://blog.csdn.net/qq_41797451/article/details/80542060
inc_max_stock = new_df[new_df['rise_fall_rate'] == new_df['rise_fall_rate'].max()]['name'].values[0]
# 按索引列提取
new_index_list = list(set(index_list)) # 去重
bad_rec = bad_rec.iloc[new_index_list, :] # 提取达标数据
# 任何列
bad_rec.loc[df['cloumn_name']].isin(some_values) #
## 条件筛选
__del_codes = list(_del_codes)
_df_old_adjust_del = _df_old_adjust[~_df_old_adjust.index.isin(__del_codes)]
new_df = base_info_df[base_info_df['code'].isin(str_code_list)]
new_df = base_info_df.loc[str_code_list, :]
new_df = new_df[~(new_df['code'].map(lambda d: d[:3])).isin(['688'])] # 过滤掉创业板
## 保存成excel
new_user_info_df.to_excel(os.path.join(REPORT_PATH, '{}.xlsx'.format(now)))
## 分组
grouped = df.groupby(by = ['code','category'])
pd.options.display.max_columns = None
grouplist = []
for code,group in grouped:
fixs = Fix()
b = group.apply(lambda x: fixs.func(x), axis=1)
b['amount'] = b['amount'].cumsum()
b['volume'] = b['volume'].cumsum()
grouplist.append(b)
df.dropna(axis=0, how='any', inplace=True) # 塞选删除含有空值的行, how='all':所有为空才删除
basic_info = basic_info.where(basic_info.notna(), None) # 将NaN强制转成None
# 去重
df.drop_duplicates(["trader"], keep="last", inplace=True)
df = df.drop_duplicates(subset=['code', 'industry'], keep=False) # 去重
df['hello'] = df['hello'].replace({np.nan:None})
关于值替换:
# 将NaN替换成0
__tmp_df['cash'] = __tmp_df['cash'].fillna(0)
# 将指定值替换成NaN
df[df.isin(['0000-00-00', '0000-00-00 00:00:00', '', 0.0])] = np.nan
df= df.where(df.notna(), None) # 将nan值替换成None
df.replace('目标值', '结果值', inplace=True)
timing.index = pd.to_datetime(timing['date'])
df = df[['befor_date', 'date', 'status', 'ups', 'close', 'open']].dropna()
df = df.rename(columns={'befor_date': 'befor_day', 'status': 'niubear'}) # 改列名
# 移动运算
df['column_name'] = df['balance'].rolling(window=ma).mean() # 移动求平均
# 取精度值
df['s'].round(4)
# 排序
pd.sort_values("xxx", inplace=True)
# 合并 pandas dataframe的合并(append, merge, concat)
_count_res_df = pd.merge(_code_name_res_df, _count_res_df, on='code')
# 设置df全量输出
pd.set_option('display.max_colwidth',500)
pandas.set_option('display.max_rows',None)