Pandas常见操作

 

# Pandas常用功能小结:

查看表信息:df.shape

读取:pd.Dateframe([{},{}]),  pd.readsql()

查询:df.where()  df.query() 

判断逻辑:  ‘&’‘and’ ‘|’‘or’ .isin() 

替换:replace()  filno

筛选:loc() iloc()

合并:merge

移动:shift

删除:去重 :.drop_duplicates

数据转换:tolist() to_cidt() 

设置索引:set_index() reindex()


官网DataFrame  【点击】

# 返回表的行/列 数

df.shape

# 判断为空

df.empty

# numpy &  pandas 相互转换

## dataframe转化成array

    df=df.values

## array转化成dataframe

    import pandas as pd

    df = pd.DataFrame(df)

# df.shape[0]  # 查看表的行数

# 生成df

# df --> lis_dics

minute_net_inflow_info_dic_list = minute_net_inflow_info_df.to_dict('records')

# 设置索引

# base_info_df = base_info_df.set_index('code', drop=True)

# base_info_df = base_info_df.reindex(index=['code'])

 

# 选取数据

df.iloc[:2, :].to_dict(orient='records') # 提取前两行,所有列,并转换成字典

data.loc[['Ohio', 'Utah'], ['two', 'four']] # 使用行名和列名选取

### 条件查询

查询:【1】 【2】【3】【逻辑、条件】 【4】

# 条件查询,更改指定字段的值

new_df.loc[(new_df['code'].map(lambda d: d[:3])).isin(['688']), ['category']] = 14906

# 条件删除

df.drop(df[(df.score < 50) & (df.score > 20)].index, inplace=True)

数据提取:https://blog.csdn.net/qq_41797451/article/details/80542060

inc_max_stock = new_df[new_df['rise_fall_rate'] == new_df['rise_fall_rate'].max()]['name'].values[0]

# 按索引列提取

new_index_list = list(set(index_list)) # 去重

bad_rec = bad_rec.iloc[new_index_list, :] # 提取达标数据

 

# 任何列

bad_rec.loc[df['cloumn_name']].isin(some_values) # 

## 条件筛选

__del_codes = list(_del_codes)

_df_old_adjust_del = _df_old_adjust[~_df_old_adjust.index.isin(__del_codes)]

new_df = base_info_df[base_info_df['code'].isin(str_code_list)]

new_df = base_info_df.loc[str_code_list, :]

new_df = new_df[~(new_df['code'].map(lambda d: d[:3])).isin(['688'])] # 过滤掉创业板

## 保存成excel

new_user_info_df.to_excel(os.path.join(REPORT_PATH, '{}.xlsx'.format(now)))

 

## 分组

grouped = df.groupby(by = ['code','category'])

pd.options.display.max_columns = None

grouplist = []

for code,group in grouped:

    fixs = Fix()

    b = group.apply(lambda x: fixs.func(x), axis=1)

    b['amount'] = b['amount'].cumsum()

    b['volume'] = b['volume'].cumsum()

    grouplist.append(b)

df.dropna(axis=0, how='any', inplace=True) # 塞选删除含有空值的行, how='all':所有为空才删除

basic_info = basic_info.where(basic_info.notna(), None) # 将NaN强制转成None

 

# 去重

df.drop_duplicates(["trader"], keep="last", inplace=True)

df = df.drop_duplicates(subset=['code', 'industry'], keep=False) # 去重

df['hello'] = df['hello'].replace({np.nan:None})

关于值替换:

# 将NaN替换成0

__tmp_df['cash'] = __tmp_df['cash'].fillna(0)

# 将指定值替换成NaN

df[df.isin(['0000-00-00', '0000-00-00 00:00:00', '', 0.0])] = np.nan

df= df.where(df.notna(), None)  # 将nan值替换成None

df.replace('目标值', '结果值', inplace=True)

timing.index = pd.to_datetime(timing['date'])

df = df[['befor_date', 'date', 'status', 'ups', 'close', 'open']].dropna()

df = df.rename(columns={'befor_date': 'befor_day', 'status': 'niubear'}) # 改列名

# 移动运算

df['column_name'] = df['balance'].rolling(window=ma).mean()  # 移动求平均

# 取精度值

df['s'].round(4)

# 排序

pd.sort_values("xxx", inplace=True)

# 合并 pandas dataframe的合并(append, merge, concat)

_count_res_df = pd.merge(_code_name_res_df, _count_res_df, on='code')

# 设置df全量输出

pd.set_option('display.max_colwidth',500)

pandas.set_option('display.max_rows',None)

 

 

 

你可能感兴趣的:(Python)