pandas使用笔记

1、根据已有列的值生成新的一列
def get_ques_stem(question_id, question_desc_text):
    if "(.*)', re.S)
        backgroud = re.sub(patt, '', question_desc_text)
        return backgroud
    else:
        return "###_" + str(question_id) 

df = pd.read_excel(path, index_col=None, sheet_name="税法二")
df['ques_stem'] = df.apply(lambda row: get_ques_stem(row['question_id'], row['question_desc_text']), axis=1)
2、根据已有列生成新的多列
def remove_invalid_item(s):
    soup = BeautifulSoup(s, 'html.parser')
    handled_html_text = soup.encode_contents().decode()
    handled_search_text = soup.get_text()
    return handled_html_text, handled_search_text, part_invalid

df = pd.read_json(file_path, orient='records', lines=True)
df[['handled_html_text', 'handled_search_text', 'part_invalid']] = df.apply(
        lambda row: remove_invalid_item(row['html_text']), axis=1, result_type='expand')

3、聚合分组

根据数据的某一个字段进行聚合

def get_attachment_dic():
    q_sql = "SELECT parent_id, download_url FROM shuiben_attachment_relevance WHERE deleted = 0 limit 10"
    df = fu.mysql_pd(q_sql, "prod")
    group_df = df.groupby(['parent_id'])
    group_df_keys = group_df.groups.keys()
    print("#########打印集合后的key##########")
    print(group_df_keys)
    # 根据聚合后的key遍历
    for k in group_df_keys:
        # 获取同一parent_id下的所有附件信息
        grouped_rows = group_df.get_group(k).to_dict('records')
        print("#########打印法规下的所有附件信息##########")
        print(grouped_rows)

打印信息如下:

#########打印集合后的key##########
dict_keys([4, 6, 8, 12, 13])
#########打印法规下的所有附件信息##########
[{'parent_id': 4, 'download_url': 'upload/2023/01-18/00-03-130082-1841407387.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-470077701932422.pdf'}, {'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-570084-90822133.pdf'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-1109521442465714.doc'}, {'parent_id': 8, 'download_url': None}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120183-2102821684.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120515-1103417705.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120730301869269.doc'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 12, 'download_url': 'upload/2023/01-18/00-03-150019365343013.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 13, 'download_url': 'upload/2023/01-18/00-03-150460-1843529730.pdf'}]

你可能感兴趣的:(Python,pandas,笔记,python)