python:pandas常用操作整理

pandas常用的方法示例整理:

"""
列
# df = DataFrame()
# df["a"] # 显示索引取列
# df.iloc[0] # 隐试索引取列
行
df.iloc[0] # 隐试索引取列
df.loc["a"] # 显示索引取列
元素
df.loc["B","b"]
df.iloc[1,1]
切行
df[0:2]
切列
df[:,0:2]
"""
df.drop_duplicates(keep="first") # 去重保留第一行数据
df.dropna(axis=0) # 将源数据中存在空值的行数据删除
df.fillna(value=666) # 将数据nan值填充666
df.fillna(method="ffill",axis=0) # 0是列,method="ffill",nan值向前填充
df.fillna(method="bfill",axis=0) # 0是列,method="bfill",nan值向前填充
df.fillna(method="ffill",axis=0).fillna(method="bfill",axis=0)
# 判定异常条件
data = 24
df = df.loc[~(df["c"] > data)] # 删除为Ture的行删掉,df["c"] > data返回bool值

合并
import pandas as pd
import numpy as np
df1 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=["A","B","C"])
df2 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=["A","D","C"])
print(df1)
print(df2)
# 级联
df = pd.concat([df1, df2],axis=0,join='inner') # join='inner'同列名合并
print(df)
# 查看列中是否存在空值
df.isnull().any(axis=0)

# 合并
# outer并集,inner交集
# 一对一
df = pd.merge(df1,df2,on="A")
# 一对多,结果多行
df4 = pd.merge(df1,df2)
# 多对多
df5 = pd.merge(df1,df2,how="left") # 结果以左数据为主
# 多对多 列名有很多相同,不同的没空
df6 = pd.merge(df1,df2,on="A",how='outer')
df7 = pd.merge(df1,df2,left_on="A",right_on="C")# 2个表列名不同,数据相同的合并

# 找出state中的空值对应行数据
df['state'].isnull()
# 将布尔值作为原数据的行索引,可以将true对应的行数据
df.loc[df['state'].isnull()]
# 将简称列取出
df.loc[df['state'].isnull()]["state"]
# 对其进行去重即可
df.loc[df['state'].isnull()]["state"].unique()
# 将pr数据的空找到
df["state"] == "pr"
df.loc[df["state"] == "pr"] # 将pr对应的行数据找到
# 捕获到临时表中的行索引
indexs = df.loc[df["state"] == "pr"].index
# 批量复制
df.loc[indexs,'state'] = 'pr'
# 条件查询
df.query('ages == "total" & year == 2010')
# 计算
df["计算"] = df1["A"] / df1["C"]
# 排序
df.sort_values(by="计算",ascending=False) # False降序 Ture升序

高级操作
import pandas as pd
import numpy as np

def after_sal(s):
    if s > 3000:
        return s - (s-3000) * 0.5
    else:
        return s

def my_mean(s):
    sum = 0
    for i in s:
        sum += i
    return sum / len(s)


if __name__ == '__main__':

    df1 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=["A","B","C"])
    df2 = pd.DataFrame(data=np.random.randint(0,100,size=(5,3)),columns=["A","D","C"])

    # 替换
    df1.replace(to_replace=8,value="eight") # 把表数值为8的换成eight
    df1.replace(to_replace={
     
        8: "eight"
    })# 多值替换
    df1.replace(to_replace={
     3: 74},value=7744) # 指定列的替换操作

    # 映射
    dic = {
     
        'name': ["张三","李四",'张三'],
        'salary': [15000,2000,15000]
    }
    df = pd.DataFrame(data=dic)
    print(df)
    dic1 = {
     
        "张三": "Tom",
        "李四": "Jerry"
    }
    df["e_name"] = df["name"].map(dic1)
    print(df)

    # 运算工具
    df["运算"] = df["salary"].map(after_sal) # 传入函数
    print(df)

    # 排序实现的随机抽样
    # 将数据打乱(行列索引打乱)
    print(np.random.permutation(4))
    df = df.take(indices=np.random.permutation(4),axis=1) # 隐试索引
    print(df)
    # 再把行打乱顺序
    df = df.take(indices=np.random.permutation(4), axis=1).take(indices=np.random.permutation(3))
    print(df)

    # 数据分类处理
    df = pd.DataFrame({
     
        'item': ['a','b','c','b','c','a'],
        'price': [4,3,3,2.5,4,2],
        'color': ['red','yellow','yellow','green','green','green'],
        'weight': [12,20,50,30,20,44]
    })
    # print(df)
    # # 分组
    # df.groupby(by='item').groups
    # # 聚合均值
    # df.groupby(by='item').mean()
    # # 推荐
    # df1 = df.groupby(by='item')['price'].mean()
    # print(df1)
    # df_dic = df1.to_dict()
    # print(df_dic)
    # df["mean_price"] = df["item"].map(df_dic)
    # print(df)

    # 高级聚合(自定义聚合函数)
    a = df.groupby(by='item')["price"].apply(my_mean).to_dict()
    # b = df.groupby(by='item')["price"].transform(my_mean).to_dict() transform对返回的结果进行了映射,而apply没有
    # print(b)
    df["aaa"] = df["item"].map(a)
    print(df)

    # 读取文本
    # pd.read_csv('1.txt',sep='-',header=None)
    """
    # 透视表
    pivot_table
    index参数:分类条件
    values:对计算数据进行筛选
    aggfunc:对数据聚合时进行的函数操作 # 默认为均值
    columns: 可以设置列的层次(对values字段进行分类)
    fill_values:把nan变成0
    """
    df.pivot_table(index=["列名1","列名2"],
                   values=["列名3","列名4"],
                   aggfunc='sum'
                   )
    df.pivot_table(index=["列名1", "列名2"],
                   aggfunc={
     
                       '列名3': 'sum',
                       '列名4': 'mean'
                   }
                   )
    df.pivot_table(index="列名1",
                   aggfunc={
     
                       '列名3': 'sum'
                   },
                   columns="列名2"
                   )# 在列名1的基础上又对列名2进行分类

    # 交叉表
    """
    pd.crosstab()
    index参数:分组数据,
    columns:交叉表的列索引
    """
    pd.crosstab(df.smoke,df.smoke)

模糊匹配
import pandas as pd
import re
import difflib
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
    """
    df_1 is the left table to join
    df_2 is the right table to join
    key1 is the key column of the left table
    key2 is the key column of the right table
    threshold is how close the matches should be to return a match, based on Levenshtein distance
    limit is the amount of matches that will get returned, these are sorted high to low
    """
    s = df_2[key2].tolist()

    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
    df_1['matches'] = m

    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2

    return df_1
    

补充
import pandas as pd
import math
def page(li, num):
    """
    :param li: 需要分页的列表
    :param num: 每一页页码数量
    :return: 分页后列表
    需要导入math模块
    """
    zu = math.ceil(len(li)/num)
    new_list = []
    n = 0
    while True:
        if n == (zu - 1):
            new_list.append(li[n:])
            break
        page = li[n:(n + num)]
        del li[0:(num-1)]
        new_list.append(page)
        n += 1
    return new_list
if __name__ == '__main__':
	df.index = range(len(df)) # 重置索引
	task.drop(i, inplace=True) # 删除第i行
	# 判断一列值在不在另一列中
	df["列名3"] = df.apply(lambda x: 'Ture' if x.列名1 in task.列名2.values else 'FaLse', axis=1)
	task["列名3"] = task["列名3"].astype('str') # 转换某列的数据类型
 	df = df.iloc[1:10, :] # df切割
 	df = df.dropna(subset=['日期'],axis=0) # 删除日期为控制的所在行
 	df1["出库"] = df1.订单号.isin(df2.原始单号) # 新增出库一列,判断df1订单号在不在df2原始单号中 返回bool
 	df= df.drop(labels=["列名1",'列名2'],axis=1) # 删除某列
 	df[df["列名1"].isin(["11"])] # 去除列名1中是11所在行,返回df格式
 	

你可能感兴趣的:(python,数据分析,pandas)