六、pandas模块:(8)多条件删除-替换

import pandas as pd
import numpy as  np
df = pd.read_csv("pandas-test.csv")
modification = ("Deamidation,None,PGE,Gln->Pyro-Glu,Lys loss,A2G0F,A1G0F,Unglycosylated,Oxidation,Isomerization".split(","))
#注意这个split前面是用点
def site_clean(s):
    if pd.isna(s["Site"]):
        return pd.np.nan
    return s["Site"].replace("~","")#即替换~为空
df["Site"] = df.apply(site_clean,axis = 1)#注意这个函数使用的方法
#**删除具体列内的空值:
df = df.loc[(~df["Modification"].isna()) & (~df["Delta (ppm)"].isna())]
#注意思路是先取出空值,然后取反,取空值:df = df.loc[(df["Modification"].isna()) & (df["Delta (ppm)"].isna())],注意.isna()需要括号
#**选取多条件的值,按照列的多条件,保留符合这些条件的行
df = df.loc[(df.Modification.isin(modification) )& (abs(df["Delta (ppm)"])<=10)&(df["Confidence Score"]>=80)]
df2 = df.loc[(df.Modification == "None")]
print(df2)
# .loc[]可以说是按照行值来筛选
    Peptide Sequence Modification Site  Delta (ppm)  Confidence Score ID Type  \
1   VVSVLTVLHQDWLNGK         None  NaN         3.00             100.0    Both   
6       QVQLVQSGAEVK         None  NaN        -0.88             100.0    Both   
13           DTLMISR         None  NaN         1.32             100.0    Both   
15    DSTYSLSSTLTLSK         None  NaN         1.06             100.0    Both   

    Avg MS Area: NR    Protein  
1      2.350000e+09  TY_ABC_HC  
6      2.989130e+07  TY_ABC_HC  
13     5.581042e+08  TY_ABC_HC  
15     5.981615e+07  TY_ABC_LC  
#**drop的使用,drop不能删除单个值,如果删除一个列中的一个值,就需要加后面加逗号,即删除符合x的值,保留x列的其他值的y值。注意二维问题
a = df.drop(["Modification"],axis = 1)#删除一列
b = df.loc[(df.Modification == "PGE") & (df.Site != "E1")]#取出Modification == PGE的,且Site!=E1的
c = df.loc[(df.Modification == "PGE") & (df.Site != "E1"),]#注意b和c的区别,就是最后有一个逗号【,】
print(b)
    Peptide Sequence Modification  Site  Delta (ppm)  Confidence Score  \
2  TTPPVLDSDGSFFLYSK          PGE  T397         1.22              99.4   

  ID Type  Avg MS Area: NR                Protein  
2    Both       16900000.0  C483CEC010_HeavyChain  
print(c)
    Peptide Sequence Modification  Site  Delta (ppm)  Confidence Score  \
2  TTPPVLDSDGSFFLYSK          PGE  T397         1.22              99.4   

  ID Type  Avg MS Area: NR                Protein  
2    Both       16900000.0  C483CEC010_HeavyChain  
#** drop可以按照行来删除数据
df3 = df.drop(df.loc[(df.Modification == "PGE")].index)
#注意这种先取出符合多种条件的index,然后按照index来删除的方法
print(df3)
    Peptide Sequence    Modification  Site  Delta (ppm)  Confidence Score  \
0   VVSVLTVLHQDWLNGK     Deamidation  N315         7.11             100.0   
1   VVSVLTVLHQDWLNGK            None   NaN         3.00             100.0   
3   SVSTSGFNYMHWYQQK   Gln->Pyro-Glu   S28         8.39              97.6   
4           SLSLSPGK        Lys loss  K451        -0.51              80.1   
5       QVQLVQSGAEVK   Gln->Pyro-Glu    Q1         1.76             100.0   
6       QVQLVQSGAEVK            None   NaN        -0.88             100.0   
7       NYLAWYQQKPGK        Lys loss   K42        -0.41             100.0   
9          EEQFNSTYR           A2G0F  N297         0.26             100.0   
10         EEQFNSTYR           A1G0F  N297        -1.94             100.0   
11         EEQFNSTYR  Unglycosylated  N297        -0.52             100.0   
12           DTLMISR       Oxidation  M252         0.74             100.0   
13           DTLMISR            None   NaN         1.32             100.0   
14    DSTYSLSSTLTLSK   Isomerization  D174         3.01             100.0   
15    DSTYSLSSTLTLSK            None   NaN         1.06             100.0   

   ID Type  Avg MS Area: NR                Protein  
0     Both     2.440000e+08              TY_ABC_HC  
1     Both     2.350000e+09              TY_ABC_HC  
3      MS2     7.290000e+05              TY_ABC_LC  
4     Both     1.110000e+09  C483CEC010_HeavyChain  
5     Both     1.140000e+09              TY_ABC_HC  
6     Both     2.989130e+07              TY_ABC_HC  
7     Both     1.330000e+06  C483CEC010_LightChain  
9     Both     6.950000e+07              TY_ABC_HC  
10    Both     1.120000e+07              TY_ABC_HC  
11    Both     5.266161e+06              TY_ABC_HC  
12    Both     1.540000e+07              TY_ABC_HC  
13    Both     5.581042e+08              TY_ABC_HC  
14     MS2     2.640000e+08              TY_ABC_LC  
15    Both     5.981615e+07              TY_ABC_LC  
#**替换,
df.loc[df.Modification == "Isomerization",["Modification","Site"]]=["None",""]
#这个是采用赋值的方式来替换,*****注意写法
print(df)
     Peptide Sequence    Modification  Site  Delta (ppm)  Confidence Score  \
0    VVSVLTVLHQDWLNGK     Deamidation  N315         7.11             100.0   
1    VVSVLTVLHQDWLNGK            None   NaN         3.00             100.0   
2   TTPPVLDSDGSFFLYSK             PGE  T397         1.22              99.4   
3    SVSTSGFNYMHWYQQK   Gln->Pyro-Glu   S28         8.39              97.6   
4            SLSLSPGK        Lys loss  K451        -0.51              80.1   
5        QVQLVQSGAEVK   Gln->Pyro-Glu    Q1         1.76             100.0   
6        QVQLVQSGAEVK            None   NaN        -0.88             100.0   
7        NYLAWYQQKPGK        Lys loss   K42        -0.41             100.0   
8    EVQLVESGGGLVQPGR             PGE    E1         0.49             100.0   
9           EEQFNSTYR           A2G0F  N297         0.26             100.0   
10          EEQFNSTYR           A1G0F  N297        -1.94             100.0   
11          EEQFNSTYR  Unglycosylated  N297        -0.52             100.0   
12            DTLMISR       Oxidation  M252         0.74             100.0   
13            DTLMISR            None   NaN         1.32             100.0   
14     DSTYSLSSTLTLSK            None               3.01             100.0   
15     DSTYSLSSTLTLSK            None   NaN         1.06             100.0   

   ID Type  Avg MS Area: NR                Protein  
0     Both     2.440000e+08              TY_ABC_HC  
1     Both     2.350000e+09              TY_ABC_HC  
2     Both     1.690000e+07  C483CEC010_HeavyChain  
3      MS2     7.290000e+05              TY_ABC_LC  
4     Both     1.110000e+09  C483CEC010_HeavyChain  
5     Both     1.140000e+09              TY_ABC_HC  
6     Both     2.989130e+07              TY_ABC_HC  
7     Both     1.330000e+06  C483CEC010_LightChain  
8     Both     1.850000e+07  C483CEC010_HeavyChain  
9     Both     6.950000e+07              TY_ABC_HC  
10    Both     1.120000e+07              TY_ABC_HC  
11    Both     5.266161e+06              TY_ABC_HC  
12    Both     1.540000e+07              TY_ABC_HC  
13    Both     5.581042e+08              TY_ABC_HC  
14     MS2     2.640000e+08              TY_ABC_LC  
15    Both     5.981615e+07              TY_ABC_LC  
df.loc[df["Modification"] =="A2G0F","Modification"]="G0F"
"""
#**一定要注意这个[]内加了逗号,"Modification",主要是为了告诉程序,
只替换Modification列的这个A2G0F值,如果不加“,Modification”,
程序会替换“A2G0F所在行的所有值为G0F

#注意上述代码的,单个替换和2列相互关联的条件替换

"""
print(df)
     Peptide Sequence    Modification  Site  Delta (ppm)  Confidence Score  \
0    VVSVLTVLHQDWLNGK     Deamidation  N315         7.11             100.0   
1    VVSVLTVLHQDWLNGK            None   NaN         3.00             100.0   
2   TTPPVLDSDGSFFLYSK             PGE  T397         1.22              99.4   
3    SVSTSGFNYMHWYQQK   Gln->Pyro-Glu   S28         8.39              97.6   
4            SLSLSPGK        Lys loss  K451        -0.51              80.1   
5        QVQLVQSGAEVK   Gln->Pyro-Glu    Q1         1.76             100.0   
6        QVQLVQSGAEVK            None   NaN        -0.88             100.0   
7        NYLAWYQQKPGK        Lys loss   K42        -0.41             100.0   
8    EVQLVESGGGLVQPGR             PGE    E1         0.49             100.0   
9           EEQFNSTYR             G0F  N297         0.26             100.0   
10          EEQFNSTYR           A1G0F  N297        -1.94             100.0   
11          EEQFNSTYR  Unglycosylated  N297        -0.52             100.0   
12            DTLMISR       Oxidation  M252         0.74             100.0   
13            DTLMISR            None   NaN         1.32             100.0   
14     DSTYSLSSTLTLSK            None               3.01             100.0   
15     DSTYSLSSTLTLSK            None   NaN         1.06             100.0   

   ID Type  Avg MS Area: NR                Protein  
0     Both     2.440000e+08              TY_ABC_HC  
1     Both     2.350000e+09              TY_ABC_HC  
2     Both     1.690000e+07  C483CEC010_HeavyChain  
3      MS2     7.290000e+05              TY_ABC_LC  
4     Both     1.110000e+09  C483CEC010_HeavyChain  
5     Both     1.140000e+09              TY_ABC_HC  
6     Both     2.989130e+07              TY_ABC_HC  
7     Both     1.330000e+06  C483CEC010_LightChain  
8     Both     1.850000e+07  C483CEC010_HeavyChain  
9     Both     6.950000e+07              TY_ABC_HC  
10    Both     1.120000e+07              TY_ABC_HC  
11    Both     5.266161e+06              TY_ABC_HC  
12    Both     1.540000e+07              TY_ABC_HC  
13    Both     5.581042e+08              TY_ABC_HC  
14     MS2     2.640000e+08              TY_ABC_LC  
15    Both     5.981615e+07              TY_ABC_LC  
#**************
#df.loc[df['Modification'] !='PGE'']= "xx"   这个代码的意思是将“Modification"列所有不等于“PGE"的值替换为“xx”,
#df.loc[df['Modification'] !='PGE'',"Site"]= "xx" 这个代码的意思是将“Modification"列所有不等于“PGE"的对应的Site列的值替换为“xx”

你可能感兴趣的:(python学习笔记)