import pandas as pd
import numpy as np
df = pd.read_csv("pandas-test.csv")
modification = ("Deamidation,None,PGE,Gln->Pyro-Glu,Lys loss,A2G0F,A1G0F,Unglycosylated,Oxidation,Isomerization".split(","))
def site_clean(s):
if pd.isna(s["Site"]):
return pd.np.nan
return s["Site"].replace("~","")
df["Site"] = df.apply(site_clean,axis = 1)
df = df.loc[(~df["Modification"].isna()) & (~df["Delta (ppm)"].isna())]
df = df.loc[(df.Modification.isin(modification) )& (abs(df["Delta (ppm)"])<=10)&(df["Confidence Score"]>=80)]
df2 = df.loc[(df.Modification == "None")]
print(df2)
Peptide Sequence Modification Site Delta (ppm) Confidence Score ID Type \
1 VVSVLTVLHQDWLNGK None NaN 3.00 100.0 Both
6 QVQLVQSGAEVK None NaN -0.88 100.0 Both
13 DTLMISR None NaN 1.32 100.0 Both
15 DSTYSLSSTLTLSK None NaN 1.06 100.0 Both
Avg MS Area: NR Protein
1 2.350000e+09 TY_ABC_HC
6 2.989130e+07 TY_ABC_HC
13 5.581042e+08 TY_ABC_HC
15 5.981615e+07 TY_ABC_LC
a = df.drop(["Modification"],axis = 1)
b = df.loc[(df.Modification == "PGE") & (df.Site != "E1")]
c = df.loc[(df.Modification == "PGE") & (df.Site != "E1"),]
print(b)
Peptide Sequence Modification Site Delta (ppm) Confidence Score \
2 TTPPVLDSDGSFFLYSK PGE T397 1.22 99.4
ID Type Avg MS Area: NR Protein
2 Both 16900000.0 C483CEC010_HeavyChain
print(c)
Peptide Sequence Modification Site Delta (ppm) Confidence Score \
2 TTPPVLDSDGSFFLYSK PGE T397 1.22 99.4
ID Type Avg MS Area: NR Protein
2 Both 16900000.0 C483CEC010_HeavyChain
df3 = df.drop(df.loc[(df.Modification == "PGE")].index)
print(df3)
Peptide Sequence Modification Site Delta (ppm) Confidence Score \
0 VVSVLTVLHQDWLNGK Deamidation N315 7.11 100.0
1 VVSVLTVLHQDWLNGK None NaN 3.00 100.0
3 SVSTSGFNYMHWYQQK Gln->Pyro-Glu S28 8.39 97.6
4 SLSLSPGK Lys loss K451 -0.51 80.1
5 QVQLVQSGAEVK Gln->Pyro-Glu Q1 1.76 100.0
6 QVQLVQSGAEVK None NaN -0.88 100.0
7 NYLAWYQQKPGK Lys loss K42 -0.41 100.0
9 EEQFNSTYR A2G0F N297 0.26 100.0
10 EEQFNSTYR A1G0F N297 -1.94 100.0
11 EEQFNSTYR Unglycosylated N297 -0.52 100.0
12 DTLMISR Oxidation M252 0.74 100.0
13 DTLMISR None NaN 1.32 100.0
14 DSTYSLSSTLTLSK Isomerization D174 3.01 100.0
15 DSTYSLSSTLTLSK None NaN 1.06 100.0
ID Type Avg MS Area: NR Protein
0 Both 2.440000e+08 TY_ABC_HC
1 Both 2.350000e+09 TY_ABC_HC
3 MS2 7.290000e+05 TY_ABC_LC
4 Both 1.110000e+09 C483CEC010_HeavyChain
5 Both 1.140000e+09 TY_ABC_HC
6 Both 2.989130e+07 TY_ABC_HC
7 Both 1.330000e+06 C483CEC010_LightChain
9 Both 6.950000e+07 TY_ABC_HC
10 Both 1.120000e+07 TY_ABC_HC
11 Both 5.266161e+06 TY_ABC_HC
12 Both 1.540000e+07 TY_ABC_HC
13 Both 5.581042e+08 TY_ABC_HC
14 MS2 2.640000e+08 TY_ABC_LC
15 Both 5.981615e+07 TY_ABC_LC
df.loc[df.Modification == "Isomerization",["Modification","Site"]]=["None",""]
print(df)
Peptide Sequence Modification Site Delta (ppm) Confidence Score \
0 VVSVLTVLHQDWLNGK Deamidation N315 7.11 100.0
1 VVSVLTVLHQDWLNGK None NaN 3.00 100.0
2 TTPPVLDSDGSFFLYSK PGE T397 1.22 99.4
3 SVSTSGFNYMHWYQQK Gln->Pyro-Glu S28 8.39 97.6
4 SLSLSPGK Lys loss K451 -0.51 80.1
5 QVQLVQSGAEVK Gln->Pyro-Glu Q1 1.76 100.0
6 QVQLVQSGAEVK None NaN -0.88 100.0
7 NYLAWYQQKPGK Lys loss K42 -0.41 100.0
8 EVQLVESGGGLVQPGR PGE E1 0.49 100.0
9 EEQFNSTYR A2G0F N297 0.26 100.0
10 EEQFNSTYR A1G0F N297 -1.94 100.0
11 EEQFNSTYR Unglycosylated N297 -0.52 100.0
12 DTLMISR Oxidation M252 0.74 100.0
13 DTLMISR None NaN 1.32 100.0
14 DSTYSLSSTLTLSK None 3.01 100.0
15 DSTYSLSSTLTLSK None NaN 1.06 100.0
ID Type Avg MS Area: NR Protein
0 Both 2.440000e+08 TY_ABC_HC
1 Both 2.350000e+09 TY_ABC_HC
2 Both 1.690000e+07 C483CEC010_HeavyChain
3 MS2 7.290000e+05 TY_ABC_LC
4 Both 1.110000e+09 C483CEC010_HeavyChain
5 Both 1.140000e+09 TY_ABC_HC
6 Both 2.989130e+07 TY_ABC_HC
7 Both 1.330000e+06 C483CEC010_LightChain
8 Both 1.850000e+07 C483CEC010_HeavyChain
9 Both 6.950000e+07 TY_ABC_HC
10 Both 1.120000e+07 TY_ABC_HC
11 Both 5.266161e+06 TY_ABC_HC
12 Both 1.540000e+07 TY_ABC_HC
13 Both 5.581042e+08 TY_ABC_HC
14 MS2 2.640000e+08 TY_ABC_LC
15 Both 5.981615e+07 TY_ABC_LC
df.loc[df["Modification"] =="A2G0F","Modification"]="G0F"
"""
#**一定要注意这个[]内加了逗号,"Modification",主要是为了告诉程序,
只替换Modification列的这个A2G0F值,如果不加“,Modification”,
程序会替换“A2G0F所在行的所有值为G0F
#注意上述代码的,单个替换和2列相互关联的条件替换
"""
print(df)
Peptide Sequence Modification Site Delta (ppm) Confidence Score \
0 VVSVLTVLHQDWLNGK Deamidation N315 7.11 100.0
1 VVSVLTVLHQDWLNGK None NaN 3.00 100.0
2 TTPPVLDSDGSFFLYSK PGE T397 1.22 99.4
3 SVSTSGFNYMHWYQQK Gln->Pyro-Glu S28 8.39 97.6
4 SLSLSPGK Lys loss K451 -0.51 80.1
5 QVQLVQSGAEVK Gln->Pyro-Glu Q1 1.76 100.0
6 QVQLVQSGAEVK None NaN -0.88 100.0
7 NYLAWYQQKPGK Lys loss K42 -0.41 100.0
8 EVQLVESGGGLVQPGR PGE E1 0.49 100.0
9 EEQFNSTYR G0F N297 0.26 100.0
10 EEQFNSTYR A1G0F N297 -1.94 100.0
11 EEQFNSTYR Unglycosylated N297 -0.52 100.0
12 DTLMISR Oxidation M252 0.74 100.0
13 DTLMISR None NaN 1.32 100.0
14 DSTYSLSSTLTLSK None 3.01 100.0
15 DSTYSLSSTLTLSK None NaN 1.06 100.0
ID Type Avg MS Area: NR Protein
0 Both 2.440000e+08 TY_ABC_HC
1 Both 2.350000e+09 TY_ABC_HC
2 Both 1.690000e+07 C483CEC010_HeavyChain
3 MS2 7.290000e+05 TY_ABC_LC
4 Both 1.110000e+09 C483CEC010_HeavyChain
5 Both 1.140000e+09 TY_ABC_HC
6 Both 2.989130e+07 TY_ABC_HC
7 Both 1.330000e+06 C483CEC010_LightChain
8 Both 1.850000e+07 C483CEC010_HeavyChain
9 Both 6.950000e+07 TY_ABC_HC
10 Both 1.120000e+07 TY_ABC_HC
11 Both 5.266161e+06 TY_ABC_HC
12 Both 1.540000e+07 TY_ABC_HC
13 Both 5.581042e+08 TY_ABC_HC
14 MS2 2.640000e+08 TY_ABC_LC
15 Both 5.981615e+07 TY_ABC_LC