import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn
# 正则表达式
import re
import sys
# 中心化
from sklearn.preprocessing import MinMaxScaler
# 标准化
from sklearn.preprocessing import StandardScaler
# 自定义的工具包
from my_tools import *
import warnings
jibing = pd.read_excel("./某疾病数据.xlsx")
dict_sex = {"女":0,"男":1}
for i in range(jibing.loc[:,"性别"].shape[0]):
jibing.loc[:,"性别"][i] = dict_sex[jibing.loc[:,"性别"][i]]
{'A', 'AB', 'B', 'O'}
dict_sex = {"A":0,"AB":1,"B":2,"O":3}
for i in range(jibing.loc[:,"血型"].shape[0]):
jibing.loc[:,"血型"][i] = dict_sex[jibing.loc[:,"血型"][i]]
{0, 1, 2, 3}
{'双侧', '右', '女', '左'}
查看各部分所占的比例,发现 左 和 右 所占的比例和接近 99%
对于比例较少的 女 和 双侧 可以认为是噪声,直接删除
length = jibing.loc[:,"左右"].shape[0]
length_nv = jibing[jibing["左右"] == "女"].shape[0]
length_sc = jibing[jibing["左右"] == "双侧"].shape[0]
length_z = jibing[jibing["左右"] == "左"].shape[0]
length_y = jibing[jibing["左右"] == "右"].shape[0]
print("左:" + str(length_z / length * 100) + "%")
print("右:" + str(length_y / length * 100) + "%")
print("双侧:" + str(length_sc / length * 100) + "%")
print("女:" + str(length_nv / length * 100) + "%")
删除某一行后使用 reset_index 保证索引的连续性
jibing = jibing.drop(labels=jibing[jibing['左右'] == "女"].index).reset_index(drop = True)
jibing = jibing.drop(labels=jibing[jibing['左右'] == "双侧"].index).reset_index(drop = True)
dict_lr = {"左":0,"右":1,}
for i in range(jibing.loc[:,"左右"].shape[0]):
jibing.loc[:,"左右"][i] = dict_lr[jibing.loc[:,"左右"][i]]
{0, 1}
'10月 ',
'2月余 ',
'3月余 ',
'4月余 ',
'6月 ',
'6月余 ',
'8月 ',
'9月余 ',
for i in range(jibing.loc[:,"症状持续时间"].shape[0]):
if'小时', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 0
if'天', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 1
if'周', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 2
if'月', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 3
if'年', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 4
{0, 1, 2, 3, 4, '无'}
4 797
3 756
1 58
2 18
无 1
0 1
Name: 症状持续时间, dtype: int64
pandas 获取指定列中的某个值(范围)所属的行
jibing[jibing['症状持续时间'] == "无"].index
Int64Index([1515], dtype='int64')
jibing = jibing.drop(labels=jibing[jibing['症状持续时间'] == "无"].index)
jibing = jibing.drop(labels=jibing[jibing['症状持续时间'] == 0 ].index)
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
0 | 1 | 0 | 3 | 0.0 | 1 | 54 | 0 | 0 | 0 | 1 | ... | 2.03 | 73.0 | 39.0 | 2.0 | 48.0 | 15.5 | 13.4 | 59.4 | 0 | 0 |
1 | 1 | 1 | 1 | 1.0 | 0 | 63 | 1 | 0 | 1 | 0 | ... | 2.90 | 84.0 | 20.0 | 3.1 | 71.0 | 17.7 | 12.4 | 67.1 | 0 | 0 |
2 | 1 | 0 | 4 | 1.0 | 0 | 65 | 0 | 0 | 0 | 0 | ... | 1.55 | 121.0 | 7.0 | 1.8 | 63.0 | 11.4 | 19.6 | 50.5 | 2 | 0 |
3 | 0 | 1 | 1 | 0.0 | 1 | 45 | 0 | 0 | 0 | 1 | ... | 1.90 | 187.0 | 19.0 | 2.3 | 42.0 | 9.4 | 9.8 | 55.8 | 2 | 0 |
4 | 1 | 1 | 3 | 1.0 | 1 | 55 | 0 | 0 | 0 | 0 | ... | 2.19 | 66.0 | 25.0 | 2.0 | 111.0 | 15.3 | 26.1 | 54.8 | 0 | 0 |
5 rows × 63 columns
Int64Index: 1629 entries, 0 to 1630
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 左右 1629 non-null object
1 是否外伤 1629 non-null int64
2 症状持续时间 1629 non-null object
3 明显夜间痛 1629 non-null float64
4 性别 1629 non-null object
5 年龄 1629 non-null int64
6 高血压 1629 non-null int64
7 高血脂 1629 non-null int64
8 2型糖尿病 1629 non-null int64
9 吸烟与否 1629 non-null int64
10 饮酒与否 1629 non-null int64
11 红细胞计数*10^12/L 1629 non-null float64
12 血红蛋白 1629 non-null float64
13 红细胞压积 1629 non-null float64
14 血小板计数 1629 non-null float64
15 血小板压积 1629 non-null float64
16 总蛋白g/L 1628 non-null float64
17 白蛋白g/L 1629 non-null float64
18 球蛋白g/L 1629 non-null object
19 白球比 1629 non-null float64
20 ALT丙氨酸氨基转移酶 1629 non-null int64
21 AST天门冬氨酸氨基转移酶 1629 non-null int64
22 碱性磷酸酶 1629 non-null int64
23 谷氨酸转肽酶 1629 non-null int64
24 AST:ALT 1629 non-null float64
25 总胆红素 1629 non-null float64
26 直接胆红素 1629 non-null float64
27 间接胆红素 1629 non-null float64
28 钾 1629 non-null float64
29 钠 1629 non-null float64
30 氯 1629 non-null float64
31 钙 1629 non-null object
32 磷 1629 non-null float64
33 镁 1629 non-null float64
34 葡萄糖 1629 non-null float64
35 肌酐 1629 non-null float64
36 尿素 1629 non-null float64
37 尿酸 1629 non-null float64
38 甘油三酯 1629 non-null float64
39 总胆固醇 1629 non-null float64
40 H高密度胆固醇 1629 non-null float64
41 L低密度胆固醇 1629 non-null float64
42 载脂蛋白A1 1629 non-null float64
43 载脂蛋白B 1629 non-null float64
44 载脂蛋白E mg/l 1629 non-null float64
45 aPoB/aPoA1 1629 non-null float64
46 脂蛋白小a 1629 non-null object
47 乳酸脱氢酶LDH 1629 non-null int64
48 β-2微球蛋白 1629 non-null float64
49 胆碱酯酶 1629 non-null int64
50 前白蛋白mg/l 1629 non-null int64
51 总胆汁酸 1629 non-null float64
52 腺苷脱氨酶ADA 1629 non-null float64
53 果糖胺 1629 non-null float64
54 肌酸激酶 1629 non-null float64
55 α-L-盐藻糖苷酶 1629 non-null float64
56 乳酸 1628 non-null float64
57 淀粉酶 1629 non-null float64
58 同型半胱氨酸 1629 non-null float64
59 铁 1629 non-null float64
60 总铁结合力 1629 non-null float64
61 血型 1629 non-null object
62 结果 1629 non-null int64
dtypes: float64(41), int64(15), object(7)
memory usage: 814.5+ KB
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
0 | 1 | 0 | 3 | 0.0 | 1 | 54 | 0 | 0 | 0 | 1 | ... | 2.03 | 73.0 | 39.0 | 2.0 | 48.0 | 15.5 | 13.4 | 59.4 | 0 | 0 |
1 | 1 | 1 | 1 | 1.0 | 0 | 63 | 1 | 0 | 1 | 0 | ... | 2.90 | 84.0 | 20.0 | 3.1 | 71.0 | 17.7 | 12.4 | 67.1 | 0 | 0 |
2 | 1 | 0 | 4 | 1.0 | 0 | 65 | 0 | 0 | 0 | 0 | ... | 1.55 | 121.0 | 7.0 | 1.8 | 63.0 | 11.4 | 19.6 | 50.5 | 2 | 0 |
3 | 0 | 1 | 1 | 0.0 | 1 | 45 | 0 | 0 | 0 | 1 | ... | 1.90 | 187.0 | 19.0 | 2.3 | 42.0 | 9.4 | 9.8 | 55.8 | 2 | 0 |
4 | 1 | 1 | 3 | 1.0 | 1 | 55 | 0 | 0 | 0 | 0 | ... | 2.19 | 66.0 | 25.0 | 2.0 | 111.0 | 15.3 | 26.1 | 54.8 | 0 | 0 |
5 rows × 63 columns
多余的 +
kmp 算法
ValueError: could not convert string to float: ‘22…9’
drop_index = []
for i in range(jibing.shape[0]):
for j in range(jibing.shape[1]):
Get_next(p, next)
if kmp_match(s,p,next) != -1:
p = "+"
Get_next(p, next)
if kmp_match(s,p,next) != -1:
[155, 265, 356]
jibing = jibing.drop(labels=156,axis=0)
jibing = jibing.drop(labels=266,axis=0)
jibing = jibing.drop(labels=357,axis=0).reset_index()
import random
id = [i for i in range(0,len(jibing))]
jibing_copy = jibing.copy()
for j in range(0,len(jibing)):
jibing.iloc[j] = jibing_copy.iloc[id[j]]
jibing = jibing.iloc[:,1:]
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
0 | 0 | 0 | 3 | 0.0 | 0 | 65 | 1 | 0 | 0 | 0 | ... | 1.32 | 48.0 | 12.0 | 1.9 | 49.0 | 9.9 | 12.3 | 43.5 | 3 | 0 |
1 | 1 | 1 | 2 | 0.0 | 0 | 62 | 1 | 0 | 0 | 0 | ... | 1.67 | 77.0 | 16.0 | 1.4 | 81.0 | 9.2 | 16.9 | 55.5 | 0 | 1 |
2 | 1 | 0 | 4 | 1.0 | 0 | 55 | 0 | 0 | 0 | 0 | ... | 1.86 | 78.0 | 22.0 | 1.9 | 89.0 | 9.9 | 7.0 | 51.4 | 0 | 1 |
3 | 1 | 0 | 3 | 0.0 | 0 | 60 | 0 | 0 | 0 | 0 | ... | 1.68 | 92.0 | 12.0 | 1.4 | 69.0 | 9.3 | 15.8 | 53.0 | 0 | 0 |
4 | 0 | 1 | 3 | 0.0 | 0 | 61 | 0 | 0 | 0 | 0 | ... | 1.60 | 58.0 | 14.0 | 1.7 | 153.0 | 8.1 | 13.2 | 45.9 | 0 | 1 |
5 rows × 63 columns
jibing.to_excel("./jibing_yuchuli_final.xlsx", index=False)
jibing = pd.read_excel("./jibing_yuchuli_final.xlsx")
jibing = three_sigema(jibing)
jibing.to_excel("./jibing_yuchuli_final.xlsx", index=False)
(1598, 63)