import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn
# 正则表达式
import re
import sys
# 中心化
from sklearn.preprocessing import MinMaxScaler
# 标准化
from sklearn.preprocessing import StandardScaler
# 自定义的工具包
from my_tools import *
import warnings
warnings.filterwarnings("ignore")
jibing = pd.read_excel("./某疾病数据.xlsx")
set(jibing.loc[:,"性别"])
dict_sex = {"女":0,"男":1}
for i in range(jibing.loc[:,"性别"].shape[0]):
jibing.loc[:,"性别"][i] = dict_sex[jibing.loc[:,"性别"][i]]
set(jibing.loc[:,"血型"])
{'A', 'AB', 'B', 'O'}
dict_sex = {"A":0,"AB":1,"B":2,"O":3}
for i in range(jibing.loc[:,"血型"].shape[0]):
jibing.loc[:,"血型"][i] = dict_sex[jibing.loc[:,"血型"][i]]
set(jibing.loc[:,"血型"])
{0, 1, 2, 3}
左右
set(jibing.loc[:,"左右"])
{'双侧', '右', '女', '左'}
查看各部分所占的比例,发现 左 和 右 所占的比例和接近 99%
对于比例较少的 女 和 双侧 可以认为是噪声,直接删除
length = jibing.loc[:,"左右"].shape[0]
length_nv = jibing[jibing["左右"] == "女"].shape[0]
length_sc = jibing[jibing["左右"] == "双侧"].shape[0]
length_z = jibing[jibing["左右"] == "左"].shape[0]
length_y = jibing[jibing["左右"] == "右"].shape[0]
print("左:" + str(length_z / length * 100) + "%")
print("右:" + str(length_y / length * 100) + "%")
print("双侧:" + str(length_sc / length * 100) + "%")
print("女:" + str(length_nv / length * 100) + "%")
左:32.76178812002449%
右:67.11573790569504%
双侧:0.0612369871402327%
女:0.0612369871402327%
删除某一行后使用 reset_index 保证索引的连续性
jibing = jibing.drop(labels=jibing[jibing['左右'] == "女"].index).reset_index(drop = True)
jibing = jibing.drop(labels=jibing[jibing['左右'] == "双侧"].index).reset_index(drop = True)
dict_lr = {"左":0,"右":1,}
for i in range(jibing.loc[:,"左右"].shape[0]):
jibing.loc[:,"左右"][i] = dict_lr[jibing.loc[:,"左右"][i]]
set(jibing.loc[:,"左右"])
{0, 1}
set(jibing.loc[:,"症状持续时间"])
{'10余天',
'10余年',
'10天',
'10年余',
'10月',
'10月 ',
'10月余',
'11天',
'11月',
'11月余',
'12天',
'14年',
'15天',
'15年',
'17天',
'17年',
'18月',
'1周',
'1周余',
'1天',
'1天余',
'1年',
'1年余',
'1年半',
'1年半余',
'1月',
'1月余',
'20余天',
'20余年',
'20天',
'20天余',
'20年余',
'21天',
'2周',
'2周余',
'2天',
'2天余',
'2年',
'2年余',
'2月',
'2月余',
'2月余 ',
'3周余',
'3周月',
'3天',
'3年',
'3年余',
'3月',
'3月余',
'3月余 ',
'40余天',
'4天',
'4小时',
'4年',
'4年余',
'4月',
'4月余',
'4月余 ',
'50天',
'5天',
'5年',
'5年余',
'5月',
'5月余',
'6年余',
'6月',
'6月 ',
'6月余',
'6月余 ',
'7天',
'7年余',
'7月余',
'8年',
'8年余',
'8月',
'8月 ',
'8月余',
'9年余',
'9月',
'9月余',
'9月余 ',
'半年',
'半年余',
'半月余',
'数年',
'无'}
for i in range(jibing.loc[:,"症状持续时间"].shape[0]):
if re.search('小时', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 0
if re.search('天', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 1
if re.search('周', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 2
if re.search('月', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 3
if re.search('年', str(jibing.loc[:,"症状持续时间"][i])) is not None:
jibing.loc[:,"症状持续时间"][i] = 4
set(jibing.loc[:,"症状持续时间"])
{0, 1, 2, 3, 4, '无'}
统计某一列各元素出现的个数
https://blog.csdn.net/zr1213159840/article/details/107818784?spm=1001.2101.3001.6650.3&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-3-107818784-blog-103014532.pc_relevant_recovery_v2&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7ECTRLIST%7ERate-3-107818784-blog-103014532.pc_relevant_recovery_v2&utm_relevant_index=6
jibing['症状持续时间'].value_counts()
4 797
3 756
1 58
2 18
无 1
0 1
Name: 症状持续时间, dtype: int64
无占很小的比例,直接删掉那一行
pandas 获取指定列中的某个值(范围)所属的行
https://blog.csdn.net/weixin_44222183/article/details/106187018
jibing[jibing['症状持续时间'] == "无"].index
Int64Index([1515], dtype='int64')
pandas 删除某一行
https://blog.csdn.net/LHJCSDNYL/article/details/124784943?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522166808474916800182765154%2522%252C%2522scm%2522%253A%252220140713.130102334…%2522%257D&request_id=166808474916800182765154&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2allsobaiduend~default-1-124784943-null-null.142v63wechat,201v3add_ask,213v2t3_esquery_v1&utm_term=pandas%20%E5%88%A0%E9%99%A4%E6%9F%90%E4%B8%80%E8%A1%8C&spm=1018.2226.3001.4187
jibing = jibing.drop(labels=jibing[jibing['症状持续时间'] == "无"].index)
jibing = jibing.drop(labels=jibing[jibing['症状持续时间'] == 0 ].index)
jibing.head(5)
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0.0 | 1 | 54 | 0 | 0 | 0 | 1 | ... | 2.03 | 73.0 | 39.0 | 2.0 | 48.0 | 15.5 | 13.4 | 59.4 | 0 | 0 |
1 | 1 | 1 | 1 | 1.0 | 0 | 63 | 1 | 0 | 1 | 0 | ... | 2.90 | 84.0 | 20.0 | 3.1 | 71.0 | 17.7 | 12.4 | 67.1 | 0 | 0 |
2 | 1 | 0 | 4 | 1.0 | 0 | 65 | 0 | 0 | 0 | 0 | ... | 1.55 | 121.0 | 7.0 | 1.8 | 63.0 | 11.4 | 19.6 | 50.5 | 2 | 0 |
3 | 0 | 1 | 1 | 0.0 | 1 | 45 | 0 | 0 | 0 | 1 | ... | 1.90 | 187.0 | 19.0 | 2.3 | 42.0 | 9.4 | 9.8 | 55.8 | 2 | 0 |
4 | 1 | 1 | 3 | 1.0 | 1 | 55 | 0 | 0 | 0 | 0 | ... | 2.19 | 66.0 | 25.0 | 2.0 | 111.0 | 15.3 | 26.1 | 54.8 | 0 | 0 |
5 rows × 63 columns
未发现缺失值
jibing.info()
Int64Index: 1629 entries, 0 to 1630
Data columns (total 63 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 左右 1629 non-null object
1 是否外伤 1629 non-null int64
2 症状持续时间 1629 non-null object
3 明显夜间痛 1629 non-null float64
4 性别 1629 non-null object
5 年龄 1629 non-null int64
6 高血压 1629 non-null int64
7 高血脂 1629 non-null int64
8 2型糖尿病 1629 non-null int64
9 吸烟与否 1629 non-null int64
10 饮酒与否 1629 non-null int64
11 红细胞计数*10^12/L 1629 non-null float64
12 血红蛋白 1629 non-null float64
13 红细胞压积 1629 non-null float64
14 血小板计数 1629 non-null float64
15 血小板压积 1629 non-null float64
16 总蛋白g/L 1628 non-null float64
17 白蛋白g/L 1629 non-null float64
18 球蛋白g/L 1629 non-null object
19 白球比 1629 non-null float64
20 ALT丙氨酸氨基转移酶 1629 non-null int64
21 AST天门冬氨酸氨基转移酶 1629 non-null int64
22 碱性磷酸酶 1629 non-null int64
23 谷氨酸转肽酶 1629 non-null int64
24 AST:ALT 1629 non-null float64
25 总胆红素 1629 non-null float64
26 直接胆红素 1629 non-null float64
27 间接胆红素 1629 non-null float64
28 钾 1629 non-null float64
29 钠 1629 non-null float64
30 氯 1629 non-null float64
31 钙 1629 non-null object
32 磷 1629 non-null float64
33 镁 1629 non-null float64
34 葡萄糖 1629 non-null float64
35 肌酐 1629 non-null float64
36 尿素 1629 non-null float64
37 尿酸 1629 non-null float64
38 甘油三酯 1629 non-null float64
39 总胆固醇 1629 non-null float64
40 H高密度胆固醇 1629 non-null float64
41 L低密度胆固醇 1629 non-null float64
42 载脂蛋白A1 1629 non-null float64
43 载脂蛋白B 1629 non-null float64
44 载脂蛋白E mg/l 1629 non-null float64
45 aPoB/aPoA1 1629 non-null float64
46 脂蛋白小a 1629 non-null object
47 乳酸脱氢酶LDH 1629 non-null int64
48 β-2微球蛋白 1629 non-null float64
49 胆碱酯酶 1629 non-null int64
50 前白蛋白mg/l 1629 non-null int64
51 总胆汁酸 1629 non-null float64
52 腺苷脱氨酶ADA 1629 non-null float64
53 果糖胺 1629 non-null float64
54 肌酸激酶 1629 non-null float64
55 α-L-盐藻糖苷酶 1629 non-null float64
56 乳酸 1628 non-null float64
57 淀粉酶 1629 non-null float64
58 同型半胱氨酸 1629 non-null float64
59 铁 1629 non-null float64
60 总铁结合力 1629 non-null float64
61 血型 1629 non-null object
62 结果 1629 non-null int64
dtypes: float64(41), int64(15), object(7)
memory usage: 814.5+ KB
缺失值占了很少的比例,所以直接去掉缺失值所在的行
jibing.dropna(axis=0,inplace=True)
jibing.head(5)
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0.0 | 1 | 54 | 0 | 0 | 0 | 1 | ... | 2.03 | 73.0 | 39.0 | 2.0 | 48.0 | 15.5 | 13.4 | 59.4 | 0 | 0 |
1 | 1 | 1 | 1 | 1.0 | 0 | 63 | 1 | 0 | 1 | 0 | ... | 2.90 | 84.0 | 20.0 | 3.1 | 71.0 | 17.7 | 12.4 | 67.1 | 0 | 0 |
2 | 1 | 0 | 4 | 1.0 | 0 | 65 | 0 | 0 | 0 | 0 | ... | 1.55 | 121.0 | 7.0 | 1.8 | 63.0 | 11.4 | 19.6 | 50.5 | 2 | 0 |
3 | 0 | 1 | 1 | 0.0 | 1 | 45 | 0 | 0 | 0 | 1 | ... | 1.90 | 187.0 | 19.0 | 2.3 | 42.0 | 9.4 | 9.8 | 55.8 | 2 | 0 |
4 | 1 | 1 | 3 | 1.0 | 1 | 55 | 0 | 0 | 0 | 0 | ... | 2.19 | 66.0 | 25.0 | 2.0 | 111.0 | 15.3 | 26.1 | 54.8 | 0 | 0 |
5 rows × 63 columns
小数点输入重复
多余的 +
去量纲化的过程中发现有很多部分数字的小数点重复,
为了不对整体产生影响,决定将其删除
kmp 算法
ValueError: could not convert string to float: ‘22…9’
drop_index = []
for i in range(jibing.shape[0]):
for j in range(jibing.shape[1]):
p=".."
p=list(p)
s=list(str(jibing.iloc[i,j]))
next=[0]
Get_next(p, next)
if kmp_match(s,p,next) != -1:
drop_index.append(i)
p = "+"
p=list(p)
s=list(str(jibing.iloc[i,j]))
next=[0]
Get_next(p, next)
if kmp_match(s,p,next) != -1:
drop_index.append(i)
drop_index
[155, 265, 356]
jibing = jibing.drop(labels=156,axis=0)
jibing = jibing.drop(labels=266,axis=0)
jibing = jibing.drop(labels=357,axis=0).reset_index()
import random
id = [i for i in range(0,len(jibing))]
random.shuffle(id)
jibing_copy = jibing.copy()
for j in range(0,len(jibing)):
jibing.iloc[j] = jibing_copy.iloc[id[j]]
jibing = jibing.iloc[:,1:]
jibing.head()
左右 | 是否外伤 | 症状持续时间 | 明显夜间痛 | 性别 | 年龄 | 高血压 | 高血脂 | 2型糖尿病 | 吸烟与否 | ... | 果糖胺 | 肌酸激酶 | α-L-盐藻糖苷酶 | 乳酸 | 淀粉酶 | 同型半胱氨酸 | 铁 | 总铁结合力 | 血型 | 结果 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 3 | 0.0 | 0 | 65 | 1 | 0 | 0 | 0 | ... | 1.32 | 48.0 | 12.0 | 1.9 | 49.0 | 9.9 | 12.3 | 43.5 | 3 | 0 |
1 | 1 | 1 | 2 | 0.0 | 0 | 62 | 1 | 0 | 0 | 0 | ... | 1.67 | 77.0 | 16.0 | 1.4 | 81.0 | 9.2 | 16.9 | 55.5 | 0 | 1 |
2 | 1 | 0 | 4 | 1.0 | 0 | 55 | 0 | 0 | 0 | 0 | ... | 1.86 | 78.0 | 22.0 | 1.9 | 89.0 | 9.9 | 7.0 | 51.4 | 0 | 1 |
3 | 1 | 0 | 3 | 0.0 | 0 | 60 | 0 | 0 | 0 | 0 | ... | 1.68 | 92.0 | 12.0 | 1.4 | 69.0 | 9.3 | 15.8 | 53.0 | 0 | 0 |
4 | 0 | 1 | 3 | 0.0 | 0 | 61 | 0 | 0 | 0 | 0 | ... | 1.60 | 58.0 | 14.0 | 1.7 | 153.0 | 8.1 | 13.2 | 45.9 | 0 | 1 |
5 rows × 63 columns
jibing.to_excel("./jibing_yuchuli_final.xlsx", index=False)
jibing = pd.read_excel("./jibing_yuchuli_final.xlsx")
jibing = three_sigema(jibing)
操作前有1624行
操作后有1598行
删去了26行
jibing.to_excel("./jibing_yuchuli_final.xlsx", index=False)
jibing.shape
(1598, 63)