zoodD顶真

数模1232

适应赛

题目1

随机森林

增加一列BMI参数

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
print(df3.head(5))

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1

[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第一题只分析元数据即可
df = df3[['性别', '年龄', '身高cm', '体重kg', 'BMI','是否得病']]
print(df.head(5))


# step3 模型建立
dataset = pd.get_dummies(df, columns=['性别', '是否得病'])  # 将分类变量转成独热变量
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg','BMI' ]
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head())


y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
X = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=100)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
'''
训练集混淆矩阵为：
 [[1102   16]
 [  17 1102]]
训练集准确率: 0.9852481001341081
测试集混淆矩阵为：
 [[261 221]
 [239 239]]
测试集准确率: 0.5166666666666667
'''

SVM

注意SVM 的数据集的类别分布不能相差guo

import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1)
# print(df1) # [4923 rows x 13 columns] 读取阴性 数量太多导致数据不平衡
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1

[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第一题只分析元数据即可
df = df3[['身高cm','年龄', '体重kg', 'BMI','是否得病']]
print(df.head(5))


# step3 模型建立
# dataset = pd.get_dummies(df, columns=['性别', '是否得病'])  # 将分类变量转成独热变量
dataset = df
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['身高cm','年龄', '体重kg', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head())


y = dataset[['是否得病']]
# print(y) # target目标
X = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 创建一个SVM分类器


model = svm.SVC()  # 创建SVM分类器
model = model.fit(x_train, y_train)  # 用训练集做训练
print(y_train)
prediction = model.predict(x_train)  # 用测试集做预测
# p_svm =  pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')

# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))

prediction = model.predict(x_test)  # 用测试集做预测

p_svm =  pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))

神经网络MLP

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPClassifier


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1

[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm']/100)**2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14,'是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第一题只分析元数据即可
df = df3[['性别', '年龄', '身高cm', '体重kg', 'BMI','是否得病']]
print(df.head(5))


# step3 模型建立
dataset = pd.get_dummies(df, columns=['性别', '是否得病'])  # 将分类变量转成独热变量
# print(dataset.head(5))
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg','BMI' ]
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head())


y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
X = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)

print("神经网络方法")
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

题目2

随机森林

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix




# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第二题
df = df3[['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']]
# 标签类别
set(df['是否得病'])  #{0,1} 阴性-0，阳性-1
print(df.shape)
# 统计缺失值
print(df.isna().sum())
print(df.describe())
'''
# 潮气量有大量空值
(6520, 10)
潮气量（L）         1974
用力肺活量（%）          0
中心气道参数（%）         0
FEV1/FVC（%）       0
最高呼气流速（%）         0
外周气道参数A（%）        0
外周气道参数B（%）        0
外周气道参数C（%）        0
外周气道参数D（%）        0
是否得病              0
dtype: int64
           潮气量（L）      用力肺活量（%）  ...   外周气道参数D（%）         是否得病
count  4546.000000  6520.000000  ...  6520.000000  6520.000000
mean      1.388097    98.463413  ...    83.906838     0.244939
std       0.558965    11.734124  ...    33.676596     0.430084
min       0.160000    68.711656  ...     0.000000     0.000000
25%       0.990000    89.880350  ...    61.896197     0.000000
50%       1.310000    97.291561  ...    79.493590     0.000000
75%       1.730000   105.834854  ...   101.187574     0.000000
max       4.070000   188.603989  ...   679.464286     1.000000

[8 rows x 10 columns]

Process finished with exit code 0
'''

#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)
print('填充之后','\n',df.head(5))
print(df.isna().sum())
'''
1636  0.650000  106.319703  110.917031  ...   84.887460   62.626263     0
901   1.020000  108.368201  111.500000  ...   91.600000   61.160714     0
478   0.810000  100.000000   99.130435  ...  111.003861   93.534483     0
6145  1.388097  127.960526  124.809160  ...   69.047619   33.913043     1
4401  1.060000   83.266932   91.866029  ...   80.276134   73.660714     0

[5 rows x 10 columns]
潮气量（L）         0
用力肺活量（%）       0
中心气道参数（%）      0
FEV1/FVC（%）    0
最高呼气流速（%）      0
外周气道参数A（%）     0
外周气道参数B（%）     0
外周气道参数C（%）     0
外周气道参数D（%）     0
是否得病           0
dtype: int64
'''

# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
feat_labels = df.columns[0:9] # 特征的名称
# print(feat_labels)
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1,max_depth=3)
forest.fit(x_train, y_train)
score = forest.score(x_test, y_test)  # score=0.98148
importances = forest.feature_importances_ # 随机森林模型认为训练特征的重要程度
indices = np.argsort(importances)[::-1] # 下标排序
for f in range(x_train.shape[1]):   # x_train.shape[1]
    print("%2d) %-*s %f" % \
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

'''
1) FEV1/FVC（%）                    0.573146
 2) 外周气道参数A（%）                     0.172901
 3) 外周气道参数B（%）                     0.079313
 4) 中心气道参数（%）                      0.068985
 5) 潮气量（L）                         0.037866
 6) 用力肺活量（%）                       0.033652
 7) 最高呼气流速（%）                      0.022079
 8) 外周气道参数D（%）                     0.006881
 9) 外周气道参数C（%）                     0.005177

选前面5个的效果可能会好一点
'''

先看下不做选择时模型的预测效果

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics




# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第二题
df = df3[['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']]
# 标签类别
set(df['是否得病'])  #{0,1} 阴性-0，阳性-1
print(df.shape)
# 统计缺失值
print(df.isna().sum())
print(df.describe())
'''
# 潮气量有大量空值
(6520, 10)
潮气量（L）         1974
用力肺活量（%）          0
中心气道参数（%）         0
FEV1/FVC（%）       0
最高呼气流速（%）         0
外周气道参数A（%）        0
外周气道参数B（%）        0
外周气道参数C（%）        0
外周气道参数D（%）        0
是否得病              0
dtype: int64
           潮气量（L）      用力肺活量（%）  ...   外周气道参数D（%）         是否得病
count  4546.000000  6520.000000  ...  6520.000000  6520.000000
mean      1.388097    98.463413  ...    83.906838     0.244939
std       0.558965    11.734124  ...    33.676596     0.430084
min       0.160000    68.711656  ...     0.000000     0.000000
25%       0.990000    89.880350  ...    61.896197     0.000000
50%       1.310000    97.291561  ...    79.493590     0.000000
75%       1.730000   105.834854  ...   101.187574     0.000000
max       4.070000   188.603989  ...   679.464286     1.000000

[8 rows x 10 columns]

Process finished with exit code 0
'''

#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)
print('填充之后','\n',df.head(5))
print(df.isna().sum())
'''
1636  0.650000  106.319703  110.917031  ...   84.887460   62.626263     0
901   1.020000  108.368201  111.500000  ...   91.600000   61.160714     0
478   0.810000  100.000000   99.130435  ...  111.003861   93.534483     0
6145  1.388097  127.960526  124.809160  ...   69.047619   33.913043     1
4401  1.060000   83.266932   91.866029  ...   80.276134   73.660714     0

[5 rows x 10 columns]
潮气量（L）         0
用力肺活量（%）       0
中心气道参数（%）      0
FEV1/FVC（%）    0
最高呼气流速（%）      0
外周气道参数A（%）     0
外周气道参数B（%）     0
外周气道参数C（%）     0
外周气道参数D（%）     0
是否得病           0
dtype: int64
'''

# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵

print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

进行特征选择，选择前7个

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics



# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第二题
df = df3[['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']]
# 标签类别
set(df['是否得病'])  #{0,1} 阴性-0，阳性-1
print(df.shape)
# 统计缺失值
print(df.isna().sum())
print(df.describe())
'''
# 潮气量有大量空值
(6520, 10)
潮气量（L）         1974
用力肺活量（%）          0
中心气道参数（%）         0
FEV1/FVC（%）       0
最高呼气流速（%）         0
外周气道参数A（%）        0
外周气道参数B（%）        0
外周气道参数C（%）        0
外周气道参数D（%）        0
是否得病              0
dtype: int64
           潮气量（L）      用力肺活量（%）  ...   外周气道参数D（%）         是否得病
count  4546.000000  6520.000000  ...  6520.000000  6520.000000
mean      1.388097    98.463413  ...    83.906838     0.244939
std       0.558965    11.734124  ...    33.676596     0.430084
min       0.160000    68.711656  ...     0.000000     0.000000
25%       0.990000    89.880350  ...    61.896197     0.000000
50%       1.310000    97.291561  ...    79.493590     0.000000
75%       1.730000   105.834854  ...   101.187574     0.000000
max       4.070000   188.603989  ...   679.464286     1.000000

[8 rows x 10 columns]

Process finished with exit code 0
'''

#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)
print('填充之后','\n',df.head(5))
print(df.isna().sum())
'''
1636  0.650000  106.319703  110.917031  ...   84.887460   62.626263     0
901   1.020000  108.368201  111.500000  ...   91.600000   61.160714     0
478   0.810000  100.000000   99.130435  ...  111.003861   93.534483     0
6145  1.388097  127.960526  124.809160  ...   69.047619   33.913043     1
4401  1.060000   83.266932   91.866029  ...   80.276134   73.660714     0

[5 rows x 10 columns]
潮气量（L）         0
用力肺活量（%）       0
中心气道参数（%）      0
FEV1/FVC（%）    0
最高呼气流速（%）      0
外周气道参数A（%）     0
外周气道参数B（%）     0
外周气道参数C（%）     0
外周气道参数D（%）     0
是否得病           0
dtype: int64
'''

# step3 模型建立


# 进行特征选择
df = df[['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）','是否得病']]
print(df.head(5))
'''
      FEV1/FVC（%）  外周气道参数A（%）  外周气道参数B（%）  ...   用力肺活量（%）   最高呼气流速（%）  是否得病
1491    88.235294   56.373938   91.962617  ...  82.156134   89.898990     0
80      79.794521   45.555556   91.546763  ...  97.333333   92.721519     0
296     82.951654   84.552846   94.329897  ...  85.249458  116.040956     0
2187    87.401575   66.954023   67.620751  ...  82.200647   76.863354     1
1071    86.764706   68.318966   99.506579  ...  97.142857  101.744186     0
'''
dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
print("特征选择之后")
print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

'''
训练集混淆矩阵为：
 [[1119    0]
 [   0 1118]]
训练集准确率: 1.0
测试集混淆矩阵为：
 [[372 109]
 [156 323]]
测试集准确率: 0.7166666666666667
'''

svm

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第二题
df = df3[['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']]
# 标签类别
set(df['是否得病'])  #{0,1} 阴性-0，阳性-1
print(df.shape)
# 统计缺失值
print(df.isna().sum())
print(df.describe())
'''
# 潮气量有大量空值
(6520, 10)
潮气量（L）         1974
用力肺活量（%）          0
中心气道参数（%）         0
FEV1/FVC（%）       0
最高呼气流速（%）         0
外周气道参数A（%）        0
外周气道参数B（%）        0
外周气道参数C（%）        0
外周气道参数D（%）        0
是否得病              0
dtype: int64
           潮气量（L）      用力肺活量（%）  ...   外周气道参数D（%）         是否得病
count  4546.000000  6520.000000  ...  6520.000000  6520.000000
mean      1.388097    98.463413  ...    83.906838     0.244939
std       0.558965    11.734124  ...    33.676596     0.430084
min       0.160000    68.711656  ...     0.000000     0.000000
25%       0.990000    89.880350  ...    61.896197     0.000000
50%       1.310000    97.291561  ...    79.493590     0.000000
75%       1.730000   105.834854  ...   101.187574     0.000000
max       4.070000   188.603989  ...   679.464286     1.000000

[8 rows x 10 columns]

Process finished with exit code 0
'''

#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)
print('填充之后','\n',df.head(5))
print(df.isna().sum())
'''
1636  0.650000  106.319703  110.917031  ...   84.887460   62.626263     0
901   1.020000  108.368201  111.500000  ...   91.600000   61.160714     0
478   0.810000  100.000000   99.130435  ...  111.003861   93.534483     0
6145  1.388097  127.960526  124.809160  ...   69.047619   33.913043     1
4401  1.060000   83.266932   91.866029  ...   80.276134   73.660714     0

[5 rows x 10 columns]
潮气量（L）         0
用力肺活量（%）       0
中心气道参数（%）      0
FEV1/FVC（%）    0
最高呼气流速（%）      0
外周气道参数A（%）     0
外周气道参数B（%）     0
外周气道参数C（%）     0
外周气道参数D（%）     0
是否得病           0
dtype: int64
'''

# step3 模型建立


# 进行特征选择
df = df[['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）','是否得病']]
print(df.head(5))
'''
      FEV1/FVC（%）  外周气道参数A（%）  外周气道参数B（%）  ...   用力肺活量（%）   最高呼气流速（%）  是否得病
1491    88.235294   56.373938   91.962617  ...  82.156134   89.898990     0
80      79.794521   45.555556   91.546763  ...  97.333333   92.721519     0
296     82.951654   84.552846   94.329897  ...  85.249458  116.040956     0
2187    87.401575   66.954023   67.620751  ...  82.200647   76.863354     1
1071    86.764706   68.318966   99.506579  ...  97.142857  101.744186     0
'''
# dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC()  # 创建SVM分类器
model = model.fit(x_train, y_train)  # 用训练集做训练
print(y_train)
prediction = model.predict(x_train)  # 用测试集做预测
# p_svm =  pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')

# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))

prediction = model.predict(x_test)  # 用测试集做预测

p_svm =  pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))

MLP

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.neural_network import MLPClassifier


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第二题
df = df3[['潮气量（L） ','用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']]
# 标签类别
set(df['是否得病'])  #{0,1} 阴性-0，阳性-1
print(df.shape)
# 统计缺失值
print(df.isna().sum())
print(df.describe())
'''
# 潮气量有大量空值
(6520, 10)
潮气量（L）         1974
用力肺活量（%）          0
中心气道参数（%）         0
FEV1/FVC（%）       0
最高呼气流速（%）         0
外周气道参数A（%）        0
外周气道参数B（%）        0
外周气道参数C（%）        0
外周气道参数D（%）        0
是否得病              0
dtype: int64
           潮气量（L）      用力肺活量（%）  ...   外周气道参数D（%）         是否得病
count  4546.000000  6520.000000  ...  6520.000000  6520.000000
mean      1.388097    98.463413  ...    83.906838     0.244939
std       0.558965    11.734124  ...    33.676596     0.430084
min       0.160000    68.711656  ...     0.000000     0.000000
25%       0.990000    89.880350  ...    61.896197     0.000000
50%       1.310000    97.291561  ...    79.493590     0.000000
75%       1.730000   105.834854  ...   101.187574     0.000000
max       4.070000   188.603989  ...   679.464286     1.000000

[8 rows x 10 columns]

Process finished with exit code 0
'''

#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)
print('填充之后','\n',df.head(5))
print(df.isna().sum())
'''
1636  0.650000  106.319703  110.917031  ...   84.887460   62.626263     0
901   1.020000  108.368201  111.500000  ...   91.600000   61.160714     0
478   0.810000  100.000000   99.130435  ...  111.003861   93.534483     0
6145  1.388097  127.960526  124.809160  ...   69.047619   33.913043     1
4401  1.060000   83.266932   91.866029  ...   80.276134   73.660714     0

[5 rows x 10 columns]
潮气量（L）         0
用力肺活量（%）       0
中心气道参数（%）      0
FEV1/FVC（%）    0
最高呼气流速（%）      0
外周气道参数A（%）     0
外周气道参数B（%）     0
外周气道参数C（%）     0
外周气道参数D（%）     0
是否得病           0
dtype: int64
'''

# step3 模型建立


# 进行特征选择
df = df[['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）','是否得病']]
print(df.head(5))
'''
      FEV1/FVC（%）  外周气道参数A（%）  外周气道参数B（%）  ...   用力肺活量（%）   最高呼气流速（%）  是否得病
1491    88.235294   56.373938   91.962617  ...  82.156134   89.898990     0
80      79.794521   45.555556   91.546763  ...  97.333333   92.721519     0
296     82.951654   84.552846   94.329897  ...  85.249458  116.040956     0
2187    87.401575   66.954023   67.620751  ...  82.200647   76.863354     1
1071    86.764706   68.318966   99.506579  ...  97.142857  101.744186     0
'''
dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['FEV1/FVC（%）','外周气道参数A（%）','外周气道参数B（%）','中心气道参数（%）','潮气量（L） ','用力肺活量（%）','最高呼气流速（%）']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)


mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)

print("神经网络方法")
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

题目三

随机森林

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C（%）','外周气道参数C（%）']
# df = df3.drop(drop_features, axis=1)
df = df3
#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

print(df.head(10))
print(df.columns)
'''
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI', '是否得病'],
      dtype='object')
'''

# step3 模型建立

dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42, n_estimators=500)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
# print("特征选择之后")
print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

'''
[5 rows x 16 columns]
训练集混淆矩阵为：
 [[1130    0]
 [   0 1107]]
训练集准确率: 1.0
测试集混淆矩阵为：
 [[380  90]
 [154 336]]
测试集准确率: 0.74375
'''

SVM

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C（%）','外周气道参数C（%）']
# df = df3.drop(drop_features, axis=1)
df = df3
#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

print(df.head(10))
print(df.columns)
'''
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI', '是否得病'],
      dtype='object')
'''

# step3 模型建立

dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC()  # 创建SVM分类器
model = model.fit(x_train, y_train)  # 用训练集做训练
print(y_train)
prediction = model.predict(x_train)  # 用测试集做预测
# p_svm =  pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')

# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))

prediction = model.predict(x_test)  # 用测试集做预测

p_svm =  pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))

特征选择之后

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
drop_features = ['性别','外周气道参数C（%）','外周气道参数C（%）']
df = df3.drop(drop_features, axis=1)
#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

print(df.head(10))
print(df.columns)
'''
Index(['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）',
       '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数D（%）', 'BMI', '是否得病'],
      dtype='object')
'''

# step3 模型建立

dataset = df
standardScaler = StandardScaler()
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）',
       '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数D（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病']]
# print(y) # target目标
x = dataset.drop(['是否得病'],axis=1)
# print(X) # 输入的特征


#
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
model = svm.SVC()  # 创建SVM分类器
model = model.fit(x_train, y_train)  # 用训练集做训练
print(y_train)
prediction = model.predict(x_train)  # 用测试集做预测
# p_svm =  pd.DataFrame(prediction)
# p_svm.to_csv("train_svm.csv", index=False, sep=',')

# print(prediction)
print('训练集准确率:', metrics.accuracy_score(prediction, y_train))

prediction = model.predict(x_test)  # 用测试集做预测

p_svm =  pd.DataFrame(prediction)
p_svm.to_csv("test_svm.csv", index=False, sep=',')
print('测试集准确率:', metrics.accuracy_score(prediction, y_test))

神经网络

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier


# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
drop_features = ['性别','外周气道参数C（%）','外周气道参数C（%）']
df = df3.drop(drop_features, axis=1)
#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

print(df.head(10))
print(df.columns)
'''
Index(['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）',
       '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数D（%）', 'BMI', '是否得病'],
      dtype='object')
'''

# step3 模型建立
dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler() # 数据标准化
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）',
       '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数D（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征


#
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp.fit(x_train, y_train)

print("神经网络方法")
y_t = mlp.predict(x_train)
print('训练集准确率:', metrics.accuracy_score(y_t, y_train))
y_pred = mlp.predict(x_test)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

K折交叉验证

机器学习—模型选择与优化7-1（k-fold交叉验证法） - 橘子橘子呀 - 博客园 (cnblogs.com)

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
# df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
ColNames_List = df3.columns.values.tolist()
# print('------------------------------------------------------')
# print(ColNames_List,type(ColNames_List))
#  ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ',
#  '用力肺活量（%）', '中心气道参数（%）', 'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）', '外周气道参数D（%）', '是否得病']
# print(df3)
'''
     性别  年龄   身高cm  体重kg  ...  外周气道参数B（%）  外周气道参数C（%）  外周气道参数D（%）  是否得病
0     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
1     男  55  151.0  49.0  ...   59.870550  101.844262   83.653846     0
2     男  30  181.0  69.0  ...   64.519906  103.146067  107.100592     0
3     男  25  179.0  75.0  ...   62.237762  104.092072   92.814371     0
4     男  23  171.0  59.0  ...   54.024390   74.943567   48.076923     0
...  ..  ..    ...   ...  ...         ...         ...         ...   ...
6515  女  36  167.0  60.0  ...   96.875000  102.512563  100.000000     1
6516  男  28  183.0  68.0  ...  102.870264   63.942308   82.269504     1
6517  女  36  160.0  55.0  ...   64.957265   58.203125   54.585153     1
6518  女  60  159.0  74.0  ...  100.957854   72.000000   48.529412     1
6519  女  63  156.0  46.0  ...   66.336634   92.553191   87.368421     1
[6520 rows x 14 columns]
'''

# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)
# print(df3.head(5))

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
# print(df3)

# 第三题 元数据选年龄、体重、身高、BMI、 检查参数选前7个
# drop_features = ['性别','外周气道参数C（%）','外周气道参数C（%）']
# df = df3.drop(drop_features, axis=1)
df = df3
#使用均值填充缺失值
mean_val =df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

print(df.head(10))
print(df.columns)
'''
Index(['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI', '是否得病'],
      dtype='object')
'''

# step3 模型建立

dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()
columns_to_scale = ['性别', '年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
       'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', '外周气道参数C（%）',
       '外周气道参数D（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 切分训练集与测试集，注意所有的交叉验证等都是在训练集上做的操作，测试集只有最后的最后才会使用到
# 创建一个随机森林实例
rf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_mse = cross_val_score(estimator = rf , X = x_train, y = y_train, scoring = 'r2', cv = 5, verbose = 1, n_jobs=6)


print('随机森林模型中，R2的平均数是 %.4f，标准差是 %.4f' %(rf_mse.mean(), rf_mse.std()))



mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5,4),activation='logistic',max_iter=5000)
mlp_mse = cross_val_score(estimator = mlp, X = x_train, y = y_train, scoring = 'r2', cv = 5, verbose = 1, n_jobs=6)
print('MLP模型中，R2的平均数是 %.4f，标准差是 %.4f' %(mlp_mse.mean(), mlp_mse.std()))

# 平均数最高（低偏差），标准差最小（低方差) 就是好模型
# 随机森林模型中，R2的平均数是 -0.2770，标准差是 0.0515
# MLP模型中，R2的平均数是 -0.3461，标准差是 0.0922

# 因此模型选择为 随机森林

随机森林参数调优

RandomForest 随机森林算法与模型参数的调优 - 码农充电站 - 博客园 (cnblogs.com)

(88条消息) gridsearchcv参数_随机森林算法参数解释及调优_weixin_39953578的博客-CSDN博客

# 参数优化 对n_estimators参数择优
randomforest = RandomForestClassifier(random_state=42)
param_test1 = {"n_estimators": range(1, 101, 10)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_test1,
                        scoring='roc_auc', cv=10)
gsearch1.fit(x_train, y_train)

print(gsearch1.best_score_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

'''
0.7900814280926616
{'n_estimators': 81}
best accuracy:0.790081
'''

# 参数优化 最大特征数max_features,其他参数设置为常数，且n_estimators为81
randomforest = RandomForestClassifier(random_state=42)
param_test2 = {"max_features":range(1,11,1)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=81,
                        random_state=10),
                        param_grid = param_test2,scoring='roc_auc',cv=10)

gsearch1.fit(x_train, y_train)

print(gsearch1.best_score_)
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

'''
0.7900814280926616
{'n_estimators': 81}
best accuracy:0.790081
0.7973612760484702
{'max_features': 9}
best accuracy:0.797361
'''

比较优化后的结果

没啥变化，草拟吗

randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
# print("特征选择之后")
print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))

'''
训练集混淆矩阵为：
 [[1108    0]
 [   0 1129]]
训练集准确率: 1.0
测试集混淆矩阵为：
 [[393  99]
 [147 321]]
测试集准确率: 0.74375
'''

样本不平衡问题

import numpy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import recall_score  #召回率



# step1 数据预处理

# 读取数据
path = 'xiaochuan.xlsx'
xl = pd.ExcelFile(path)
# print(xl.sheet_names) # ['阴性', '阳性']
df1 = xl.parse('阴性', index_col=0)
# df1 = df1.loc[0:1600,:]
# print(df1) # [4923 rows x 13 columns] 读取阴性
df2 = xl.parse('阳性', index_col=0)
# print(df2) # [1597 rows x 13 columns] 读取阳性数据

# 先分别添加一列 是否患病 阴性0 阳性 1
df1.insert(loc=13, column='是否得病', value=0)  # 阴性为0
# print(df1) # 1     男  40  176.0  67.0  ...   64.661654  105.070423   76.190476     0
df2.insert(loc=13, column='是否得病', value=1)  # 阳性为1
# print(df2) # 1     男  20  174.0  77.0  ...   62.248521   67.021277   58.035714     1

# 将df1 和df 2 合并在一起
df3 = pd.concat([df1, df2], axis=0).reset_index(drop=True)
# 增加BMI参数 BMI=体重（kg）÷身高2（m2）。
df3['BMI'] = df3['体重kg'] / ((df3['身高cm'] / 100) ** 2)
# print(3**2)
d = df3.pop('是否得病')
df3.insert(14, '是否得病', d)

# 对性别列进行处理
df3.replace('男', 1, inplace=True)
df3.replace('女', 0, inplace=True)

df3 = shuffle(df3)
drop_features = ['性别', '外周气道参数C（%）', '外周气道参数D（%）']  # 之前相关性啥的不重要的特征给他删除
df = df3.drop(drop_features, axis=1)
# 使用均值填充缺失值
mean_val = df['潮气量（L） '].mean()
# print(mean_val) # 1.3880972283325999
df['潮气量（L） '].fillna(mean_val, inplace=True)

# print(df.head(5))
# print(df.isna().sum())

dataset = df
x = dataset.iloc[:, :-1]  # 特征
y = dataset.iloc[:, -1]  # 标签
groupby_data_o = dataset.groupby(['是否得病'])['是否得病'].count()   #标签类别分类计数
# 分析样本不平衡问题
print(groupby_data_o)

'''
是否得病
0    4923
1    1597 存在样本不平衡问题
Name: 是否得病, dtype: int64
'''
# 使用过采样
# 使用SMOTE方法进行过抽样处理
from imblearn.over_sampling import SMOTE  # 过抽样处理库SMOTE

model_smote = SMOTE()
x_smote_resampled, y_smote_resampled = model_smote.fit_resample(x, y)  # 输入数据进行过抽样处理
y_smote_resampled = pd.DataFrame(y_smote_resampled, columns=['是否得病'])

smote_resampled = pd.concat([x_smote_resampled, y_smote_resampled], axis=1)  # 将特征和标签重新拼接
group_data_smote = smote_resampled.groupby(['是否得病'])['是否得病'].count()  # 查看标签类别个数
print(group_data_smote)
'''
是否得病
0    4923
1    4923
Name: 是否得病, dtype: int64
'''


dataset = pd.get_dummies(smote_resampled, columns=['是否得病'])  # 将分类变量转成独热变量
standardScaler = StandardScaler()  # 标准化
columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
                    'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', 'BMI']
dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
print(dataset.head(5))

y = dataset[['是否得病_0','是否得病_1']]
# print(y) # target目标
x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# print(X) # 输入的特征


# 随机森林 不管样本分类不平衡
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# 创建一个随机森林分类器的实例
randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# 利用训练集样本对分类器模型进行训练
model = randomforest.fit(x_train, y_train)

# print(model.predict(x_train))
# print(y_train)
# print(type(model.predict(x_train)))
y_train = numpy.array(y_train)
xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
print("使用过采样之后")
print('训练集混淆矩阵为：\n',xm)
print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))

y_test = numpy.array(y_test)

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵

print('测试集混淆矩阵为：\n',cm)
print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))


'''
使用过采样之后
训练集混淆矩阵为：
 [[3477    0]
 [   0 3415]]
训练集准确率: 1.0
测试集混淆矩阵为：
 [[1130  316]
 [ 226 1282]]
测试集准确率: 0.8165199729180772
'''


# # 使用RandomUnderSampler进行欠抽样处理
# from imblearn.under_sampling import RandomUnderSampler  # 欠抽样处理库RandomUnderSampler
#
# model_RandomUnderSampler = RandomUnderSampler()  # 实例化
# x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled = model_RandomUnderSampler.fit_resample(x,y)  # 输入数据进行欠抽样处理
# y_RandomUnderSampler_resampled = pd.DataFrame(y_RandomUnderSampler_resampled, columns=['是否得病'])
#
# RandomUnderSampler_resampled = pd.concat([x_RandomUnderSampler_resampled, y_RandomUnderSampler_resampled],
#                                          axis=1)  # 将特征和标签重新拼接
# group_data_RandomUnderSampler = RandomUnderSampler_resampled.groupby(['是否得病'])['是否得病'].count()  # 查看标签类别个数
#
# print(group_data_RandomUnderSampler)
#
# # RandomUnderSampler_resampled  采样后的数据
#
# dataset = pd.get_dummies(RandomUnderSampler_resampled, columns=['是否得病'])  # 将分类变量转成独热变量
# standardScaler = StandardScaler()  # 标准化
# columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
#                     'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', 'BMI']
# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
# print(dataset.head(5))
#
# y = dataset[['是否得病_0','是否得病_1']]
# # print(y) # target目标
# x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# # print(X) # 输入的特征
#
#
# # 随机森林 不管样本分类不平衡
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# # 创建一个随机森林分类器的实例
# randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# # 利用训练集样本对分类器模型进行训练
# model = randomforest.fit(x_train, y_train)
#
# # print(model.predict(x_train))
# # print(y_train)
# # print(type(model.predict(x_train)))
# y_train = numpy.array(y_train)
# xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
# print("使用欠采样之后")
# print('训练集混淆矩阵为：\n',xm)
# print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
#
# y_test = numpy.array(y_test)
#
# y_pred = model.predict(x_test)
# cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵
#
# print('测试集混淆矩阵为：\n',cm)
# print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
#
# '''
# 使用欠采样之后
# 训练集混淆矩阵为：
#  [[1089    0]
#  [   0 1146]]
# 训练集准确率: 1.0
# 测试集混淆矩阵为：
#  [[321 187]
#  [145 306]]
# 测试集准确率: 0.6538060479666319
# '''


















# dataset = pd.get_dummies(df, columns=['是否得病'])  # 将分类变量转成独热变量
# standardScaler = StandardScaler()  # 标准化
# columns_to_scale = ['年龄', '身高cm', '体重kg', '潮气量（L） ', '用力肺活量（%）', '中心气道参数（%）',
#                     'FEV1/FVC（%）', '最高呼气流速（%）', '外周气道参数A（%）', '外周气道参数B（%）', 'BMI']
# dataset[columns_to_scale] = standardScaler.fit_transform(dataset[columns_to_scale])
# print(dataset.head(5))
#
# y = dataset[['是否得病_0','是否得病_1']]
# # print(y) # target目标
# x = dataset.drop(['是否得病_0','是否得病_1'],axis=1)
# # print(X) # 输入的特征
#
#
# # 随机森林 不管样本分类不平衡
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
# # 创建一个随机森林分类器的实例
# randomforest = RandomForestClassifier(random_state=42,n_estimators=81,max_features=9)
# # 利用训练集样本对分类器模型进行训练
# model = randomforest.fit(x_train, y_train)
#
# # print(model.predict(x_train))
# # print(y_train)
# # print(type(model.predict(x_train)))
# y_train = numpy.array(y_train)
# xm = confusion_matrix(y_train.argmax(axis=1),model.predict(x_train).argmax(axis=1))  # 混淆矩阵
# # print("特征选择之后")
# print('训练集混淆矩阵为：\n',xm)
# print('训练集准确率:', metrics.accuracy_score(model.predict(x_train), y_train))
#
# y_test = numpy.array(y_test)
#
# y_pred = model.predict(x_test)
# cm = confusion_matrix(y_test.argmax(axis=1),y_pred.argmax(axis=1))  # 混淆矩阵
#
# print('测试集混淆矩阵为：\n',cm)
# print('测试集准确率:', metrics.accuracy_score(y_pred, y_test))
# '''
# 训练集混淆矩阵为：
#  [[3459    0]
#  [   0 1105]]
# 训练集准确率: 1.0
# 测试集混淆矩阵为：
#  [[1362  102]
#  [ 380  112]]
# 测试集准确率: 0.7535787321063395
# '''

plt.imshow(cm, cmap=plt.cm.Blues)
indices = range(len(cm))
plt.xticks(indices, [0,1])
plt.yticks(indices, [0,1])
plt.colorbar()
plt.xlabel('guess')
plt.ylabel('fact')
for first_index in range(len(cm)):
    for second_index in range(len(cm[first_index])):
        plt.text(first_index, second_index, cm[second_index][first_index])

plt.title('优化随机森林测试集混淆矩阵')

plt.show()

你可能感兴趣的:(笔记,大数据)

C++ 11 Lambda表达式和min_element()与max_element()的使用_c++ lamda函数 min_element((1) 2401_84976182 程序员 c语言 c++学习
既有适合小白学习的零基础资料，也有适合3年以上经验的小伙伴深入学习提升的进阶课程，涵盖了95%以上CC++开发知识点，真正体系化！由于文件比较多，这里只是将部分目录截图出来，全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频，并且后续会持续更新如果你需要这些资料，可以戳这里获取#include#include#includeusingnamespacestd;boolcmp(int
基于链家网的二手房数据采集清洗与可视化分析 Mint_Datazzh 项目 selenium 网络爬虫
个人学习内容笔记，仅供参考。项目链接：https://gitee.com/rongwu651/lianjia原文链接：基于链家网的二手房数据采集清洗与可视化分析–笔墨云烟研究内容该课题的主要目的是通过将二手房网站上的存量与已销售房源，构建一个二手房市场行情情况与房源特点的可视化平台。该平台通过HTML架构和Echarts完成可视化的搭建。因此，该课题的主要研究内容就是如何利用相关技术设计并实现这样
算法学习笔记：17.蒙特卡洛算法 ——从原理到实战，涵盖 LeetCode 与考研 408 例题
在计算机科学和数学领域，蒙特卡洛算法（MonteCarloAlgorithm）以其独特的随机抽样思想，成为解决复杂问题的有力工具。从圆周率的计算到金融风险评估，从物理模拟到人工智能，蒙特卡洛算法都发挥着不可替代的作用。本文将深入剖析蒙特卡洛算法的思想、解题思路，结合实际应用场景与Java代码实现，并融入考研408的相关考点，穿插图片辅助理解，帮助你全面掌握这一重要算法。蒙特卡洛算法的基本概念蒙特卡
分布式学习笔记_04_复制模型 NzuCRAS 分布式学习笔记架构后端
常见复制模型使用复制的目的在分布式系统中，数据通常需要被分布在多台机器上，主要为了达到：拓展性：数据量因读写负载巨大，一台机器无法承载，数据分散在多台机器上仍然可以有效地进行负载均衡，达到灵活的横向拓展高容错&高可用：在分布式系统中单机故障是常态，在单机故障的情况下希望整体系统仍然能够正常工作，这时候就需要数据在多台机器上做冗余，在遇到单机故障时能够让其他机器接管统一的用户体验：如果系统客户端分布
算法学习笔记：15.二分查找 ——从原理到实战，涵盖 LeetCode 与考研 408 例题呆呆企鹅仔算法学习算法学习笔记考研二分查找
在计算机科学的查找算法中，二分查找以其高效性占据着重要地位。它利用数据的有序性，通过不断缩小查找范围，将原本需要线性时间的查找过程优化为对数时间，成为处理大规模有序数据查找问题的首选算法。二分查找的基本概念二分查找（BinarySearch），又称折半查找，是一种在有序数据集合中查找特定元素的高效算法。其核心原理是：通过不断将查找范围减半，快速定位目标元素。与线性查找逐个遍历元素不同，二分查找依赖
入门html这篇文章就够了 ξ流ぁ星ぷ132 html 前端
HTML笔记文章目录HTML笔记html介绍什么是htmlhtml的作用HTML标签介绍常用标签标签and标签and标签u标签del删除线br标签用于换行pre标签，预处理标签span标签div标签sub标签andsup标签hr标签h1,h2...h6标签：HTML5中的语义标签：特殊字符img标签a标签第一种用法：超链接第二种用法：锚点video标签表格标签：form标签input标签selec
数字孪生技术为UI前端注入新活力：实现产品设计的沉浸式体验 ui设计前端开发老司机 ui
hello宝子们...我们是艾斯视觉擅长ui设计、前端开发、数字孪生、大数据、三维建模、三维动画10年+经验!希望我的分享能帮助到您!如需帮助可以评论关注私信我们一起探讨!致敬感谢感恩!一、引言：从“平面交互”到“沉浸体验”的UI革命当用户在电商APP中翻看3D家具模型却无法感知其与自家客厅的匹配度，当设计师在2D屏幕上绘制汽车内饰却难以预判实际乘坐体验——传统UI设计的“平面化、静态化、割裂感”
OKHttp3源码分析——学习笔记 Sincerity_ 源码相关 Okhttp 源码解析读书笔记 httpclient cache
文章目录1.HttpClient与HttpUrlConnection的区别2.OKHttp源码分析使用步骤:dispatcher任务调度器,（后面有详细说明）Request请求RealCallAsyncCall3.OKHttp架构分析1.异步请求线程池,Dispather2.连接池清理线程池-ConnectionPool3.缓存整理线程池DisLruCache4.Http2异步事务线程池,http
vue3面试题(个人笔记) 武昌库里写JAVA 面试题汇总与解析课程设计 spring boot vue.js java 学习
vue3比vue2有什么优势？性能更好，打包体积更小，更好的ts支持，更好的代码组织，更好的逻辑抽离，更多的新功能。描述Vue3生命周期CompositionAPI的生命周期：onMounted()onUpdated()onUnmounted()onBeforeMount()onBeforeUpdate()onBeforeUnmount()onErrorCaptured()onRenderTrac
提升企业级数据处理效率！TDengine 四个集群优化点详解 TDengine （老段） TDengine 运维大数据数据库物联网时序数据库服务器运维 tdengine
为了帮助企业更好地进行大数据处理，我们在此前TDengine3.x系列版本中进行了几项与集群相关的优化和新功能开发，以提升集群的稳定性和在异常情况下的恢复能力。这些优化包括clusterID隔离、leaderrebalance、raftlearner和restorednode。本文将对这几项重要优化进行详细阐述，以解答企业在此领域的疑问，并帮助大家更好地应对相关挑战。clusterID隔离问题fi
Python学习笔记5|条件语句和循环语句 iamecho9 Python从0到1学习笔记 python 学习笔记
一、条件语句条件语句用于根据不同的条件执行不同的代码块。1、if语句基本语法：if布尔型语句1:代码块#语句1为True时执行的代码示例：age=int(input("请输入你的年龄:"))ifage>=18:print("你已成年")2、if-else语句如果if条件不成立，则执行else代码块：if布尔型语句1:代码块#语句1为True时执行的代码else:代码块#语句1为False时执行的代
swagger【个人笔记】撰卢笔记 java
文章目录swagger导入mave坐标在配置类(WebMvcConfiguration)中加入knife4j相关配置设置静态资源映射，主要是让拦截器放行swagger常用注解@Api(tags="\[描述这个类的作用]")@ApiModel(description="\[描述这个类的作用]")@ApiModelProPerty("描述这个类的作用")@ApiOperation("\[描述方法的作用
【个人笔记】负载均衡撰卢笔记负载均衡运维
文章目录nginx反向代理的好处负载均衡负载均很的配置方式均衡负载的方式nginx反向代理的好处提高访问速度进行负载均衡保证后端服务安全负载均衡负载均衡，就是把大量的请求按照我们指定的方式均衡的分配给集群中的每台服务器负载均很的配置方式upstreamwebservers{server192.168.100.128:8080server192.168.100.129:8080}server{lis
在 Obsidian 中本地使用 DeepSeek — 无需互联网！知识大胖 NVIDIA GPU和大语言模型开发教程人工智能 deepseek
简介您是否想在Obsidian内免费使用类似于ChatGPT的本地LLM？如果是，那么本指南适合您！我将引导您完成在Obsidian中安装和使用DeepSeek-R1模型的确切步骤，这样您就可以在笔记中拥有一个由AI驱动的第二大脑。推荐文章《24GBGPU中的DeepSeekR1：UnslothAI针对671B参数模型进行动态量化》权重1，DeepSeek类《在RaspberryPi上运行语音识别
中国银联豪掷1亿采购海光C86架构服务器信创新态势海光芯片 C86 国产芯片海光信息
近日，中国银联国产服务器采购大单正式敲定，基于海光C86架构的服务器产品中标，项目金额超过1亿元。接下来，C86服务器将用于支撑中国银联的虚拟化、大数据、人工智能、研发测试等技术场景，进一步提升其业务处理能力、用户服务效率和信息安全水平。作为我国重要的银行卡组织和金融基础设施，中国银联在全球183个国家和地区设有银联受理网络，境内外成员机构超过2600家，是世界三大银行卡品牌之一。此次中国银联发力
5G标准学习笔记14 - CSI--RS概述刘孬孬沉迷学习 5G 学习笔记信息与通信
5G标准学习笔记14-CSI–RS概述大家好~，这里是刘孬孬，今天带着大家一起学习一下5GNR中一个非常非常重要的参考信号------------------CSI-RS信号，CSI-RS不是持续发送，UE只能在网络明确配置了CSI-RS的情况下才能使用其进行信道测量。前言对于CSI-RS，肯定还离不开前面所说的CSI（channelstateinformation），前面也讲过CSI对于MIMO
5G标准学习笔记06-基于AI/ML波束管理刘孬孬沉迷学习 5G 学习笔记
5G标准学习笔记06-基于AI/ML波束管理前言前面对于孬孬学习了波束管理的概述，下面要进一步来看一下传统波束管理和现在3GPP中推动的AL/ML波束管理之前的区别联系。一、传统波束管理方法流程传统BM流程主要包括以下步骤：波束扫描（BeamSweeping）：gNB通过顺序发送多个窄波束（SSB或CSI-RS），覆盖整个服务区域，UE测量每个波束的信号质量（如L1-RSRP或L1-SINR）。波
5G标准学习笔记03- CSI 反馈增强概述刘孬孬沉迷学习 5G 笔记学习
5G标准学习笔记03-CSI反馈增强概述大家好，最近在研究AI/ML3gpp标准NR空口的有关内容，后面可能会给大家介绍一下对应的有关内容AI/ML在3GPP标准中的研究进展在AI/ML在NR空口的应用中，对应标准主要聚焦了3个case进行讨论研究分别是：CSI反馈增强；波束管理；定位精度增强；这三个内容可能比较涉及RAN1/2的具体内容，后面会基于这个进行一定的介绍。今天主要是主要介绍CSI反馈
运维笔记＜4＞ xxl-job打通 GeminiJM 运维 java xxl-job
新的一天，来点新的运维业务，今天是xxl-job的打通其实在非集群中，xxl-job的使用相对是比较简单的，相信很多人都有使用的经验这次我们的业务场景是在k8s集群中，用xxl-job来做定时调度加上第一次倒腾，也是遇到了不少问题，在这里做一些记录1.xxl-job的集群安装首先是xxl-job的集群安装先贴上xxl-jobsql初始化文件的地址：xxl-job/doc/db/tables_xxl
全面探索Kafka：架构、应用与流处理
Kafka：企业级消息系统与流处理平台的深度解析ApacheKafka作为分布式流处理平台，广泛应用于大数据处理和实时分析领域。本文将基于其官方文档，详细探讨Kafka的核心功能、应用场景以及如何进行有效管理。背景简介Kafka作为高吞吐量的消息系统，支持企业级的发布-订阅模式。它能够处理大量实时数据，并支持高并发读写操作。本文将依据Kafka官方文档的内容，逐层深入，从入门到高级应用，帮助读者全
Flink时间窗口详解 bxlj_jcj Flink flink 大数据
一、引言在大数据流处理的领域中，Flink的时间窗口是一项极为关键的技术，想象一下，你要统计一个电商网站每小时的订单数量。由于订单数据是持续不断产生的，这就形成了一个无界数据流。如果没有时间窗口的概念，你就需要处理无穷无尽的数据，难以进行有效的统计分析。而时间窗口的作用，就是将这无界的数据流按照时间维度切割成一个个有限的“数据块”，方便我们对这些数据进行处理和分析。比如，我们可以定义一个1小时的时
探索实时流处理的未来：Kafka Streams 深度指南秋或依
探索实时流处理的未来：KafkaStreams深度指南项目介绍欢迎进入KafkaStreams：实时流处理的世界！这不仅仅是一本书，更是一个通往流处理领域深层奥秘的门户。由PrashantPandey编著，这本书以ApacheKafka2.1中的KafkaStreams库为核心，为读者铺就了一条从理解基础概念到熟练掌握KafkaStreams编程的路径。无论是软件工程师、数据架构师，还是对大数据处
Elasticsearch搜索引擎存储：从原理到实践的全景解析 Python×CATIA工业智造搜索引擎 elasticsearch 大数据
引言在大数据时代，数据规模呈指数级增长，传统数据库的模糊查询、实时分析能力逐渐成为瓶颈。Elasticsearch（简称ES）凭借其分布式架构、实时搜索和灵活的数据分析能力，成为企业级搜索与存储的核心引擎。截至2025年，ES在全球日志分析、电商搜索、实时监控等场景的市场占有率超过60%。本文将从存储架构、核心技术、应用场景及优化策略四个维度，深入解析Elasticsearch的设计哲学与实践价值
两台pc如何高速度传输大文件费城之鹰其他两台电脑高速传输文件局域网不适用U盘传输资料网线直连两台电脑传资料
今天笔记本跑一个大一点的项目，8G的内存直接100%，i5的CPU直接75%并且在超频工作了，原本1.6Ghz的频率直接飙到了3.8Ghz，由于项目性质原因，采用的是公司配的笔记本，但是年初采购的联想E480，还在三包时间段内，公司不允许拆机增加内存，只能换一台新的台式机，听起来挺爽，有新设备，但是办公区域不准使用U盘这一类的存储设备，这就蛋疼了，大半年了项目代码，资料全在这个不够用的笔记本里，问
学习笔记(33):matplotlib绘制简单图表-绘制混淆矩阵热图宁儿数据安全 #机器学习学习笔记 matplotlib
学习笔记(33):matplotlib绘制简单图表-绘制混淆矩阵热图一、绘制混淆矩阵热图代码解析1.1、导入必要的库importmatplotlib.pyplotaspltfromsklearn.metricsimportconfusion_matriximportseabornassnsmatplotlib.pyplot：Python中最常用的绘图库，用于创建各种图表confusion_matr
玩转Docker | 使用Docker部署NotepadMX笔记应用程序心随_风动玩转Docker docker 笔记 eureka
玩转Docker|使用Docker部署NotepadMX笔记应用程序前言一、NotepadMX介绍工具简介主要特点二、系统要求环境要求环境检查Docker版本检查检查操作系统版本三、部署NotepadMX服务下载NotepadMX镜像编辑部署文件创建容器检查容器状态检查服务端口安全设置四、访问NotepadMX服务访问NotepadMX首页设置访问验证编辑笔记总结前言在如今快节奏的工作与学习中，一
【前端】异步任务风控验证与轮询机制技术方案（通用笔记版）
一、背景场景在某类生成任务中，例如用户点击“执行任务”按钮后触发一个较耗时的后端操作（如生成报告、渲染图像、转码视频等），由于其调用了模型、渲染服务或需要较长处理时间，为了防止接口被频繁恶意调用，系统需要加入风控验证机制。此外，因任务处理为异步，前端无法立即获得最终结果，因此需通过轮询方式定期查询任务状态，等待任务完成后展示结果。二、整体流程说明1.用户点击“执行任务”按钮：前端调用风控接口/ap
【Kafka专栏 13】Kafka的消息确认机制：不是所有的“收到”都叫“确认”！
作者名称：夏之以寒作者简介：专注于Java和大数据领域，致力于探索技术的边界，分享前沿的实践和洞见文章专栏：夏之以寒-kafka专栏专栏介绍：本专栏旨在以浅显易懂的方式介绍Kafka的基本概念、核心组件和使用场景，一步步构建起消息队列和流处理的知识体系，无论是对分布式系统感兴趣，还是准备在大数据领域迈出第一步，本专栏都提供所需的一切资源、指导，以及相关面试题，立刻免费订阅，开启Kafka学习之旅！
数据分析案例-电脑笔记本价格数据可视化分析3 艾派森数据分析信息可视化 python 数据分析数据挖掘电脑
‍♂️个人主页：@艾派森的个人主页✍作者简介：Python学习者希望大家多多支持，我们一起进步！如果文章对你有帮助的话，欢迎评论点赞收藏加关注+目录1.项目背景2.数据集介绍3.技术工具
LLaMA 学习笔记 AI算法网奇深度学习基础人工智能深度学习
目录LLaMA模型结构：模型微调手册：推理示例：指定位置加载模型测试ok：模型下载：llama-stack下载modelscope下载LLaMA优化技术RMSNormSwiGLU激活函数旋转位置编码（RoPE）LLaMA模型结构：llama3结构详解-CSDN博客模型微调手册：大模型微调LLaMA详细指南（准备环境、数据、配置微调参数+微调过程）_llama微调-CSDN博客显存占用：FP16/B
统一思想认识永夜-极光思想
1.统一思想认识的基础,才能有的放矢原因: 总有一种描述事物的方式最贴近本质,最容易让人理解. 如何让教育更轻松,在于找到最适合学生的方式. 难点在于,如何模拟对方的思维基础选择合适的方式. &
Joda Time使用笔记 bylijinnan java joda time
Joda Time的介绍可以参考这篇文章： http://www.ibm.com/developerworks/cn/java/j-jodatime.html 工作中也常常用到Joda Time，为了避免每次使用都查API，记录一下常用的用法： /** * DateTime变化（增减） */ @Tes
FileUtils API eksliang FileUtils FileUtils API
转载请出自出处：http://eksliang.iteye.com/blog/2217374 一、概述这是一个Java操作文件的常用库，是Apache对java的IO包的封装，这里面有两个非常核心的类FilenameUtils跟FileUtils，其中FilenameUtils是对文件名操作的封装;FileUtils是文件封装，开发中对文件的操作，几乎都可以在这个框架里面找到。非常的好用。
各种新兴技术不懂事的小屁孩技术
1:gradle Gradle 是以 Groovy 语言为基础，面向Java应用为主。基于DSL（领域特定语言）语法的自动化构建工具。现在构建系统常用到maven工具，现在有更容易上手的gradle，搭建java环境: http://www.ibm.com/developerworks/cn/opensource/os-cn-gradle/ 搭建android环境： http://m
tomcat6的https双向认证酷的飞上天空 tomcat6
1.生成服务器端证书 keytool -genkey -keyalg RSA -dname "cn=localhost,ou=sango,o=none,l=china,st=beijing,c=cn" -alias server -keypass password -keystore server.jks -storepass password -validity 36
托管虚拟桌面市场势不可挡蓝儿唯美
用户还需要冗余的数据中心，dinCloud的高级副总裁兼首席营销官Ali Din指出。该公司转售一个MSP可以让用户登录并管理和提供服务的用于DaaS的云自动化控制台，提供服务或者MSP也可以自己来控制。在某些情况下，MSP会在dinCloud的云服务上进行服务分层，如监控和补丁管理。 MSP的利润空间将根据其参与的程度而有所不同，Din说。 “我们有一些合作伙伴负责将我们推荐给客户作为个
spring学习——xml文件的配置 a-john spring
在Spring的学习中，对于其xml文件的配置是必不可少的。在Spring的多种装配Bean的方式中，采用XML配置也是最常见的。以下是一个简单的XML配置文件： <?xml version="1.0" encoding="UTF-8"?> <beans xmlns="http://www.springframework.or
HDU 4342 History repeat itself 模拟 aijuans 模拟
来源：http://acm.hdu.edu.cn/showproblem.php?pid=4342 题意：首先让求第几个非平方数，然后求从1到该数之间的每个sqrt(i)的下取整的和。思路：一个简单的模拟题目，但是由于数据范围大，需要用__int64。我们可以首先把平方数筛选出来，假如让求第n个非平方数的话，看n前面有多少个平方数，假设有x个，则第n个非平方数就是n+x。注意两种特殊情况，即
java中最常用jar包的用途 asia007 java
java中最常用jar包的用途 jar包用途axis.jarSOAP引擎包commons-discovery-0.2.jar用来发现、查找和实现可插入式接口，提供一些一般类实例化、单件的生命周期管理的常用方法.jaxrpc.jarAxis运行所需要的组件包saaj.jar创建到端点的点到点连接的方法、创建并处理SOAP消息和附件的方法，以及接收和处理SOAP错误的方法. w
ajax获取Struts框架中的json编码异常和Struts中的主控制器异常的解决办法百合不是茶 js json编码返回异常
一:ajax获取自定义Struts框架中的json编码出现以下问题: 1,强制flush输出 json编码打印在首页 2, 不强制flush js会解析json 打印出来的是错误的jsp页面却没有跳转到错误页面 3, ajax中的dataType的json 改为text 会
JUnit使用的设计模式 bijian1013 java 设计模式 JUnit
JUnit源代码涉及使用了大量设计模式 1、模板方法模式（Template Method）定义一个操作中的算法骨架，而将一些步骤延伸到子类中去，使得子类可以不改变一个算法的结构，即可重新定义该算法的某些特定步骤。这里需要复用的是算法的结构，也就是步骤，而步骤的实现可以在子类中完成。
Linux常用命令（摘录） sunjing crond chkconfig
chkconfig --list 查看linux所有服务 chkconfig --add servicename 添加linux服务 netstat -apn | grep 8080 查看端口占用 env 查看所有环境变量 echo $JAVA_HOME 查看JAVA_HOME环境变量安装编译器 yum install -y gcc
【Hadoop一】Hadoop伪集群环境搭建 bit1129 hadoop
结合网上多份文档，不断反复的修正hadoop启动和运行过程中出现的问题，终于把Hadoop2.5.2伪分布式安装起来，跑通了wordcount例子。Hadoop的安装复杂性的体现之一是，Hadoop的安装文档非常多，但是能一个文档走下来的少之又少，尤其是Hadoop不同版本的配置差异非常的大。Hadoop2.5.2于前两天发布，但是它的配置跟2.5.0，2.5.1没有分别。 &nb
Anychart图表系列五之事件监听白糖_ chart
创建图表事件监听非常简单：首先是通过addEventListener('监听类型',js监听方法)添加事件监听，然后在js监听方法中定义具体监听逻辑。以钻取操作为例，当用户点击图表某一个point的时候弹出point的name和value，代码如下： <script> //创建AnyChart var chart = new AnyChart(); //添加钻取操作&quo
Web前端相关段子 braveCS web前端
Web标准：结构、样式和行为分离使用语义化标签 0）标签的语义：使用有良好语义的标签，能够很好地实现自我解释，方便搜索引擎理解网页结构，抓取重要内容。去样式后也会根据浏览器的默认样式很好的组织网页内容，具有很好的可读性，从而实现对特殊终端的兼容。 1）div和span是没有语义的：只是分别用作块级元素和行内元素的区域分隔符。当页面内标签无法满足设计需求时，才会适当添加div
编程之美-24点游戏 bylijinnan 编程之美
import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; public class PointGame { /**编程之美
主页面子页面传值总结 chengxuyuancsdn 总结
1、showModalDialog returnValue是javascript中html的window对象的属性,目的是返回窗口值,当用window.showModalDialog函数打开一个IE的模式窗口时,用于返回窗口的值主界面 var sonValue=window.showModalDialog("son.jsp"); 子界面 window.retu
[网络与经济]互联网+的含义 comsci 互联网+
互联网+后面是一个人的名字 = 网络控制系统互联网+你的名字 = 网络个人数据库每日提示:如果人觉得不舒服,千万不要外出到处走动,就呆在床上,玩玩手游,更不能够去开车,现在交通状况不
oracle 创建视图 with check option daizj 视图 view oralce
我们来看下面的例子： create or replace view testview as select empno,ename from emp where ename like ‘M%’ with check option; 这里我们创建了一个视图，并使用了with check option来限制了视图。然后我们来看一下视图包含的结果： select * from testv
ToastPlugin插件在cordova3.3下使用 dibov Cordova
自己开发的Todos应用，想实现“ 再按一次返回键退出程序 ”的功能，采用网上的ToastPlugins插件，发现代码或文章基本都是老版本，运行问题比较多。折腾了好久才弄好。下面吧基于cordova3.3下的ToastPlugins相关代码共享。 ToastPlugin.java package&nbs
C语言22个系统函数 dcj3sjt126com c function
C语言系统函数一、数学函数下列函数存放在math.h头文件中Double floor(double num) 求出不大于num的最大数。Double fmod(x, y) 求整数x/y的余数。Double frexp(num, exp); double num; int *exp; 将num分为数字部分（尾数）x和以2位的指数部分n，即num=x*2n，指数n存放在exp指向的变量中，返回x。D
开发一个类的流程 dcj3sjt126com 开发
本人近日根据自己的开发经验总结了一个类的开发流程。这个流程适用于单独开发的构件，并不适用于对一个项目中的系统对象开发。开发出的类可以存入私人类库，供以后复用。以下是开发流程： 1. 明确类的功能，抽象出类的大概结构 2. 初步设想类的接口 3. 类名设计（驼峰式命名） 4. 属性设置(权限设置) 判断某些变量是否有必要作为成员属
java 并发 shuizhaosi888 java 并发
能够写出高伸缩性的并发是一门艺术在JAVA SE5中新增了3个包 java.util.concurrent java.util.concurrent.atomic java.util.concurrent.locks 在java的内存模型中，类的实例字段、静态字段和构成数组的对象元素都会被多个线程所共享，局部变量与方法参数都是线程私有的，不会被共享。
Spring Security（11）——匿名认证 234390216 Spring Security ROLE_ANNOYMOUS 匿名
匿名认证目录 1.1 配置 1.2 AuthenticationTrustResolver 对于匿名访问的用户，Spring Security支持为其建立一个匿名的AnonymousAuthenticat
NODEJS项目实践0.2[ express,ajax通信...] 逐行分析JS源代码 Ajax nodejs express
一、前言通过上节学习，我们已经 ubuntu系统搭建了一个可以访问的nodejs系统，并做了nginx转发。本节原要做web端服务及 mongodb的存取，但写着写着，web端就
在Struts2 的Action中怎样获取表单提交上来的多个checkbox的值 lhbthanks java html struts checkbox
第一种方法：获取结果String类型在 Action 中获得的是一个 String 型数据，每一个被选中的 checkbox 的 value 被拼接在一起，每个值之间以逗号隔开(,)。所以在 Action 中定义一个跟 checkbox 的 name 同名的属性来接收这些被选中的 checkbox 的 value 即可。以下是实现的代码：前台 HTML 代码：
003.Kafka基本概念 nweiren hadoop kafka
Kafka基本概念：Topic、Partition、Message、Producer、Broker、Consumer。 Topic：消息源（Message）的分类。 Partition： Topic物理上的分组，一
Linux环境下安装JDK roadrunners jdk linux
1、准备工作创建JDK的安装目录： mkdir -p /usr/java/ 下载JDK，找到适合自己系统的JDK版本进行下载： http://www.oracle.com/technetwork/java/javase/downloads/index.html 把JDK安装包下载到/usr/java/目录，然后进行解压： tar -zxvf jre-7
Linux忘记root密码的解决思路 tomcat_oracle linux
1：使用同版本的linux启动系统，chroot到忘记密码的根分区passwd改密码　　2：grub启动菜单中加入init=/bin/bash进入系统，不过这时挂载的是只读分区。根据系统的分区情况进一步判断. 　　3: grub启动菜单中加入 single以单用户进入系统. 　　4:用以上方法mount到根分区把/etc/passwd中的root密码去除　　例如: 　　ro
跨浏览器 HTML5 postMessage 方法以及 message 事件模拟实现 xueyou jsonp jquery 框架 UI html5
postMessage 是 HTML5 新方法，它可以实现跨域窗口之间通讯。到目前为止，只有 IE8+, Firefox 3, Opera 9, Chrome 3和 Safari 4 支持，而本篇文章主要讲述 postMessage 方法与 message 事件跨浏览器实现。postMessage 方法 JSONP 技术不一样，前者是前端擅长跨域文档数据即时通讯，后者擅长针对跨域服务端数据通讯，p