本章建模的数据是从kaggle网站上下载的印第安人糖尿病数据库。
数据链接:https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
数据集介绍:该数据集最初来自美国国家糖尿病、消化和肾脏疾病研究所。目的是根据数据集中包含的某些诊断测量值,诊断性地预测患者是否患有糖尿病。这里的所有患者都是至少 21 岁的印第安血统的女性。数据集由几个医学预测变量和一个目标变量组成。预测变量包括患者的怀孕次数、BMI、胰岛素水平、年龄等。
图1 糖尿病数据库(数据预览)
数据集中各变量的含义介绍如下:
Pregnancies:怀孕次数
Glucose:口服葡萄糖耐量试验中 2 小时的血浆葡萄糖浓度
BloodPressure:舒张压 (mm Hg)
SkinThickness:三头肌皮褶厚度(mm)
Insulin:2 小时血清胰岛素 (mu U/ml)
BMI:体重指数(体重公斤/(身高米)^2)
DiabetesPedigreeFunction:糖尿病谱系功能
Age:年龄(岁)
Outcome:目标变量(0 或 1) 数据集中268 为 1,500为 0,0表示不患糖尿病、1表示患糖尿病。
(1)读取csv数据
(2)将字符串类型的数据变成浮点型
(3)找到数据集中每列数据的最小值和最大值
(4)将数据集进行最小-最大值归一化
(5)划分训练集和测试集
(6)梯度下降法求解逻辑回归的系数
(7)建立逻辑回归模型
(8)计算模型的准确率
from random import seed
from random import randrange
from csv import reader
from math import exp
# 加载csv数据(Load our data using csv reader)
def load_data_from_csv_file(file_name):
# 通过一个list容器去装数据
dataset = list()
with open(file_name, "r") as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# 将字符串转变成浮点型数据
def string_to_float(dataset, column):
for row in dataset:
# 把字符串的前后空格去掉后再转换成float类型
row[column] = float(row[column].strip())
# 找到数据集中每列的最小值和最大值
def find_min_and_max(dataset):
min_max_list = list()
for i in range(len(dataset[0])):
values_every_column = [row[i] for row in dataset]
min_every_column = min(values_every_column)
max_every_column = max(values_every_column)
min_max_list.append([min_every_column, max_every_column])
return min_max_list
# 数据最小-最大值归一化
def rescale_dataset(dataset, min_max_list):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - min_max_list[i][0]) / (min_max_list[i][1] - min_max_list[i][0])
# 划分训练集和测试集
def k_fold_cross_validation(dataset, folds):
splited_dataset = list()
# 对原数据进行处理的时候,尽量不要改动原数据(可以通过创建copy的方式对copy数据进行处理)
copy_dataset = list(dataset)
every_fold_size = int(len(dataset) / folds)
# 创建一个空的盒子,然后逐一随机选取数据放入盒子中
for i in range(folds):
box_for_my_fold = list()
while len(box_for_my_fold) < every_fold_size:
random_index = randrange(len(copy_dataset))
box_for_my_fold.append(copy_dataset.pop(random_index))
splited_dataset.append(box_for_my_fold)
return splited_dataset
# 计算模型的准确率
def calculate_the_accuracy_of_our_model(actual_data, predicted_data):
counter_of_correct_prediction = 0
for i in range(len(actual_data)):
if actual_data[i] == predicted_data[i]:
counter_of_correct_prediction += 1
return counter_of_correct_prediction / float(len(actual_data)) * 100.0
# 评价模型的性能
def how_good_is_our_algo(dataset, algo, folds, *args):
train_test_data = k_fold_cross_validation(dataset, folds)
scores = list()
for fold in train_test_data:
training_dataset = list(train_test_data)
training_dataset.remove(fold)
training_dataset = sum(training_dataset, []) # 将三维列表直接降为二维列表,重点
testing_dataset = list()
# 保险操作,去除真实数据,避免影响模型的学习结果
for row in fold:
row_copy = list(row)
row_copy[-1] = None
testing_dataset.append(row_copy)
predicted = algo(training_dataset, testing_dataset, *args)
actual = [row[-1] for row in fold]
accuracy = calculate_the_accuracy_of_our_model(actual, predicted)
scores.append(accuracy)
return scores
# 通过梯度下降计算出的逻辑回归的系数进行预测
def prediction(row, coefficients):
y_hat = coefficients[0]
for i in range(len(row) - 1):
y_hat += coefficients[i + 1] * row[i]
return 1 / (1.0 + exp(-y_hat))
# 用梯度下降算法去计算逻辑回归的系数
def using_sgd_to_calculate_logistic_coefficients(training_data, learning_rate, epochs):
coef = [0.0 for i in range(len(training_data[0]))]
for epoch in range(epochs):
for row in training_data:
y_hat = prediction(row, coef)
error = row[-1] - y_hat
coef[0] = coef[0] + learning_rate * error * y_hat * (1.0 - y_hat)
for i in range(len(row) - 1):
coef[i + 1] = coef[i + 1] + learning_rate * error * y_hat * (1.0 - y_hat) * row[i]
return coef
def logistic_regression(training_data, testtng_data, learning_rate, epochs):
predictions = list()
coef = using_sgd_to_calculate_logistic_coefficients(training_data, learning_rate, epochs)
for row in testtng_data:
y_hat = prediction(row, coef)
y_hat = round(y_hat)
predictions.append(y_hat)
return predictions
if __name__ == '__main__':
seed(1)
dataset = load_data_from_csv_file(".\dataset\diabetes\diabetes.csv")[1:]
for i in range(len(dataset[0])):
string_to_float(dataset, i)
min_max_value = find_min_and_max(dataset)
rescale_dataset(dataset, min_max_value)
folds = 10
learning_rate = 0.1
epochs = 1000
scores = how_good_is_our_algo(dataset, logistic_regression, folds, learning_rate, epochs)
print("The scores of our algo are %s" % scores)
print("The average accuracy of our model is %.3f" % (sum(scores) / float(len(scores))))
链接:https://pan.baidu.com/s/1EpRUJvyDRrsi_OQ9br_A_g
提取码:7cd6
--来自百度网盘超级会员V7的分享