使用同态加密和Logistic Regression对冠心病(coronary heart disease ,CHD)进行预测
数据集使用的是Kaggle的数据集,可以点击here进行下载。
另外,实验的目的是探究HE密文训练与明文训练的差异,为了减少训练的计算量,会对数据及计算过程进行一些取舍。如果不想用CHD的数据,也可以用随机数来测试。
CHD数据集的格式如下:
male | age | education | currentSmoker | cigsPerDay | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 39 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 195 | 106 | 70 | 26 | 97 | 80 | 77 | 0 |
这里采用了CHD的数据进行测试,但是也提供了随机数的实现,只需要改变注释的代码即可。
import torch
import tenseal as ts
import pandas as pd
import random
from time import time
# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt
#设置随机种子,用来初始化模型权重
torch.random.manual_seed(73)
random.seed(73)
def split_train_test(x, y, test_ratio=0.3):
#拆分训练集和测试集
idxs = [i for i in range(len(x))]
random.shuffle(idxs)
# delimiter between test and train data
delim = int(len(x) * test_ratio)
test_idxs, train_idxs = idxs[:delim], idxs[delim:]
return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]
def heart_disease_data():
data = pd.read_csv("./data/framingham.csv")
# 删除有空值的样本
data = data.dropna()
# 删除"education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"这些特征
data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
# balance data
grouped = data.groupby('TenYearCHD')
data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
# 提取标签
y = torch.tensor(data["TenYearCHD"].values).float().unsqueeze(1)
data = data.drop("TenYearCHD", 'columns')
# 数据归一化
data = (data - data.mean()) / data.std()
x = torch.tensor(data.values).float()
return split_train_test(x, y)
def random_data(m=1024, n=2):
# data separable by the line `y = x`
x_train = torch.randn(m, n)
x_test = torch.randn(m // 2, n)
y_train = (x_train[:, 0] >= x_train[:, 1]).float().unsqueeze(0).t()
y_test = (x_test[:, 0] >= x_test[:, 1]).float().unsqueeze(0).t()
return x_train, y_train, x_test, y_test
# You can use whatever data you want without modification to the tutorial
# x_train, y_train, x_test, y_test = random_data()
x_train, y_train, x_test, y_test = heart_disease_data()
print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")
输出如下
############# Data summary #############
x_train has shape: torch.Size([780, 9])
y_train has shape: torch.Size([780, 1])
x_test has shape: torch.Size([334, 9])
y_test has shape: torch.Size([334, 1])
#######################################
没什么需要注意的点,就是一个简单的logistic回归的过程。
class LR(torch.nn.Module):
def __init__(self, n_features):
super(LR, self).__init__()
self.lr = torch.nn.Linear(n_features, 1)
def forward(self, x):
out = torch.sigmoid(self.lr(x))
return out
n_features = x_train.shape[1]
model = LR(n_features)
# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()
# define the number of epochs for both plain and encrypted training
EPOCHS = 5
def train(model, optim, criterion, x, y, epochs=EPOCHS):
for e in range(1, epochs + 1):
optim.zero_grad()
out = model(x)
loss = criterion(out, y)
loss.backward()
optim.step()
print(f"Loss at epoch {e}: {loss.data}")
return model
model = train(model, optim, criterion, x_train, y_train)
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
最后的正确率在70.35%,现在我们用HE加密训练来测试测试一下。
采用同态加密的方式来对模型进行评估。
评估方式伪码如下
明文方式:
model(x_test) compare with y_test
HE方式:
model.weight.decrypt()
model.bias.decrypt()
x_test = x_test.decrypt()
y_test = y_test.decrypt()
model(x_test) compare with y_test
具体代码
def encrypted_evaluation(model, enc_x_test, y_test):
t_start = time()
correct = 0
for enc_x, y in zip(enc_x_test, y_test):
# encrypted evaluation
enc_out = model(enc_x)
# plain comparaison
out = enc_out.decrypt()
out = torch.tensor(out)
out = torch.sigmoid(out)
if torch.abs(out - y) < 0.5:
correct += 1
t_end = time()
print(f"Evaluated test_set of {len(x_test)} entries in {int(t_end - t_start)} seconds")
print(f"Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
return correct / len(x_test)
encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")
最后的正确率在62.57%,相差不大,可以认为同态加密对模型的保护较好。
通过加密模型进行评估的方式,验证了HE在加密模型中进行运算的可能性。现在只是使用HE对训练数据进行即加密训练,进行模型效果评估。
明文方式:
pred = model(x_train)
loss = loss(pred ,y_test)
loss.back()
model.updataParm()
acc = model(x_test) compare with y_test
密文方式
model.encrypt()
pred = model(enc_x_test)
loss = loss(predm,enc_y_test)
loss.back()
model.updataParm()
model.decrypt()
acc = model(x_test) compare with y_test
代码:
import torch
import tenseal as ts
import pandas as pd
import random
from time import time
# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt
#设置随机种子,用来初始化模型权重
torch.random.manual_seed(73)
random.seed(73)
def split_train_test(x, y, test_ratio=0.3):
#拆分训练集和测试集
idxs = [i for i in range(len(x))]
random.shuffle(idxs)
# delimiter between test and train data
delim = int(len(x) * test_ratio)
test_idxs, train_idxs = idxs[:delim], idxs[delim:]
return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]
def heart_disease_data():
data = pd.read_csv("./data/framingham.csv")
# 删除有空值的样本
data = data.dropna()#
# 数据归一化
data = (data - data.mean()) / data.std()
x = torch.tensor(data.values).float()
return split_train_test(x, y)
def random_data(m=1024, n=2):
# data separable by the line `y = x`
x_train = torch.randn(m, n)
x_test = torch.randn(m // 2, n)
y_train = (x_train[:, 0] >= x_train[:, 1]).float().unsqueeze(0).t()
y_test = (x_test[:, 0] >= x_test[:, 1]).float().unsqueeze(0).t()
return x_train, y_train, x_test, y_test
# You can use whatever data you want without modification to the tutorial
# x_train, y_train, x_test, y_test = random_data()
x_train, y_train, x_test, y_test = heart_disease_data()
print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")
class LR(torch.nn.Module):
def __init__(self, n_features):
super(LR, self).__init__()
self.lr = torch.nn.Linear(n_features, 1)
def forward(self, x):
out = torch.sigmoid(self.lr(x))
return out
n_features = x_train.shape[1]
model = LR(n_features)
# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()
# define the number of epochs for both plain and encrypted training
EPOCHS = 5
def train(model, optim, criterion, x, y, epochs=EPOCHS):
for e in range(1, epochs + 1):
optim.zero_grad()
out = model(x)
loss = criterion(out, y)
loss.backward()
optim.step()
print(f"Loss at epoch {e}: {loss.data}")
return model
model = train(model, optim, criterion, x_train, y_train)
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
class EncryptedLR:
def __init__(self, torch_lr):
self.weight = torch_lr.lr.weight.data.tolist()[0]
self.bias = torch_lr.lr.bias.data.tolist()
# we accumulate gradients and counts the number of iterations
self._delta_w = 0
self._delta_b = 0
self._count = 0
def forward(self, enc_x):
enc_out = enc_x.dot(self.weight) + self.bias
enc_out = EncryptedLR.sigmoid(enc_out)
return enc_out
def backward(self, enc_x, enc_out, enc_y):
out_minus_y = (enc_out - enc_y)
self._delta_w += enc_x * out_minus_y
self._delta_b += out_minus_y
self._count += 1
def update_parameters(self):
if self._count == 0:
raise RuntimeError("You should at least run one forward iteration")
# update weights
# We use a small regularization term to keep the output
# of the linear layer in the range of the sigmoid approximation
self.weight -= self._delta_w * (1 / self._count) + self.weight * 0.05
self.bias -= self._delta_b * (1 / self._count)
# reset gradient accumulators and iterations count
self._delta_w = 0
self._delta_b = 0
self._count = 0
@staticmethod
def sigmoid(enc_x):
# We use the polynomial approximation of degree 3
# sigmoid(x) = 0.5 + 0.197 * x - 0.004 * x^3
# from https://eprint.iacr.org/2018/462.pdf
# which fits the function pretty well in the range [-5,5]
return enc_x.polyval([0.5, 0.197, 0, -0.004])
def plain_accuracy(self, x_test, y_test):
# evaluate accuracy of the model on
# the plain (x_test, y_test) dataset
w = torch.tensor(self.weight)
b = torch.tensor(self.bias)
out = torch.sigmoid(x_test.matmul(w) + b).reshape(-1, 1)
correct = torch.abs(y_test - out) < 0.5
return correct.float().mean()
def encrypt(self, context):
self.weight = ts.ckks_vector(context, self.weight)
self.bias = ts.ckks_vector(context, self.bias)
def decrypt(self):
self.weight = self.weight.decrypt()
self.bias = self.bias.decrypt()
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
# parameters
poly_mod_degree = 8192
coeff_mod_bit_sizes = [40, 21, 21, 21, 21, 21, 21, 40]
# create TenSEALContext
ctx_training = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx_training.global_scale = 2 ** 21
ctx_training.generate_galois_keys()
t_start = time()
enc_x_train = [ts.ckks_vector(ctx_training, x.tolist()) for x in x_train]
enc_y_train = [ts.ckks_vector(ctx_training, y.tolist()) for y in y_train]
t_end = time()
print(f"Encryption of the training_set took {int(t_end - t_start)} seconds")
normal_dist = lambda x, mean, var: np.exp(- np.square(x - mean) / (2 * var)) / np.sqrt(2 * np.pi * var)
def plot_normal_dist(mean, var, rmin=-10, rmax=10):
x = np.arange(rmin, rmax, 0.01)
y = normal_dist(x, mean, var)
fig = plt.plot(x, y)
# plain distribution
lr = LR(n_features)
data = lr.lr(x_test)
mean, var = map(float, [data.mean(), data.std() ** 2])
plot_normal_dist(mean, var)
print("Distribution on plain data:")
plt.show()
# encrypted distribution
def encrypted_out_distribution(eelr, enc_x_test):
w = eelr.weight
b = eelr.bias
data = []
for enc_x in enc_x_test:
enc_out = enc_x.dot(w) + b
data.append(enc_out.decrypt())
data = torch.tensor(data)
mean, var = map(float, [data.mean(), data.std() ** 2])
plot_normal_dist(mean, var)
print("Distribution on encrypted data:")
plt.show()
eelr = EncryptedLR(lr)
eelr.encrypt(ctx_training)
encrypted_out_distribution(eelr, enc_x_train)
eelr = EncryptedLR(LR(n_features))
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #0 is {accuracy}")
times = []
for epoch in range(EPOCHS):
eelr.encrypt(ctx_training)
# if you want to keep an eye on the distribution to make sure
# the function approxiamation is still working fine
# WARNING: this operation is time consuming
# encrypted_out_distribution(eelr, enc_x_train)
t_start = time()
for enc_x, enc_y in zip(enc_x_train, enc_y_train):
enc_out = eelr.forward(enc_x)
eelr.backward(enc_x, enc_out, enc_y)
eelr.update_parameters()
t_end = time()
times.append(t_end - t_start)
eelr.decrypt()
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #{epoch + 1} is {accuracy}")
print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
print(f"Final accuracy is {accuracy}")
diff_accuracy = plain_accuracy - accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
print("Oh! We got a better accuracy when training on encrypted data! The noise was on our side...")
迭代训练5次后,在明文训练中的准确率为0.703592836856842,在密文训练中的准确率为0.667664647102356,因为训练次数较少,所以两者的差异可以接受。
明文训练时间为5秒,密文训练时间为31秒,密文在保证安全性的同时加大了训练时间。