题目来源于2021年中国研究生数学建模竞赛D题——抗乳腺癌候选药物的优化建模。
在本次实战的数据分析过程中,涉及以下技术内容:
(1)数据预处理
(2)相关系数
(3)NCA算法
(4)重采样
(5)遗传算法
(6)求AUC值
这里包括数据读取、删除相关性大的列、删除值全都是0的列、将超出mean±4σ的异常值替换为mean±4σ。
global Mdl_Caco_2 Mdl_CYP3A4 Mdl_hERG Mdl_HOB Mdl_MN Mdl_nca_pIC50_ensemble
%读入数据
dir_data1='Molecular_Descriptor.xlsx';
dir_data2='ERα_activity.xlsx';
warning('off');
sheets_1=sheetnames(dir_data1);
data1=readtable(dir_data1,'Sheet',sheets_1(1));
data2=readtable(dir_data2);
data3=readtable(dir_data1,'Sheet',sheets_1(2));
%求出各特征变量之间的相关系数,并删除相关性大的特征变量
table_1=data1(:,2:end);
num_1=data1{:,2:end};
%首先删除全为0的特征变量
del_0=[];
for i=1:size(num_1,2)
if sum(num_1(:,i))==0
del_0=[del_0,i];
end
end
table_1(:,del_0)=[];
num_1(:,del_0)=[];
%删除相关系数大于0.9的特征变量
corr_data=abs(corrcoef(num_1));
del_x=[];
for i=1:size(corr_data,1)
if ~ismember(i,del_x) %已经在删除列表里的列不需要再求与其相关性大的列号
tem1=corr_data(i,:);
find_x=find(tem1>=0.9);
find_x(find_x==i)=[];%该列与自己相关系数为1,所以去掉自己的列号
del_x=[del_x,find_x];
end
end
u_del_x=unique(del_x);
table_1_ir=table_1;
table_1_ir(:,u_del_x)=[];%table_1_ir为去掉相关性大的列后的表
%将异常值替换为mean±4σ
tem3=table_1_ir{:,:};
tem4=[];
for i=1:size(tem3,2)
tem1=tem3(:,i);
tem1((tem1-mean(tem1))>4*std(tem1))=mean(tem1)+4*std(tem1);
tem1((tem1-mean(tem1))<-4*std(tem1))=mean(tem1)-4*std(tem1);
tem4=[tem4,tem1];
end
tem1=table_1_ir.Properties.VariableNames;
table_1_ir=array2table(tem4);
table_1_ir.Properties.VariableNames=tem1;
本来我是使用了NCA算法和树模型两种方法求特征重要度,后来发现用NCA算法求出的特征,训练的模型精度相对较高,所以后来确定的用NCA算法求特征重要的。写论文时大家可以多尝试几种求特征重要度的方法,以使论文内容丰富。
%NCA算法求特征重要度,使用了交叉验证防止过拟合
rng(1) % For reproducibility
Xtrain=table_1_ir{:,:};
ytrain=data2{:,2};
n = length(ytrain);
cvp = cvpartition(length(ytrain),'kfold',5);
numvalidsets = cvp.NumTestSets;
lambdavals = linspace(0,50,20)*std(ytrain)/n;
lossvals = zeros(length(lambdavals),numvalidsets);
for i = 1:length(lambdavals)
for k = 1:numvalidsets
X = Xtrain(cvp.training(k),:);
y = ytrain(cvp.training(k),:);
Xvalid = Xtrain(cvp.test(k),:);
yvalid = ytrain(cvp.test(k),:);
nca = fsrnca(X,y,'FitMethod','exact', ...
'Solver','minibatch-lbfgs','Lambda',lambdavals(i), ...
'GradientTolerance',1e-4,'IterationLimit',30);
lossvals(i,k) = loss(nca,Xvalid,yvalid,'LossFunction','mse');
end
end
meanloss = mean(lossvals,2);
figure
plot(lambdavals,meanloss,'ro-')
xlabel('Lambda')
ylabel('Loss (MSE)')
grid on
%Fit the NCA feature selection model for regression using the best λ value.
[~,idx] = min(meanloss)
bestlambda = lambdavals(idx)
bestloss = meanloss(idx)
nca = fsrnca(Xtrain,ytrain,'FitMethod','exact', ...
'Solver','lbfgs','Lambda',bestlambda);
figure
plot(nca.FeatureWeights,'ro')
xlabel('Feature Index')
ylabel('Feature Weight')
grid on
%各特征变量重要度降序
nca_w=nca.FeatureWeights;
[nca_w_d,nca_sort]=sort(nca_w,"descend");
这里之所以只用集成学习来训练模型,也是提前进行对比尝试出来的。
%使用NCA选择的前20个特征变量重新训练模型
nca_sel=nca_sort(1:20);
table_train_nca=table_1_ir(:,nca_sel);
table_train_nca.pIC50=data2{:,2};
options = struct('UseParallel',true);
Mdl_nca_pIC50_ensemble = fitrauto(table_train_nca,'pIC50','OptimizeHyperparameters','all', ...
'HyperparameterOptimizationOptions',options,...
"Learners",'ensemble');
%预测新数据的pIC50值
tem1=table_train_nca.Properties.VariableNames;
sel_VariableNames=tem1(1:end-1);
table_tem=data3(:,2:end);
table_predict=table_tem(:,sel_VariableNames);
Y_predict = predict(Mdl_nca_pIC50_ensemble,table_predict);
figure,plot(Y_predict,'b-o');
dir_data4='ADMET-train.xlsx';
data4=readtable(dir_data4);
%训练Caco-2分类模型
%使用NCA选择的前20个特征变量训练模型
nca_sel=nca_sort(1:20);
table_train_Caco_2=table_1_ir(:,nca_sel);
Caco_2=data4{:,2};
Caco_2=categorical(Caco_2);
table_train_Caco_2.Caco_2=Caco_2;
%对分类目标是1(对应数字2)的样本进行重采样
d_Caco_2=double(Caco_2);
hang_2=find(d_Caco_2==2);
Xdata =table_train_Caco_2{:,1:end-1};
Ydata =d_Caco_2;
Xdata_chong = Xdata;
Ydata_chong = Ydata;
rng(1);
for k=1:(sum(d_Caco_2==1)-sum(d_Caco_2==2))
r = randi([1,length(hang_2)],1,1);
Xdata_chong =[Xdata_chong;Xdata(hang_2(r),:)];
Ydata_chong =[Ydata_chong;Ydata(hang_2(r(1)),:)];
end
table_train_Caco_2_new=array2table(Xdata_chong);
table_train_Caco_2_new.Caco_2=categorical(Ydata_chong);
table_train_Caco_2_new.Properties.VariableNames=table_train_Caco_2.Properties.VariableNames;
%分割训练集和测试集
rng(1); % For reproducibility of the data partition
c = cvpartition(Ydata_chong,'Holdout',0.2);
trainingIdx = training(c); % Training set indices
S1Train = table_train_Caco_2_new(trainingIdx,:);
testIdx = test(c); % Test set indices
S1Test = table_train_Caco_2_new(testIdx,:);
options = struct('UseParallel',true);
%训练集成学习模型
Mdl_Caco_2 = fitcauto(S1Train,'Caco_2','HyperparameterOptimizationOptions',options,'Learners','ensemble');
%测试集精度
testAccuracy = 1 - loss(Mdl_Caco_2,S1Test,'Caco_2')
%混淆矩阵
confusionchart(S1Test.Caco_2,predict(Mdl_Caco_2,S1Test))
tem1=table_train_Caco_2_new.Caco_2;
Ystats = tem1 == categorical(1);
[~,posterior] = predict(Mdl_Caco_2,S1Test);
[fpr,tpr,~,auc] = perfcurve(Ystats(testIdx),posterior(:,1),true);
auc
%训练CYP3A4分类模型
%使用NCA选择的前20个特征变量训练模型
nca_sel=nca_sort(1:20);
table_train_CYP3A4=table_1_ir(:,nca_sel);
CYP3A4=data4{:,3};
CYP3A4=categorical(CYP3A4);
table_train_CYP3A4.CYP3A4=CYP3A4;
%对分类目标是0(对应数字1)的样本进行重采样,两组数据求平均值
d_CYP3A4=double(CYP3A4);
hang_2=find(d_CYP3A4==1);
Xdata =table_train_CYP3A4{:,1:end-1};
Ydata =d_CYP3A4;
Xdata_chong = Xdata;
Ydata_chong = Ydata;
rng(1);
for k=1:(sum(d_CYP3A4==2)-sum(d_CYP3A4==1))
r = randi([1,length(hang_2)],1,1);
%Xdata_chong =[Xdata_chong;mean(Xdata(hang_2(r),:))];
Xdata_chong =[Xdata_chong;Xdata(hang_2(r),:)];
Ydata_chong =[Ydata_chong;Ydata(hang_2(r(1)),:)];
end
table_train_CYP3A4_new=array2table(Xdata_chong);
table_train_CYP3A4_new.CYP3A4=categorical(Ydata_chong);
table_train_CYP3A4_new.Properties.VariableNames=table_train_CYP3A4.Properties.VariableNames;
%分割训练集和测试集
rng(1); % For reproducibility of the data partition
c = cvpartition(Ydata_chong,'Holdout',0.2);
trainingIdx = training(c); % Training set indices
S1Train = table_train_CYP3A4_new(trainingIdx,:);
testIdx = test(c); % Test set indices
S1Test = table_train_CYP3A4_new(testIdx,:);
options = struct('UseParallel',true);
%训练集成学习模型
Mdl_CYP3A4 = fitcauto(S1Train,'CYP3A4','HyperparameterOptimizationOptions',options,'Learners','ensemble');
%测试集精度
testAccuracy = 1 - loss(Mdl_CYP3A4,S1Test,'CYP3A4')
%混淆矩阵
confusionchart(S1Test.CYP3A4,predict(Mdl_CYP3A4,S1Test))
tem1=table_train_CYP3A4_new.CYP3A4;
Ystats = tem1 == categorical(1);
[~,posterior] = predict(Mdl_CYP3A4,S1Test);
[fpr,tpr,~,auc] = perfcurve(Ystats(testIdx),posterior(:,1),true);
auc
%训练hERG分类模型
%使用NCA选择的前20个特征变量训练模型
nca_sel=nca_sort(1:20);
table_train_hERG=table_1_ir(:,nca_sel);
hERG=data4{:,4};
hERG=categorical(hERG);
table_train_hERG.hERG=hERG;
%对分类目标是0(数字是1)的样本进行重采样,两组数据求平均值
d_hERG=double(hERG);
hang_2=find(d_hERG==1);
Xdata =table_train_hERG{:,1:end-1};
Ydata =d_hERG;
Xdata_chong = Xdata;
Ydata_chong = Ydata;
rng(1);
for k=1:(sum(d_hERG==2)-sum(d_hERG==1))
r = randi([1,length(hang_2)],1,1);
%Xdata_chong =[Xdata_chong;mean(Xdata(hang_2(r),:))];
Xdata_chong =[Xdata_chong;Xdata(hang_2(r),:)];
Ydata_chong =[Ydata_chong;Ydata(hang_2(r(1)),:)];
end
table_train_hERG_new=array2table(Xdata_chong);
table_train_hERG_new.hERG=categorical(Ydata_chong);
table_train_hERG_new.Properties.VariableNames=table_train_hERG.Properties.VariableNames;
%分割训练集和测试集
rng(1); % For reproducibility of the data partition
c = cvpartition(Ydata_chong,'Holdout',0.2);
trainingIdx = training(c); % Training set indices
S1Train = table_train_hERG_new(trainingIdx,:);
testIdx = test(c); % Test set indices
S1Test = table_train_hERG_new(testIdx,:);
options = struct('UseParallel',true);
%训练集成学习模型
Mdl_hERG = fitcauto(S1Train,'hERG','HyperparameterOptimizationOptions',options,'Learners','ensemble');
%测试集精度
testAccuracy = 1 - loss(Mdl_hERG,S1Test,'hERG')
%混淆矩阵
confusionchart(S1Test.hERG,predict(Mdl_hERG,S1Test))
tem1=table_train_hERG_new.hERG;
Ystats = tem1 == categorical(1);
[~,posterior] = predict(Mdl_hERG,S1Test);
[fpr,tpr,~,auc] = perfcurve(Ystats(testIdx),posterior(:,1),true);
auc
%训练HOB分类模型
%使用NCA选择的前20个特征变量训练模型
nca_sel=nca_sort(1:20);
table_train_HOB=table_1_ir(:,nca_sel);
HOB=data4{:,5};
HOB=categorical(HOB);
table_train_HOB.HOB=HOB;
%对分类目标是1(数字是2)的样本进行重采样,两组数据求平均值
d_HOB=double(HOB);
hang_2=find(d_HOB==2);
Xdata =table_train_HOB{:,1:end-1};
Ydata =d_HOB;
Xdata_chong = Xdata;
Ydata_chong = Ydata;
rng(1);
for k=1:(sum(d_HOB==1)-sum(d_HOB==2))
r = randi([1,length(hang_2)],1,1);
%Xdata_chong =[Xdata_chong;mean(Xdata(hang_2(r),:))];
Xdata_chong =[Xdata_chong;Xdata(hang_2(r),:)];
Ydata_chong =[Ydata_chong;Ydata(hang_2(r(1)),:)];
end
table_train_HOB_new=array2table(Xdata_chong);
table_train_HOB_new.HOB=categorical(Ydata_chong);
table_train_HOB_new.Properties.VariableNames=table_train_HOB.Properties.VariableNames;
%分割训练集和测试集
rng(1); % For reproducibility of the data partition
c = cvpartition(Ydata_chong,'Holdout',0.2);
trainingIdx = training(c); % Training set indices
S1Train = table_train_HOB_new(trainingIdx,:);
testIdx = test(c); % Test set indices
S1Test = table_train_HOB_new(testIdx,:);
options = struct('UseParallel',true);
%训练集成学习模型
Mdl_HOB = fitcauto(S1Train,'HOB','HyperparameterOptimizationOptions',options,'Learners','ensemble');
%测试集精度
testAccuracy = 1 - loss(Mdl_HOB,S1Test,'HOB')
%混淆矩阵
confusionchart(S1Test.HOB,predict(Mdl_HOB,S1Test))
tem1=table_train_HOB_new.HOB;
Ystats = tem1 == categorical(1);
[~,posterior] = predict(Mdl_HOB,S1Test);
[fpr,tpr,~,auc] = perfcurve(Ystats(testIdx),posterior(:,1),true);
auc
%训练MN分类模型
%使用NCA选择的前20个特征变量训练模型
nca_sel=nca_sort(1:20);
table_train_MN=table_1_ir(:,nca_sel);
MN=data4{:,6};
MN=categorical(MN);
table_train_MN.MN=MN;
%对分类目标是0(数字是1)的样本进行重采样,两组数据求平均值
d_MN=double(MN);
hang_2=find(d_MN==1);
Xdata =table_train_MN{:,1:end-1};
Ydata =d_MN;
Xdata_chong = Xdata;
Ydata_chong = Ydata;
rng(1);
for k=1:(sum(d_MN==2)-sum(d_MN==1))
r = randi([1,length(hang_2)],1,1);
%Xdata_chong =[Xdata_chong;mean(Xdata(hang_2(r),:))];
Xdata_chong =[Xdata_chong;Xdata(hang_2(r),:)];
Ydata_chong =[Ydata_chong;Ydata(hang_2(r(1)),:)];
end
table_train_MN_new=array2table(Xdata_chong);
table_train_MN_new.MN=categorical(Ydata_chong);
table_train_MN_new.Properties.VariableNames=table_train_MN.Properties.VariableNames;
%分割训练集和测试集
rng(1); % For reproducibility of the data partition
c = cvpartition(Ydata_chong,'Holdout',0.2);
trainingIdx = training(c); % Training set indices
S1Train = table_train_MN_new(trainingIdx,:);
testIdx = test(c); % Test set indices
S1Test = table_train_MN_new(testIdx,:);
options = struct('UseParallel',true);
%训练集成学习模型
Mdl_MN = fitcauto(S1Train,'MN','HyperparameterOptimizationOptions',options,'Learners','ensemble');
%测试集精度
testAccuracy = 1 - loss(Mdl_MN,S1Test,'MN')
%混淆矩阵
confusionchart(S1Test.MN,predict(Mdl_MN,S1Test))
tem1=table_train_MN_new.MN;
Ystats = tem1 == categorical(1);
[~,posterior] = predict(Mdl_MN,S1Test);
[fpr,tpr,~,auc] = perfcurve(Ystats(testIdx),posterior(:,1),true);
auc
%求20个变量的上下限
tem1=table_train_nca{:,1:end-1};
lb=[];
ub=[];
for i=1:size(tem1,2)
lb=[lb,min(tem1(:,i))];
ub=[ub,max(tem1(:,i))];
end
可能是约束函数和目标函数的特殊性,导致在实施不能用并行运算。注意,在自定义函数中使用workspace里算出的数据时,要提前在主函数和自定义函数中都将其声明为全局变量。另外,我发现多次运行算出的最高pIC50值不同,应该和每次在ADMET分类时,随机重采样的样本不同有关。
%遗传算法
nonlcon = @NC;
fun = @pIC50_max;
options = optimoptions('ga','UseParallel', true, 'UseVectorized', true, ...
'ConstraintTolerance',1e-6,'PlotFcn', @gaplotbestf,'MigrationFraction',0.2);
rng default % For reproducibility
[x,fval,exitflag,output,population,scores]= ga(fun,20,[],[],[],[],lb,ub,nonlcon,options)
double(predict(Mdl_Caco_2,x))...
+double(predict(Mdl_CYP3A4,x))...
+double(predict(Mdl_hERG,x))...
+double(predict(Mdl_HOB,x))...
+double(predict(Mdl_MN,x))-5
其中,
function y=pIC50_max(x)
global Mdl_nca_pIC50_ensemble
y=-predict(Mdl_nca_pIC50_ensemble,x);
end
function [c,ceq] = NC(x)
global Mdl_Caco_2 Mdl_CYP3A4 Mdl_hERG Mdl_HOB Mdl_MN
ceq = [];
c=3-(double(predict(Mdl_Caco_2,x))...
+double(predict(Mdl_CYP3A4,x))...
+double(predict(Mdl_hERG,x))...
+double(predict(Mdl_HOB,x))...
+double(predict(Mdl_MN,x))-5);
end