SVM与逻辑回归分类解决二分类问题代码-用户点击率行为预测

SVM与逻辑回归分类解决二分类问题代码-用户点击率行为预测

这个数据集来自一个用用户点击广告行为预测竞赛数据,我已将其上传自我的代码,如果需要做测试,可以自行下载
下面我们直接上代码

import  os
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
import pandas as pd
import os
from sklearn import svm 
import sklearn
train=pd.read_csv("E:/bigdata/train.csv")

label=pd.read_csv("E:/bigdata/train_label.csv")
print(train)
print(label)

df=train.iloc[:,2:]
print(df.isnull().sum())
df=(df-df.min())/df.max()


print(df)
print(df.describe())


lable=label['label']
#df['label']=lable

#print(df)

pca=PCA()
pca.fit(df)
#print(pca.components_)#返回模型的各个特征向量
print(pca.explained_variance_ratio_)#返回各个成分各自的方差百分比

pca=PCA(5)#设置转化主成分个数两个
pca.fit(df)

low_d=pca.transform(df)
print(low_d)#返回降维后的数据
print(len(low_d))


#print(df)
train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(low_d,lable, random_state=1, train_size=0.7,test_size=0.3)  


classifier=svm.SVC(gamma=0.1)  
classifier.fit(train_data,train_label.ravel())

print("训练集:",classifier.score(train_data,train_label))  
print("测试集:",classifier.score(test_data,test_label))
os.system("pause")

上诉是svm算法预测代码

下面则是逻辑回归预测代码


import  os
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
import pandas as pd
import os
from sklearn import svm 
import sklearn
train=pd.read_csv("E:/bigdata/train.csv")

label=pd.read_csv("E:/bigdata/train_label.csv")
print(train)
print(label)

df=train.iloc[:,2:]
print(df.isnull().sum())
df=(df-df.min())/df.max()


print(df)
print(df.describe())


lable=label['label']
#df['label']=lable

#print(df)

pca=PCA()
pca.fit(df)
#print(pca.components_)#返回模型的各个特征向量
print(pca.explained_variance_ratio_)#返回各个成分各自的方差百分比

pca=PCA(3)#设置转化主成分个数两个
pca.fit(df)

low_d=pca.transform(df)
print(low_d)#返回降维后的数据
print(len(low_d))


#print(df)
train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(low_d,lable, random_state=1, train_size=0.7,test_size=0.3)  

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
 
# penalty:正则化 l2/l1
# C :正则化强度
# multi_class:多分类时使用 ovr: one vs rest 
lor = LogisticRegression(penalty='l1',C=100,multi_class='ovr') 
lor.fit(train_data,train_label)
print(lor.score(test_data,test_label))
 
sgdv = SGDClassifier(penalty='l1')
sgdv.fit(train_data,train_label)
print(sgdv.score(test_data,test_label))
os.system("pause")

对于这次的二分类问题·比较特别,在上诉算法中都用到了pca降维,我们对特征进行降维与提取后,两种算法的准确率均只能达到0.83左右眼,所以后面打算使用神经网络进行一下测试

你可能感兴趣的:(python,机器学习,机器学习,python)