class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None
主要是看有没有缺失值、异常值;
逻辑回归还需要把分类变量转化为数值变量(分类变量不能是字符串)
import pandas as pd
# 数据
path = "../Data/classify.csv"
rawdata = pd.read_csv(path)
X = rawdata.iloc[:,:13]
Y = rawdata.iloc[:,14] # {”A":0,"B":1,"C":2}
Y = pd.Categorical(Y).codes # ABC变成123
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# 训练集和测试集
x_train, x_test, y_train, y_test = \
train_test_split(X, Y, test_size=0.3)
# 训练
log_model = LogisticRegression(multi_class="multinomial", solver="newton-cg", max_iter=1000)
log_model.fit(x_train,y_train)
核心就是fit
分类模型,一般采用准确率和召回率评价性能的好坏。
from sklearn.metrics import accuracy_score, recall_score
pred_test = log_model.predict(x_test)
acu = accuracy_score(y_test, pred_test) # 准确率
recall = recall_score(y_test, pred_test, average="macro") # 召回率
注意accuracy_score, recall_score传入np.array
待优化的代码:
def log(x_train, y_train, x_test, y_test, multi_class, solver): # multi_class: {ovr','multinomial'}, solver: {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}
log_model = LogisticRegression(multi_class=multi_class, solver=solver, max_iter=1000)
log_model.fit(x_train,y_train)
pred_test = log_model.predict(x_test)
acu = accuracy_score(y_test, pred_test)
recall = recall_score(y_test, pred_test, average="macro")
return acu, recall
def run_log_reg(times, test_size):
result = {"times":[],
"multi_class":[],
"solver":[],
"acu":[],
"recall":[]}
x_train, x_test, y_train, y_test = \
train_test_split(X, Y, test_size=test_size)
for multi_class in ['ovr','multinomial']:
for solver in ['newton-cg', 'liblinear', 'lbfgs', 'sag', 'saga']:
if (multi_class == 'multinomial' and solver == 'liblinear'):
continue
acu, recall = log(x_train, y_train, x_test, y_test, multi_class, solver)
result["times"].append(i)
result["multi_class"].append(multi_class)
result["solver"].append(solver)
result["acu"].append(acu)
result["recall"].append(recall)
df_re = pd.DataFrame(result)
ave_acu = df_re.groupby(["multi_class", "solver"])["acu"].mean()
ave_recall = df_re.groupby(["multi_class", "solver"])["recall"].mean()
return ave_acu, ave_recall
求得了每个multi-class和solver下的模型准确率和召回率。