使用Logistics Regression进行数据分类。
def generate_data(seed):
np.random.seed(seed)
# class1
data_size_1 = 300
# feature1
x1_1 = np.random.normal(loc=5.0, scale=1.0, size=data_size_1)
# feature2
x2_1 = np.random.normal(loc=4.0, scale=1.0, size=data_size_1)
y_1 = [0 for _ in range(data_size_1)]
# class2
data_size_2 = 400
# feature1
x1_2 = np.random.normal(loc=5.0, scale=2.0, size=data_size_2)
# feature2
x2_2 = np.random.normal(loc=4.0, scale=2.0, size=data_size_2)
y_2 = [1 for _ in range(data_size_2)]
# concatenate
x1 = np.concatenate((x1_1, x1_2), axis=0)
x2 = np.concatenate((x2_1, x2_2), axis=0)
# 合成为一个整的数据集,变为二维矩阵
x = np.hstack((x1.reshape(-1,1), x2.reshape(-1,1)))
y = np.concatenate((y_1, y_2), axis=0)
# 总的数据大小
data_size_all = data_size_1 + data_size_2
# 打乱数据
shuffled_index = np.random.permutation(data_size_all)
x = x[shuffled_index]
y = y[shuffled_index]
return x, y
其中80%数据用于训练,20%数据用于测试,由于数据量小,不设置验证数据集
# 数据分割,由于数据量小,不设置验证数据集
def data_split(x_data, y_data):
# 80%数据用于训练
train_split = int(len(y_data) * 0.8)
x_train = x[:train_split]
y_train = y[:train_split]
# 20%数据用于测试
x_test = x[train_split:]
y_test = y[train_split:]
return x_train, y_train, x_test, y_test
根据上文:李宏毅机器学习(四),以及大佬:王佳旭同学代码。
# Logistic Regression模型
class LogisticRegression():
'''
:param lr: 学习率
:param num_iters: 更新轮数
:param seed: 随机数种子
'''
def __init__(self, lr=0.1, num_iters=100, seed=None):
self.seed = seed
self.lr = lr
self.num_iters = num_iters
def fit(self, x, y):
np.random.seed(self.seed)
# 参数初始化w b
self.w = np.random.normal(loc=0.0, scale=1.0, size=x.shape[1])
self.b = np.random.normal(loc=0.0, scale=1.0)
# 数据集
self.x = x
self.y = y
# 迭代更新
for i in range(self.num_iters):
self._update_step()
# sigmod处理
def _sigmoid(self, z):
return 1.0 / (1.0 + np.exp(-z))
# 函数模型 w*x + b,经过SIGMOD处理
def _f(self, x, w, b):
z = x.dot(w) + b
return self._sigmoid(z)
# 初次预测算出概率
def predict_proba(self, x=None):
if x is None:
x = self.x
y_pred = self._f(x, self.w, self.b)
return y_pred
# 再预测,根据概率分类
def predict(self, x=None):
if x is None:
x = self.x
y_pred_proba = self._f(x, self.w, self.b)
y_pred = np.array([0 if y_pred_proba[i] < 0.5 else 1 for i in range(len(y_pred_proba))])
return y_pred
# 为分类进行评分
def score(self, y_true=None, y_pred=None):
if y_true is None or y_pred is None:
y_true = self.y
y_pred = self.predict()
# 计算准确率
acc = np.mean([1 if y_true[i] == y_pred[i] else 0 for i in range(len(y_true))])
return acc
# 损失函数
def loss(self, y_true=None, y_pred_proba=None):
if y_true is None or y_pred_proba is None:
y_true = self.y
y_pred_proba = self.predict_proba()
return np.mean(-1.0 * (y_true * np.log(y_pred_proba) + (1.0 - y_true) * np.log(1.0 - y_pred_proba)))
# 梯度下降
def gradient_descent(self):
y_pred = self.predict()
d_w = (y_pred - self.y).dot(self.x) / len(self.y)
d_b = np.mean(y_pred - self.y)
self.w = self.w - self.lr * d_w
self.b = self.b - self.lr * d_b
return self.w, self.b
import matplotlib.pyplot as plt
def main():
# 生成数据
x, y = generate_data(seed = 514)
x_train, y_train, x_test, y_test = data_split(x, y)
# 数据归一化
x_train = (x_train - np.min(x_train, axis=0)) / (np.max(x_train, axis=0) - np.min(x_train, axis=0))
x_test = (x_test - np.min(x_test, axis=0)) / (np.max(x_test, axis=0) - np.min(x_test, axis=0))
# 逻辑斯蒂回归分类器
clf = LogisticRegression(lr=0.1, num_iters=500, seed=514)
clf.fit(x_train, y_train)
# 结果可视化
split_boundary_func = lambda x: (-clf.b - clf.w[0] * x) / clf.w[1]
xx = np.arange(0.1, 0.6, 0.1)
cValue = ['g','b']
plt.scatter(x_train[:,0], x_train[:,1], c=[cValue[i] for i in y_train], marker='o')
plt.plot(xx, split_boundary_func(xx), c='red')
plt.show()
# 测试数据集上的损失
y_test_pred = clf.predict(x_test)
y_test_pred_proba = clf.predict_proba(x_test)
print(clf.score(y_test, y_test_pred))
print(clf.loss(y_test, y_test_pred_proba))
if __name__ == '__main__':
main()
本人在代码方面还是有所欠缺,对numpy、matplotlib的使用不熟悉。感谢王同学提供的代码。