数据:train.csv test.csv correct_answer.csv
收集到一些美国人的年龄、学历、地区等资料给你,如果是训练集,会在同一个csv中以‘income’告诉你这些人的工资收入是否超过50K。如果是测试集,工资收入会在correct_answer中,以Label形式告诉你是否为“ >50K”。
我们要做的是,通过人的14项基本信息,预测他的工资是否上50K,从而判断他是WInner or Loser。
在B数据集中我们发现 'sex' 和 ‘income’ 这两列都只有两种结果,直接布尔化处理。
trainData['sex'] = (trainData['sex'] == 'male')
trainData['income'] = (trainData['income'] == ' >50K')
B = pd.get_dummies(B) # 处理非字符的数据集
trainData = pd.concat([A, B], axis=1) # 将数据集A、B连接起来
这样的数据我们还不能直接使用,因为此案例中,训练的列数会比测试的列数多一列:native_country_ Holand-Netherlands。我们要把此列删除,使测试时参数数目对应。
trainData = trainData.drop(['native_country_ Holand-Netherlands'], axis=1).values
我们使用Logistic Regression逻辑回归设想这个问题。Logistic Regression和Linear Regression的差别主要在前者的输出范围是0-1,后者是任何值。我们可以这样理解,其实Logistict Regression就是将Linear Regression的公式外面再套一个函数,导致其输出在0-1之间。具体推导可以看李宏毅的机器学习视频。
由图右可知,我们先求出u1, u2, E1, E2(由E1, E2有权相加,得到E)。由于我不会打公式,都直接上图吧。u与E的公式如下:。79是逻辑回归的两分类中一份分类的数目,各自u、E的数目不一样的。
def g_train(X, Y):
# 我们需要u1, u2, E1, E2来计算 z=w*x+b的w、b
num = X.shape[0]
cnt1 = 0
cnt2 = 0
sum1 = np.zeros((X.shape[1],)) # (101,)
sum2 = np.zeros((X.shape[1],))
for i in range(num):
if Y[i] == 1:
sum1 += X[i]
cnt1 += 1
sum2 += X[i]
cnt2 += 1
u1 = sum1 / cnt1
u2 = sum2 / cnt2 # 找到了平均值
E1 = np.zeros((X.shape[1], X.shape[1])) # (101, 101)
E2 = np.zeros((X.shape[1], X.shape[1])) # (101, 101)
for i in range(num):
if Y[i] == 1:
# E1 += np.dot(X[i] - u1, (X[i] - u1).T)
E1 += np.dot(np.transpose([X[i] - u1]), [X[i] - u1])
# E2 += np.dot(X[i] - u2, (X[i] - u2).T)
E2 += np.dot(np.transpose([X[i] - u2]), [X[i] - u2])
E1 = E1 / float(cnt1)
E2 = E2 / float(cnt2)
E = E1 * (float(cnt1) / num) + E2 * (float(cnt2) / num)
# print ('findParams_U1', u1.shape, u1)
# print ('findParams_U2', u2.shape, u2)
# print ('findParams_E', E.shape, E)
return u1, u2, E, cnt1, cnt2
def g_pridict(X, Y, u1, u2, E, N1, N2):
E_inv = inv(E) # 居然碰到奇异矩阵的问题
w = np.dot((u1 - u2), E_inv)
b = (-0.5) * np.dot(np.dot(u1.T, E_inv), u1) + (0.5) * np.dot(np.dot(u2.T, E_inv), u2) + np.log(float(N1)/N2)
z = np.dot(w, X.T) + b
y = sigmoid(z)
cnt1 = 0
cnt2 = 0
y = np.around(y)
for i in range(Y.shape[0]):
if y[i] == Y[i]:
cnt1 += 1
cnt2 += 1
print ('[Generative]测试数据共', Y.shape[0], '个,判断正确', cnt1, '个,判断错误', cnt2, '个')
print ('准确率:', float(cnt1) / Y.shape[0]*100, '%')
return y
剩下我还使用了mini-batch SGD、Discriminative方式。效果如下:
import numpy as np
from random import shuffle
from numpy.linalg import inv
from math import floor, log
import matplotlib.pyplot as plt
import os
import argparse
import pandas as pd
dir = 'F:/machine/HW2/data'
def washData(pathData, pathAnswer='Nothing'):
# 14个属性+收入属性
# 数据清洗
df_x = pd.read_csv(pathData)
# 在执行清洗之前,合并数据和答案,方便将行数据对应清洗
if(pathAnswer != 'Nothing'): # 表示是测试数据,真的有pathAnswer
df_ans = pd.read_csv(pathAnswer)
df_x = pd.concat([df_x, df_ans['label']], axis=1) # 注意训练集里面列名是'income', 这里是'label'
df_x.rename(columns={'label': 'income'}, inplace=True) # label -> income
else :
df_x['income'] = (df_x['income'] == ' >50K')
df_x = df_x.replace(' ?', np.nan) # 将数据中存在'?'的行用NAN替代
df_x = df_x.dropna() # 将含有NAN的行删除
# 修改性别项 和 分离income项
df_x['sex'] = (df_x['sex'] == 'male')
data_y = df_x[['income']].astype(np.int64) # df_x[[]]两重括号才能保持其DataFrame属性, 一重括号data_y变成Series属性
del df_x['income']
# 将数据分成数字和非数字 两部分
object_columns = [col for col in df_x.columns if df_x[col].dtypes == "object"] # 陷阱:in df.columns可以,in range(df.columns)不行
no_object_columns = [col for col in df_x.columns if df_x[col].dtypes == 'int64']
object_data = df_x[object_columns]
no_object_data = df_x[no_object_columns]
# set every element in object rows as an attribute
object_data = pd.get_dummies(object_data) # 走到这一步其实很多列映射的值都一样
# 将数字部分和非数字部分都合并起来,还是我们的数据集
data_x = pd.concat([no_object_data, object_data], axis=1)
data_x = data_x.astype(np.int64)
# 数据都变成了一些数字
data_x = (data_x - data_x.mean()) / data_x.std()
if pathAnswer == 'Nothing': # 对比train.csv和test.csv发现如下项对应不了,故train.csv中获取的此元素删掉
del data_x['native_country_ Holand-Netherlands']
return data_x.values, data_y.values # 分别为14列、1列 # 这.values是陷阱啊!!!没有要不得,findParams会取不出数字的
def sigmoid(z):
z = 1 / (1.0 + np.exp(-z))
return z
def sgd_train(X, Y, batchSize=300, eta=0.0001, lambdaL2=0.0): # 用梯度求这个,是错误方向?
w = np.zeros(X.shape[1]) # (101, )
b = 0.0
list_cost = []
for i in range(0, X.shape[0] // batchSize * batchSize, batchSize):
batch = X[i:i + batchSize, :] # (30, 101)
y_ = np.squeeze(Y[i: i + batchSize]) # 这个函数是针对 np.dot的。
hypo = np.dot(batch, w) # 按公式获取了预测值, 结果是(30, )
hypo = np.around(hypo) # 上行算出来的是小数,我们要二分类
loss = hypo - y_ # (30, )
cost = np.sum(loss**2) / (2.0 * batchSize)
grad = np.sum(np.dot(loss.T, batch)) / batchSize
lambdaL2 = np.sum(loss) / batchSize
w = w - eta * grad
b = b - eta * lambdaL2
# print (list_cost) # 并没有逐渐变小的结果
return w, b
def sgd_predict(X, Y, w, b):
y = np.dot(X, w) + b
# y = sigmoid(y)
y = np.around(y)
cnt1 = 0
cnt2 = 0
for i in range(Y.shape[0]):
if y[i] == Y[i]:
cnt1 += 1
cnt2 += 1
print ('[mini-batch-SGD]预测数据共', Y.shape[0], '个,判断正确', cnt1, '个,判断错误', cnt2, '个')
print ('准确率:', float(cnt1) / Y.shape[0] * 100, '%')
def lr_train(X, Y, batchSize=300, eta=0.001, lambdaL2=0.0):
# 使用最大似然函数求解
w = np.zeros(X.shape[1])
b = 0.0
list_cost = []
for i in range(0, X.shape[0] // batchSize * batchSize, batchSize):
batch = X[i:i + batchSize, :] # (30, 101)
y_ = np.squeeze(Y[i: i + batchSize])
z = np.dot(batch, w) + b
y = sigmoid(z)
loss = y - y_
# 计算交叉熵, 由于是矩阵(,101)*(101,)=标量,所以巧妙求和了
cross_entropy = (-1) * (np.dot(y_.T, np.log(y)) + np.dot((1 - y_.T), np.log(1 - y))) # 存在log(0)的情况
w = w - eta * np.dot(batch.T, loss)
b = b - eta * (np.sum(loss) / batchSize)
return w, b
def lr_pridect(X, Y, w, b):
z = np.dot(X, w) + b
y = sigmoid(z)
y = np.around(y)
cnt1 = 0
cnt2 = 0
for i in range(Y.shape[0]):
if y[i] == Y[i]:
cnt1 += 1
else :
cnt2 += 1
print ('[Logistic Regression]预测数据共', Y.shape[0], '个,判断正确', cnt1, '个,判断错误', cnt2, '个')
print ('准确率:', float(cnt1) / Y.shape[0] * 100, '%')
trainX, trainY = washData(dir+'/train.csv') # trainX是DataFrame(30162, 101) (30162,)
testX, testY = washData(dir+'/test.csv', dir+'/correct_answer.csv') # (15060, 101) (15060,)
# Generative 公式的方法
u1, u2, E, N1, N2 = g_train(trainX, trainY)
my_ans = g_pridict(testX, testY, u1, u2, E, N1, N2)
np.savetxt(dir+'/my_ans_1.csv', my_ans)
# mini_batch SGD 这个方向是低正确率的
w, b = sgd_train(trainX, trainY)
sgd_predict(testX, testY, w, b)
w, b = lr_train(trainX, trainY)
lr_pridect(testX, testY, w, b)
output_dir = "output/"
def dataProcess_X(rawData):
# sex 只有两个属性 先drop之后处理
if "income" in rawData.columns: # if in 是训练集
Data = rawData.drop(["sex", 'income'], axis=1)
else: # 是测试集
Data = rawData.drop(["sex"], axis=1)
listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column
listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column
ObjectData = Data[listObjectColumn]
NonObjectData = Data[listNonObjedtColumn]
#insert set into nonobject data with male = 0 and female = 1
NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int))
#set every element in object rows as an attribute
# print('编码前:', ObjectData) -------------------
ObjectData = pd.get_dummies(ObjectData)
# print('编码后:', ObjectData) -------------------
Data = pd.concat([NonObjectData, ObjectData], axis=1) # 列相连接、并列
# print('列名:', Data.columns) # 原本数字的在前,字符的在后,sex是第一个
Data_x = Data.astype("int64")
# Data_y = (rawData["income"] == " <=50K").astype(np.int)
# pandas.std() 计算的是样本标准偏差,默认ddof = 1。如果我们知道所有的分数,那么我们就有了总体
# ——因此,要使用 pandas 进行归一化处理,我们需要将“ddof”设置为 0。
Data_x = (Data_x - Data_x.mean()) / Data_x.std() # pandas.mean()求每一列自己的平均值
## 保存数字型数据,通过分析此文件,发现它将原来列属性中不同的值分成了新的列。所以文件的列数激增。
# if "income" in rawData.columns:
# Data_x.to_csv('F:/machine/HW2/dta/train_num_data.csv')
# 疑惑,目前没有进行数据清洗,即数据不全处。
return Data_x
def dataProcess_Y(rawData):
df_y = rawData['income'] # 太帅了这个用法, 并且使用的时候我们可以不转换为数组
Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"])
return Data_y
def split_valid_set(X, Y, percentage):
all_size = X.shape[0] # 32561
valid_size = int(floor(all_size * percentage)) # 3256
X, Y = _shuffle(X, Y) # 将数据打乱
# 将数据分成 percentage: 1-percentage两部分
X_valid, Y_valid = X[ : valid_size], Y[ : valid_size]
X_train, Y_train = X[valid_size:], Y[valid_size:]
return X_train, Y_train, X_valid, Y_valid
def valid(X, Y, mu1, mu2, shared_sigma, N1, N2):
sigma_inv = inv(shared_sigma) # 矩阵求逆
w = np.dot((mu1-mu2), sigma_inv)
X_t = X.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2)
a = np.dot(w,X_t) + b # 唉,弄了半天,代码没有问题,只不过w在PPT上用的是wT这个名字,但是意义是一样的。
y = sigmoid(a) # a就是线性里面的y了,在逻辑回归里面只不过套了哥函数,将其分布改成0-1之间
y_ = np.around(y) # 四舍五入的值,即二分类
# squeeze()将维度里面为1的值维度去掉。Y(3256,1) y_(3256,)
result = (np.squeeze(Y) == y_) # result(3256,) [true or false]
# 我训练集的前半部分得出的函数,对后半部分的测试成功率
print('Valid acc = %f' % (float(result.sum()) / result.shape[0]))
def train(X_train, Y_train):
# vaild_set_percetange = 0.1
# X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange)
#Gussian distribution parameters
train_data_size = X_train.shape[0]
cnt1 = 0
cnt2 = 0
mu1 = np.zeros((X_train.shape[1],))
mu2 = np.zeros((X_train.shape[1],))
for i in range(train_data_size):
if Y_train[i] == 1: # >50k
mu1 += X_train[i]
cnt1 += 1
mu2 += X_train[i]
cnt2 += 1
mu1 /= cnt1 # 均值U
mu2 /= cnt2
sigma1 = np.zeros((X_train.shape[1], X_train.shape[1])) # (106,106)
sigma2 = np.zeros((X_train.shape[1], X_train.shape[1]))
for i in range(train_data_size):
if Y_train[i] == 1:
# sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1]) # 分布∑1 # 公式有误??
sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1]) # 分布∑1 # 公式有误??
sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2]) # 分布∑2
sigma1 /= cnt1
sigma2 /= cnt2
shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2 # 分布∑
N1 = cnt1
N2 = cnt2
return mu1, mu2, shared_sigma, N1, N2 # 现在将公式的参数全部求出了
if __name__ == "__main__":
trainData = pd.read_csv("F:/machine/HW2/data/train.csv") # 第一行会作为列名 (32561,15)
testData = pd.read_csv("F:/machine/HW2/data/test.csv") # (16281,14)没有数据
ans = pd.read_csv("F:/machine/HW2/data/correct_answer.csv") # (16281, 2) 2 = id + label
#here is one more attribute in trainData
# 删除训练集中 有['native_country_ Holand-Netherlands']的那一列, 因为测试集里面无此国家项,即无此列
x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values # (32561,107-1)
x_test = dataProcess_X(testData).values # (16281,106)
y_train = dataProcess_Y(trainData).values # (32561,1)
y_ans = ans['label'].values # (16281,)if 达到50K then 1 else 0 answer for test
vaild_set_percetage = 0.1
X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetage) # 返回的是打乱了、分割了的数据
mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train)
valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2)
mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train) # 开始对整个训练集训练
sigma_inv = inv(shared_sigma)
w = np.dot((mu1 - mu2), sigma_inv)
X_t = x_test.T
b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(
float(N1) / N2)
a = np.dot(w, X_t) + b
y = sigmoid(a)
y_ = np.around(y).astype(np.int)
df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_})
result = (np.squeeze(y_ans) == y_)
print('Test acc = %f' % (float(result.sum()) / result.shape[0]))
df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_})
if not os.path.exists(output_dir):
df.to_csv(os.path.join(output_dir+'gd_output.csv'), sep='\t', index=False)