题目如下,给定Titanic上乘客的资料,预测他们幸免于难的概率,训练集合如下,测试集合类似,只不过Survived与否需要自己预测
这个只是个题目,可以用之前学到的机器学习算法练练手,我主要采用了Andrew Ng将的logistic回归的资料和方法,数据处理则是用pandas做的
主要步骤
1 数据整理,数据清理和转换 python pandas和numpy完成
2 logistic回归分析,预测 Octave完成
具体代码如下
1 数据整理
# -*- coding: utf-8 -*- import csv as csv import numpy as np import pandas as pd def trainDataPrd(): df=pd.read_csv('train.csv') #判断字段的空值状况,自动化 df.isnull().sum() #通过判断为空的数量,来判断为空的数目,其中False=0,True=1嘛 #整理性别字段 df['Gender']=df['Sex'].map({'male':0,'female':1}).astype(int) #填充空值字段,按照gender,class进行groupby聚类然后填充这个分组的平均值 ageArr=np.zeros((2,3)) for i in range(2): for j in range(3): ageArr[i,j]=df[ (df['Gender']==i) & (df['Pclass']==j+1) ]['Age'].dropna().mean() # 自我赋值df[ (df['Gender']==i) & (df['Pclass']==j+1) ]=df[ (df['Gender']==i) & (df['Pclass']==j+1) ].fillna(ageArr[i,j]) df.loc[(df['Gender']==i) & (df['Pclass']==j+1), 'Age']=ageArr[i,j] #整理亲戚字段 df['Relative']=(df['SibSp']+df['Parch']).astype(int) #清理字段,务必自己赋值,否则无效 df=df.drop(['Name','Ticket','Cabin','Embarked','SibSp','Parch'],axis=1) #输出训练样本 of=df[['Pclass','Age','Fare','Gender','Relative','Survived']] of.to_csv('pytrain.csv',index=False,header=['Pclass','Age','Fare','Gender','Relative','Survived']) def testDataPrd(): df=pd.read_csv('test.csv') #判断字段的空值状况,自动化 df.isnull().sum() #通过判断为空的数量,来判断为空的数目,其中False=0,True=1嘛 #整理性别字段 df['Gender']=df['Sex'].map({'male':0,'female':1}).astype(int) #填充空值字段,按照gender,class进行groupby聚类然后填充这个分组的平均值 ageArr=np.zeros((2,3)) for i in range(2): for j in range(3): ageArr[i,j]=df[ (df['Gender']==i) & (df['Pclass']==j+1) ]['Age'].dropna().mean() # 自我赋值df[ (df['Gender']==i) & (df['Pclass']==j+1) ]=df[ (df['Gender']==i) & (df['Pclass']==j+1) ].fillna(ageArr[i,j]) df.loc[(df['Gender']==i) & (df['Pclass']==j+1), 'Age']=ageArr[i,j] #整理亲戚字段 df['Relative']=(df['SibSp']+df['Parch']).astype(int) df['Fare']=df['Fare'].fillna(8.0) #清理字段,务必自己赋值,否则无效 df=df.drop(['Name','Ticket','Cabin','Embarked','SibSp','Parch'],axis=1) #输出训练样本 of=df[['Pclass','Age','Fare','Gender','Relative']] of.to_csv('pytest.csv',index=False,header=['Pclass','Age','Fare','Gender','Relative'])
2 用Octave完成logistic回归分类和预测
% logistic回归方法 %% Initialization clear ; close all; clc % 加载训练数据 data = load('pytrain.txt'); cols=size(data,2) X = data(:, 2:cols-1); y = data(:, cols); fprintf('\nProgram paused. Press enter to continue.\n'); pause; %% ============ Part 2: Compute Cost and Gradient ============ [m, n] = size(X); %扩充常数维度,增加一维 X = [ones(m, 1) X]; % Initialize fitting parameters initial_theta = zeros(n + 1, 1); % Compute and display initial cost and gradient [cost, grad] = costFunction(initial_theta, X, y); fprintf('Cost at initial theta (zeros): %f\n', cost); fprintf('Gradient at initial theta (zeros): \n'); fprintf(' %f \n', grad); fprintf('\nProgram paused. Press enter to continue.\n'); pause; %% ============= Part 3: Optimizing using fminunc ============= % 使用了自带的优化函数fminunc,而不是自己确定learning rate计算的那种 % 主要怕自己算的太烂了。 % Set options for fminunc options = optimset('GradObj', 'on', 'MaxIter', 400); % Run fminunc to obtain the optimal theta % This function will return theta and the cost [theta, cost] = ... fminunc(@(t)(costFunction(t, X, y)), initial_theta, options); % Print theta to screen fprintf('Cost at theta found by fminunc: %f\n', cost); fprintf('theta: \n'); fprintf(' %f \n', theta); %pause; %% ============== Part 4: Predict and Accuracies ============== % 计算预测的精度 p = predict(theta, X); fprintf('Train Accuracy: %f\n', mean(double(p == y)) * 100); fprintf('\nProgram paused. Press enter to continue.\n'); pause; %预测未知数据 data = load('pytest.txt'); Xtest=data; [m, n] = size(X); Xtest = [ones(m, 1) X]; Xtest = data(:, 1:cols); p = predict(theta, Xtest); save 'predict.txt' p;
计算的结果正确了在70-75之间,可能是我留的维度比较少,或者是logistic回归实现的不好。
导致还不如教程给的根据性别计算的单变量分类方法正确率高。。。