题记:第一个手写的机器学习的项目实战选择了经典的kaggle竞赛题目:泰坦尼克号沉船预测。典型的回归问题,希望在此过程中熟练掌握机器学习的基本流程,数据预处理与特征选择的思路。同类型经典赛题:阿里云天池大赛的工业蒸汽量预测问题。
Pycharm和jupyter的选择问题:kaggle在线训练平台提供的是类似于jupyter的接口,方便直观的看到数据的流动,但是结构性和逻辑性不足。而Pycharm通过函数的封装能很好体现逻辑性。个人更倾向于Jupyter做前期的数据分析工作,后面的算法由Pycharm实现。
Score:0.77751
# -*- coding: utf-8 -*-
"""
@Time : 2023/4/4 10:39
@Auth : zyt
@File :01regression.py
@IDE :PyCharm
@Motto:ABC(Always Be Coding)
"""
'''
问题描述:泰坦尼克号沉船预测事件
'''
import pandas as pd
import sklearn.preprocessing as preprocessing
from sklearn import svm
def pre(data_train):
# 由于姓名、船票编号属于唯一标识符对于预测来说无意义故直接删除
data_train.drop('Name',axis=1,inplace=True)
data_train.drop('Ticket', axis=1, inplace=True)
return data_train
#缺失值
def set_missing_train(data_train):
null_all = data_train.isnull().sum()
print(null_all)
#1.Age平均值填充
age_median = data_train.Age.median()
data_train.Age.fillna(age_median,inplace=True)
#2.Cabin列删除
data_train.drop('Cabin',axis=1,inplace=True)
#3.Embarked行删除
data_train.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)
return data_train
def set_missing_test(data_test):
null_all = data_test.isnull().sum()
print(null_all)
#1.Age平均值填充
age_median = data_test.Age.median()
data_test.Age.fillna(age_median,inplace=True)
#2.Cabin列删除
data_test.drop('Cabin',axis=1,inplace=True)
#3.Fare平均值填充
Fare_median = data_test.Fare.median()
data_test.Fare.fillna(Fare_median, inplace=True)
return data_test
#标准化
def set_norm(data_train):
#特征因子化
dummies_Sex = pd.get_dummies(data_train['Sex'],prefix='Sex')
data_train.drop('Sex',axis=1,inplace = True)
data_train = pd.concat([data_train,dummies_Sex],axis=1)
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
data_train.drop('Embarked',axis=1,inplace=True)
data_train = pd.concat([data_train, dummies_Embarked], axis=1)
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')
data_train.drop('Pclass',axis=1,inplace=True)
data_train = pd.concat([data_train, dummies_Pclass], axis=1)
#特征标准化
scaler = preprocessing.StandardScaler()
age_scaler_param = scaler.fit(data_train['Age'].values.reshape(-1,1))
data_train['Age'] = scaler.fit_transform(data_train['Age'].values.reshape(-1,1),age_scaler_param)
Fare_scaler_param = scaler.fit(data_train['Fare'].values.reshape(-1, 1))
data_train['Fare'] = scaler.fit_transform(data_train['Fare'].values.reshape(-1, 1), Fare_scaler_param)
return data_train
def predict(data_train,data_test):
y = data_train['Survived']
features = ['Age','SibSp','Parch','Fare','Sex_female','Sex_male','Pclass_1','Pclass_2','Pclass_3','Embarked_C','Embarked_Q','Embarked_S']
x = pd.DataFrame(data_train[features],columns=features)
x_test = pd.DataFrame(data_test[features],columns=features)
model =svm.SVC(C=3,kernel='rbf',gamma=0.1)
model.fit(x,y)
predictions = model.predict(x_test)
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
output.to_csv('./data/submission.csv', index=False)
print("Your submission was successfully saved!")
if __name__ == '__main__':
# 数据导入
data_train = pd.read_csv('./data/train.csv')
data_train = pre(data_train)
data_train = set_missing_train(data_train)
data_train = set_norm(data_train)
data_test = pd.read_csv('./data/test.csv')
data_test = pre(data_test)
data_test = set_missing_test(data_test)
data_test = set_norm(data_test)
predict(data_train,data_test)