机器学习实战【泰坦尼克号沉船预测】(附python代码)

题记:第一个手写的机器学习的项目实战选择了经典的kaggle竞赛题目:泰坦尼克号沉船预测。典型的回归问题,希望在此过程中熟练掌握机器学习的基本流程,数据预处理与特征选择的思路。同类型经典赛题:阿里云天池大赛的工业蒸汽量预测问题。

Pycharm和jupyter的选择问题:kaggle在线训练平台提供的是类似于jupyter的接口,方便直观的看到数据的流动,但是结构性和逻辑性不足。而Pycharm通过函数的封装能很好体现逻辑性。个人更倾向于Jupyter做前期的数据分析工作,后面的算法由Pycharm实现。

Score:0.77751

# -*- coding: utf-8 -*-
"""
@Time : 2023/4/4 10:39
@Auth : zyt
@File :01regression.py
@IDE :PyCharm
@Motto:ABC(Always Be Coding)
"""

'''
问题描述:泰坦尼克号沉船预测事件
'''
import pandas as pd
import sklearn.preprocessing as preprocessing
from sklearn import svm

def pre(data_train):
    # 由于姓名、船票编号属于唯一标识符对于预测来说无意义故直接删除
    data_train.drop('Name',axis=1,inplace=True)
    data_train.drop('Ticket', axis=1, inplace=True)

    return data_train

#缺失值
def set_missing_train(data_train):
    null_all = data_train.isnull().sum()
    print(null_all)
    #1.Age平均值填充
    age_median = data_train.Age.median()
    data_train.Age.fillna(age_median,inplace=True)
    #2.Cabin列删除
    data_train.drop('Cabin',axis=1,inplace=True)
    #3.Embarked行删除
    data_train.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)

    return data_train

def set_missing_test(data_test):
    null_all = data_test.isnull().sum()
    print(null_all)
    #1.Age平均值填充
    age_median = data_test.Age.median()
    data_test.Age.fillna(age_median,inplace=True)
    #2.Cabin列删除
    data_test.drop('Cabin',axis=1,inplace=True)
    #3.Fare平均值填充
    Fare_median = data_test.Fare.median()
    data_test.Fare.fillna(Fare_median, inplace=True)

    return data_test

#标准化
def set_norm(data_train):
    #特征因子化
    dummies_Sex = pd.get_dummies(data_train['Sex'],prefix='Sex')
    data_train.drop('Sex',axis=1,inplace = True)
    data_train = pd.concat([data_train,dummies_Sex],axis=1)

    dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix='Embarked')
    data_train.drop('Embarked',axis=1,inplace=True)
    data_train = pd.concat([data_train, dummies_Embarked], axis=1)

    dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix='Pclass')
    data_train.drop('Pclass',axis=1,inplace=True)
    data_train = pd.concat([data_train, dummies_Pclass], axis=1)

    #特征标准化
    scaler = preprocessing.StandardScaler()
    age_scaler_param = scaler.fit(data_train['Age'].values.reshape(-1,1))
    data_train['Age'] = scaler.fit_transform(data_train['Age'].values.reshape(-1,1),age_scaler_param)

    Fare_scaler_param = scaler.fit(data_train['Fare'].values.reshape(-1, 1))
    data_train['Fare'] = scaler.fit_transform(data_train['Fare'].values.reshape(-1, 1), Fare_scaler_param)

    return data_train

def predict(data_train,data_test):
    y = data_train['Survived']
    features = ['Age','SibSp','Parch','Fare','Sex_female','Sex_male','Pclass_1','Pclass_2','Pclass_3','Embarked_C','Embarked_Q','Embarked_S']
    x = pd.DataFrame(data_train[features],columns=features)
    x_test = pd.DataFrame(data_test[features],columns=features)

    model =svm.SVC(C=3,kernel='rbf',gamma=0.1)
    model.fit(x,y)
    predictions = model.predict(x_test)

    output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': predictions})
    output.to_csv('./data/submission.csv', index=False)
    print("Your submission was successfully saved!")

if __name__ == '__main__':
    # 数据导入
    data_train = pd.read_csv('./data/train.csv')
    data_train = pre(data_train)
    data_train = set_missing_train(data_train)
    data_train = set_norm(data_train)

    data_test = pd.read_csv('./data/test.csv')
    data_test = pre(data_test)
    data_test = set_missing_test(data_test)
    data_test = set_norm(data_test)

    predict(data_train,data_test)

你可能感兴趣的:(研究生入门,代码精进,python,机器学习)