泰坦尼克号幸存者预测 python 详解

泰坦尼克号幸存数据,压缩包第一个

#读取数据
import  pandas as pd
data = pd.read_csv(r"C:\Users\lhf\PycharmProjects\python+MachineLeaning\泰坦尼克号数据\data.csv")
print(data.info())

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB
None
#处理数据
#1.筛选特征  名字不重要,Cabin缺太多,票号不重要
data.drop(["Name","Cabin","Ticket"],inplace= True,axis=1)#将要删除的那一列的名字放到列表中,inplace=True表示生成的新的数据代替旧的数据,axis=1 删除列

#2.处理缺失值  Age缺很多数据,
data["Age"] = data["Age"].fillna(data["Age"].mean())  #用年龄的平均值填补缺失值   注意是方括号!!!
   #Embarked 缺两个数据 ,把缺的两行删除
data = data.dropna()#删除有缺失值的行  默认是axis = 0

#3.数据中出现非数字的数据,将它转换成数字,有两种方法
#3.1 第一种方法
label = data["Embarked"].unique().tolist()  #取出Embarked中所有的取值并转化成列表------类似于集合形式
data["Embarked"] = data["Embarked"].apply(lambda x:label.index(x))
#3.2 第二种方法
data["Sex"] = (data["Sex"] == "male").astype("int") #将男人=1,女人=0

#4.分离特征和标签
x = data.iloc[:,data.columns != "Survived"]  #取出data数据中列名不为"Survived"的其他列
y = data.iloc[:,data.columns == "Survived"]  #取出data数据中列名为"Survived"的列

#训练集,测试集划分
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.3) #划分后数据索引会变混乱 Xtrain,Xtest把x分了7:3,Ytrain,Ytest把y分了7:3
for i in [Xtrain,Xtest,Ytrain,Ytest]:
    i.index = range(i.shape[0])   #将索引改成0-621
#网格搜索 能帮助我们同时调整多个参数的技术 枚举技术
parameters = {"criterion":("gini","entropy")
              ,"splitter":("best","random")
              ,"max_depth":list(range(1,10))
              ,"min_samples_leaf":list(range(1,50,5))
              ,"min_impurity_decrease":list(np.linspace(0,0.5,20))}
clf = DecisionTreeClassifier(random_state= 25)
GS = GridSearchCV(clf,parameters,cv=10) #GridSearchCV同时满足了fit,score,cross_val_score
GS = GS.fit(Xtrain,Ytrain)
print(GS.best_params_) #从我们输入的参数和参数列表中,返回最佳组合   最佳参数组合
print(GS.best_score_)#网格搜索后的模型的评价标准
{'criterion': 'gini', 'max_depth': 5, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'splitter': 'best'}
0.8408360128617364

你可能感兴趣的:(学习)