泰坦尼克号生死预测(决策树)

# -*- coding: utf-8 -*-

from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from  sklearn.feature_extraction.text import TfidfVectorizer
from  sklearn.metrics import classification_report
import pandas as pd
from sklearn.feature_extraction  import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
def mul():
    # 获取20类新闻的数据集合
    news=fetch_20newsgroups(subset='all')
    # 进行数据分割
    x_train,x_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25)
    #文章的特征值抽取(重要性)
    tf=TfidfVectorizer()
    # 进行特征抽取
    x_train=tf.fit_transform(x_train)
    print(tf.get_feature_names())
    x_test=tf.transform(x_test)

    mu=MultinomialNB(alpha=1.0)
    # print("训练集",x_train.toarray())
    mu.fit(x_train,y_train)
    y_predict=mu.predict(x_test)
    print("预测文章的类别",y_predict)
    print("准确率",mu.score(x_test,y_test))
    print(classification_report(y_test,y_predict,target_names=news.target_names))
    return None

def desc():
    taitan=pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
    x=taitan[['pclass','age','sex']]
    y=taitan['survived']
    print(x)
    # 缺失值处理
    x['age'].fillna(x['age'].mean(),inplace=True)
    # 分割数据
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
    # 特征抽取
    dv=DictVectorizer(sparse=False)
    x_train=dv.fit_transform(x_train.to_dict(orient="records"))
    print(dv.get_feature_names())
    x_test=dv.transform(x_test.to_dict(orient="records"))
    print(x_train)

    dc=DecisionTreeClassifier()
    dc.fit(x_train,y_train)
    print("预测准确率",dc.score(x_test,y_test))

    return None

if __name__=="__main__":
    desc()

 

你可能感兴趣的:(Python)