# -*- coding: utf-8 -*-
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
def mul():
# 获取20类新闻的数据集合
news=fetch_20newsgroups(subset='all')
# 进行数据分割
x_train,x_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25)
#文章的特征值抽取(重要性)
tf=TfidfVectorizer()
# 进行特征抽取
x_train=tf.fit_transform(x_train)
print(tf.get_feature_names())
x_test=tf.transform(x_test)
mu=MultinomialNB(alpha=1.0)
# print("训练集",x_train.toarray())
mu.fit(x_train,y_train)
y_predict=mu.predict(x_test)
print("预测文章的类别",y_predict)
print("准确率",mu.score(x_test,y_test))
print(classification_report(y_test,y_predict,target_names=news.target_names))
return None
def desc():
taitan=pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
x=taitan[['pclass','age','sex']]
y=taitan['survived']
print(x)
# 缺失值处理
x['age'].fillna(x['age'].mean(),inplace=True)
# 分割数据
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)
# 特征抽取
dv=DictVectorizer(sparse=False)
x_train=dv.fit_transform(x_train.to_dict(orient="records"))
print(dv.get_feature_names())
x_test=dv.transform(x_test.to_dict(orient="records"))
print(x_train)
dc=DecisionTreeClassifier()
dc.fit(x_train,y_train)
print("预测准确率",dc.score(x_test,y_test))
return None
if __name__=="__main__":
desc()