第1关:使用scikit-learn导入数据集
from sklearn import datasets
def getIrisData():
'''
导入Iris数据集
返回值:
X - 前5条训练特征数据
y - 前5条训练数据类别
X_shape - 训练特征数据的二维数组大小
'''
#初始化
X = []
y = []
X_shape = ()
# 请在此添加实现代码 #
#********** Begin *********#
iris = datasets.load_iris()
X = iris.data[:5]
y = iris.target[:5]
X_shape = iris.data.shape
#********** End **********#
return X,y,X_shape
第2关:数据预处理 — 标准化
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
'''
Data descrption:
The data contains 20,640 observations on 9 variables.
This dataset contains the average house value as target variable
and the following input variables (features): average income,
housing average age, average rooms, average bedrooms, population,
average occupation, latitude, and longitude in that order.
dataset : dict-like object with the following attributes:
dataset.data : ndarray, shape [20640, 8]
Each row corresponding to the 8 feature values in order.
dataset.target : numpy array of shape (20640,)
Each value corresponds to the average house value in units of 100,000.
dataset.feature_names : array of length 8
Array of ordered feature names used in the dataset.
dataset.DESCR : string
Description of the California housing dataset.
'''
dataset = fetch_california_housing("./step4/")
X_full, y = dataset.data, dataset.target
#抽取其中两个特征数据
X = X_full[:, [0, 5]]
def getMinMaxScalerValue():
'''
对特征数据X进行MinMaxScaler标准化转换,并返回转换后的数据前5条
返回值:
X_first5 - 数据列表
'''
X_first5 = []
# 请在此添加实现代码 #
# ********** Begin *********#
X_first5 = MinMaxScaler().fit_transform(X)
X_first5 = X_first5[:5]
# ********** End **********#
return X_first5
def getScaleValue():
'''
对目标数据y进行简单scale标准化转换,并返回转换后的数据前5条
返回值:
y_first5 - 数据列表
'''
y_first5 = []
# 请在此添加实现代码 #
# ********** Begin *********#
y_first5 = scale(y)
y_first5 = y_first5[:5]
# ********** End **********#
return y_first5
def getStandardScalerValue():
'''
对特征数据X进行StandardScaler标准化转换,并返回转换后的数据均值和缩放比例
返回值:
X_mean - 均值
X_scale - 缩放比例值
'''
X_mean = None
X_scale = None
# 请在此添加实现代码 #
#********** Begin *********#
scale = StandardScaler().fit(X)
X_mean = scale.mean_
X_scale = scale.scale_
#********** End **********#
return X_mean,X_scale
**
第3关:文本数据特征提取**
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
categories = [
'alt.atheism',
'talk.religion.misc',
]
# 加载对应目录的新闻数据,包含857 个文档
data = fetch_20newsgroups("./step5/",subset='train', categories=categories)
X = data.data
def transfer2CountVector():
'''
使用CountVectorizer方法提取特征向量,返回词汇表大小和前五条特征向量
返回值:
vocab_len - 标量,词汇表大小
tokenizer_list - 数组,对测试字符串test_str进行分词后的结果
'''
vocab_len = 0
test_str = "what's your favorite programming language?"
tokenizer_list = []
# 请在此添加实现代码 #
# ********** Begin *********#
vectorizer = CountVectorizer()
vectorizer.fit(X)
vocab_len = len(vectorizer.vocabulary_)
analyze = vectorizer.build_analyzer()
tokenizer_list = analyze(test_str)
# ********** End **********#
return vocab_len,tokenizer_list
def transfer2TfidfVector():
'''
使用TfidfVectorizer方法提取特征向量,并将向量化转换器应用到新的测试数据
TfidfVectorizer()方法的参数设置:
min_df = 2,stop_words="english"
test_data - 需要转换的原数据
返回值:
transfer_test_data - 二维数组ndarray
'''
test_data = ['Once again, to not believe in God is different than saying\n>I BELIEVE that God does not exist. I still maintain the position, even\n>after reading the FAQs, that strong atheism requires faith.\n>\n \nNo it in the way it is usually used. In my view, you are saying here that\ndriving a car requires faith that the car drives.\n \nFor me it is a conclusion, and I have no more faith in it than I have in the\npremises and the argument used.\n \n \n>But first let me say the following.\n>We might have a language problem here - in regards to "faith" and\n>"existence". I, as a Christian, maintain that God does not exist.\n>To exist means to have being in space and time. God does not HAVE\n>being - God IS Being. Kierkegaard once said that God does not\n>exist, He is eternal. With this said, I feel it\'s rather pointless\n>to debate the so called "existence" of God - and that is not what\n>I\'m doing here. I believe that God is the source and ground of\n>being. When you say that "god does not exist", I also accept this\n>statement - but we obviously mean two different things by it. However,\n>in what follows I will use the phrase "the existence of God" in it\'s\n>\'usual sense\' - and this is the sense that I think you are using it.\n>I would like a clarification upon what you mean by "the existence of\n>God".\n>\n \nNo, that\'s a word game. The term god is used in a different way usually.\nWhen you use a different definition it is your thing, but until it is\ncommonly accepted you would have to say the way I define god is ... and\nthat does not exist, it is existence itself, so I say it does not exist.\n \nInterestingly, there are those who say that "existence exists" is one of\nthe indubitable statements possible.\n \nFurther, saying god is existence is either a waste of time, existence is\nalready used and there is no need to replace it by god, or you are implying\nmore with it, in which case your definition and your argument so far\nare incomplete, making it a fallacy.\n \n \n(Deletion)\n>One can never prove that God does or does not exist. When you say\n>that you believe God does not exist, and that this is an opinion\n>"based upon observation", I will have to ask "what observtions are\n>you refering to?" There are NO observations - pro or con - that\n>are valid here in establishing a POSITIVE belief.\n(Deletion)\n \nWhere does that follow? Aren\'t observations based on the assumption\nthat something exists?\n \nAnd wouldn\'t you say there is a level of definition that the assumption\n"god is" is meaningful. If not, I would reject that concept anyway.\n \nSo, where is your evidence for that "god is" is meaningful at some level?\n Benedikt\n']
transfer_test_data = None
# 请在此添加实现代码 #
# ********** Begin *********#
tfidf_vertor = TfidfVectorizer(min_df=2, stop_words="english")
tfidf_vertor.fit(X)
transfer_test_data = tfidf_vertor.transform(test_data).toarray()
# ********** End **********#
return transfer_test_data
第4关:使用scikit-learn分类器SVM对digits数据分类
import matplotlib.pyplot as plt
# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics
# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data
# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]
def createModelandPredict():
'''
创建分类模型并对测试数据预测
返回值:
predicted - 测试数据预测分类值
'''
predicted = None
# 请在此添加实现代码 #
#********** Begin *********#
classifier = svm.SVC()
classifier.fit(train_data,train_target)
predicted = classifier.predict(test_data)
#********** End **********#
return predicted
第5关:模型持久化
# 导入数据集,分类器相关包
from sklearn import datasets, svm, metrics
import pickle
# 导入digits数据集
digits = datasets.load_digits()
n_samples = len(digits.data)
data = digits.data
# 使用前一半的数据集作为训练数据,后一半数据集作为测试数据
train_data,train_target = data[:n_samples // 2],digits.target[:n_samples // 2]
test_data,test_target = data[n_samples // 2:],digits.target[n_samples // 2:]
def createModel():
classifier = svm.SVC()
classifier.fit(train_data,train_target)
return classifier
local_file = 'dumpfile'
def dumpModel():
'''
存储分类模型
'''
clf = createModel()
# 请在此处补全模型存储语句 #
#********** Begin *********#
f_model = open(local_file, 'wb')
pickle.dump(clf, f_model)
#********** End **********#
def loadModel():
'''
加载模型,并使用模型对测试数据进行预测,返回预测值
返回值:
predicted - 模型预测值
'''
predicted = None
# 请在此处补全模型加载语句,并对预测数据分类返回预测值#
#********** Begin *********#
fw = open(local_file, 'rb')
classifier = pickle.loads(fw.read())
predicted = classifier.predict(test_data)
#********** End **********#
return predicted
第6关:模型评估-量化预测效果
from sklearn.metrics import accuracy_score,precision_score,f1_score,precision_recall_fscore_support
from sklearn.svm import LinearSVC,SVC
def bin_evaluation(X_train, y_train, X_test, y_test):
'''
评估二分类模型
:param X_train: 训练数据集
:param y_train: 训练集类别
:param X_test: 测试数据集
:param y_test: 测试集实际类别
:return:
correct_num - 正确分类的样本个数
prec - 正类的准确率
recall - 正类的召回率
f_score - 正类的f值
'''
classifier = LinearSVC()
correct_num, prec, recall, fscore = None, None, None, None
# 请在此添加实现代码 #
# ********** Begin *********#
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
correct_num = accuracy_score(y_test, y_pred, normalize=False)
prec, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label=1)
return correct_num, prec, recall, fscore
# ********** End **********#
def multi_evaluation(X_train,y_train,X_test,y_test):
'''
评估多分类模型
:param X_train: 训练数据集
:param y_train: 训练集类别
:param X_test: 测试数据集
:param y_test: 测试集实际类别
:return:
acc - 模型的精度
prec - 准确率
f_score - f值
'''
#初始化
acc,prec,f_score = None,None,None
classifier = SVC(kernel='linear')
# 请在此添加实现代码 #
# ********** Begin *********#
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec, zhaohui, f_score, sp_score = precision_recall_fscore_support(y_test, y_pred, average='macro')
return acc,prec,f_score
# ********** End **********#