1、字典特征提取
from sklearn.feature_extraction import DictVectorizer
# sklearn.feature_extraction模块负责从原始数据中提取特征
# feature_extraction.DictVectorizer将特征值映射列表转换为向量
def dict_demo():
data = [{'city': '北京', 'tem': 100}, {'city': '上海', 'tem': 60}, {'city': '深圳', 'tem': 30}]
# 实例化一个转换器类
transfer1 = DictVectorizer()
# 调用fit_transform()
data_new1 = transfer1.fit_transform(data)
# 若sparse=True(默认)则返回一个sparse矩阵(稀疏矩阵)
print("data_new1:\n", data_new1)
print("data_new1类型:",type(data_new1))
transfer2 = DictVectorizer(sparse=False)
data_new2 = transfer2.fit_transform(data)
# 写出矩阵来看其实是等价的,稀疏矩阵节省内存
print("data_new2:\n", data_new2)
print("data_new2类型:", type(data_new2))
print("特征名字:\n", transfer1.feature_names_)
return None
if __name__ == "__main__":
dict_demo()
2、文本特征提取
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# sklearn.feature_extraction.text收集实用程序以从文本文档构建特征向量
# feature_extraction.text.CountVectorizer将文本文档集合转换为令牌计数矩阵(统计每个样本特征词出现的个数)
# feature_extraction.text.TfidfVectorizer将原始文档集合转换为TF-IDF功能矩阵
import jieba
def count_demo():
data = ["life is short,i like like python", "life is too long,i dislike python"]
# 实例化一个转换器类
transfer = CountVectorizer()
# 调用fit_transform()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new)
# 转换为非稀疏矩阵
print("特征名字:\n", transfer.get_feature_names_out())
print("data_new:\n", data_new.toarray())
return None
# 中文文本提取,手动分词
def count_chinese_demo():
data = ["大胆 你 在 狗叫 什么", "我 什么 身份 你 什么 地位"]
transfer = CountVectorizer()
data_new = transfer.fit_transform(data)
print("特征名字:\n", transfer.get_feature_names_out())
print("data_new:\n", data_new.toarray())
return None
# 进行中文分词
def cut_words(text):
text = " ".join(list(jieba.cut(text)))
# print(text)
# print(type(text))
return text
# 中文文本提取,自动分词
def count_chinese_demo2():
# 将中文文本进行分词
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所有每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
data_new = []
for sentence in data:
data_new.append(cut_words(sentence))
# print(data_new)
transfer = CountVectorizer()
data_final = transfer.fit_transform(data_new)
print("特征名字:\n", transfer.get_feature_names_out())
print("data_new:\n", data_final.toarray())
return None
# tfidf方法文本特征提取
def tfidf_demo():
data = ["一种还是一种今天很残酷,明天更残酷,后天很美好,但绝对大部分是死在明天晚上,所有每个人不要放弃今天。",
"我们看到的从很远星系来的光是在几百万年之前发出的,这样当我们看到宇宙时,我们是在看它的过去。",
"如果只用一种方式了解某样事物,你就不会真正了解它。了解事物真正含义的秘密取决于如何将其与我们所了解的事物相联系。"]
data_new = []
for sentence in data:
data_new.append(cut_words(sentence))
# print(data_new)
transfer = TfidfVectorizer(stop_words=["一种", "所以"])
data_final = transfer.fit_transform(data_new)
print("特征名字:\n", transfer.get_feature_names_out())
print("data_new:\n", data_final.toarray())
return None
if __name__ == "__main__":
# count_demo()
# count_chinese_demo()
# cut_words("我草你妈的")
# count_chinese_demo2()
tfidf_demo()
3、特征预处理
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# sklearn.preprocessing是预处理和规范化模块,包括缩放、居中、归一化、二值化方法
# sklearn.preprocessing.MinMaxScaler该估计器分别缩放和转换每个特征,以使其在训练集上处于给定范围内,例如介于零和一之间(归一化)
# sklearn.preprocessing.StandardScaler通过对原始数据进行变换把数据变换到均值为0,标准差为1的范围内(标准化)
# 归一化
def minmax_demo():
# 获取数据
data = pd.read_csv("dating.txt")
data = data.iloc[:, :3] # 取所有行和0、1、2列
print("data:\n", data)
# 实例化一个转换器类
transfer = MinMaxScaler() # 默认归一化到(0,1)
# 调用fit_transform()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new)
return None
# 标准化
def stand_demo():
data = pd.read_csv("dating.txt")
data = data.iloc[:, :3] # 取前三列
print("data:\n", data)
transfer = StandardScaler()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new)
return None
if __name__ == "__main__":
# minmax_demo()
stand_demo()
4、特征降维
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
# feature_selection.VarianceThreshold是删除所有低方差特征的特征选择器
from scipy.stats import pearsonr
# 皮尔逊相关系数
from sklearn.decomposition import PCA
# sklearn.decomposition模块包括矩阵分解算法,其中包括PCA,NMF或ICA。该模块的大多数算法都可以视为降维技术
# sklearn.decomposition.PCA是主成分分析,利用数据的奇异值分解将其投射到较低维空间的线性降维
# 低方差特征过滤和相关系数
def variance_demo():
# 获取数据
data = pd.read_csv("factor_returns.csv")
data = data.iloc[:, 1:-2] # 取第1到第-2列
print("data:\n", data)
# 实例化一个转换器类
transfer = VarianceThreshold(threshold=10) # 默认是选择方差大于0的,这里设为选择方差大于10的
# 调用fit_transform()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new, data_new.shape)
# 原来有9个特征,过滤后只有7个
# 计算某两个变量之间的相关系数
r1 = pearsonr(data["pe_ratio"], data["pb_ratio"])
# r1的第一个值是皮尔逊相关系数,后一个数是p值,p值越小表示相关系数越显著
print("相关系数:\n", r1)
r2 = pearsonr(data["revenue"], data["total_expense"])
print("相关系数:\n", r2)
return None
# 主成分分析
def pca_demo():
data = [[2, 8, 4, 5], [6, 3, 0, 8], [5, 4, 9, 1]]
# 实例化转换器类
transfer1 = PCA(n_components=2) # n_components=2表示将特征数降为2
transfer2 = PCA(n_components=0.95) # n_components=0.95表示保留95%的信息
# 调用fit_transform()
data_new1 = transfer1.fit_transform(data)
data_new2 = transfer2.fit_transform(data)
print("data_new1:\n", data_new1)
print("data_new2:\n", data_new2)
return None
if __name__ == "__main__":
# variance_demo()
pca_demo()
5、鸢尾花数据集测试
from sklearn.datasets import load_iris
# 加载并返回鸢尾花数据集
from sklearn.model_selection import train_test_split
# 将数组或矩阵切分为随机训练和测试子集
# sklearn数据集使用
def datasets_demo():
# 获取数据集
# iris是字典类型的数据
iris = load_iris()
print("鸢尾花数据集:\n", iris)
print("查看数据集描述:\n", iris["DESCR"])
print("查看特征值名字:\n", iris["feature_names"])
print("查看目标值名字:\n", iris["target_names"])
# 数据集划分
# x是数据集的特征值,y是数据集的标签值
# test_size表示测试集的大小,一般为float
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)
print("训练集特征值:\n", x_train, x_train.shape)
print("训练集标签值:\n", y_train, y_train.shape)
print("测试集特征值:\n", x_test, x_test.shape)
print("测试集标签值:\n", y_test, y_test.shape)
return None
if __name__ == "__main__":
datasets_demo()
1、KNN鸢尾花种类预测
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# sklearn.neighbors实现k近邻算法,sklearn.neighbors.KNeighborsClassifier分类器执行k最近邻投票
from sklearn.model_selection import GridSearchCV
# sklearn.model_selection.GridSearchCV详尽搜索估计器的指定参数
def knn_iris():
# 1、获取数据
iris = load_iris()
# 2、划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)
# print("训练集特征值:\n", x_train, x_train.shape)
print("训练集标签值:\n", y_train, y_train.shape)
# print("测试集特征值:\n", x_test, x_test.shape)
print("测试集标签值:\n", y_test, y_test.shape)
# 3、特征工程:标准化
transfer = StandardScaler() # 实例化一个转换器对象
x_train = transfer.fit_transform(x_train)
# 上一步的fit已经计算了总体样本的均值和标准差,即transfer已经fit了,而测试集和训练集来自同一总体样本,标准化的时候不需要再fit了,直接transform
x_test = transfer.transform(x_test)
# 4、knn算法估计器
estimator = KNeighborsClassifier(n_neighbors=3) # 实例化一个估计器对象
estimator.fit(x_train, y_train) # 训练模型
# 5、模型评估
y_predict = estimator.predict(x_test)
# (1)、直接对比真实值和预测值
print("y_predict:\n", y_predict)
print("预测值是否等于真实值:\n", y_predict == y_test)
# (2)、计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
return None
# 添加网格搜索和交叉验证
def knn_iris_grid():
# 1、获取数据
iris = load_iris()
# 2、划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=6)
# print("训练集特征值:\n", x_train, x_train.shape)
print("训练集标签值:\n", y_train, y_train.shape)
# print("测试集特征值:\n", x_test, x_test.shape)
print("测试集标签值:\n", y_test, y_test.shape)
# 3、特征工程:标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
# 上一步的fit已经计算了总体样本的均值和标准差,即transfer已经fit了,而测试集和训练集来自同一总体样本,标准化的时候不需要再fit了,直接transform
x_test = transfer.transform(x_test)
# 4、knn算法估计器
estimator = KNeighborsClassifier()
# 加入网格搜索和交叉验证
# 参数准备
param_dict = {"n_neighbors": [1, 3, 5, 7, 9, 11]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=10) # cv=10表示10折交叉验证
estimator.fit(x_train, y_train)
# 5、模型评估
y_predict = estimator.predict(x_test)
# (1)、直接对比真实值和预测值
print("y_predict:\n", y_predict)
print("预测值是否等于真实值:\n", y_predict == y_test)
# (2)、计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
print("最佳参数:\n", estimator.best_params_)
print("最佳结果:\n", estimator.best_score_)
print("最佳估计器:\n", estimator.best_estimator_)
print("交叉验证结果:\n", estimator.cv_results_)
return None
if __name__ == "__main__":
# knn_iris()
knn_iris_grid()
2、朴素贝叶斯新闻分类
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# sklearn.naive_bayes.MultinomialNB是用于多项式模型的朴素贝叶斯分类器,适用于具有离散特征的分类(例如文本分类的字数统计)
def nb_news():
# 1)获取数据
news = fetch_20newsgroups(subset="all")
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target)
# 3)特征工程
# 文本特征抽取-tfidf
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4)朴素贝叶斯预估器流程
estimator = MultinomialNB(alpha=1) # 默认alpha就是1(拉普拉斯平滑系数)
estimator.fit(x_train, y_train)
# 5)模型评估
y_predict = estimator.predict(x_test)
# (1)、直接对比真实值和预测值
print("y_predict:\n", y_predict)
print("预测值是否等于真实值:\n", y_predict == y_test)
# (2)、计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
return None
if __name__ == "__main__":
nb_news()
3、决策树鸢尾花分类
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# sklearn.tree模块包括用于分类和回归的基于决策树的模型
# sklearn.tree.DecisionTreeClassifier是决策树分类器
# sklearn.tree.export_graphviz以DOT格式导出决策树
def decision_iris():
# 1)获取数据
iris = load_iris()
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)
# 3) 决策树估计器
estimator = DecisionTreeClassifier(criterion="entropy") # 按信息增益进行分类
estimator.fit(x_train, y_train)
# 4) 模型评估
y_predict = estimator.predict(x_test)
# (1)、直接对比真实值和预测值
print("y_predict:\n", y_predict)
print("预测值是否等于真实值:\n", y_predict == y_test)
# (2)、计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
# 可视化决策树
export_graphviz(estimator, out_file="iris_decision_tree.dot", feature_names=iris.feature_names)
return None
if __name__ == "__main__":
decision_iris()
1、波士顿房价预测
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
# sklearn.linear_model实现了各种线性模型
# sklearn.linear_model.LinearRegression是普通最小二乘线性回归
# sklearn.linear_model.SGDRegressor通过使用SGD最小化正则经验损失来拟合线性模型
# sklearn.linear_model.Ridge是具有l2正则化的线性最小二乘法(岭回归)
from sklearn.metrics import mean_squared_error
# sklearn.metrics包括评分功能,性能指标以及成对指标和距离计算
# sklearn.metrics.mean_squared_error是均方误差回归损失
import joblib # 模型保存和加载
# 正规方程方法对波士顿房价预测
def Linear1():
# 1)获取数据集
boston = load_boston()
print("特征数量:\n", boston.data.shape)
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=22)
# 3)特征工程:
# 无量纲化 - 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4)预估器流程
# fit() --> 模型
# coef_ intercept_
estimator = LinearRegression()
estimator.fit(x_train, y_train)
print("正规方程权重系数:\n", estimator.coef_)
print("正规方程偏置:\n", estimator.intercept_)
# 5)模型评估
y_predit = estimator.predict(x_test)
print("预测房价:\n", y_predit)
error = mean_squared_error(y_test, y_predit)
print("正规方程均方误差为:\n", error)
return None
# 梯度下降方法对波士顿房价预测
def Linear2():
# 1)获取数据集
boston = load_boston()
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=22)
# 3)特征工程:
# 无量纲化 - 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4)预估器流程
# fit() --> 模型
# coef_ intercept_
estimator = SGDRegressor()
estimator.fit(x_train, y_train)
print("梯度下降权重系数:\n", estimator.coef_)
print("梯度下降偏置:\n", estimator.intercept_)
# 5)模型评估
y_predit = estimator.predict(x_test)
print("预测房价:\n", y_predit)
error = mean_squared_error(y_test, y_predit)
print("梯度下降均方误差为:\n", error)
return None
# 岭回归对波士顿房价进行预测
def Linear3():
# 1)获取数据集
boston = load_boston()
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=22)
# 3)特征工程:
# 无量纲化 - 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4)预估器流程
# fit() --> 模型
# coef_ intercept_
estimator = Ridge()
estimator.fit(x_train, y_train)
print("岭回归权重系数:\n", estimator.coef_)
print("岭回归偏置:\n", estimator.intercept_)
# 5)模型评估
y_predit = estimator.predict(x_test)
print("预测房价:\n", y_predit)
error = mean_squared_error(y_test, y_predit)
print("岭回归均方误差为:\n", error)
# 模型保存
joblib.dump(estimator, "my_ridge.pkl")
return None
# 岭回归模型加载
def Linear4():
estimator = joblib.load("my_ridge.pkl")
# 1)获取数据集
boston = load_boston()
# 2)划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=22)
# 3)特征工程:
# 无量纲化 - 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 模型加载
estimator = joblib.load("my_ridge.pkl")
# 4)模型评估
y_predit = estimator.predict(x_test)
print("预测房价:\n", y_predit)
error = mean_squared_error(y_test, y_predit)
print("岭回归均方误差为:\n", error)
if __name__ == "__main__":
# Linear1()
# Linear2()
# Linear3()
Linear4()