数据集 = 特征值 + 目标值
pip3 install Scikit-learn==0.19.1
# 导入鸢尾花
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 22)
print(f"测试集:{x_train}, 大小:{x_train.shape}")
sklearn.feature_extraction
from sklearn.feature_extraction import DictVectorizer
def dict_demo():
"""字典特征抽取"""
data = [{'city': '北京', 'temperayure': 100},
{'city': '上海', 'temperayure': 60 },
{'city': '深圳', 'temperayure': 30 }]
# 1.实例化一个转换器类
transfer = DictVectorizer(sparse= False)
# 2.调用fit_transform()
data_new = transfer.fit_transform(data) # data_new.toarray()
print("数据:", data_new)
print("特征名字:", transfer.get_feature_names())
return None
from sklearn.feature_extraction.text import CountVectorizer
stop_words 停用词
**sparse.toarray() ** — sparse包含变成数组的方法,字典/文本提取均可使用
def dict_cn_demo():
"""
"我爱北京天安门" -> "我 爱 北京 天安门"
"""
text = "我爱北京天安门"
a = jieba.cut(text) # 返回迭代器
b = list(a) # ['我', '爱', '北京', '天安门']
c = " ".join(b) # "我 爱 北京 天安门" str
print(c)
return None
def cut_CN(text):
"""
"我爱北京天安门" -> "我 爱 北京 天安门"
"""
return " ".join(list(jieba(text)))
from sklearn.feature_extraction.text import TfidfVectorizer
sklearn.preprocessing
鲁棒性较差,最大值/最小值为Nan容易出现问题
from sklearn.preprocessing import MinMaxScaler
def minmax_demo():
"""
归一化
"""
# 1.获取数据
# data.to_csv("data.csv", index= False)
data = pd.read_csv("data.csv")
data = data.iloc[:, :3]
print("data:", data)
# 2.获取转换器类
transfer = MinMaxScaler(feature_range= (0, 1))
# 3.fit_transform
data_new = transfer.fit_transform(data)
print("data_new", data_new)
return None
# 适合大样本
from sklearn.preprocessing import StandardScaler
def stand_demo():
"""
标准化
"""
# 1.获取数据
data = pd.read_csv("data.csv")
data = data.iloc[:, :3]
print("data:", data)
# 2.获取转换器类
transfer = StandardScaler()
# 3.fit_transform
data_new = transfer.fit_transform(data)
print("data_new", data_new)
return None
from sklearn.feature_selection import VarianceThreshold
【Filter过滤式】
【Embedded嵌入式】
…
def variance_demo():
"""
过滤低方差特征
"""
# 1.获取数据
data = pd.read_csv("data.csv")
print("data:", data.head())
# 2.实例化转换器
transter = VarianceThreshold()
# 3.调用fit_transform
data_new = transter.fit_transform(data)
print("data_new:", data_new, data_new.shape)
return None
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-imaB5JQX-1583501771196)(C:\Users\ASUS\AppData\Roaming\Typora\typora-user-images\image-20200304174051073.png)]
from scipy.stats import pearsonr
# 计算两个变量的相关性 返回 r = (相关系数, p 显著水平 越小越好)
r = pearsonr(data["pe_ratio"], data["pb_ratio"])
from sklearn.decomposition import PCA
def pca_demo():
data = [[2, 8, 4, 5],
[6, 3, 0, 8],
[5, 4, 9, 1]]
# 1.实例化转换器类
transfer = PCA(n_components= 2) # 4个特征转换成2个特征
# 2.fit_transform
data_new = transfer.fit_transform(data)
print("data_new\n", data_new)
return None
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-oYztJ3rr-1583502198782)(C:\Users\ASUS\AppData\Roaming\Typora\typora-user-images\image-20200304220403929.png)]
K Nearest Neighbor
K 值取得过小,容易受到异常点的影响
K 值取得过大,样本不均匀的影响
【训练流程】
【示例】
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
def knn_iris():
"""
KNN 算法对鸢尾花分类
"""
# - 1.获取数据
iris = load_iris()
# - 2.数据集划分(train & test)
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.1, random_state = 6)
# - 3.特征工程:标准化 - 训练和测试用一样的fit
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) # 和训练(x_train)用相同的计算(fit)
# - 4.KNN预估器流程 K值
estimator = KNeighborsClassifier(n_neighbors = 3)
estimator.fit(x_train, y_train)
# - 5.模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_predict == y_test)
# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
return None
if __name__ == "__main__":
# KNN算法对鸢尾花分类
knn_iris()
寻找合适的K值
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
def knn_iris_gscv():
"""
KNN 算法对鸢尾花分类,添加网格搜索和交叉验证
"""
# - 1.获取数据
iris = load_iris()
# - 2.数据集划分(train & test)
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.1, random_state = 8)
# - 3.特征工程:标准化 - 训练和测试用一样的fit
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) # 和训练(x_train)用相同的计算(fit)
# - 4.KNN预估器流程 K值
estimator = KNeighborsClassifier()
# ------加入网格搜索与交叉验证
# ------参数准备K值搜索:param_dict,交叉次数:cv
param_dict = {"n_neighbors":[1, 3, 5, 7, 9, 11]}
estimator = GridSearchCV(estimator, param_grid = param_dict, cv = 8)
# ------新增内容
estimator.fit(x_train, y_train) # 训练
# - 5.模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_predict == y_test)
# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
print("最佳参数:\n", estimator.best_params_)
print("最佳结果:\n", estimator.best_score_)
print("最佳估计器:\n", estimator.best_estimator_)
print("交叉验证结果:\n", estimator.cv_results_)
return None
if __name__ == "__main__":
# KNN算法对鸢尾花分类
knn_iris_gscv()
朴素(相互独立)+贝叶斯(贝叶斯公式)
应用:文本分类,情感分析
from sklearn.naive_bayes import MultinomialNB
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-T5hPOnwv-1583501771257)(C:\Users\ASUS\AppData\Roaming\Typora\typora-user-images\image-20200305120042622.png)]
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
def nb_news():
"""
朴素贝叶斯算法,新闻分类
"""
# 1.获取数据
news = fetch_20newsgroups(subset= "all")
# 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target)
# 3.特征工程:文本抽取 tfidf
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.朴素贝叶斯算法预估器流程
estimator = MultinomialNB()
estimator.fit(x_train, y_train) #训练
# 5.模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_predict == y_test)
# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
return None
if __name__ == "__main__":
nb_news()
如何高效的进行决策
from sklearn.tree import DecisionTreeClassifier, export_graphviz
def decision_iris():
"""
决策树算法对鸢尾花分类,添加网格搜索和交叉验证
"""
# - 1.获取数据
iris = load_iris()
# - 2.数据集划分(train & test)
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target)
# - 3.决策树预估器
estimator = DecisionTreeClassifier(criterion= "entropy")
estimator.fit(x_train, y_train)
# - 4.模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_predict == y_test)
# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
# 数据可视化
export_graphviz(estimator, out_file= "iris_tree.dot", feature_names= iris.feature_names)
# 生成文件复制到:http://www.webgraphviz.com/
return None
适合大数据,处理高维特征数据
【示例:网格搜索+随机森林】
from sklearn.model_selection import GridSearchCV # 网格搜索
from sklearn.ensemble import RandomForestClassifier #随机森林
# - 4.随机森林预估器
estimator = RandomForestClassifier()
# ------加入网格搜索与交叉验证
# ------参数准备搜索网格:param_dict,交叉次数:cv
param_dict = {"n_estimators":[120, 150, 300], "max_depth":[5, 8, 15]}
estimator = GridSearchCV(estimator, param_grid = param_dict, cv = 3)
# ------新增内容
estimator.fit(x_train, y_train)
# - 5.模型评估
# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("直接比对真实值和预测值:\n", y_predict == y_test)
# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率:\n", score)
print("最佳参数:\n", estimator.best_params_)
print("最佳结果:\n", estimator.best_score_)
print("最佳估计器:\n", estimator.best_estimator_)
print("交叉验证结果:\n", estimator.cv_results_)
适合大数据
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor # 正规方程/梯度下降
from sklearn.metrics import mean_squared_error # 均方误差
def liner1():
"""
正规方程优化方法
"""
# - 1.获取数据集
boston = load_boston()
# print("boston数据:", boston.data.shape)
# - 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target)
# - 3.特征工程:无量纲化(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# - 4.预估器流程:fit() -> 模型
estimator = LinearRegression()
estimator.fit(x_train, y_train)
# - 5.得出模型
print("正规方程w权重系数:\n", estimator.coef_)
print("正规方程b偏置:\n", estimator.intercept_)
# - 6.模型评估
y_predict = estimator.predict(x_test)
print("预测房价:\n", y_predict)
error = mean_squared_error(y_test, y_predict)
print("正规方程,均方误差为:", error)
return None
def liner2():
"""
梯度下降优化方法
"""
# - 1.获取数据集
boston = load_boston()
# - 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target)
# - 3.特征工程:无量纲化(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# - 4.预估器流程:fit() -> 模型
estimator = SGDRegressor()
estimator.fit(x_train, y_train)
# - 5.得出模型
print("梯度下降w权重系数:\n", estimator.coef_)
print("梯度下降b偏置:\n", estimator.intercept_)
# - 6.模型评估
y_predict = estimator.predict(x_test)
print("预测房价:\n", y_predict)
error = mean_squared_error(y_test, y_predict)
print("梯度下降,均方误差为:", error)
return None
if __name__ == "__main__":
# 正规方程
liner1()
# 梯度下降
liner2()
解决过拟合问题
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge # 正规方程/梯度下降
from sklearn.metrics import mean_squared_error # 均方误差
def liner3():
"""
岭回归优化方法
"""
# - 1.获取数据集
boston = load_boston()
# - 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target)
# - 3.特征工程:无量纲化(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# - 4.预估器流程:fit() -> 模型
estimator = Ridge(alpha= 0.5, max_iter= 10000)
estimator.fit(x_train, y_train)
# - 5.得出模型
print("岭回归w权重系数:\n", estimator.coef_)
print("岭回归b偏置:\n", estimator.intercept_)
# - 6.模型评估
y_predict = estimator.predict(x_test)
print("预测房价:\n", y_predict)
error = mean_squared_error(y_test, y_predict)
print("岭回归,均方误差为:", error)
return None
from sklearn.linear_model import LogisticRegression
transfer = LogisticRegression() # 预估器
transfer.fit(x_train, x_test) #训练
transfer.coef_ #回归系数
transfer.intercept_ #偏置
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JujAUqgk-1583501771300)(C:\Users\ASUS\AppData\Roaming\Typora\typora-user-images\image-20200306203133157.png)]
from sklearn.metrics import classification_report
report = classification_report(y_test, y_predict, labels = [2, 4], target_names = ["良性", "恶性"])
rf为预估器estimator
from sklearn.externals import joblib
# ------保存模型------
joblib.dump(estimator, "my_ridge.pkl")
# ------加载模型------
estimator = joblib.load("my_ridge.pkl")
无目标值
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# 预估器KMeans
estimator = KMeans(n_clusters= 3)
# 训练
estimator.fit(data_new)
# 评估,性能评估指标
score = silhouette_score(data_new, y_predict)