# 1.1 创建向量 array
#一维数组
import numpy as np
vector_row = np.array([1,2,3]) # 创建行向量
vector_column = np.array([[1],[2],[3]]) # 创建列向量
# 1.2 创建矩阵
matrix = np.array([[1,2],[1,2],[1,2]])
# 1.3 创建稀疏矩阵(只保存非零值,节省计算成本)
from scipy import sparse
matrix = np.array([[0,0],[0,1],[3,0]])
matrix_sparse = sparse.csr_matrix(matrix) # 创建压缩的稀疏行矩阵
print(matrix_sparse) # (1, 1) 1 左侧为坐标,行列从0计
# (2, 0) 3 右侧为储存的非零值
# 1.4 选择元素(向量右闭,矩阵右开)
vector[2] # 选择向量第3个元素
vector[:] # 选取所有元素
vector[:3] # 0到第3个(含3)元素
vector[3:] # 第3个后所有元素
vector[-1] # 最后一个元素
matrix[1,1] # 选择矩阵第2行第2列
matrix[:2,:] # 选择矩阵1、2行
matrix[:,1:2] # 选择矩阵第2列
# 1.5 查看矩阵属性
matrix.shape # 行列数 (3,4)
matrix.size # 元素数量(行*列) 12
matrix.ndim # 维数 2
# 1.6 对数组中多个元素同时应用某函数 vectorize
add_100 = lambda i: i + 100 # 创建函数,每个值加一百
vectorized_add_100 = np.vectorize(add_100) # 创建向量化函数
vectorized_add_100(matrix) # 对矩阵所有元素应用函数
matrix + 100 # 结果同上
# 1.7 找到最大最小值
np.max(matrix)
np.min(matrix)
np.max(matrix,axis=0) # 每列最大元素 [3,1]
np.max(matrix,axis=1) # 每行最大元素
# 1.8 平均值方差标准差
np.mean(matrix)
np.var(matrix)
np.std(matrix)
np.mean(matrix,axis=0) # 每列平均值
# 1.9 矩阵变形
matrix.reshape(2,3)
matrix.reshape(2,-1) # -1:自动识别列数
# 1.10 转置
matrix.T
# 1.11 展开矩阵
matrix.flatten() # 转换成一维向量,同matrix.reshape(1,-1)
# 1.12 矩阵的秩 matrix_rank
np.linalg.matrix_rank(matrix)
# 1.13 行列式 det
matrix = np.array([[1,-1,3],[1,1,6],[3,8,9]])
np.linalg.det(matrix)
# 1.14 对角线元素 diagonal
matrix.diagonal()
matrix.diagonal(offset=1) # 主对角线向上偏移1的对角线元素
matrix.diagonal(offset=-1) # 主对角线向下偏移1的对角线元素
# 1.15 矩阵的迹 trace
matrix.trace() # 对角线元素之和
# 1.16 特征值和特征向量
a,b = np.linalg.eig(matrix)
a # 特征值
b # 特征向量
# 1.17 点积 dot
a = np.array([1,2,3])
b = np.array([4,5,6])
np.dot(a,b) # 两个向量之积
# 1.18 矩阵加减
np.add(a,b) # 同 a + b
np.subtract(a,b) # 同 a - b
# 1.19 矩阵乘法
np.dot(a,b)
a @ b # 同上
a * b # 矩阵对应元素相乘
# 1.20 矩阵的逆 inv
np.linalg.inv(matrix)
# 1.21 生成随机数 random
np.random.seed(0)
np.random.random(3) # 生成3个0到1之间浮点数
np.random.randint(0,11,3) # 生成3个0到10之间整数
np.random.normal(0,1,3) # 生成3个(0,1)正态分布
np.random.uniform(1,2,3) # 生成3个大于等于1小于2
# 2.1 加载样本数据集
from sklearn import datasets # 加载scikit-learn的数据集
digits = datasets.load_digits() # 加载手写数字数据集,研究图像分类
features = digits.data # 创建特征矩阵
target = digits.target # 创建目标向量
features[0] # 查看第一个样本数据
# load_boston:503个波士顿房价,研究回归
# load_iris:150个鸢尾花尺寸,研究分类
# 2.2 创建仿真数据集
from sklearn.datasets import make_regression # 用于线性回归
from sklearn.datasets import make_classification # 用于分类
from sklearn.datasets import make_blobs # 用于聚类
# 2.3 加载CSV文件
import pandas as pd
url = 'htttps://tinyurl.com/simulated_data' # 可加载本地或远端CSV
data = pd.read_csv(url) # 可设置sep=、header=True(有列名)
data.head(2) # 查看前两行
# 2.4 加载Excel文件
url = 'htttps://tinyurl.com/simulated_excel'
data = pd.read_excel(url,sheetname=0,header=1) # sheetname可用表名、序号、列表
data.head(2)
# 2.5 加载JSON文件
data = pd.read_json(url,orient='columns') # orient是JSON结构参数
# json_normalize:将半结构化json数据转换为Dataframe
# 2.6 查询SQL数据库 create_engine、read_sql_query
from aqlalchemy import create_engine
database_connection = create_engine('sqlite:///sample.db') # 创建数据库连接
dataframe = pd.read_sql_query('SELECT * FROM data',database_connection) # 加载数据,返回data表的所有列
# 3.1 创建数据帧
data = pd.DataFrame() # 创建
data['name'] = ['aaa','bbb'] # 增加列
data['age'] = [38,25]
data['driver'] = [True,False]
new_person = pd.Series(['ccc',40,True],index=['name','age','driver']) # 创建一行
data.append(new_person,ignore_index=True) # 附加一行
# 3.2 描述数据
data.head(2) # 查看前两行
data.shape # 查看行列
data.describe() # 查看数值型变量的描述性统计量
# 3.3 浏览数据帧(对行) iloc
data.iloc[0] # 第一行
data.iloc[1:4] # 2,3,4行
data.iloc[:4] # 1,2,3,4行
data.loc[:,'name'] # loc-标签(字符串),iloc-序号
# 3.4 条件语句选行
data[data['age'] == 38].head(1) # age为38的第一行
data[(data['age'] <= 38) & (data['driver'] == False)] # 多个条件
# 3.5 替换值 replace
data['age'].replace(38,40) # 38换成40
data['age'].replace([38,40],[25,50]) # 同时替换多值
data.replace(1,'one') # 替换整个表中数据
data.replace(r'1st','First',regex=True) # 可用正则
# 3.6 重命名列 rename
data.rename(columns={'age':'Age'}) # Age替换age
data.rename(columns={'age':'Age','name':'Name'}) # 改多个
# 3.7 计算值
print('max:',data['age'].max())
print('min:',data['age'].min())
print('mean:',data['age'].mean())
print('sum:',data['age'].sum())
print('count:',data['age'].count())
# 方差var,标准差std,峰态kurt,偏态skew,平均值标准误差sem,众数mode,中位数median
# 3.8 查找唯一值 unique
data['age'].unique() # 筛选唯一值
data['age'].value_counts() # 所有唯一值和出现的次数
data['age'].nunique() # 多少个唯一值
# 3.9 缺失值 isnull、na_values
data[data['age'].isnull()] # 或notnull
data = pd.read_csv(url,na_values=[np.nan,'NONE',-999]) # 设置这3个为缺失值
# 3.10 删列 drop
data.drop('age',axis=1) # 删一列
data.drop('age','name',axis=1) # 删多列 inplace=True会修改本身
# 3.11 删行
data[data['age'] != 38]
# 3.12 删重复行 drop_duplicates
data.drop_duplicates()
data.drop_duplicates(subset=['age']) # 删age中重复行,默认保存先出现的行
data.drop_duplicates(subset=['age'],keep='last') # 保存后出现的行
# 3.13 根据值对行分组
data.groupby('age').mean() # 根据age分组,计算平均值
data.groupby(['name','age'])['driver'].mean() # name分组后age分组,再计算driver平均数
# 3.14 按时间段对行分组 resample
import pandas as pd
import numpy as np
time = pd.date_range('06/06/2017',periods=100000,freq='30s') # 创建日期范围
data = pd.DataFrame(index=time) # 创建数据帧
data['count'] = np.random.randint(1,10,100000) # 创建一列1到10整数随机变量
data.resample('w').sum() # 按周分组,计算每周总和
# '2w'两周,'m'月
data.resample('w',label='left').sum() # 默认返回时间右边界值,label可改成左
# 3.15 遍历一个列的数据
for name in data['name'][0:2]:
print(name.upper()) # 大写打印前两行名字
# 3.16 对一列元素应用某函数 apply
def uppercase(x):
return x.upper() # 创建函数
data['name'].apply(uppercase)[0:2]
# 3.17 对所有分组应用函数
data.groupby('age').apply(lambda x: x.count()) # 对行分组,每组应用函数
# 3.18 连接多个数据帧 concat
pd.concat([data_a,data_b],axis=0) # 沿着行的方向连接两个数据帧(上下)
pd.concat([data_a,data_b],axis=1) # 沿着列的方向连接两个数据帧(左右)
# 3.19 合并两个数据帧 merge
pd.merge(data_a,data_b,on='id') # 等值连接(交集)
pd.merge(data_a,data_b,on='id',how='outer') # 并集
pd.merge(data_a,data_b,on='id',how='left') # 左连接(留下左表有的id)
# 4.1 min-max特征的缩放 MinMaxScaler
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1)) # 创建缩放器
scaled_feature = minmax_scale.fit_transform(feature) # 缩放特征的值
scaled_feature
# 4.2 0-1特征的标准化 StandardScaler
x = np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])
scaler = preprocessing.StandardScaler() # 创建缩放器
standardized = scaler.fit_transform(x) # 转换特征
standardized
# 中位数和四分位数间距进行缩放(存在异常值) RobustScaler
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)
# 4.3 归一化观察值 Normalizer
from sklearn.preprocessing import Normalizer
features = np.array([[0.5,0.5],[1.1,3.4],[1.5,20.2],[1.63,34.4],[10.9,3.3]])
normalizer = Normalizer(norm='l2') # L2范数(欧式范数)
normalizer.transform(features)
# 4.4 生成多项式和交互特征 PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
features = np.array([[2,3],[2,3],[2,3]]) # 创建特征矩阵
polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False) # 阶数最高为2
polynomial_interaction.fit_transform(features) # 创建多项式特征
# 4.5 转换特征
from sklearn.preprocessing import FunctionTransformer
def add_ten(x):
return x+10
ten_transformer = FunctionTransformer(add_ten) # 创建转换器
ten_transformer.transform(features) # 转换特征矩阵
# apply也可
import pandas as pd
df = pd.DataFrame(features,columns=['feature_1','feature_2']) # 创建数据帧
df.apply(add_ten) # 应用函数
# 4.6 识别异常值 EllipticEnvelope (正常1,异常-1)
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 10,n_features = 2,centers = 1,random_state = 1) # 创建模拟数据
features[0,0] = 10000 # 将第一个观察值替换为极端值
features[0,1] = 10000
outlier_detector = EllipticEnvelope(contamination=.1) # 创建识别器
outlier_detector.fit(features) # 拟合识别器
outlier_detector.predict(features) # 预测异常值
# 4.7 处理异常值
houses = pd.DataFrame()
houses['Price'] = [534433,392333,293222,4322032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] = [1500,2500,1500,48000]
# 1.丢弃
houses[houses['Bathrooms'] < 20]
# 2.标记 where
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1) # 异常值标1
houses
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']] # 对特征值取对数值
houses
# 4.8 特征离散化(可用于数值型数据编码)
from sklearn.preprocessing import Binarizer
age = np.array([[6],[12],[20],[36],[65]])
# 根据阈值将特征二值化 Binarizer
binarizer = Binarizer(18) # 创建二值化器
binarizer.fit_transform(age) # 转换特征
# 根据多个阈值将特征离散化 digitize
np.digitize(age,bins=[20,30,64]) # bins左闭右开
np.digitize(age,bins=[20,30,64],right=True) # 改为左开右闭
# 4.9 使用聚类将观察值分组 KMeans
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
features,_ = make_blobs(n_samples = 50,n_features = 2,centers = 3,random_state = 1)
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
clusterer = KMeans(3,random_state=0) # 创建K-Mean聚类器
clusterer.fit(features)
data['group'] = clusterer.predict(features) # 预测聚类的值
data.head(5)
# 4.10 删除带有缺失值的观察值
# numpy
features = np.array([[1.1,11.1],[2.2,22.2],[3.3,33.3],[4.4,44.4],[np.nan,55]])
features[~np.isnan(features).any(axis=1)] # ~:非 保留没有缺失值的
# pandas dropna
data = pd.DataFrame(features,columns=['feature_1','feature_2'])
data.dropna()
# 4.11 填充缺失值
# 数据量不大,KNN预测缺失值
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
features,_ = make_blobs(n_samples = 1000,n_features = 2,random_state = 1)
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features) # 标准化特征值
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan # 第一个特征向量的第一个值替换为缺失值
features_knn_imputed = KNN(k=5,verbose=0).complete(standardized_features) # 预测缺失值
print('TRUE:',true_value)
print('Imputed:',features_knn_imputed[0,0]) # 对比真实值和预测值
# 平均数、中位数、众数填充,效果较KNN差
from sklearn.preprocessing import Imputer
mean_inputer = Imputer(strategy='mean'ssssssssssss,axis=0)
features_mean_inputed = mean_imputer.fit_transform(features)
print('TRUE:',true_value)
print('Imputed:',features_mean_inputed[0,0])
# 5.1 对无序分类特征编码
# LabelBinarizer
import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer
feature = np.array([['a'],['b'],['a'],['c'],['a']])
one_hot = LabelBinarizer() # 创建one_hot编码
one_hot.fit_transform(feature)
one_hot.classes_ # 输出分类
one_hot.inverse_transform(one_hot.transform(feature)) # 对one_hot编码逆转换
# MultiLabelBinarizer(每个观察值有多分类)
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)
one_hot_multiclass.classes_
# pandas
import pandas as pd
pd.get_dummies(feature[:,0]) # 创建虚拟变量
# (没有内在顺序不宜用1,2,3编码,one-hot编码后应删除一个编码特征)
# 5.2 对有序分类特征编码 replace
data = pd.DataFrame({'score':['low','low','medium','medium','high']})
scale_mapper = {'low':1,'medium':2,'high':3} # 创建映射器
data['score'].replace(scale_mapper) # 使用映射器替换特征
# 5.3 对特征字典编码 DictVectorizer
from sklearn.feature_extraction import DictVectorizer
data = [{'red':2,'blue':4},{'red':4,'blue':3},{'red':1,'yellow':2},{'red':2,'yellow':2}]
dictvectorizer = DictVectorizer(sparse=False) # 创建字典向量化器,False输出稠密矩阵,默认稀疏矩阵
features = dictvectorizer.fit_transform(data) # 将字典转成特征矩阵
features
feature_names = dictvectorizer.get_feature_names() # 获取特征的名字
feature_names
pd.DataFrame(features,columns=feature_names) # 转成数据帧
# 5.4 填充缺失的分类值
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
x = np.array([[0,2.1,1.45],[1,1.18,1.33],[0,1.22,1.27],[1,-0.21,-1.19]])
x_with_nan = np.array([[np.nan,0.87,1.31],[np.nan,-0.67,-0.22]])
clf = KNeighborsClassifier(3,weights='distance')
trained_model = clf.fit(x[:,1:],x[:,0]) # 训练knn分类器
imputed_values = trained_model.predict(x_with_nan[:,1:]) # 预测缺失值的分类
x_with_imputed = np.hstack((imputed_values.reshape(-1,1),x_with_nan[:,1:])) # 连接预测分类和其他特征
np.vstack((x_with_imputed,x)) # 连接两个特征矩阵
# 用出现次数最多的值填充 Imputer
from sklearn.preprocessing import Imputer
x_complete = np.vstack((x_with_imputed,x)) # 连接两个特征矩阵
imputer = Imputer(strategy='most_frequent',axis=0)
imputer.fit_transform(x_complete)
# 5.5 处理不均衡分类(3种方法)
# 移除40个山鸢尾数据,合并另两个,得到10个是山鸢尾,100个不是山鸢尾
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris = load_iris() # 加载数据
features= iris.data # 创建特征矩阵
target = iris.target # 创建目标向量
features = features[40:,:]
target = target[40:] # 移除前40个观察值
target = np.where((target == 0),0,1) # 创建二元目标向量标识观察值是否为类别0
target
# 1.对不均衡分类进行加权 class_weight
weights = {0: 0.9,1: 0.1} # 创建权重
RandomForestClassifier(class_weight=weights)
RandomForestClassifier(class_weight='balanced') # balanced自动创建与分类频数成反比的权重
# 2.对占多数的分类进行下采样(抽取与少数分类相同的子集)
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0] # 给每个分类观察值打标签
n_class0 = len(i_class0)
n_class1 = len(i_class1) # 确定每个分类观察值数量
i_class1_downsampled = np.random.choice(i_class1,size=n_class0,replace=False) # 对分类0,从1中无放回随机抽样
np.hstack((target[i_class0],target[i_class1_downsampled])) # 连接0和抽的1向量
np.vstack((features[i_class0,:],features[i_class1_downsampled,:]))[0:5] # 连接0和抽的1矩阵
# 3.对占少数的分类上采样(对少的有放回随机抽样)
i_class0_upsampled = np.random.choice(i_class0,size=n_class1,replace=True)
np.concatenate((target[i_class0_upsampled],target[i_class1])) # 连接0和抽的1向量
np.vstack((features[i_class0_upsampled,:],features[i_class1,:]))[0:5] # 连接0和抽的1矩阵
print(str.upper()) # 把所有字符中的小写字母转换成大写字母
print(str.lower()) # 把所有字符中的大写字母转换成小写字母
print(str.capitalize()) # 把第一个字母转化为大写字母,其余小写
print(str.title()) # 把每个单词的第一个字母转化为大写,其余小写
# 6.1 清洗文本 strip
data = [' interrobang. by aishwarya henriette ','paeking and going. by karl gautier',' tofay is the night. by jarek prakash ']
strip_whitespace = [string.strip() for string in data] # 去除文本两端空格
remove_periods = [string.replace('.','') for string in strip_whitespace] # 删除句点
def capitalizer(string: str) -> str:
return string.upper()
[capitalizer(string) for string in remove_periods] # 应用函数,改大写
import re
def replace_letters_with_x(string: str) ->str:
return re.sub(r'[a-zA-Z]','x',string)
[replace_letters_with_x(string) for string in remove_periods] # 用正则
# 6.2 解析并清洗HTML BeautifulSoup
from bs4 import BeautifulSoup
html = '''Masego Azra'''
soup = BeautifulSoup(html,'lxml')
soup.find('div',{'class':'full_name'}).text # 查找class是'full_name'的div标签,查看文本
# 6.3 移除标点
import unicodedata
import sys
data = ['hi!!!! I.love.this.song.....','100000% agree!!!! #loveit','right?!?!']
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) # 创建一个标点字典
[string.translate(punctuation) for string in data] # 移除每个字符串中的标点
# 6.4 文本分词 word_tokenize sent_tokenize
from nltk.tokenize import word_tokenize
string = 'the science of today is the technology of tomorrow'
word_tokenize(string) # 分词
from nltk.tokenize import sent_tokenize
string = 'the science of today is the technology of tomorrow. tomorrow is today.'
sent_tokenize(string) # 切分成句子
# 6.5 删除停止词 stopwords
import nltk
nltk.download('stopwords') # 第一次使用要下载
from nltk.corpus import stopwords # 假设单词都是小写的
stop_words[:5] # 查看停止词
tokenized_words = ['i','am','going','to','go','to','the','store','and'] # 创建单词序列
stop_words = stopwords.words('english') # 加载停止词
[word for word in tokenized_words if word not in stop_words] # 删除停止词
# 6.6 提取词干 PorterStemmer
from nltk.stem.porter import PorterStemmer
tokenized_words =['i','am','humbled','by','this','traditional','meeting']
porter = PorterStemmer() # 创建词干转换器
[porter.stem(word) for word in tokenized_words]
# 6.7 标注词性 pos_tag
from nltk import pos_tag
from nltk import word_tokenize
data = 'Chris loved outdoor running'
tagged = pos_tag(word_tokenize(data)) # 词性标注
tag # NNP单数专有名n NN单数或复数n RB副词 VBD过去式v VBG现在分词 JJ形容词 PRP人称代词
[word for word,tag in tagged if tag in ['NN','NNS','NNP','NNPS']] # 过滤单词
# 将句中单词转换为词性编码
from sklearn.preprocessing import MultiLabelBinarizer
text = ['I am eating a burrito for breakfast','Political science is an amazing field','San Francisco is an awesome city']
tagged = [] # 创建列表
for data in text:
tagg = nltk.pos_tag(word_tokenize(data))
tagged.append([tag for word,tag in tagg]) # 所有单词加标签
one_hot = MultiLabelBinarizer()
one_hot.fit_transform(tagged) # 转换标签为特征
one_hot.classes_ # 查看特征名
# 6.8 将文本编码成词袋 CountVectorizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
data = np.array(['I love Brazil. Brazil!','Sweden is best','Germany beats both'])
count = CountVectorizer()
#count = CountVectorizer(ngram_range=(1,2),stop_words='english') # 每个特征一个单词或两个单词
bag_of_words = count.fit_transform(data) # 创建一个词袋特征矩阵
bag_of_words # 默认输出稀疏矩阵
bag_of_words.toarray() # 查看每个特征值的词频统计矩阵
count.get_feature_names() # 查看特征名
# 6.9 按单词的重要性加权
from sklearn.feature_extraction.text import TfidfVectorizer
text = np.array(['I love Brazil. Brazil!','Sweden is best','Germany beats both'])
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text) # 创建TF-IDF特征矩阵
feature_matrix # 稀疏矩阵
feature_matrix.toarray() # 稠密矩阵
tfidf.vocabulary_ # 查看特征名
七、处理日期和时间
# 7.1 字符串转换成时间 to_datetime
import numpy as np
import pandas as pd
data_strings = np.array(['03-04-2005 11:35 PM','23-05-2010 12:01 AM','04-09-2009 09:09 PM'])
[pd.to_datetime(data, format='%d-%m-%Y %I:%M %p') for data in data_strings] # 转换成时间类型数据
[pd.to_datetime(data, format='%d-%m-%Y %I:%M %p',errors='coerce') for data in data_strings] # errors='coerce'出现错误时不会异常,将错误值设置成NaT(缺失值)
# %d-日 %m-月 %Y-年 %I-小时 %M=分钟 %S=秒 %p-AM/PM 都需要完整两位或四位数
# 7.2 处理时区
pd.Timestamp('2017-05-01 06:00:00',tz='Europe/London') # tz指定时区
date = pd.Timestamp('2017-05-01 06:00:00')
date_in_london = data.tz_localize('Europe/London') # tz_localize添加时区
date_in_london.tz_convert('Africa/Abidjan') # tz_convert改变时区
dates = pd.Series(pd.date_range('2/2/2002',periods=3,freq='M')) # 创建3个日期,Series对象能对每个元素应用tz_localize、tz_convert
dates.dt.tz_localize('Africa/Abidjan') # 添加时区
from pytz import all_timezones # 所有时区的字符串
all_timezones[0:4]
# 7.3 选择日期时间
data = pd.DataFrame()
data['date'] = pd.date_range('1/1/2001',periods=100000,freq='H')
data[(data['date'] > '2002-1-1 01:00:00')&(data['date'] <= '2002-1-1 04:00:00')] # 筛选出两个日期之间观察值
data = data.set_index(data['date']) # 将日期列设置为索引
data.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00'] # 适用复杂的时间操作,用索引选择时间
# 7.4 将日期数据切分成多个特征 Series.dt
data = pd.DataFrame()
data['date'] = pd.date_range('1/1/2001',periods=150,freq='W')
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['hour'] = data['date'].dt.hour
data['minute'] = data['date'].dt.minute
data.head(3)
# 7.5 计算日期差值
data = pd.DataFrame()
data['Arrived'] = [pd.Timestamp('01-01-2017'),pd.Timestamp('01-04-2017')]
data['Left'] = [pd.Timestamp('01-01-2017'),pd.Timestamp('01-06-2017')] # 创建两个特征
data['Left'] - data['Arrived'] # 计算时间间隔
pd.Series(delta.days for delta in (data['Left'] - data['Arrived'])) # 移除输出中的days字符串
# 7.6 对一周内各天进行编码 .dt.weekday
dates = pd.Series(pd.date_range('2/2/2002',periods=3,freq='M'))
dates.dt.weekday_name # 输出星期英文
dates.dt.weekday # 输出数字,周一是0,周日是6
# 7.7 创建一个滞后的特征 shift
data = pd.DataFrame()
data['date'] = pd.date_range('1/1/2001',periods=5,freq='D')
data['stock_price'] = [1.1,2.2,3.3,4.4,5.5]
data['previous_days_stock_price'] = data['stock_price'].shift(1) # 让值滞后一行
data
# 7.8 使用滚动时间窗口 rolling
time_index = pd.date_range('01/01/2010',periods=5,freq='M')
data = pd.DataFrame(index=time_index) # 创建数据帧,设置索引
data['stock_price'] = [1,2,3,4,5] # 创建特征
data.rolling(window=2).mean() # 计算滚动平均值
# 7.9 处理时间序列中缺失值 interpolate、ffill、bfill
time_index = pd.date_range('01/01/2010',periods=5,freq='M')
data = pd.DataFrame(index=time_index)
data['sales'] = [1.0,2.0,np.nan,np.nan,5.0]
data.interpolate() # 对缺失数据进行插值(线性)
data.interpolate(method='quadratic') # 曲线填充缺失值
data.interpolate(limit=1,limit_direction='forward') # 限制缺口大时只填充一个,向后插值
data.ffill() # 用前面值填充
data.bfill() # 用后面值填充
八、图像处理
# 略导入图像
# 8.1 加载图像 imread
import cv2
from matplotlib import pyplot as plt
image = cv2.imread('.jpg',cv2.IMREAD_GRAYSCALE) # 将图像导入灰度图
plt.imshow(image,cmap='gray'),plt.axis('off') # 查看图像
plt.show()
# 8.2 保存图像 imwrite
cv2.imwrite('.jpg',image)
# 8.3 调整图像大小 resize
image = cv2.resize(image,(50,50)) # 将图片尺寸调整为50x50像素
# 常见图像规格:32x32,64x64,96x96,256x256
# 8.4 裁剪图像
image = image[:,:128] # 选择所有行和前128列
# 8.5 平滑处理图像(变糊) blur
image_blurry = cv2.blur(image,(5,5)) # 用5x5的核对每个像素周围的值取平均值
# 8.6 图像锐化 filter2D
kernel = np.array([[0,-1,0],
[-1,5,-1],
[0,-1,0]]) # 创建核
image_sharp = cv2.filter2D(image,-1 kernel) # 锐化图像,突出边缘
# 8.7 提升对比度 equalizeHist
# 灰度图
image_enhanced = cv2.equalizeHist(image) # 增强图像
# 彩色图
image_yuv = cv2.cvtColor(image,cv2.COLOR_BGR2YUV) # 转换成YUV格式
image_yuv[:,:,0] = cv2.equalizeHist(image_yuv[:,:,0]) # 对图像应用直方图均衡
image_rgb = cv2.cvtColor(image_yuv,cv2.COLOR_YUV2RGB) # 转换成RGB格式
# 8.8 颜色分离(将某颜色分离出来)
image_hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV) # 将BGR格式转换成HSV格式
lower_blue = np.array([50,100,50])
upper_blue = np.array([130,255,255]) # 定义HSV格式中蓝色分量的区间
mask = cv2.inRange(image_hsv,lower_blue,upper_blue) # 创建掩模
image_masked = cv2.bitwise_and(image,image,mask=mask) # 应用掩模
image_rgb = cv2.cvtColor(image_masked,cv2.COLOR_BGR2RGB) # BGR格式转换成RGB格式
# HSV:H色调 S饱和度 V亮度
# 8.9 图像二值化(仅黑白) adaptiveThreshold
max_output_value = 255 # 输出像素最大强度
neighborhood_size = 99
subtract_from_mean = 10
image_binarized = cv2.adaptiveThreshold(image,
max_output_value,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C, # 域值设为相邻像素强度的加权和,ADAPTIVE_THRESH_MEAN_C:相邻的平均值
CV2.THRESH_BINARY,
neighborhood_size,
subtract_from_mean) # 自适应阈值处理
# 8.10 移除背景 grabCut
image_rgb = cv2.cvtColor(image,cv2.COLOR_BGR2RGB) # BGR格式转换成RGB格式
rectangle = (0,56,256,150) # 矩形的值:左上角的x坐标,左上角的y坐标,宽,高
mask = np.zeros(image_rgb.shape[:2],np.uint8) # 创建初始掩模
bgdModel = np.zeros((1,65),np.float64)
fgdModel = np.zeros((1,65),np.float64) # 临时数组
cv2.grabCut(image_rgb, # 图像
mask,rectangle,bgdModel,fgdModel,
5, # 迭代次数
cv2.GC_INIT_WITH_RECT) # 使用定义的矩形初始化
mask_2 = np.where((mask==2)|(mask==0),0,1).astype('uint8') # 背景设0,其余1
image_rgb_nobg = image_rgb * mask_2[:,:,np.newaxis] # 图像与掩模相乘除去背景
# 8.11 边缘检测 Canny
median_intensity = np.median(image) # 计算像素强度的中位数
lower_threshold = int(max(0,(1.0 - 0.33) * median_intensity))
upper_threshold = int(min(255,(1.0 + 0.33) * median_intensity)) # 设置阈值
image_canny = cv2.Canny(image,lower_threshold,upper_threshold) # 应用边缘检测器
# 8.12 角点预测 cornerHarris
image_bgr = cv2.imread('.jpg')
image_gray = cv2.cvtColor(image_bgr,cv2.COLOR_BGR2GRAY)
image_gray = np.float32(image_gray)
block_size = 2 # 角点检测中窗口的尺寸
aperture = 29 # Sobel算子的尺寸
free_parameter = 0.04 # 控制角点检测的严格程度,越大,可以识别的角点越平滑
detector_responses = cv2.cornerHarris(image,block_size,aperture,free_paraneter) # 检测角点
detector_responses = cv2.dilate(detector_responses,None) # 放大角点标志
threshold = 0.02
image_bgr[detector_responses > threshold * detector_responses.max()]=[255,255,255] # 只保留大于阈值的检测结果,并标记白色
image_gray = cv2.cvtColor(image_bgr,cv2.COLOR_BGR2GRAY) # 转换成灰度图
# 8.13 为机器学习创建特征 flatten
image_10x10 = cv2.resize(image,(10,10)) # 转换尺寸
image_10x10.flatten() # 将图像数据转换成一维向量
# 8.14 将颜色平均值编码成特征
image_bgr = cv2.imread('.jpg',cv2.IMREAD_COLOR) # BGR格式加载图像
channels = cv2.mean(image_bgr) # 计算每个通道的平均值
observation = np.array([(channels[2],channels[1],channels[0])]) # 交换红蓝通道,BGR转换RGB
observation
# 8.15 将色彩直方图编码成特征
image_rgb = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
features = [] # 创建特征列表
colors = ('r','g','b') # 为每个颜色通道计算直方图
for i,channel in enumerate(colors): # 为每一个通道计算直方图并把它加入特征列表中
histogram = cv2.calcHist([image_rgb], # 图像
[i], # 颜色通道序号
None, # 不使用掩模
[256], # 直方图尺寸
[0,256]) # 范围
features.extend(histogram)
observation = np.array(features).flatten() # 展开成一维数组
observation[0:5]
for i,channel in enumerate(colors): # 对每个通道绘制直方图
histogram = cv2.calcHist([image_rgb], # 图像
[i], # 颜色通道序号
None, # 不使用掩模
[256], # 直方图尺寸
[0,256]) # 范围
plt.plot(histogram, color = channel)
plt.xlim([0,256])
九、利用特征提取进行特征降维(创建新特征降低维度)
# 9.1 使用主成分进行特征降维 PCA(对线性可分的效果好)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets
digits = datasets.load_digits() # 加载数据
features = StandardScaler().fit_transform(digits.data) # 标准化特征矩阵
pca = PCA(n_components=0.99,whiten=True) # 创建可以保留99%信息量(方差表示)的PCA(通常0.99或0.95)
features_pca = pca.fit_transform(features) # 执行pca
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_pca.shape[1]) # 减少了10个特征
# 9.2 对线性不可分数据进行特征降维 KernelPCA
from sklearn.decomposition import PCA,KernelPCA
from sklearn.datasets import make_circles
features,_ = make_circles(n_samples=1000,random_state=1,noise=0.1,factor=0.1) # 创建线性不可分数据
kpca = KernelPCA(kernel='rbf',gamma=15,n_components=1)
features_kpca = kpca.fit_transform(features) # 应用基于径向基函数核rbf的Kernel PCA方法(其他核:poly、sigmoid)
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_kpca.shape[1])
# 9.3 通过最大化类间可分性进行特征降维 LinearDiscriminantAnalysis
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
iris = datasets.load_iris()
features = iris.data
target = iris.target # 加载数据
lda = LinearDiscriminantAnalysis(n_components=1) # 返回特征数量最优值参考explained_variance_ratio_
features_lda = lda.fit(features,target).transform(features) # 创建并运行LDA,对特征做变换
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_lda.shape[1])
lda.explained_variance_ratio_ # 查看每个成分保留的信息量
# ===========自动识别最优特征数 n_components=None===========
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features,target) # 创建并运行LDA
lda_var_ratios = lda.explained_variance_ratio_ # 获取方差的百分比数组
def select_n_components(var_ratio,goal_var:float) -> int:
total_variance = 0.0 # 设置总方差的初始值
n_components = 0 # 设置特征数量的初始值
for explained_variance in var_ratio: # 遍历方差百分比数组的元素
total_variance += explained_variance # 将该百分比加入总方差
n_components += 1 # n_components的值加1
if total_variance >= goal_var: # 如果达到目标阈值
break # 结束遍历
return n_components # 返回n_components的值
select_n_components(lda_var_ratios,0.95) # 运行函数
# 9.4 使用矩阵分解法进行特征降维(非负矩阵) NMF
from sklearn.decomposition import NMF
from sklearn import datasets
digits = datasets.load_digits() # 加载数据
features = digits.data # 加载特征矩阵
nmf = NMF(n_components=10,random_state=1)
features_nmf = nmf.fit_transform(features) # 创建NMF,进行变换并应用
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_nmf.shape[1])
# 9.5 对稀疏数据进行特征降维 TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np
digits = datasets.load_digits() # 加载数据
features = StandardScaler().fit_transform(digits.data) # 标准化特征矩阵
features_sparse = csr_matrix(features) # 生成稀疏矩阵
tsvd = TruncatedSVD(n_components=10) # 创建tsvd
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse) # 在稀疏矩阵上执行TSVD
print('Original number of features:',features_sparse.shape[1])
print('Reduced number of features:',features_sparse_tsvd.shape[1])
tsvd.explained_variance_ratio_[0:3].sum() # 对前三个成分的信息量占比求和
十、使用特征选择进行降维(丢弃信息量低的特征)
方法:过滤器:根据特征的统计信息选择最优特征
包装器:通过不断试错,找出一个可以产生高质量预测值的模型的特征子集
嵌入式:将选择最优特征子集作为机器学习算法训练过程的一部分
# 10.1 数值型特征方差的阈值化(挑出方差大于给定阈值的特征) VarianceThreshold
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
iris = datasets.load_iris()
features = iris.data
target = iris.target # 创数据
thresholder = VarianceThreshold(threshold=.5) # 创建对象(手动选阈值)
features_high_variance = thresholder.fit_transform(features) # 创建大方差特征矩阵
features_high_variance[0:3] # 显示大方差特征矩阵
thresholder.fit(features).variances_ # 显示方差
# 10.2 二值特征的方差阈值化(特征只有两个分类)
from sklearn.feature_selection import VarianceThreshold
features = [[0,1,0],[0,1,1],[0,1,0],[0,1,1],[1,0,0]]
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features) # 创建对象并运行
# 10.3 处理高度相关性的特征
import pandas as pd
import numpy as np
features = np.array([[1,1,1],[2,2,0],[3,3,1],[4,4,0],[5,5,1],[6,6,0],[7,7,1],[8,7,0],[9,7,1]]) # 特征矩阵,包含两个高度相关的特征
data = pd.DataFrame(features) # 特征矩阵转换成DataFrame
corr_matrix = data.corr().abs() # 创建相关矩阵
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool)) # 选择相关矩阵的上三角阵
to_drop = [column for column in upper.columns if any(upper[column]>0.95)] # 找到相关性大于0.95的特征列的索引
data.drop(data.columns[to_drop],axis=1).head(3)
# 10.4 删除与分类任务不相关的特征
# 分类型特征(卡方统计量)——选择固定数量特征 SelectKBest chi2
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
iris = load_iris()
features = iris.data
target = iris.target
features = features.astype(int) #分类数据转换成整数型数据
chi2_selector = SelectKBest(chi2,k=2)
features_Kbest = chi2_selector.fit_transform(features,target) # 选择卡方最大的两个特征
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_Kbest.shape[1])
# 数值型特征(F值)——选择固定数量特征 SelectKBest f_classif
fvalue_selector = SelectKBest(f_classif,k=2)
features_Kbest = fvalue_selector.fit_transform(features,target) # 选择F值最大的两个特征
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_Kbest.shape[1])
# 选择前n%特征 SelectPercentile
from sklearn.feature_selection import SelectPercentile
fvalue_selector = SelectPercentile(f_classif,percentile=75)
features_kbest = fvalue_selector.fit_transform(features,target) # 选择F值位于前75%的特征
print('Original number of features:',features.shape[1])
print('Reduced number of features:',features_kbest.shape[1])
# 10.5 递归式特征消除(自动选择保留最优) RFECV
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets,linear_model
warnings.filterwarnings(action='ignore',module='scipy',message='internal gelsd') # 忽略无意义警告
features,target = make_regression(n_samples=10000,n_features=100,n_informative=2,random_state=1) #生成特征矩阵、目标向量、系数
ols = linear_model.LinearRegression() # 创建线性回归对象
rfecv = RFECV(estimator=ols,step=1,scoring='neg_mean_squared_error')
# step 每次迭代丢弃的特征数量 scoring 交叉验证时评估模型性能的方法
rfecv.fit(features,target)
rfecv.transform(features) # 递归消除特征
rfecv.n_features_ # 最优特征的数量
rfecv.support_ # 哪些特征是最优特征
rfecv.ranking_ # 特征排名
十一、模型评估
# 11.1 交叉验证模型 KFold cross_val_score
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import KFold,cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandarScaler
digits = datasets.load_digits() # 加载手写数字数据集
features = digits.data # 创建特征矩阵
target = digits.target # 创建目标向量
# 交叉验证
standardizer = StandarScaler() # 创建标准化对象
logit = LogisticRegression() # 创建逻辑回归对象
pipeline = make_pipeline(standardizer,logit) # 创建包含数据标准化和逻辑回归的流水线
kf = KFold(n_splits=10,
shuffle=True, # shuffle=True打乱数据顺序
random_state=1) # 创建k折交叉验证对象
cv_result = cross_val_score(pipeline, # 流水线
features, # 特征矩阵
target, # 目标向量
cv=kf, # 交叉验证方法
scoring='accuracy', # 损失函数
n_jobs=-1) # 使用所有CPU核
cv_result.mean() # 计算得分平均值
#
# 训练集和测试集的交叉验证
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.1, random_state=1) # 创建训练集和测试集
standardizer.fit(features_train) # 使用训练集计算标准化参数
features_train_std = standardizer.transform(features_train)
features_test_std = standardizer.transform(features_test) # 将标准化操作应用到训练集和测试集
# 后续操作接上部分交叉验证
# 11.2 创建一个基准回归模型 DummyRegressor
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
boston = load_boston()
features,target = boston.data,boston.target # 创建特征矩阵和目标向量
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0) # 数据分为训练集和测试集
dummy = DummyRegressor(strategy='mean') # 创建DummyRegressor对象(预测方法:训练集的均值)
dummy.fit(features_train, target_train) # 训练回归模型
dummy.score(features_test,target_test) # 计算R方得分
# 训练自己模型与基准模型对比
from sklearn.linear_model import LinearRegression
ols = LinearRegression()
ols.fit(features_train, target_train) # 训练简单的线性回归模型
ols.score(features_test,target_test) # 计算R方得分
# 11.3 创建一个基准分类模型 DummyClassifier
from sklearn.datasets import load_iris
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
iris = load_iris()
features,target = iris.data, iris.target
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=0)
dummy = DummyClassifier(strategy='uniform', random_state=1) # 创建DummyClassifier(随机生成均匀的预测)
dummy.fit(features_train, target_train)
dummy.score(features_test,target_test)
# 训练自己模型与基准模型对比
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(features_train, target_train)
classifier.score(features_test,target_test)
# 11.4 评估二元分类器
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X,y = make_classification(n_samples =10000,n_features = 3,n_informative = 3,n_redundant = 0,n_classes = 2,random_state = 1) # 生成特征矩阵和目标向量
logit = LogisticRression() # 生成逻辑回归对象
cross_val_score(logit,X,y,scoring='accuracy') # 使用准确率对模型进行交叉验证
cross_val_score(logit,X,y,scoring='precision') # 精确度
cross_val_score(logit,X,y,scoring='recall') # 召回率
cross_val_score(logit,X,y,scoring='f1') # f1分数(精确度和召回率的调和平均)
# 直接计算值
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1) # 创建训练集和测试集
y_hat = logit.fit(X_train,y_train).predict(X_test) # 对测试集进行预测
accuracy_score(y_test,y_hat) # 计算准确率
# 11.5 评估二元分类器的阈值 roc_curve
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
features,target = make_classification(n_samples=10000,n_features=10,n_classes=2,n_informative=3,random_state=3)
features_train,features_test,target_train,target_test = train_test_split(features,target,test_size=0.1,random_state=1)
logit = LogisticRegression()
logit.fit(features_train,target_train)
target_probabilities = logit.predict_proba(features_test)[:,1] #获取预测的概率
false_positive_rate, true_positive_rate, threshold = roc_curve(target_test,target_probabilities) # 计算真阳性和假阳性概率
# 画出ROC曲线
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0,1], ls='--')
plt.plot([0,0],[1,0], c='.7'), plt.plot([1,1], c='.7')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# 计算ROC曲线下方面积
roc_auc_score(target_test, target_probabilities) # 面积AUCROC越接近1,模型性能越好
# 11.6 评估多元分类器
cross_val_score(logit,X,y,scoring='f1_macro') # 求各分类评估分数的平均值
# weighted: 加权平均值,权重为样本数占总数比例
# micro: 计算每个样本分类组合的得分,取平均值
# 11.7 分类器性能的可视化 confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
iris = datasets.load_iris()
features = iris.data
target = iris.target
class_names = iris.target_names # 创建目标分类的名称列表
features_train,features_test,target_train,target_test = train_test_split(features,target,random_state=1)
classifier = LogisticRegression()
target_predicted = classifier.fit(features_train,target_train).predict(features_test)
# 创建混淆矩阵
matrix = confusion_matrix(target_test,target_predicted)
dataframe = pd.DataFrame(matrix, index=class_names, columns=class_names) # 创建DataFrame
# 绘制热力图
sns.heatmap(dataframe, annot=True, cbar=None, cmap='Blues')
plt.title('Confusion Matrix'), plt.tight_layout()
plt.ylabel('True Class'), plt.xlabel('Predicted Class')
plt.show()
# 行表示真实分类,完美模型只有对角线上有数字
# 11.8 评估回归模型
ols = LinearRegression()
cross_val_score(ols,X,y,scoring='neg_mean_squared_error') # MSE对线性回归做交叉验证
cross_val_score(ols,X,y,scoring='r2') # 决定系数对线性回归做交叉验证
# 11.9 评估聚类模型 silhouette_score
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2,random_state=1).fit(X) # KMeans方法聚类
y_predict = model.labels_ # 获取预测分类
silhouette_score(X, y_predict) # 轮廓系数评估模型(-1,1)1表示内部密集分离彻底
# 11.10 创建自定义评估指标 make_scorer
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import Ridge
def custom_metric(y_test, y_predicted): # 创建自定义指标函数
r2 = r2_score(y_test, y_predicted)
return r2
score= make_scorer(custom_metric, greater_is_better=True) # 创建评分函数,定义分数越高模型越好
classifier = Ridge() # 参加岭回归
model = classifier.fit(x_train,y_train)
score(model, x_test, y_test) # 应用自定义评分器
# 11.11 可视化训练集规模的影响 learning_curve
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
digits = load_digits()
x,y = digits.data, digits.target
# 使用交叉验证为50个不同规模的训练集计算训练的测试得分
train_sizes, train_scores, test_scores = learning_curve(RandomForestClassifier(), x, y, cv=10, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01,1.0,50))
# 计算均值标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# 画线
plt.plot(train_sizes, train_mean, '--', color='#111111', label='Training score')
plt.plot(train_sizes, test_mean, color='#111111', label='Cross-validation score')
# 画带状图
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='#DDDDDD')
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='#DDDDDD')
# 创建图
plt.title('Learning Curve')
plt.xlabel('Training Set Size'), plt.ylabel('Accuracy Score')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
# 11.12 生成对评估指标的报告 classification_report
from sklearn.metrics import classification_report
class_names = iris.target_names # 创建目标分类名的列表
print(classification_report(y_test, y_predicted, y_names=class_names))
# 包括精确度、召回率、F1分数。 support指样本数
# 11.13 可视化超参数值的效果
from sklearn.model_selection import validation_curve
param_range = np.arange(1,250,2) # 创建参数的变化范围
train_scores, test_scores = validation_curve(
RandomForestClassifier(), x, y,
param_name='n_estimators', # 要查看的超参数
param_range=param_range, # 超参数值范围
cv=3, scoring='accuracy', n_jobs=-1)
# 平均值标准差、画线画带状图画图,同11.11
十二、模型选择
# 12.1 使用穷举搜索选择最佳模型 GridSearchCV
import numpy as np
from sklearn import linear_model, datasets
from sklearn.model_selection import GridSearchCV
# 加载数据
iris = datasets.load_iris()
features = iris.data
target = iris.target
# 创建逻辑回归
logistic = linear_model.LogisticRegression()
# 正则化惩罚的超参数区间
penalty = ['l1','l2']
# 正则化的超参数区间(十个可能的值)
C = np.logspace(0,4,10)
# 创建候选超参数字典
hyperparameters = dict(C=C, penalty=penalty)
# 创建网格搜索对象
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
# 训练网格搜索
best_model = gridsearch.fit(features, target)
# 查看最佳超参数
print('Best Penalty:',best_model.best_estimator_.get_params()['penalty'])
print('Best C:',best_model.best_estimator_.get_params()['C'])
# 预测目标向量
best_model.predict(features)
# 12.2 使用随机搜索选择最佳模型 RandomizedSearchCV
from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV
iris = datasets.load_iris()
features = iris.data
target = iris.target
# 模型
logistic = linear_model.LogisticRegression()
penalty = ['l1','l2']
C = uniform(loc=0, scale=4) # 均匀分布
hyperparameters = dict(C=C, penalty=penalty)
randomizedsearch = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)
best_model = randomizedsearch.fit(features, target)
# 查看
print('Best Penalty:',best_model.best_estimator_.get_params()['penalty'])
print('Best C:',best_model.best_estimator_.get_params()['C'])
best_model.predict(features)
# 12.3 从多种学习算法中选择最佳模型
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
np.random.seed(0) # 设置随机数种子
iris = datasets.load_iris()
features = iris.data
target = iris.target
# 创建流水线
pipe = Pipeline([('classidier',RandomForestClassifier())])
# 创建候选学习算法及超参数的字典
search_space = [{'classifier':[LogisticRegression()],
'classifier_penalty':['l1','l2'],
'classifier_C': np.logspace(0,4,10)}, # 逻辑回归
{'classifier':[RandomForestClassifier()],
'classifier_n_estimators':[10,100,1000],
'classifier_max_features': [1,2,3]}] # 随机森林
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)
best_model = gridsearch.fit(features, target)
# 查看最佳模型
best_model.best_estimator_.get_params()['classifier']
best_model.predict(features)
# 12.4 将数据预处理加入模型选择过程
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandarScaler
# 12.5 用并行化加速模型选择 n_jobs=-1
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=1) # 用所有CPU核
# 12.6 使用针对特定算法的方法加速模型选择 LogisticRegressionCV
from sklearn import linear_model, datasets
# 只能搜索超参数C最优
logit = linear_model.LogisticRegressionCV(Cs=100) # 产生超参数C的100个候选值
logit.fit(features, target)
# 还有 岭回归、套索回归、弹性网络回归 有这种优势
# 12.7 模型选择后的性能评估
from sklearn.model_selection import cross_val_score
gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0)
cross_val_score(gridsearch, features, target).mean() # 嵌套交叉验证,输出平均得分
十三、线性回归
# 13.1 拟合一条直线 LinearRegression
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
model = regression.fit(features,target)
# 查看截距
model.intercept_
# 查看系数
model.coef_
# 预测
model.predict(features)
# 13.2 处理特征之间的影响 PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
# 创建交互特征
interaction = PolynomialFeatures(degree=3, include_bias=False, interaction_only=True) # interaction_only=True 只返回交互特征
features_interaction = interaction.fit_transform(features)
# 回归
regression = LinearRegression()
model = regression.fit(features_interaction,target)
# 13.3 拟合非线性关系
# 创建多项式特征 x^2 x^3
Polynomial = PolynomialFeatures(degree=3, include_bias=False)
features_Polynomial = Polynomial.fit_transform(features)
# 回归
regression = LinearRegression()
model = regression.fit(features_Polynomial,target)
# 13.4 通过正则化减少方差
# 包含正则化项:岭回归、套索回归
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandarScaler
# 特征标准化
scaler = StandardScaler()
features_Standardized = scaler.fit_transform(features)
# 岭回归
regression = Ridge(alpha=0.5)
model = regression.fit(features_Standardized, target)
# RidgeCV优化alpha
from sklearn.linear_model import Ridge, RidgeCV
regr_cv = Ridge(alphas=[0.1, 1.0, 10.0])
model_cv = regr_cv.fit(features_Standardized, target)
model_cv.alpha_
# 训练模型前确保特征标准化
# 13.5 使用套索回归减少特征 Lasso (将特征系数减小为0)
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandarScaler
# 特征标准化
scaler = StandardScaler()
features_Standardized = scaler.fit_transform(features)
# 套索回归
regression = Lasso(alpha=0.5)
model = regression.fit(features_Standardized, target)
十四、数和森林
# 14.1 训练决策树分类器 DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
iris = datasets.load_iris()
features= iris.data
target = iris.target
decisiontree = DecisionTreeClassifier(random_state=0) # 创建决策树分类器对象
model = decisiontree.fit(features, target) # 训练模型
observation = [[5,4,3,2]] # 创建新样本
model.predict(observation) # 预测样本的分类
model.predict_proba(observation) # 查看样本分别属于三个分类的概率
# 使用entropy作为不纯度检测方法创建决策树
decisiontree_entropy = DecisionTreeClassifier(criterion='entropy', random_state=0)
model_entropy = decisiontree_entropy.fit(features, target)
# 14.2 训练决策树回归模型 DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
decisiontree = DecisionTreeRegressor(random_state=0)
model = decisiontree.fit(features, target)
observation = [[0.02,16]] # 创建新样本
model.predict(observation)
# 使用MAE创建决策树回归模型
decisiontree_mae = DecisionTreeRegressor(criterion='mae', random_state=0)
model_mae = decisiontree_mae.fit(features, target)
# 14.3 可视化决策树模型 pydotplus
# 将决策树导出为DOT格式并可视化
import pydotplus
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from IPython.display import Image
from sklearn import tree
iris = datasets.load_iris()
features= iris.data
target = iris.target
decisiontree = DecisionTreeClassifier(random_state=0)
model = decisiontree.fit(features, target)
# 创建DOT数据(需要安装graphviz)
dot_data = tree.export_graphviz(decisiontree,
out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names)
graph = pydotplus.graph_from_dot_data(dot_data) # 绘制图形
Image(graph.create_png()) # 显示图形
graph.write_pdf('iris.pdf') # 存为PDF格式
# 14.4 训练随机森林分类器 RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier # 解决决策树的过拟合
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
model = randomforest.fit(features, target)
# 14.5 训练随机森林回归模型 RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
randomforest = RandomForestRegressor(random_state=0, n_jobs=-1)
model = randomforest.fit(features, target)
# 14.6 识别随机森林中重要特征 model.feature_importances_
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
iris = datasets.load_iris()
features= iris.data
target = iris.target
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
model = randomforest.fit(features, target)
# 重要性
importances = model.feature_importances_ # 查看特征重要程度
indices = np.argsort(importances)[::-1] # 将特征重要性降序排列
names = [iris.feature_names[i] for i in indices] # 按照特征重要性对特征名称重新排序
# 画图
plt.figure()
plt.title('Feature Importance')
plt.bar(range(features.shape[1]), importances[indices])
plt.xticks(range(features.shape[1]), names, rotation=90)
plt.show()
# 14.7 选择随机森林中的重要特征 SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
selector = SelectFromModel(randomforest, threshold=0.3) # 选择重要性大于等于阈值的特征
features_important = selector.fit_transform(features, target) # 使用选择器创建新的特征矩阵
model = randomforest.fit(features_important, target) # 使用重要特征训练
# 14.8 处理不均衡的分类 class_weight='balanced'
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
iris = datasets.load_iris()
features= iris.data
target = iris.target
# 删除前40样本获得高度不均衡数据
features = features[40:,:]
target = target[40:]
target = np.where((target == 0), 0, 1) # 创建目标向量表明分类为0还是1
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced')
model = randomforest.fit(features, target)
# 14.9 控制决策树的规模
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier(random_state=0,
max_depth=None, # 树的最大深度
min_simples_split=2, # 节点分裂前节点上最小的样本数
min_simples_leaf=1, # 叶子节点需要的最小样本数
min_weight_fraction_leaf=0,
max_leaf_nodes=None, # 最大叶子节点数
min_impurity_decrease=0) # 执行分裂所需的最小不纯度减少量
model = decisiontree.fit(features, target)
# 14.10 通过boosting提高性能 AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier(random_state=0)
model = adaboost.fit(features, target)
# 14.11 使用袋外误差评估随机森林模型 randomforest.oob_score_
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=0, n_estimators=1000, oob_score=True, n_jobs=-1) # oob_score=True
model = randomforest.fit(features, target)
randomforest.oob_score_ # 查看袋外误差(替代交叉验证)
十五、KNN
# 15.1 找到一个观察值的最近邻 NearestNeighbors
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
features= iris.data
standarizer = StandardScaler()
features_standardized = standarizer.fit_transform(features) # 特征标准化
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized) # 俩个最近的观察值
new_observation = [1,1,1,1] # 创建一个观察值
distances, indices = nearest_neighbors.kneighbors([new_observation]) # 获取里观察值最近的两个观察值的索引,以及到这两个点的距离
features_standardized[indices] # 查看最近的两个观察值
# 15.2 创建一个KNN分类器 KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
standardizer = StandardScaler()
X_std = standarizer.fit_transform(X) # 标准化特征
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(X_std, y) # 训练一个有5个邻居的KNN分类器
new_observation = [[0.75,0.75,0.75,0.75], [1,1,1,1]] # 创建两个观察值
knn.predict(new_observation) # 预测两个观察值的分类
# 15.3 确定最佳的邻域点集的大小 GridSearchCV
# 为knn找最佳的k
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
features= iris.data
target = iris.target
standarizer = StandardScaler()
features_standardized = standarizer.fit_transform(features) # 标准化
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
pipe = Pipeline([('standarizer',standarizer),('knn',knn)]) # 创建一个流水线
search_space = [{'knn__n_neighbors':[1,2,3,4,5,6,7,8,9,10]}] # 创建一个可选值范围
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standardized, target) # 创建grid搜索
classifier.best_estimator_.get_params()['knn__n_neighbors'] # 最佳邻域的大小
# 15.4 创建一个基于半径的最佳邻分类器
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
standarizer = StandardScaler()
features_standardized = standarizer.fit_transform(features)
rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1).fit(features_standardized, target)
new_observation = [1,1,1,1]
rnn.predict(new_observation)
十六、逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
logistic_regression = LogisticRegression(random_state=0)
model = logistic_regression.fit(features_standardized, target)
model.predict(new_observation)
model.predict_proba(new_observation) # 查看预测的概率
# 16.2 训练多元分类器 multi_class
logistic_regression = LogisticRegression(random_state=0, multi_class='ovr') # 创建一对多的逻辑回归对象(OVR、MLR)
# 16.3 通过正则化减小方差 LogisticRegressionCV
logistic_regression = LogisticRegressionCV(penalty='l2', Cs=10, random_state=0, n_jobs=-1) # 创建一对多的逻辑回归对象(OVR、MLR)
# 16.4 在超大数据集上训练分类器 solver='sag'
logistic_regression = LogisticRegressionCV(random_state=0, solver='sag')
# 16.5 处理不均衡分类 class_weight='balanced'
logistic_regression = LogisticRegression(random_state=0, class_weight='balanced')
十七、支持向量机
# 17.1 训练一个线性分类器 LinearSVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
svc = LinearSVC(C=1.0)
model = svc.fit(features_standardized, target)
svc.predict(new_observation)
# 画图
from matplotlib import pyplot as plt
color = ['black' if c == 0 else 'lightgrey' for c in target]
plt.scatter(features_standardized[:,0], features_standardized[:,1], c=color) # 画出样本点,并根据分类上色
w = svc.coef_[0]
a = -w[0]/w[1]
xx = np.linspace(-2.5,2.5)
yy = a * xx - (svc.intercept_[0]/w[1]) # 创建超平面
plt.plot(xx, yy)
plt.axis('off'),plt.show()
# 17.2 使用核函数处理线性不可分的数据 SVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np
np.random.seed(0)
features = np.random.randn(200,2) # 生成两个特征
target_xor = np.logical_xor(features[:,0] > 0, features[:,1] > 0)
target = np.where(target_xor, 0, 1) # 使用异或门创建线性不可分数据
svc = SVC(kernel='rbf', random_state=0, gamma=1, C=1) # 创建一个有径向基核函数的支持向量机
model = svc.fit(features, target)
# 17.3 计算预测分类的概率 probability=True
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
svc = SVC(kernel='linear', random_state=0, probability=True)
model = svc.fit(features_standardized, target)
new_observation = [[.4,.4,.4,.4]]
model.predict_proba(new_observation)
# 17.4 识别支持向量 model.support_vectors_
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
svc = SVC(kernel='linear', random_state=0)
model = svc.fit(features_standardized, target)
model.support_vectors_ # 查看支持向量
model.support_ # 查看支持向量在观察值中的索引值
model.n_support_ # 每个分类有几个支持向量
# 17.5 处理不均衡的分类 class_weight='balanced'
svc = SVC(kernel='linear', class_weight='balanced', C=1.0, random_state=0)
model = svc.fit(features_standardized, target)
十八、朴素贝叶斯
# 18.1 为连续的数据训练分类器 GaussianNB
from sklearn.naive_bayes import GaussianNB
classifer = GaussianNB() # 高斯朴素贝叶斯分类器
model = classifer.fit(features, target)
clf = GaussianNB(priors=[0.25, 0.25, 0.5]) # 给每个分类先验概率
# 18.2 为离散数据和计数数据训练分类器 MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
text_data = np.array(['I love Brazil. Brazil!', 'Brazil is best', 'Germany beats both'])
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data) # 创建词袋
features = bag_of_words.toarray()
target = np.array([0,0,1])
classifer = MultinomialNB(class_prior=[0.25, 0.5]) # 给定每个分类的先验概率,创建多项式朴素贝叶斯对象
model = classifer.fit(features, target)
# 18.3 为具有二元特征的数据训练朴素贝叶斯分类器 BernoulliNB
from sklearn.naive_bayes import BernoulliNB
classifer = BernoulliNB(class_prior=[0.25, 0.5])
model = classifer.fit(features, target)
# 18.4 校准预测概率 CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
classifer = GaussianNB()
classifer_sigmoid = CalibratedClassifierCV(classifer, cv=2, method='sigmoid') # 使用sigmoid校准调校过的交叉验证模型
classifer_sigmoid.fit(features, target) # 校准概率
classifer_sigmoid.predict_proba(new_observation) # 查看校准过的概率
十九、聚类
# 19.1 使用K-Means聚类算法 KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
cluster = KMeans(n_clusters=3, random_state=0, n_jobs=-1)
model = cluster.fit(features_std)
model.labels_ # 查看预测分类
iris.target # 查看真实分类
model.predict(new_observation)
model.cluster_centers_ # 查看分类的中心点
# 19.2 加速K-Means聚类 MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans
cluster = MiniBatchKMeans(n_clusters=3, random_state=0, batch_size=100)
model = cluster.fit(features_std)
# 19.3 使用Meanshift聚类算法
from sklearn.cluster import MeanShift
cluster = MeanShift(n_jobs=-1) # 不用设定聚类数量
model = cluster.fit(features_std)
# 19.4 使用DBSCAN聚类算法
from sklearn.cluster import DBSCAN
cluster = DBSCAN(n_jobs=-1)
model = cluster.fit(features_std)
# 19.5 使用层次合并聚类算法
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=3)
model = cluster.fit(features_std)
二十、神经网络
# 20.1 为神经网络预处理数据 StandardScaler
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
features_std = scaler.fit_transform(features) # 标准化
# 20.2 设计一个神经网络
from keras import models
from keras import layers
network = models.Sequential() # 启动神经网络
network.add(layers.Dense(units=16, activation='relu', input_shape=(10,))) # 添加使用RelU激活函数的全连接层
network.add(layers.Dense(units=16, activation='relu')) # 添加使用RelU激活函数的全连接层
network.add(layers.Dense(units=1, activation='sigmoid')) # 添加使用sigmoid激活函数的全连接层
network.compile(loss='binary_crossentropy', # 损失函数:交叉熵
optimizer='rmsprop', # 优化算法:均方根传播
metrics=['accuracy']) # 将准确率作为性能指标
# 20.3 训练一个二元分类器 Tokenizer
import numpy as np
from keras import models
from keras import layers
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
np.random.seed(0)
number_of_features = 1000 # 设定想要的特征数量
(data_train, target_train),(data_test, target_test) = imdb.load_data(num_words=number_of_features) # 加载影评数据
# 将影评数据转化为one-hot编码过的特征矩阵
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode='binary')
features_test = tokenizer.sequences_to_matrix(data_test, mode='binary')
# 创建神经网络
network = models.Sequential()
network.add(layers.Dense(units=16, activation='relu', input_shape=(number_of_features,)))
network.add(layers.Dense(units=16, activation='relu'))
network.add(layers.Dense(units=1, activation='sigmoid'))
network.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
# 训练神经网络
history = network.fit(features_train, # 特征
target_train, # 目标向量
epochs=3, # spoch数量
verbose=1, # 每个epoch之后打印描述
batch_size=100, # 每个批次中观察值的数量
validation_data=(features_test,target_test)) # 测试数据
# 20.4 训练一个多元分类器
import numpy as np
from keras import models
from keras import layers
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
np.random.seed(0)
number_of_features = 5000 # 设定想要的特征数量
(data_train, target_vector_train),(data_test, target_vector_test) = reuters.load_data(num_words=number_of_features) # 加载新闻数据
# 将数据转化为one-hot编码过的特征矩阵
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode='binary')
features_test = tokenizer.sequences_to_matrix(data_test, mode='binary')
# 将特征向量转换成特征矩阵
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)
# 创建神经网络
network = models.Sequential()
network.add(layers.Dense(units=100, activation='relu', input_shape=(number_of_features,)))
network.add(layers.Dense(units=100, activation='relu'))
network.add(layers.Dense(units=46, activation='softmax'))
# 编译神经网络
network.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
# 训练神经网络
history = network.fit(features_train, # 特征
target_train, # 目标向量
epochs=3, # spoch数量
verbose=0, # 没有输出
batch_size=100, # 每个批次中观察值的数量
validation_data=(features_test,target_test)) # 测试数据
# 20.5 训练一个回归模型
network = models.Sequential()
network.add(layers.Dense(units=32, activation='relu', input_shape=(features_train.shape[1],)))
network.add(layers.Dense(units=32, activation='relu'))
network.add(layers.Dense(units=1)) # 添加没有激活函数的全连接层
# 编译神经网络
network.compile(loss='mse', # 均方误差
optimizer='RMSprop',
metrics=['mse'])
# 训练神经网络
history = network.fit(features_train, # 特征
target_train, # 目标向量
epochs=10, # spoch数量
verbose=0, # 没有输出
batch_size=100, # 每个批次中观察值的数量
validation_data=(features_test,target_test)) # 测试数据
# 20.6 做预测
predicted_target = network.predict(features_test)
predicted_target[0] # 查看第一个观察值属于分类1的预测概率(二元)
# 20.7 可视化训练历史
import matplotlib.pyplot as plt
# 获取训练集和测试集的损失历史数值
training_loss = history.history['loss']
test_loss = history.history['val_loss']
# 为每个epoch创建编号
epoch_count = range(1, len(training_loss) + 1)
# 画出损失的历史数值
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss','Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
#
# 训练集和测试集的准确率
training_accuracy = history.history['acc']
test_accuracy = history.history['val_acc']
plt.plot(epoch_count, training_accuracy, 'r--')
plt.plot(epoch_count, test_accuracy, 'b-')
plt.legend(['Training Accuracy','Test Accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy Score')
plt.show()
# 20.8 通过权重调节减少过拟合 kernel_regularizers=regularizers.l2(0.01)
from keras import regularizers
network.add(layers.Dense(units=16, activation='relu', kernel_regularizers=regularizers.l2(0.01), input_shape=(features_train.shape[1],))) # 0.01表示要对参数值施加多重惩罚
network.add(layers.Dense(units=16, kernel_regularizers=regularizers.l2(0.01), activation='relu'))
# 20.9 通过提前结束减少过拟合 EarlyStopping
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [EarlyStopping(monitor='val_loss',patience=2), # 如果连续两个epoch测试集损失情况没有改善,就中断训练
ModelCheckpoint(filepath='best_model.h5',
monitor='val_loss',
save_best_only=True)] # 只保存最佳模型
history = network.fit(features_train, # 特征
target_train, # 目标向量
epochs=20, # spoch数量
callbacks=callbacks # 提前结束
verbose=1, # 没有输出
batch_size=100, # 每个批次中观察值的数量
validation_data=(features_test,target_test)) # 测试数据
# 20.10 通过Dropout减少过拟合
network = models.Sequential()
network.add(layers.Dropout(0.2, input_shape=(number_of_features,))) # 为输入层添加一个Dropout层
network.add(layers.Dense(units=32, activation='relu', input_shape=(features_train.shape[1],)))
network.add(layers.Dropout(0.5)) # 为前面的隐藏层添加一个Dropout层
network.add(layers.Dense(units=32, activation='relu'))
network.add(layers.Dropout(0.5)) # 为前面的隐藏层添加一个Dropout层
network.add(layers.Dense(units=1))
# 一般对神经元的丢弃比例,输入层0.2,隐藏层0.5
# 20.11 保存模型训练过程
from keras.callbacks import ModelCheckpoint
checkpoint = [ModelCheckpoint(filepath='model.hdf5')] # 设置一个回调函数提前结束训练,并保存结束时的最佳模型
history = network.fit(features_train, # 特征
target_train, # 目标向量
epochs=20, # spoch数量
callbacks=checkpoint # 检查点
verbose=0, # 没有输出
batch_size=100, # 每个批次中观察值的数量
validation_data=(features_test,target_test)) # 测试数据
# 20.12 使用k折交叉验证评估神经网络
def create_network():
network = models.Sequential()
network.add(layers.Dense(units=16, activation='relu', input_shape=(number_of_features,)))
network.add(layers.Dense(units=16, activation='relu'))
network.add(layers.Dense(units=1, activation='sigmoid'))
network.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
return network
# 封装keras模型,以便被scikit-learn使用(回归:KerasRegressor)
neural_network = KerasClassifier(build_fn=create_network,
epochs=10,
batch_size=100,
verbose=0)
# 使用3折交叉验证评估神经网络
cross_val_score(neural_network, features, target, cv=3)
# 20.13 调校神经网络
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras import models
from keras import layers
def create_network(optimizer='rmsprop'):
network = models.Sequential()
network.add(layers.Dense(units=16, activation='relu', input_shape=(number_of_features,)))
network.add(layers.Dense(units=16, activation='relu'))
network.add(layers.Dense(units=1, activation='sigmoid'))
network.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])
return network
neural_network = KerasClassifier(build_fn=create_network,verbose=0)
# 创建超参数空间
epochs = [5,10]
batches = [5,10,100]
optimizers = ['rmsprop','adam']
# 创建超参数选项
hyperparameters = dict(optimizers=optimizers,epochs=epochs,batch_size=batches)
# 创建网格搜索
grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters)
# 实现网格搜索
grid_result = grid.fit(features, target)
# 查看最优超参数
grid_result.best_params_
# 20.14 可视化神经网络
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
network = models.Sequential()
network.add(layers.Dense(units=16, activation='relu', input_shape=(10,)))
network.add(layers.Dense(units=16, activation='relu'))
network.add(layers.Dense(units=1, activation='sigmoid'))
# 可视化
SVG(model_to_dot(network, show_shapes=True).create(prog='dot', format='svg')) # show_shapes 是否展示输入输出的形状
plot_model(network, show_shapes=True, to_file='network.png') # 将图存为文件
# 20.15 图像分类
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np.utils
from keras import backend as K
K.set_image_data_format('channels_first') # 设置色彩通道值优先
np.random.seed(0)
# 图像信息
channels = 1
height = 28
width = 28
(data_train, target_train),(data_test, target_test) = mnist.load_data()
# 将图像数据转换成特征
data_train = data_train.reshape(data_train.shape[0], channels, height, width)
data_test = data_test.reshape(data_test.shape[0], channels, height, width)
# 将像素强度值收缩到0和1之间
features_train = data_train / 255
features_test = data_test / 255
# 对目标进行one-hot编码
target_train = np.utils.to_categorical(target_train )
target_test = np.utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]
# 启动神经网络
network = models.Sequential()
# 添加有64个过滤器,一个大小5*5的窗口,ReLU激活函数的卷积层
network.add(Conv2D(filters=64,
kernel_size=(5,5),
input_shape=(channels, width, height),
activation='relu'))
# 添加带一个2*2窗口的最大池化层
network.add(MaxPooling2D(pool_size=(2,2)))
# 添加Dropout层
network.add(Dropout(0.5))
# 添加一层来压平输入
network.add(Flatten())
# 添加带ReLU激活函数的有128个神经元的全连接
network.add(Dense(128, activation='relu'))
# 添加Dropout层
network.add(Dropout(0.5))
# 添加使用softmax激活函数的全连接层
network.add(Dense(number_of_classes, activation='softmax'))
# 编译神经网络
network.compile(loss='categorial_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
# 训练神经网络
network.fit(features_train, # 特征
target_train, # 目标向量
epochs=2, # spoch数量
verbose=0, # 每个epoch之后打印描述
batch_size=1000, # 每个批次中观察值的数量
validation_data=(features_test,target_test))
# 20.16 通过图像增强来改善卷积神经网络的性能
from keras.preprocessing.image import ImageDataGenerator
# 创建图像增强对象
augmentation = ImageDataGenerator(featurewise_center=True, # 实施ZCA白化(标准化)
zoom_range=0.3, # 随机放大图像
width_shift_range=0.2, # 随机打乱图像
horizontal_flip=True, # 随机翻转图形
rotation_range=90) # 随即旋转图像
# 对文件夹下所有图像进行处理
augment_images = augmentation.flow_from_directory('raw/images', # 图像文件夹
batch_size=32,# 批次的大小
class_mode='binary' # 分类
save_ro_dir='processed/images')
# 训练神经网络
network.fit_generator(augment_images,
steps_per_epoch=2000, # 在每个epoch中调用生成器的次数
epochs=5,
validation_data=augment_images_test, # 测试数据生成器
validation_steps=800) # 在每个测试epoch中调用生成器的次数
# 20.17 文本分类
import numpy as np
from keras import models
from keras import layers
from keras.datasets import imdb
from keras.preprocessing import sequence
np.random.seed(0)
number_of_features = 1000 # 设定想要的特征数量
(data_train, target_train),(data_test, target_test) = imdb.load_data(num_words=number_of_features)
# 采用添加填充值或者截断的方式,使每个样本都有400个特征
features_train = sequence.pad_sequence(data_train, maxlen=400)
features_test= sequence.pad_sequence(data_test, maxlen=400)
# 启动神经网络
network = models.Sequential()
# 添加嵌入层
network.add(layers.Embedding(input_dim=number_of_features, output_dim=128))
# 添加一个有128个神经元的长短期记忆网络
network.add(layers.LSTM(units=128))
# 添加使用sigmoid激活函数的全连接层
network.add(layers.Dense(units=1, activation='sigmoid'))
# 编译神经网络
network.compile(loss='binary_crossentropy',
optimizer='Adam',
metrics=['accuracy'])
# 训练神经网络
network.fit(features_train, # 特征
target_train, # 目标向量
epochs=3, # spoch数量
verbose=0, # 每个epoch之后打印描述
batch_size=1000, # 每个批次中观察值的数量
validation_data=(features_test,target_test))
二十一、保存和加载训练后的模型
# 21.1 保存和加载一个scikit-learn模型
from sklearn.externals import joblib
classifer = RandomForestClassifier()
model = classifer.fit(features, target)
# 将模型保存为pickle文件
joblib.dump(model,'model.pkl')
# 从文件中加载模型
classifer = joblib.load('model.pkl')
# 预测样本分类
classifer.predict(new_observation)
# 21.2 保存和加载Keras模型
from keras.models import load_model
# 保存神经网络
network.save('model.h5') # 包含结构和训练后参数,重新训练所需要的各种设置
# 加载神经网络
network = load_model('model.h5')