- Apriori算法
- 线性回归
- UCI分类KNN
- 决策树
- Naive_Bayes
- K-Means图像分割
Apriori算法
流程
源代码
# coding=utf-8
def load_data_set():
"""
一个项目集合的列表
表示买的东西
"""
data_set = [['i1', 'i2', 'i5'], ['i2', 'i4'], ['i2', 'i3'],
['i1', 'i2', 'i4'], ['i1', 'i3'], ['i2', 'i3'],
['i1', 'i3'], ['i1', 'i2', 'i3', 'i5'], ['i1', 'i2', 'i3'],
['i1','i4'],['i2','i3']]
return data_set
def create_C1(data_set):
"""
创建只有一个项目的锁定候选集合
"""
C1 = set()
for t in data_set:
for item in t:
item_set = frozenset([item])
C1.add(item_set)
return C1
def is_apriori(Ck_item, Lksub1):
"""
评价这个候选集合是不是满足apriori算法
"""
for item in Ck_item:
sub_Ck = Ck_item - frozenset([item])
if sub_Ck not in Lksub1:
return False
return True
def create_Ck(Lksub1, k):
"""
创建候选集合
"""
Ck = set()
len_Lksub1 = len(Lksub1)
list_Lksub1 = list(Lksub1)
for i in range(len_Lksub1):
for j in range(1, len_Lksub1):
l1 = list(list_Lksub1[i])
l2 = list(list_Lksub1[j])
l1.sort()
l2.sort()
if l1[0:k-2] == l2[0:k-2]:
Ck_item = list_Lksub1[i] | list_Lksub1[j]
# pruning
if is_apriori(Ck_item, Lksub1):
Ck.add(Ck_item)
return Ck
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
"""
用删除策略从候选集合中选出频繁集
"""
Lk = set()
item_count = {}
# 计数 每个项目出现次数
for t in data_set:
for item in Ck:
if item.issubset(t):
if item not in item_count:
item_count[item] = 1
else:
item_count[item] += 1
t_num = float(len(data_set))
for item in item_count:
if (item_count[item] / t_num) >= min_support:
Lk.add(item)
support_data[item] = item_count[item] / t_num
return Lk
def generate_L(data_set, k, min_support):
'''
产生频繁集
:param data_set: 一个列表度事务。每个事务包含几个项。
:param k: 所有频繁项集的最大项数。
:param min_support: 最小支持度
:return:L频繁集列表,support_data <频繁集:支持度>
'''
support_data = {}
C1 = create_C1(data_set)
L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
Lksub1 = L1.copy()
L = []
L.append(Lksub1)
for i in range(2, k+1):
Ci = create_Ck(Lksub1, i)
Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
Lksub1 = Li.copy()
L.append(Lksub1)
return L, support_data
def generate_big_rules(L, support_data, min_conf):
"""
产生关联规则
Args:
L: 项目列表
support_data: 频繁集的支持度
min_conf: 最小置信度
Returns:
big_rule_list: 三元组列表。
"""
big_rule_list = []
sub_set_list = []
for i in range(0, len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
# 计算置信度
conf = support_data[freq_set] / support_data[freq_set - sub_set]
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in big_rule_list:
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
big_rule_list.append(big_rule)
sub_set_list.append(freq_set)
return big_rule_list
if __name__ == "__main__":
data_set = load_data_set()
L, support_data = generate_L(data_set, k=3, min_support=0.2)#支持度
big_rules_list = generate_big_rules(L, support_data, min_conf=0.6)#置信度
for Lk in L:
if(Lk==set([])):
break
print("="*50)
print("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
print("="*50)
for freq_set in Lk:
print(freq_set, support_data[freq_set])
print()
print("Rules")
for item in big_rules_list:
print(item[0], "=>", item[1], "conf: ", item[2])
结果
==================================================
frequent 1-itemsets support
==================================================
frozenset({'i2'}) 0.7272727272727273
frozenset({'i1'}) 0.6363636363636364
frozenset({'i3'}) 0.6363636363636364
frozenset({'i4'}) 0.2727272727272727
==================================================
frequent 2-itemsets support
==================================================
frozenset({'i1', 'i2'}) 0.36363636363636365
frozenset({'i3', 'i2'}) 0.45454545454545453
frozenset({'i1', 'i3'}) 0.36363636363636365
Rules
frozenset({'i3'}) => frozenset({'i2'}) conf: 0.7142857142857143
frozenset({'i2'}) => frozenset({'i3'}) conf: 0.625
线性回归
流程
源代码
from matplotlib import pyplot as plt
import numpy as np
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import KFold
import os
os.chdir('./2_LR/')
def generate_data(N):
import random
student=[]
delta=(10-0.4)/N
y=lambda x:int(((-5/8)*x**2)+(25/2)*x+305/8)
lt=[]
for i in range(N):
learn_time=0.+delta*i
lt.append(learn_time)
# print(learn_time)
score=y(learn_time)
# score=random.random(0,5)
# print(score)
student.append(score)
lt,student=np.array(lt)[:,np.newaxis],np.array(student)[:,np.newaxis]
return lt,student
def predict(theta, x):
'''
theta:(d,1)
x:(n,d)
'''
# x = x.reshape((len(x), -1))
# print(x.shape)
return x.dot(theta).reshape((-1, 1))
def train(X,Y):
def predict(theta, x):
'''
theta:(d,1)
x:(n,d)
'''
# x = x.reshape((len(x), -1))
# print(x.shape)
return x.dot(theta).reshape((-1, 1))
def const_error(h, y):
return h - y
# 均方误差
# h:pred_y
def cost(h, y, con):
return (np.mean(con ** 2)) / 2
def grad(x, con):
return np.mean(con * x, axis=0, keepdims=True).transpose()
gamma=1e-3
epoch = 10000
epsilon = 1e-8
XY=np.concatenate((X,np.ones((len(X),1)),Y),axis=1)
np.random.seed(2)
np.random.shuffle(XY)
X, Y = XY[:, :2], XY[:, 2:]
kfold = KFold(5)
thetas = [] # 每一次的参数
for j, (train_index, test_index) in enumerate(kfold.split(X)):
# print(train_index)
train_x, test_x, train_y, test_y = X[train_index], X[test_index], Y[train_index], Y[test_index]
sc = [] # 训练集误差
vc = [] # 交叉集预测误差
theta = np.random.randn(2,1) # 随机初始化
# print(theta)
this_time_con = 10000 # 损失
# if j!=3:
# continue
# print('?')
for i in range(epoch):
# print(i,train_x.shape)
h = predict(theta, train_x)
con = const_error(h, train_y)
g = grad(train_x, con)
pre_y = predict(theta, test_x)
if gamma >= 500:
gamma *= 0.95
theta = theta - gamma * g # 更新公式
# 存一下损失
valdation_cost = cost(pre_y, test_y, const_error(pre_y, test_y))
sc.append(cost(h, train_y, con))
vc.append(valdation_cost)
if abs(this_time_con - valdation_cost) <= epsilon:
break
else:
this_time_con = valdation_cost
plt.plot(np.arange(len(sc)), np.array(sc), label="training_cost")
plt.legend()
plt.plot(np.arange(len(vc)), np.array(vc), label="validating_cost")
plt.legend()
plt.xlabel("epoch")
plt.ylabel("cost")
plt.title(str(j) + 'time_cost.png')
# print(i,valdation_cost)
plt.text(i, valdation_cost, 'val_cost:\n(%d,%.3f)' % (i, valdation_cost), fontsize=8)
# plt.show()
plt.savefig(str(j) + 'time_cost.png')
plt.clf()
thetas.append(theta)
plt.scatter(X[:,0], Y, marker='x',c='red')
plt.grid()
x = np.arange(-0.2, 8, 0.1)[:,np.newaxis]
x=np.concatenate((x,np.ones((len(x),1))),axis=1)
y = predict(theta, x)
plt.plot(x[:,0], y,c='blue')
plt.title("%dtime_predict" % (j))
plt.savefig("%dtime_predict" % (j))
plt.clf()
return thetas[-1]
if __name__ == '__main__':
learn_time, score=generate_data(100)
theta=train(learn_time,score)
plt.plot(learn_time, score)
x=np.arange(0,10,0.1)[:,np.newaxis]
x = np.concatenate((x, np.ones((len(x), 1))), axis=1)
plt.plot(x[:,0],predict(theta,x))
# plt.show()
plt.savefig('curve.tiff')
结果
UCI分类KNN
流程
step.1---初始化距离为最大值
step.2---计算未知样本和每个训练样本的距离dist
step.3---得到目前K个最临近样本中的最大距离maxdist
step.4---如果dist小于maxdist,则将该训练样本作为K-最近邻样本
step.5---重复步骤2、3、4,直到未知样本和所有训练样本的距离都算完
step.6---统计K-最近邻样本中每个类标号出现的次数
step.7---选择出现频率最大的类标号作为未知样本的类标号
源代码
# coding: utf-8
# 作者:
# ## 显示图片
# In[7]: get_ipython().magic('matplotlib inline')
# ## 导入库
# In[17]:
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# ## 读取数据
# In[2]:
iris = load_iris()
X = iris.data
y = iris.target
# ## 每次可视化两维数据
# In[5]:
X_sepal = X[:, :2]
plt.scatter(X_sepal[:, 0], X_sepal[:, 1], c=y, cmap=plt.cm.gnuplot)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
# In[6]:
X_petal = X[:, 2:4]
plt.scatter(X_petal[:, 0], X_petal[:, 1], c=y, cmap=plt.cm.gnuplot) plt.xlabel('Petal length')
plt.ylabel('Petal width') # ## 初始化分类器
# ### 最近邻
# In[18]:
knn1 = KNeighborsClassifier(n_neighbors=1) knn1.fit(X, y)
y_pred = knn1.predict(X) print((metrics.accuracy_score(y, y_pred)))
# 为什么当 K=1 的时候 KNN 算法的训练准确度为 1,KNN 会查找在训练数据集中的最近的观测,训练 得到的模型会在相同的数据集中找到相同的观测。换句话说,KNN 算法已经记住了训练数据集,因为我 们使用同样的数据作为测试的数据。
### 将X和y分割成训练和测试集
# In[15]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) # 我们能找到一个比较好的 K 值吗?
# In[19]:
# 测试从 K=1 到 K=25,记录测试准确率
k_range = list(range(1, 26))
test_accuracy = [] for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test) test_accuracy.append(metrics.accuracy_score(y_test, y_pred))
# In[20]:
plt.plot(k_range, test_accuracy)
plt.xlabel("Value of K for KNN") plt.ylabel("Testing Accuracy") # 所以我们选择 k=9 比较好
# ## 考虑交叉验证
# In[21]:
from sklearn.cross_validation
import KFold
import numpy as np
def cv_estimate(k, kfold=5):
cv = KFold(n = X.shape[0], n_folds=kfold) clf = KNeighborsClassifier(n_neighbors=k) score = 0
for train, test in cv:
clf.fit(X[train], y[train])
score += clf.score(X[test], y[test]) #print clf.score(X[test], y[test])
score /= kfold return score
# In[22]:
k_range = list(range(1, 26)) test_accuracy = []
for k in k_range:
test_accuracy.append(cv_estimate(k, 5)) # In[23]:
plt.plot(k_range, test_accuracy) plt.xlabel("Value of K for KNN")
plt.ylabel("Average Accuracy of Kfold CV")
# 所以我们可以选择 10 折以内,都会取得不错的效果。
结果
决策树
流程
- 创建数据集
- 计算数据集的信息熵
- 遍历所有特征,选择信息熵最小的特征,即为最好的分类特征
- 根据上一步得到的分类特征分割数据集,并将该特征从列表中移除
- 执行递归函数,返回第三步,不断分割数据集,直到分类结束
- 使用决策树执行分类,返回分类结果
源代码
from math import log
import operator
def calcShannonEnt(dataSet): # 计算数据的熵(entropy)
numEntries=len(dataSet) # 数据条数
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1 # 统计有多少个类以及每个类的数量
shannonEnt=0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
return shannonEnt
def createDataSet1(): # 创造示例数据
dataSet = [['长', '粗', '男'],
['短', '粗', '男'],
['短', '粗', '男'],
['长', '细', '女'],
['短', '细', '女'],
['短', '粗', '女'],
['长', '粗', '女'],
['长', '粗', '女']]
labels = ['头发','声音'] #两个特征
return dataSet,labels
def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec =featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet): # 选择最优的分类特征
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet) # 原始的熵
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob =len(subDataSet)/float(len(dataSet))
newEntropy +=prob*calcShannonEnt(subDataSet) # 按特征分类后的熵
infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
if (infoGain>bestInfoGain): # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
bestInfoGain=infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList): #按分类后类别数量排序,比如:最后分类为2男1女,则判定为男;
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet] # 类别:男或女
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}} #分类结果以字典形式保存
del(labels[bestFeat])
featValues=[example[bestFeat] for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
myTree[bestFeatLabel][value]=createTree(splitDataSet\
(dataSet,bestFeat,value),subLabels)
return myTree
if __name__=='__main__':
dataSet, labels=createDataSet1() # 创造示列数据
print(createTree(dataSet, labels)) # 输出决策树模型结果
结果
{'声音': {'粗': {'头发': {'长': '女', '短': '男'}}, '细': '女'}
Naive_Bayes
流程
源代码
import pandas as pd
import numpy as np
import math
from functools import reduce
# from scipy import stats
'''
天气因素有温度、湿度和刮风等,通过给出数据,使用贝叶斯算法学习分类,输出一个人是运动和不运动与天气之间的概率关系。
# 求先验概率p c,这里的c有两类
# 求联合概率evidence p(f1,f2...) 大家都一样无所谓
# 似然概率p(f1,f2...|c)=p(f1|c)*p(f2|c)...
## 对连续样本假设为高斯分布
'''
class Naive_Bayes():
def __init__(self):
self.data=pd.read_csv('3_data.csv')
self.P_category()
self.likelihood()
def category_extract(self,values):
# prior
labels = set(values)
self.__labels_Lenth=len(labels)
for s in labels:
indices = np.where(values == s)[0]
yield s,indices
def P_category(self):
self.prior = dict()
L = len(self.data['运动'].values)
for s,indices in self.category_extract(self.data['运动'].values):
self.prior[s] = len(indices) / L
# {'不适合': 0.35714285714285715, '适合': 0.6428571428571429}
def likelihood(self):
# 提取每一类索引
# 有两种情况,object、int
self.Pfeature = dict()#似然概率
for info in self.data._info_axis.values[:-1]:
self.Pfeature[info]=dict()
for c,indices in self.category_extract(self.data['运动'].values):
temp_values=self.data[info].values[indices]#在数据中 <天气> 这一类 <不合适-运动> 的值
if (self.data[info].dtype==np.int64):
# 整型-假设高斯分布
u=np.mean(temp_values)
theta=np.var(temp_values)
self.Pfeature[info][c]=[u,theta] # <温度> <不合适运动c> 高斯函数的参数[u,theta]
pass
elif (self.data[info].dtype==np.object):
# 文字可以直接抽取-离散
self.Pfeature[info][c] = dict()
L = len(temp_values)+self.__labels_Lenth
for s, inds in self.category_extract(temp_values):
self.Pfeature[info][c][s] = (len(inds)+1)/ L # <天气info> <不合适c>(运动) <多云s> 的似然概率
pass
else:
print(self.data[info].dtype, 'gg')
exit()
pass
def __gauss(self,x,u,theta):
# x=u
# theta=1
# print(x,u,theta,1 / math.sqrt(2 * math.pi * theta ** 2) * math.exp(-(x - u) ** 2 / (2 * theta ** 2)))
# a=1 / math.sqrt(2 * math.pi * theta ** 2) * math.exp(-(x - u) ** 2 / (2 * theta ** 2))
# b=stats.norm.pdf(x,u,theta)
# print(a,b)
x=np.arange(x-0.5,x+0.5,0.1)
return np.sum(1 / math.sqrt(2 * math.pi * theta ** 2) * np.exp(-(x - u) ** 2 / (2 * theta ** 2)))
def predict(self,n=-1):
x,y=list(self.data[info].values[n] for info in self.data._info_axis.values[:-1]),self.data[self.data._info_axis.values[-1]].values[-1]
max_p=[0,0,0]#第一个存率,第二个存类别,第三个求和
for c,d in self.prior.items():
# print('?')
temp_possibility=[]#连乘用的分子
temp_possibility.append(d)
for i,info in enumerate(self.data._info_axis.values[:]):
# print(info, c, x[i])
if (self.data[info].dtype==np.int64):
temp_possibility.append(self.__gauss(x[i],self.Pfeature[info][c][0],self.Pfeature[info][c][1]))
elif (self.data[info].dtype==np.object):
try:
temp_possibility.append(self.Pfeature[info][c][x[i]])
except KeyError:
temp_possibility.append(0.00001)
# print(c,temp_possibility)
# temp=abs(reduce(lambda a,b:a+b,map(math.log,temp_possibility)))
temp=reduce(lambda a,b:a*b,temp_possibility)
max_p[2]+=temp
if temp>max_p[0]:
max_p[:2]=temp,c
# print(max_p)
# print(temp)
# print(self.Pfeature)
'''
{'天气':
{'不适合': {'有雨': 0.42857142857142855, '晴': 0.5714285714285714},
'适合': {'有雨': 0.36363636363636365, '多云': 0.45454545454545453, '晴': 0.2727272727272727}},
'温度':
{'不适合': [74.599999999999994, 49.839999999999996],
'适合': [73.0, 33.777777777777779]},
'湿度':
{'不适合': [84.0, 74.0],
'适合': [78.222222222222229, 86.839506172839506]},
'风况':
{'不适合': {'有': 0.5714285714285714, '无': 0.42857142857142855},
'适合': {'有': 0.36363636363636365, '无': 0.6363636363636364}}}
'''
print(x,max_p[1],max_p[0]/max_p[2])
return max_p[1]
pass
if __name__ == '__main__':
myNB=Naive_Bayes()
# myNB.predict(-4)
y=list(map(myNB.predict,list(range(14))))
# print(y)
# print(np.where(y==myNB.data['运动'].values)[0])
print(len(np.where(y==myNB.data['运动'].values)[0])/len(y))
结果
['晴', 85, 85, '无'] 适合 0.605515228032
['晴', 80, 90, '有'] 不适合 0.597763996605
['多云', 83, 78, '无'] 适合 0.999993280941
['有雨', 70, 96, '无'] 适合 0.738902944045
['有雨', 68, 80, '无'] 适合 0.740164275592
['有雨', 65, 70, '有'] 适合 0.550906281645
['多云', 64, 65, '有'] 适合 0.99998485259
['晴', 72, 95, '无'] 适合 0.614379338515
['晴', 69, 70, '无'] 适合 0.618965026052
['有雨', 75, 80, '无'] 适合 0.74028592022
['晴', 75, 70, '有'] 不适合 0.589771190225
['多云', 72, 90, '有'] 适合 0.999984648989
['多云', 81, 75, '无'] 适合 0.999993369057
['有雨', 71, 80, '有'] 适合 0.550487083686
0.6428571428571429
K-Means图像分割
流程
源代码
# 基于K-means算法的图像分割
import cv2
import numpy as np
# 读取图像
start=0
img0=cv2.imread('./kmean/'+str(start)+'.tiff')
zz=np.load('./kmean/zz.npy')
C = 3 # 聚类数量
def kmean(img0):
img = np.copy(img0)
img=np.array(img,dtype=np.float32)
shape = img.shape[:2]
# zz=np.array([[60,80,100],[140,160,180],[220,240,260]])#聚类中心
zz=np.array([[ 107.42873407,165.75729793,149.6303364],[51.03919928,54.9071066,45.18418758],[304.08849809,230.56291292,161.19507833]])#聚类中心
# zz=np.tile(z,[1,3])
# zz=np.random.randint(0,255,(3,3))
# print(zz)
L=shape[0]*shape[1]#数据长度
kinds=np.empty((L,),dtype=np.uint8)#类别数组
data=np.reshape(img,(L,3)) # reshape
# z_last=zz.copy()#中心
for time in range(start,5+start):
print(time)
# 逐点
for i in range(L):
mdzz = np.linalg.norm(data[i] - zz,axis=1)
# print(mdzz)
min_mdzz_index = np.argmin(mdzz)
# print(min_mdzz_index)
kinds[i] = min_mdzz_index
# print(kinds)
# 更新聚类中心
for k in range(C):
tmp_where = np.where(kinds == k)[0]
# print('tmp_where',tmp_where)
# print(data[tmp_where])
zz[k] = np.mean(data[tmp_where],axis=0)
print('zz\n',zz)
# 赋值
for k in range(C):
tmp_where = np.where(kinds == k)[0]
data[tmp_where] = zz[k]
# re-reshpae
temp = np.reshape(data, (shape[0],shape[1],3))
temp=np.uint8(temp)
print(temp.shape)
print('保存%d.tiff'%(time+1))
# temp=cv2.cvtColor(temp,cv2.COLOR_GRAY2BGR)
cv2.imwrite('./kmean/' + str(time+1) + '.tiff', temp)
np.save('./kmean/zz.npy',zz)
return kinds
if __name__ == '__main__':
kinds=kmean(img0)
gray=[0,178,255]
img_gray=cv2.cvtColor(img0,cv2.COLOR_BGR2GRAY)
shape=img_gray.shape
img_gray=np.reshape(img_gray,(img_gray.size,))
for k in range(C):
tmp_where = np.where(kinds == k)[0]
img_gray[tmp_where]=gray[k]
img_gray=np.reshape(img_gray,shape)
cv2.imwrite('./kmean/gray.tiff', img_gray)