1. 亲和性分析

-- coding: utf-8 --

“”"
Created on Tue Sep 18 08:03:55 2018

@author: asus
“”"
#1 亲和性分析
import numpy as np
dataset_filename = “affinity_dataset.txt”
x = np.loadtxt(dataset_filename)

print(x[:5])

features = [‘bread’, ‘milk’, ‘cheese’, ‘apple’, ‘bananas’]

#通过判断交易数据中sample[3]的值,就能知道一个顾客是否买了苹果。
num_apple_purchases = 0
for sample in x:
if sample[3] == 1:
num_apple_purchases += 1
print("{0} people bought Apple".format(num_apple_purchases))

#特征处理
from collections import defaultdict
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)

for sample in x:
for premise in range(4): #前提条件,顾客购买了某一种商品
if sample[premise] == 0: #检查个体是否满足
continue
num_occurances[premise] += 1 #满足条件,出现次数加1
for conclusion in range(len(features)): #跳过条件和结论相同的情况
if premise == conclusion:
continue
if sample[conclusion] == 1:
valid_rules[(premise, conclusion)] += 1 #规则应验情况
else:
invalid_rules[(premise, conclusion)] += 1 #违反规则情况

support = valid_rules #支持度
confidence = defaultdict(float)#置信度
for premise, conclusion in valid_rules.keys():
rule = (premise, conclusion)
confidence[rule] = valid_rules[rule] / num_occurances[premise]

#输出每条规则及其支持度和置信度
def print_rule(premise, conclusion, support, confidence, features):
premise_name = features[premise]
conclusion_name = features[conclusion]
print(“rule: If a person buys {0} they will also buy {1}”.format(
premise_name, conclusion_name))
print(" - Support: {0}".format(support[(premise, conclusion)]))
print(" - Confidence: {0:.3f}".format(confidence[(premise,
conclusion)]))

premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

#1.3.5 排序找出最佳规则
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
#使用itemgetter()类作为键,就可以对嵌套列表进行排序。itemgetter(1)表示以字典各元素的
#值(这里为支持度)作为排序依据,reverse=True表示降序排列

#输出支持度最高的前5条规则
for index in range(5):
print(“Rule #{0}”.format(index + 1))
(premise, conclusion) = sorted_support[index][0]
print_rule(premise, conclusion, support, confidence, features)

#置信度最高
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
print(“Rule #{0}”.format(index + 1))
(premise, conclusion) = sorted_confidence[index][0]
print_rule(premise, conclusion, support, confidence, features)

#1.5.1 准备数据集
#分类,IRIS数据集

from sklearn.datasets import load_iris
dataset = load_iris()
x = dataset.data
y = dataset.target
print(dataset.DESCR)

#每个特征的均值计算方法
attribute_means = x.mean(axis=0)
#将连续的特征值打散,把连续的特征值转换为类别
x_d = np.array(x >= attribute_means, dtype=‘int’)

#1.5.2 实现OneR算法
#它根据已有数据中,具有相同特征值的个体最可能属于哪个类别进行分类。OneR是One Rule(一
#条规则)的简写,表示我们只选取四个特征中分类效果最好的一个用作分类依据。
from collections import defaultdict
from operator import itemgetter
#参数分别为数据集,类别数组,选好的特征索引值,特征值
def train_feature_value(x, y_true, feature_index, value):
“”"
遍历数据集中每一条数据(代表一个个体),统计具有给定特征值的个体在各个类别中
的出现次数
“”"
class_counts = defaultdict(int)
for sample, y in zip(x, y_true):
if sample[feature_index] == value:
class_counts[y] += 1
sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1),
reverse=True)
most_frequent_class = sorted_class_counts[0][0]

#计算错误率
incorrect_predictions = [class_count for class_value, class_count
                     in class_counts.items()
                     if class_value != most_frequent_class]
error = sum(incorrect_predictions)
return most_frequent_class, error

#对于某项特征,遍历其每一个特征值,使用上述函数,就能得到预测结果和每个特征值所带来的
#错误率,然后把所有的错误率累加起来,就能得到该特征的总特征率。
def train_on_feature(x, y_true, feature_index):
values = set(x[:,feature_index])
predictors = {}
errors = []
for current_value in values:
most_frequent_class, error =
train_feature_value(x, y_true, feature_index, current_value)
predictors[current_value] = most_frequent_class
errors.append(error)
total_error = sum(errors)
return predictors, total_error

#1.5.3 测试算法
#切分数据集的函数
from sklearn.cross_validation import train_test_split
#该函数根据设定的比例(默认把数据的25%作为测试集)将数据集随机分为两部分,以确保测试结
#果的可信度。
xd_train, xd_test, y_train, y_test = train_test_split(x_d, y, random_state=14)
#xd_train训练集,xd_test测试集。y_train,y_test分别为以上两个数据集的类别信息。

#只使用用训练集。遍历数据集中的每个特征,使用我们之前定义的函数train_on_feature()训练
#预测器,计算错误率。
all_predictors = {}
errors = {}
for feature_index in range(xd_train.shape[1]):
predictors, total_error = train_on_feature(
xd_train, y_train, feature_index)
all_predictors[feature_index] = predictors
errors[feature_index] = total_error
#找出错误率最低的特征,作为分类的唯一规则。
best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
#对预测器进行排序,找到最佳特征值,创建model模型。
model = {‘variable’: best_feature,
‘predictor’: all_predictors[best_feature]}
#model模型是一个字典结构,包含两个元素:用于分类的特征和预测器。有了模型后,就可以根据
#特征值对没有见过的数据进行分类。
#variable = model[‘variable’]
#predictor = model[‘predictor’]
#prediction = predictor[int(sample[variable])]

def predict(x_test, model):
variable = model[‘variable’]
predictor = model[‘predictor’]
y_prediction = np.array([predictor[int(sample[variable])] for sample in x_test])
return y_prediction

y_predicted = predict(xd_test, model)
#比较预测结果和实际类别,就能得到正确率是多少
accuracy = np.mean(y_predicted == y_test) * 100
print(“The test accuracy is {:.1f}%”.format(accuracy))

你可能感兴趣的:(python,python数据挖掘入门与实践)