Python数据挖掘学习——鸢尾花分类、OneR算法

《Python数据挖掘入门与实践》第一章内容,实现一个简单的分类处理,实现OneR算法。

OneR算法的思路很简单,它根据已有的数据中,具有相同特征值的个体最可能属于哪个类别进行分类。OneR也就是One Rule的缩写,即“一条规则”,表示我们只选取特征中分类效果最好的一个作为分类的依据。虽然这个算法十分的简单,但是在很多真实数据集上却有着不凡的表现。算法首先遍历每个特征的每个取值,对于每个特征值,统计它在各个类别中出现的次数,找到它出现次数最多的类别,并统计它在其他类别中出现的次数。接着,我们计算每个特征的错误率——把该特征的各个取值的错误率相加。最后选取错误率最低的特征作为唯一的分类标准,也就是“One Rule”。

在《Python数据挖掘入门与实践》一书中的代码在我实验的过程中有一些小问题,下面的代码是经过修改整合之后的,可以正常运行的代码。

实例中采用鸢尾花数据集,该数据集可以通过sklearn包中直接载入。

代码如下:

"""
分类问题的简单实例
使用鸢尾花数据集
实现OneR算法
"""
from sklearn.model_selection import train_test_split #分割数据集成为训练集和测试集(默认是25%)
from sklearn.datasets import load_iris
import numpy as np
from collections import defaultdict
from operator import itemgetter


dataset = load_iris()
X = dataset.data #是一个ndarray
y = dataset.target
n_samples, n_features = X.shape
"""查看数据集"""
# print(dataset.DESCR)

"""得到数据离散化所使用的阈值——均值"""
attribute_means = X.mean(axis=0)#axis=0为列
"""将原来连续的数据集离散化"""
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')
# print(attribute_means)
# print(X_d)

"""实现OneR算法"""
def train_feature_value(X, y_true, feature_index, value):
    #数据集, 类别数组, 选好的特征索引值, 特征值
    #接下来遍历数据集中的每一条数据(一条数据代表一个个体),统计具有给定特征值的个体在各个类别中出现的次数
    class_counts = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1
    #对class_counts字典排序,找到最大值
    sorted_class_counts = sorted(class_counts.items(),
                                 key=itemgetter(1),
                                 reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    #接着计算该条规则的错误率
    incorrect_predictions = [class_count for class_value, class_count
                             in class_counts.items()
                             if class_value != most_frequent_class] #这句代码确实很简练
    error = sum(incorrect_predictions)
    return most_frequent_class, error
def train(X, y_true, feature):
    """Computes the predictors and error for a given feature using the OneR algorithm

    Parameters
    ----------
    X: array [n_samples, n_features]
        The two dimensional array that holds the dataset. Each row is a sample, each column
        is a feature.

    y_true: array [n_samples,]
        The one dimensional array that holds the class values. Corresponds to X, such that
        y_true[i] is the class value for sample X[i].

    feature: int
        An integer corresponding to the index of the variable we wish to test.
        0 <= variable < n_features

    Returns
    -------
    predictors: dictionary of tuples: (value, prediction)
        For each item in the array, if the variable has a given value, make the given prediction.

    error: float
        The ratio of training data that this rule incorrectly predicts.
    """
    # Check that variable is a valid number
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    # Get all of the unique values that this variable has
    values = set(X[:, feature])
    # Stores the predictors array that is returned
    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=14)
#randon_state用来指定随机状态,每次切分适用相同的随机状态
print("there are {} training samples".format(y_train.shape))
print("there are {} testing samples".format(y_test.shape))
#接下来使用训练集训练
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
#这个模型包含用于分类的特征和预测器,可以用这个模型进行未知数据的分类
# print(model)
# print(model['variable'])
#示例
# variable = model['variable']
# predicor = model['predictor']
# prediction = predicor[int(sample[variable])]

#多条数据的预测
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted


y_predict = predict(X_test, model)
print(y_predict)
accuracy = np.mean(y_predict == y_test) * 100
print("the test accuracy is {:.1f}%".format(accuracy))


 

你可能感兴趣的:(Python数据挖掘)