一如故往

机器学习中一些基本的算法源码实现及注释详情

decision_tree的源码实现：

#coding:utf-8
from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn.externals.six import StringIO


# 读取csv数据
data = open(r'F:\study\code\python\machine_learning\decision_tree\AllElectronics.csv', 'rb')
# csv.reader可以按行读取内容
# csv_reader把每一行数据转化成了一个list，list中每个元素是一个字符串。
reader = csv.reader(data)
headers = reader.next()
# print(headers)

# 装取feature，即特征值的informatiom
featureLIst = []
# 存储最后一行的值
labelList = []

for Row in reader:
    # 每一个Row表示表格中的一行
    # print(Row)
    # labelList中存储的是每一行的最后一个值
    labelList.append(Row[len(Row) - 1])
    # RowDict存储的是从age到credit_rating的值
    RowDict = {}
    # 接着遍历每一行里面的特征值
    for i in range(1, len(Row) - 1):
        # print(Row[i])
        # RowDict的key对应的就是从每一行中取出来的值
        RowDict[headers[i]] = Row[i]
        # print("RowDict:" + RowDict)
    featureLIst.append(RowDict)
# 输出的应该是一个list of dictionary，之所以转换为这种形式是为了便于利用python中的模块，可以把这种list of dictionary转换成0-1的格式
print(featureLIst)

# 实例化,DicVectorizer是一个对象
vec = DictVectorizer()
# 转换成我们需要的dummy variable格式
dummyX = vec.fit_transform(featureLIst).toarray()
print("dummyX:" + str(dummyX))
print(vec.get_feature_names())

print ("labelList:" + str(labelList))
# 在python中提供了一种把class labels转换成0-1格式的方法
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
print ("dummyY:" + str(dummyY))

# 利用sklearn创建决策树
# criterion='entropy'指的是利用信息熵为标准创建决策树
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print("clf:" + str(clf))

# 将决策树可视化，保存为dot文件
with open("allElectronicInformationGainOri.dot", "w") as f:
    f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)
# 原表格中第一行的数据
oneRowX = dummyX[0, :]
print("oneRowX:" + str(oneRowX))

# 将第一行的数据修改年纪：youth->middle_aged，然后进行测试
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
print("newRowX:" + str(newRowX))

predictedY = clf.predict(newRowX)
print("predictedY: " + str(predictedY))

csv_file:

dot文件：

pdf文件：

K_means算法：

源码：

# coding:utf-8
from numpy import *

"""
Code for hierarchical clustering, modified from
Programming Collective Intelligence by Toby Segaran
(O'Reilly Media 2007, page 33).
"""


# 结点定义
class cluster_node:
    def __init__(self, vec, left=None, right=None, distance=0.0, id=None, count=1):
        self.left = left
        self.right = right
        self.vec = vec
        self.id = id
        self.distance = distance
        self.count = count  # only used for weighted average


# 求两种不同距离的方法
def L2dist(v1, v2):
    return sqrt(sum((v1 - v2) ** 2))


def L1dist(v1, v2):
    return sum(abs(v1 - v2))


# def Chi2dist(v1,v2):
#     return sqrt(sum((v1-v2)**2))


def hcluster(features, distance=L2dist):
    # cluster the rows of the "features" matrix
    distances = {}
    currentclustid = -1

    # clusters are initially just the individual rows
    clust = [cluster_node(array(features[i]), id=i) for i in range(len(features))]

    while len(clust) > 1:
        lowestpair = (0, 1)
        closest = distance(clust[0].vec, clust[1].vec)

        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i + 1, len(clust)):
                # distances is the cache of distance calculations
                if (clust[i].id, clust[j].id) not in distances:
                    distances[(clust[i].id, clust[j].id)] = distance(clust[i].vec, clust[j].vec)

                d = distances[(clust[i].id, clust[j].id)]

                if d < closest:
                    closest = d
                    lowestpair = (i, j)

        # calculate the average of the two clusters
        mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0 \
                    for i in range(len(clust[0].vec))]

        # create the new cluster
        newcluster = cluster_node(array(mergevec), left=clust[lowestpair[0]],
                                  right=clust[lowestpair[1]],
                                  distance=closest, id=currentclustid)

        # cluster ids that weren't in the original set are negative
        currentclustid -= 1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)

    return clust[0]


def extract_clusters(clust, dist):
    # extract list of sub-tree clusters from hcluster tree with distance    clusters = {}
    if clust.distance < dist:
        # we have found a cluster subtree
        return [clust]
    else:
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:
            cl = extract_clusters(clust.left, dist=dist)
        if clust.right != None:
            cr = extract_clusters(clust.right, dist=dist)
        return cl + cr


def get_cluster_elements(clust):
    # return ids for elements in a cluster sub-tree
    if clust.id >= 0:
        # positive id means that this is a leaf
        return [clust.id]
    else:
        # check the right and left branches
        cl = []
        cr = []
        if clust.left != None:
            cl = get_cluster_elements(clust.left)
        if clust.right != None:
            cr = get_cluster_elements(clust.right)
        return cl + cr


def printclust(clust, labels=None, n=0):
    # indent to make a hierarchy layout
    for i in range(n): print ' ',
    if clust.id < 0:
        # negative id means that this is branch
        print '-'
    else:
        # positive id means that this is an endpoint
        if labels == None:
            print clust.id
        else:
            print labels[clust.id]

    # now print the right and left branches
    if clust.left != None: printclust(clust.left, labels=labels, n=n + 1)
    if clust.right != None: printclust(clust.right, labels=labels, n=n + 1)


def getheight(clust):
    # Is this an endpoint? Then the height is just 1
    if clust.left == None and clust.right == None: return 1

    # Otherwise the height is the same of the heights of
    # each branch
    return getheight(clust.left) + getheight(clust.right)


def getdepth(clust):
    # The distance of an endpoint is 0.0
    if clust.left == None and clust.right == None: return 0

    # The distance of a branch is the greater of its two sides
    # plus its own distance
    return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance

实例测试源码：

# coding:utf-8
import numpy as np


def k_means(X, K, maxIt):
    """
    :param x: 数据集
    :param k: 分类数目
    :param maxIt: 最大迭代更新次数
    """
    # numPoints表示行数，numDim表示列数（维度）
    numPoints, numDim = X.shape
    # 新建一个数据集，比X多一列，用于存储分类
    dataSet = np.zeros((numPoints, numDim + 1))
    dataSet[:, : -1] = X
    # 随机选取K个中心点作为初始化的中心点
    centroids = dataSet[np.random.randint(numPoints, size=K), :]
    # centroids = dataSet[0:2, :]
    # 给中心点的最后一列随机赋值
    centroids[:, -1] = range(1, K + 1)
    # iterations :循环次数
    iterations = 0
    oldCentroids = None

    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations, maxIt):
        print "iteration: \n", iterations
        print "dataSet: \n", dataSet
        print "centroids: \n", centroids
        # Save old centroids for convergence test. Book keeping.
        # 使用np.copy（）改变一个的值不会影响另外一个的值，即两个变量不指向同一内存空间
        oldCentroids = np.copy(centroids)
        iterations += 1

        # Assign labels to each datapoint based on centroids
        # 更新分类标签
        updateLabels(dataSet, centroids)

        # Assign centroids based on datapoint labels
        # 更新中心点
        centroids = getCentroids(dataSet, K)

    # We can get the labels too by calling getLabels(dataSet, centroids)
    return dataSet


# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations, maxIt):
    if iterations > maxIt:
        return True
    # 比较两者值是否相等
    return np.array_equal(oldCentroids, centroids)
    # Function: Get Labels


# -------------
# Update a label for each piece of data in the dataset.
def updateLabels(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid.
    # Make that centroid the element's label.
    numPoints, numDim = dataSet.shape
    for i in range(0, numPoints):
        dataSet[i, -1] = getLabelFromClosestCentroid(dataSet[i, :-1], centroids)


def getLabelFromClosestCentroid(dataSetRow, centroids):
    label = centroids[0, -1];
    minDist = np.linalg.norm(dataSetRow - centroids[0, :-1])
    for i in range(1, centroids.shape[0]):
        # np.linalg.norm用于计算距离
        dist = np.linalg.norm(dataSetRow - centroids[i, :-1])
        if dist < minDist:
            minDist = dist
            label = centroids[i, -1]
    print "minDist:", minDist
    return label


# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, k):
    # Each centroid is the geometric mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    result = np.zeros((k, dataSet.shape[1]))
    for i in range(1, k + 1):
        oneCluster = dataSet[dataSet[:, -1] == i, :-1]
        """
            mean()函数功能：求取均值
            经常操作的参数为axis，以m * n矩阵举例：
            axis 不设置值，对 m*n 个数求均值，返回一个实数
            axis = 0：压缩行，对各列求均值，返回 1* n 矩阵
            axis =1 ：压缩列，对各行求均值，返回 m *1 矩阵
        """
        result[i - 1, :-1] = np.mean(oneCluster, axis=0)
        result[i - 1, -1] = i

    return result


x1 = np.array([1, 1])
x2 = np.array([2, 1])
x3 = np.array([4, 3])
x4 = np.array([5, 4])
# 把四个点纵向排列，组成一个矩阵
testX = np.vstack((x1, x2, x3, x4))

result = k_means(testX, 2, 10)
print "final result:"
print result

运行结果：

iteration:
0
dataSet:
[[1. 1. 0.]
[2. 1. 0.]
[4. 3. 0.]
[5. 4. 0.]]
centroids:
[[4. 3. 1.]
[5. 4. 2.]]
minDist: 3.60555127546
minDist: 2.82842712475
minDist: 0.0
minDist: 0.0
iteration:
1
dataSet:
[[1. 1. 1.]
[2. 1. 1.]
[4. 3. 1.]
[5. 4. 2.]]
centroids:
[[2.33333333 1.66666667 1. ]
[5. 4. 2. ]]
minDist: 1.490711985
minDist: 0.7453559925
minDist: 1.41421356237
minDist: 0.0
iteration:
2
dataSet:
[[1. 1. 1.]
[2. 1. 1.]
[4. 3. 2.]
[5. 4. 2.]]
centroids:
[[1.5 1. 1. ]
[4.5 3.5 2. ]]
minDist: 0.5
minDist: 0.5
minDist: 0.707106781187
minDist: 0.707106781187
final result:
[[1. 1. 1.]
[2. 1. 1.]
[4. 3. 2.]
[5. 4. 2.]]

进程已结束,退出代码0

KMN算法：

# coding:utf-8
# 读取数据
import csv
import random
import math
# operator模块提供的itemgetter函数用于获取对象的哪些维的数据，参数为一些序号
# operator.itemgetter函数获取的不是值，而是定义了一个函数，通过该函数作用到对象上才能获取值。
import operator


# 装载数据集
def loadDataset(filename, split, trainingSet = [], testSet = []):
    """
    filename: 需要装载的数据集路径及名称
    split: 将数据集分为两部分----trainingSet和testSet
    trainingSet: 训练集
    testSet: 测试集
    """
    with open(filename, "rb") as csvfile:
        # 读取所有的行
        lines = csv.reader(csvfile)
        # 转化为list的数据结构
        dataset = list(lines)
        # dataset的格式是列表中存在列表的格式，列表中的每一个子列表表示原数据中的一行数据
        # print "dataset的格式：", dataset

        # 把数据集分为两部分，分别装到训练集和测试集里面去
        # x表示行
        for x in range(len(dataset) - 1):
            for y in range(4):
                dataset[x][y] = float(dataset[x][y])
            # random.random()的范围是[0, 1)
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

        print "trainingSet", trainingSet
        print "testSet", testSet


# 计算EuclideanDistance
# 参数为两个实例以及维度，返回值是euclideanDistance
# pow()平方，sqrt()开方
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)


# 返回离目标点最近的K个点
# testInstance指的是测试集中的一个实例（数据）
def getNeighbors(trainingSet, testInstance, k):
    # 初始化distances作为容器装所有的距离
    distances = []
    # length指的是测试实例的维度
    length = len(testInstance) - 1
    # 计算测试实例到训练集中各点的距离的列表集合
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    # 此时distances的格式[([5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], 4.459820624195552), ([4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], 4.498888751680798),。。。]
    # print "distances列表：", distances
    # 此处key=operator.itemgetter(1)的作用是根据第2个域（距离）进行排序，默认为升序
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    # 取前k个距离最近的点
    for x in range(k):
        neighbors.append(distances[x][0])
    # print "neighbors:", neighbors
    # neighbors: [[6.5, 3.2, 5.1, 2.0, 'Iris-virginica'], [6.7, 3.3, 5.7, 2.5, 'Iris-virginica'],[6.5, 3.0, 5.8, 2.2, 'Iris-virginica']]
    return neighbors


# 在K个点中，根据少数服从多数原则，将目标点进行归类
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        # neighbors[x][-1]指的是距离最近的每一种花的品种
        response = neighbors[x][-1]
        # 统计距离最近的花里面每一品种的个数
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
        # reverse参数，是一个bool变量，表示升序还是降序排列，默认为false（升序排列），定义为True时将按降序排列
        sortedNotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sortedNotes[0][0]


# 测试准确率
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        # -1指的是最后一个值，也就是花的类别
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0


# 主函数
def main():
    trainingSet = []
    testSet = []
    # 0.67的作用：把2/3的数据划分为训练集，把1/3的数据划分为测试集
    split = 0.67
    # 调用loadDataset函数
    loadDataset(r'F:\study\code\python\machine_learning\KNN\irisdata.txt', split, trainingSet, testSet)
    # repr() 转化为供解释器读取的形式。
    print 'TrainSet: ' + repr(len(trainingSet))
    print 'TestSet: ' + repr(len(testSet))
    # generate predictions
    predictions = []
    k = 3
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        print("> predicted= " + repr(result) + ', actual=' + repr(testSet[x][-1]))
    # 准确率
    accuracy = getAccuracy(testSet, predictions)
    print("Accuracy: " + repr(accuracy) + "%")

if __name__ == '__main__':
    main()

运行结果：

trainingSet [[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'], [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'], [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'], [5.4, 3.7, 1.5, 0.2, 'Iris-setosa'], [4.8, 3.4, 1.6, 0.2, 'Iris-setosa'], [4.3, 3.0, 1.1, 0.1, 'Iris-setosa'], [5.8, 4.0, 1.2, 0.2, 'Iris-setosa'], [5.4, 3.9, 1.3, 0.4, 'Iris-setosa'], [5.1, 3.5, 1.4, 0.3, 'Iris-setosa'], [5.7, 3.8, 1.7, 0.3, 'Iris-setosa'], [5.1, 3.8, 1.5, 0.3, 'Iris-setosa'], [4.6, 3.6, 1.0, 0.2, 'Iris-setosa'], [5.1, 3.3, 1.7, 0.5, 'Iris-setosa'], [4.8, 3.4, 1.9, 0.2, 'Iris-setosa'], [5.0, 3.4, 1.6, 0.4, 'Iris-setosa'], [5.2, 3.5, 1.5, 0.2, 'Iris-setosa'], [5.2, 3.4, 1.4, 0.2, 'Iris-setosa'], [4.7, 3.2, 1.6, 0.2, 'Iris-setosa'], [4.8, 3.1, 1.6, 0.2, 'Iris-setosa'], [5.2, 4.1, 1.5, 0.1, 'Iris-setosa'], [5.5, 4.2, 1.4, 0.2, 'Iris-setosa'], [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'], [5.5, 3.5, 1.3, 0.2, 'Iris-setosa'], [4.4, 3.0, 1.3, 0.2, 'Iris-setosa'], [5.1, 3.4, 1.5, 0.2, 'Iris-setosa'], [5.0, 3.5, 1.3, 0.3, 'Iris-setosa'], [4.5, 2.3, 1.3, 0.3, 'Iris-setosa'], [4.4, 3.2, 1.3, 0.2, 'Iris-setosa'], [5.0, 3.5, 1.6, 0.6, 'Iris-setosa'], [5.1, 3.8, 1.9, 0.4, 'Iris-setosa'], [5.1, 3.8, 1.6, 0.2, 'Iris-setosa'], [4.6, 3.2, 1.4, 0.2, 'Iris-setosa'], [5.0, 3.3, 1.4, 0.2, 'Iris-setosa'], [6.9, 3.1, 4.9, 1.5, 'Iris-versicolor'], [5.5, 2.3, 4.0, 1.3, 'Iris-versicolor'], [6.5, 2.8, 4.6, 1.5, 'Iris-versicolor'], [5.7, 2.8, 4.5, 1.3, 'Iris-versicolor'], [4.9, 2.4, 3.3, 1.0, 'Iris-versicolor'], [5.2, 2.7, 3.9, 1.4, 'Iris-versicolor'], [5.0, 2.0, 3.5, 1.0, 'Iris-versicolor'], [5.9, 3.0, 4.2, 1.5, 'Iris-versicolor'], [6.1, 2.9, 4.7, 1.4, 'Iris-versicolor'], [5.6, 2.9, 3.6, 1.3, 'Iris-versicolor'], [6.7, 3.1, 4.4, 1.4, 'Iris-versicolor'], [5.6, 3.0, 4.5, 1.5, 'Iris-versicolor'], [5.8, 2.7, 4.1, 1.0, 'Iris-versicolor'], [6.2, 2.2, 4.5, 1.5, 'Iris-versicolor'], [5.6, 2.5, 3.9, 1.1, 'Iris-versicolor'], [6.1, 2.8, 4.0, 1.3, 'Iris-versicolor'], [6.1, 2.8, 4.7, 1.2, 'Iris-versicolor'], [6.4, 2.9, 4.3, 1.3, 'Iris-versicolor'], [6.6, 3.0, 4.4, 1.4, 'Iris-versicolor'], [6.8, 2.8, 4.8, 1.4, 'Iris-versicolor'], [6.7, 3.0, 5.0, 1.7, 'Iris-versicolor'], [5.7, 2.6, 3.5, 1.0, 'Iris-versicolor'], [5.5, 2.4, 3.8, 1.1, 'Iris-versicolor'], [5.5, 2.4, 3.7, 1.0, 'Iris-versicolor'], [5.4, 3.0, 4.5, 1.5, 'Iris-versicolor'], [6.0, 3.4, 4.5, 1.6, 'Iris-versicolor'], [6.7, 3.1, 4.7, 1.5, 'Iris-versicolor'], [6.3, 2.3, 4.4, 1.3, 'Iris-versicolor'], [5.6, 3.0, 4.1, 1.3, 'Iris-versicolor'], [5.5, 2.5, 4.0, 1.3, 'Iris-versicolor'], [6.1, 3.0, 4.6, 1.4, 'Iris-versicolor'], [5.7, 2.9, 4.2, 1.3, 'Iris-versicolor'], [6.2, 2.9, 4.3, 1.3, 'Iris-versicolor'], [5.1, 2.5, 3.0, 1.1, 'Iris-versicolor'], [5.8, 2.7, 5.1, 1.9, 'Iris-virginica'], [7.1, 3.0, 5.9, 2.1, 'Iris-virginica'], [7.6, 3.0, 6.6, 2.1, 'Iris-virginica'], [4.9, 2.5, 4.5, 1.7, 'Iris-virginica'], [7.3, 2.9, 6.3, 1.8, 'Iris-virginica'], [6.5, 3.2, 5.1, 2.0, 'Iris-virginica'], [6.4, 2.7, 5.3, 1.9, 'Iris-virginica'], [5.7, 2.5, 5.0, 2.0, 'Iris-virginica'], [6.4, 3.2, 5.3, 2.3, 'Iris-virginica'], [7.7, 3.8, 6.7, 2.2, 'Iris-virginica'], [7.7, 2.6, 6.9, 2.3, 'Iris-virginica'], [7.7, 2.8, 6.7, 2.0, 'Iris-virginica'], [6.3, 2.7, 4.9, 1.8, 'Iris-virginica'], [6.7, 3.3, 5.7, 2.1, 'Iris-virginica'], [7.2, 3.2, 6.0, 1.8, 'Iris-virginica'], [6.2, 2.8, 4.8, 1.8, 'Iris-virginica'], [6.4, 2.8, 5.6, 2.1, 'Iris-virginica'], [7.2, 3.0, 5.8, 1.6, 'Iris-virginica'], [7.4, 2.8, 6.1, 1.9, 'Iris-virginica'], [7.9, 3.8, 6.4, 2.0, 'Iris-virginica'], [6.4, 2.8, 5.6, 2.2, 'Iris-virginica'], [7.7, 3.0, 6.1, 2.3, 'Iris-virginica'], [6.3, 3.4, 5.6, 2.4, 'Iris-virginica'], [6.4, 3.1, 5.5, 1.8, 'Iris-virginica'], [6.0, 3.0, 4.8, 1.8, 'Iris-virginica'], [6.9, 3.1, 5.4, 2.1, 'Iris-virginica'], [6.7, 3.1, 5.6, 2.4, 'Iris-virginica'], [6.8, 3.2, 5.9, 2.3, 'Iris-virginica'], [6.7, 3.3, 5.7, 2.5, 'Iris-virginica'], [6.7, 3.0, 5.2, 2.3, 'Iris-virginica'], [6.3, 2.5, 5.0, 1.9, 'Iris-virginica'], [6.5, 3.0, 5.2, 2.0, 'Iris-virginica'], [6.2, 3.4, 5.4, 2.3, 'Iris-virginica']]

testSet [[4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'], [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'], [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'], [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'], [4.8, 3.0, 1.4, 0.1, 'Iris-setosa'], [5.7, 4.4, 1.5, 0.4, 'Iris-setosa'], [5.4, 3.4, 1.7, 0.2, 'Iris-setosa'], [5.1, 3.7, 1.5, 0.4, 'Iris-setosa'], [5.0, 3.0, 1.6, 0.2, 'Iris-setosa'], [5.4, 3.4, 1.5, 0.4, 'Iris-setosa'], [5.0, 3.2, 1.2, 0.2, 'Iris-setosa'], [4.9, 3.1, 1.5, 0.1, 'Iris-setosa'], [4.8, 3.0, 1.4, 0.3, 'Iris-setosa'], [5.3, 3.7, 1.5, 0.2, 'Iris-setosa'], [7.0, 3.2, 4.7, 1.4, 'Iris-versicolor'], [6.4, 3.2, 4.5, 1.5, 'Iris-versicolor'], [6.3, 3.3, 4.7, 1.6, 'Iris-versicolor'], [6.6, 2.9, 4.6, 1.3, 'Iris-versicolor'], [6.0, 2.2, 4.0, 1.0, 'Iris-versicolor'], [5.9, 3.2, 4.8, 1.8, 'Iris-versicolor'], [6.3, 2.5, 4.9, 1.5, 'Iris-versicolor'], [6.0, 2.9, 4.5, 1.5, 'Iris-versicolor'], [5.8, 2.7, 3.9, 1.2, 'Iris-versicolor'], [6.0, 2.7, 5.1, 1.6, 'Iris-versicolor'], [5.5, 2.6, 4.4, 1.2, 'Iris-versicolor'], [5.8, 2.6, 4.0, 1.2, 'Iris-versicolor'], [5.0, 2.3, 3.3, 1.0, 'Iris-versicolor'], [5.6, 2.7, 4.2, 1.3, 'Iris-versicolor'], [5.7, 3.0, 4.2, 1.2, 'Iris-versicolor'], [5.7, 2.8, 4.1, 1.3, 'Iris-versicolor'], [6.3, 3.3, 6.0, 2.5, 'Iris-virginica'], [6.3, 2.9, 5.6, 1.8, 'Iris-virginica'], [6.5, 3.0, 5.8, 2.2, 'Iris-virginica'], [6.7, 2.5, 5.8, 1.8, 'Iris-virginica'], [7.2, 3.6, 6.1, 2.5, 'Iris-virginica'], [6.8, 3.0, 5.5, 2.1, 'Iris-virginica'], [5.8, 2.8, 5.1, 2.4, 'Iris-virginica'], [6.5, 3.0, 5.5, 1.8, 'Iris-virginica'], [6.0, 2.2, 5.0, 1.5, 'Iris-virginica'], [6.9, 3.2, 5.7, 2.3, 'Iris-virginica'], [5.6, 2.8, 4.9, 2.0, 'Iris-virginica'], [6.1, 3.0, 4.9, 1.8, 'Iris-virginica'], [6.3, 2.8, 5.1, 1.5, 'Iris-virginica'], [6.1, 2.6, 5.6, 1.4, 'Iris-virginica'], [6.9, 3.1, 5.1, 2.3, 'Iris-virginica'], [5.8, 2.7, 5.1, 1.9, 'Iris-virginica']]
TrainSet: 102
TestSet: 47
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-setosa', actual='Iris-setosa'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-virginica', actual='Iris-versicolor'
> predicted= 'Iris-virginica', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-virginica', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-versicolor', actual='Iris-versicolor'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-versicolor', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
> predicted= 'Iris-virginica', actual='Iris-virginica'
Accuracy: 91.48936170212765%

进程已结束,退出代码0

KMN第二种实现方法：直接利用sklearn中的相关方法实现：

# coding:utf-8
from sklearn import neighbors
from sklearn import datasets

# 创建一个名为knn的分类器
knn = neighbors.KNeighborsClassifier()

iris = datasets.load_iris()
print iris

# 建立模型，第一个参数代表特征值的矩阵，第二个参数代表每一行对应的分类
knn.fit(iris.data, iris.target)

# 预测一个花的品种
predictedLabel = knn.predict([[0.1, 0.2, 0.3, 0.4]])

print predictedLabel

运行结果：

{'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='|S10'), 'data': array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
[5.4, 3.4, 1.7, 0.2],
[5.1, 3.7, 1.5, 0.4],
[4.6, 3.6, 1. , 0.2],
[5.1, 3.3, 1.7, 0.5],
[4.8, 3.4, 1.9, 0.2],
[5. , 3. , 1.6, 0.2],
[5. , 3.4, 1.6, 0.4],
[5.2, 3.5, 1.5, 0.2],
[5.2, 3.4, 1.4, 0.2],
[4.7, 3.2, 1.6, 0.2],
[4.8, 3.1, 1.6, 0.2],
[5.4, 3.4, 1.5, 0.4],
[5.2, 4.1, 1.5, 0.1],
[5.5, 4.2, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5. , 3.2, 1.2, 0.2],
[5.5, 3.5, 1.3, 0.2],
[4.9, 3.1, 1.5, 0.1],
[4.4, 3. , 1.3, 0.2],
[5.1, 3.4, 1.5, 0.2],
[5. , 3.5, 1.3, 0.3],
[4.5, 2.3, 1.3, 0.3],
[4.4, 3.2, 1.3, 0.2],
[5. , 3.5, 1.6, 0.6],
[5.1, 3.8, 1.9, 0.4],
[4.8, 3. , 1.4, 0.3],
[5.1, 3.8, 1.6, 0.2],
[4.6, 3.2, 1.4, 0.2],
[5.3, 3.7, 1.5, 0.2],
[5. , 3.3, 1.4, 0.2],
[7. , 3.2, 4.7, 1.4],
[6.4, 3.2, 4.5, 1.5],
[6.9, 3.1, 4.9, 1.5],
[5.5, 2.3, 4. , 1.3],
[6.5, 2.8, 4.6, 1.5],
[5.7, 2.8, 4.5, 1.3],
[6.3, 3.3, 4.7, 1.6],
[4.9, 2.4, 3.3, 1. ],
[6.6, 2.9, 4.6, 1.3],
[5.2, 2.7, 3.9, 1.4],
[5. , 2. , 3.5, 1. ],
[5.9, 3. , 4.2, 1.5],
[6. , 2.2, 4. , 1. ],
[6.1, 2.9, 4.7, 1.4],
[5.6, 2.9, 3.6, 1.3],
[6.7, 3.1, 4.4, 1.4],
[5.6, 3. , 4.5, 1.5],
[5.8, 2.7, 4.1, 1. ],
[6.2, 2.2, 4.5, 1.5],
[5.6, 2.5, 3.9, 1.1],
[5.9, 3.2, 4.8, 1.8],
[6.1, 2.8, 4. , 1.3],
[6.3, 2.5, 4.9, 1.5],
[6.1, 2.8, 4.7, 1.2],
[6.4, 2.9, 4.3, 1.3],
[6.6, 3. , 4.4, 1.4],
[6.8, 2.8, 4.8, 1.4],
[6.7, 3. , 5. , 1.7],
[6. , 2.9, 4.5, 1.5],
[5.7, 2.6, 3.5, 1. ],
[5.5, 2.4, 3.8, 1.1],
[5.5, 2.4, 3.7, 1. ],
[5.8, 2.7, 3.9, 1.2],
[6. , 2.7, 5.1, 1.6],
[5.4, 3. , 4.5, 1.5],
[6. , 3.4, 4.5, 1.6],
[6.7, 3.1, 4.7, 1.5],
[6.3, 2.3, 4.4, 1.3],
[5.6, 3. , 4.1, 1.3],
[5.5, 2.5, 4. , 1.3],
[5.5, 2.6, 4.4, 1.2],
[6.1, 3. , 4.6, 1.4],
[5.8, 2.6, 4. , 1.2],
[5. , 2.3, 3.3, 1. ],
[5.6, 2.7, 4.2, 1.3],
[5.7, 3. , 4.2, 1.2],
[5.7, 2.9, 4.2, 1.3],
[6.2, 2.9, 4.3, 1.3],
[5.1, 2.5, 3. , 1.1],
[5.7, 2.8, 4.1, 1.3],
[6.3, 3.3, 6. , 2.5],
[5.8, 2.7, 5.1, 1.9],
[7.1, 3. , 5.9, 2.1],
[6.3, 2.9, 5.6, 1.8],
[6.5, 3. , 5.8, 2.2],
[7.6, 3. , 6.6, 2.1],
[4.9, 2.5, 4.5, 1.7],
[7.3, 2.9, 6.3, 1.8],
[6.7, 2.5, 5.8, 1.8],
[7.2, 3.6, 6.1, 2.5],
[6.5, 3.2, 5.1, 2. ],
[6.4, 2.7, 5.3, 1.9],
[6.8, 3. , 5.5, 2.1],
[5.7, 2.5, 5. , 2. ],
[5.8, 2.8, 5.1, 2.4],
[6.4, 3.2, 5.3, 2.3],
[6.5, 3. , 5.5, 1.8],
[7.7, 3.8, 6.7, 2.2],
[7.7, 2.6, 6.9, 2.3],
[6. , 2.2, 5. , 1.5],
[6.9, 3.2, 5.7, 2.3],
[5.6, 2.8, 4.9, 2. ],
[7.7, 2.8, 6.7, 2. ],
[6.3, 2.7, 4.9, 1.8],
[6.7, 3.3, 5.7, 2.1],
[7.2, 3.2, 6. , 1.8],
[6.2, 2.8, 4.8, 1.8],
[6.1, 3. , 4.9, 1.8],
[6.4, 2.8, 5.6, 2.1],
[7.2, 3. , 5.8, 1.6],
[7.4, 2.8, 6.1, 1.9],
[7.9, 3.8, 6.4, 2. ],
[6.4, 2.8, 5.6, 2.2],
[6.3, 2.8, 5.1, 1.5],
[6.1, 2.6, 5.6, 1.4],
[7.7, 3. , 6.1, 2.3],
[6.3, 3.4, 5.6, 2.4],
[6.4, 3.1, 5.5, 1.8],
[6. , 3. , 4.8, 1.8],
[6.9, 3.1, 5.4, 2.1],
[6.7, 3.1, 5.6, 2.4],
[6.9, 3.1, 5.1, 2.3],
[5.8, 2.7, 5.1, 1.9],
[6.8, 3.2, 5.9, 2.3],
[6.7, 3.3, 5.7, 2.5],
[6.7, 3. , 5.2, 2.3],
[6.3, 2.5, 5. , 1.9],
[6.5, 3. , 5.2, 2. ],
[6.2, 3.4, 5.4, 2.3],
[5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'DESCR': 'Iris Plants Database\n====================\n\nNotes\n-----\nData Set Characteristics:\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%[email protected])\n :Date: July, 1988\n\nThis is a copy of UCI ML iris datasets.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\nThe famous Iris database, first used by Sir R.A Fisher\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\nReferences\n----------\n - Fisher,R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...\n', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']}
[0]

进程已结束,退出代码0

NeuralNetwork算法：

源码：

# coding:utf-8
import numpy as np
# 下载数据集
from sklearn.datasets import load_digits
# 通过矩阵显示结果
from sklearn.metrics import confusion_matrix, classification_report
# 转换数字0-9的数据类型为python可接受的数据类型
from sklearn.preprocessing import LabelBinarizer
from neuralNetwork import NeuralNetwork
# 数据集拆分为训练集和测试集
from sklearn.model_selection import  train_test_split


digits = load_digits()
X = digits.data
y = digits.target
# print X
# print y
X -= X.min()
X /= X.max()
# print X
# print y

nn = NeuralNetwork([64, 100, 10], 'logistic')
X_train, X_test, y_train, y_test = train_test_split(X, y)
label_train = LabelBinarizer().fit_transform(y_train)
lable_test = LabelBinarizer().fit_transform(y_test)
print 'start fitting'
nn.fit(X_train, label_train, epochs=3000)
predictions = []
for i in range(X_test.shape[0]):
    o = nn.predict(X_test[i])
    predictions.append(np.argmax(o))
print confusion_matrix(y_test, predictions)
print classification_report(y_test, predictions)

运行结果：

start fitting
[[41 0 0 0 2 0 0 0 0 0]
[ 0 39 0 0 0 0 0 0 0 2]
[ 0 0 31 0 0 0 0 0 0 0]
[ 0 1 2 37 0 0 0 0 2 0]
[ 0 4 0 0 47 0 0 0 1 0]
[ 0 0 0 0 1 41 0 0 0 4]
[ 0 2 0 0 0 0 43 0 1 0]
[ 0 0 0 0 1 0 0 49 1 1]
[ 0 3 0 0 0 2 0 0 40 2]
[ 0 0 0 0 0 0 0 1 1 48]]
precision recall f1-score support

0 1.00 0.95 0.98 43
1 0.80 0.95 0.87 41
2 0.94 1.00 0.97 31
3 1.00 0.88 0.94 42
4 0.92 0.90 0.91 52
5 0.95 0.89 0.92 46
6 1.00 0.93 0.97 46
7 0.98 0.94 0.96 52
8 0.87 0.85 0.86 47
9 0.84 0.96 0.90 50

avg / total 0.93 0.92 0.93 450

进程已结束,退出代码0

neuralNetWork.py

# coding:utf-8
# numpy库提供了一些基于矩阵的科学运算
import numpy as np


# 直接调用numpy库里面的tanh双曲函数
def tanh(x):
    return np.tanh(x)


# 定义tanh的一阶导数函数
# tanh(x)的导数为1-tanh(x)的平方
def tan_deriv(x):
    return 1.0 - np.tanh(x) * np.tanh(x)


# 定义逻辑函数
# exp(x)方法返回值为e的x次方
def logistic(x):
    return 1/(1 + np.exp(-x))


# 定义逻辑函数
# 逻辑函数f（x）的导数为：f（x）*（1-（f（x））
def logistic_derivative(x):
    return logistic(x) * (1 - logistic(x))


class NeuralNetwork():
    def __init__(self, layers, activation='tanh'):
        """
        \layers: 表示每层有多少神经元，用列表表示
        activation: 用户指定采用哪种模式，默认为tanh
        """
        if activation == 'logistic':
            self.activation = logistic
            self.activation_deriv = logistic_derivative
        else:
            self.activation = tanh
            self.activation_deriv = tan_deriv
        self.weights = []
        # 对权重进行随机赋值
        for i in range(1, len(layers) - 1):
            self.weights.append((2 * np.random.random((layers[i - 1] + 1, layers[i] + 1)) - 1) * 0.25)
            self.weights.append((2 * np.random.random((layers[i] + 1, layers[i + 1])) - 1) * 0.25)

    def fit(self, X, y, learning_rate=0.2, epochs=10000):
        """
            X指的是训练集，每一行对应一个实例，列数表示特征值的维度。y指的是classlabel（分类标记）
            learn_rate指的是学习率，一般取0.2
            epochs指的是利用抽样的方法来更新，训练次数，这里设置为10000次
        """
        # 将X的数据类型改为numpy，最少是一个二维的数组
        X = np.atleast_2d(X)
        # 初始化一个矩阵，假设X是一个10 x 100的矩阵.X.shape()返回值为（10,100）
        temp = np.ones([X.shape[0], X.shape[1] + 1])
        # 冒号指的是取所有的行，列数从第一列到除了最后一列
        temp[:, 0:-1] = X
        X = temp
        # y指的是分类标记，数据类型转换
        y = np.array(y)
        for k in range(epochs):
            i = np.random.randint(X.shape[0])
            # a表示随机从X中抽取的一行实例对神经网络进行更新
            a = [X[i]]
            # 正向更新
            # going forward network, for each layer
            for l in range(len(self.weights)):
                # Computer the node value for each layer (O_i) using activation function
                a.append(self.activation(np.dot(a[l], self.weights[l])))

            # Computer the error at the top layer
            error = y[i] - a[-1]
            # For output layer, Err calculation (delta is updated error)
            deltas = [error * self.activation_deriv(a[-1])]
            # 反向更新
            # we need to begin at the second to last layer
            for l in range(len(a) - 2, 0, -1):
                # Compute the updated error (i,e, deltas) for each node going from top layer to input layer
                deltas.append(deltas[-1].dot(self.weights[l].T) * self.activation_deriv(a[l]))
            deltas.reverse()
            for i in range(len(self.weights)):
                layer = np.atleast_2d(a[i])
                delta = np.atleast_2d(deltas[i])
                self.weights[i] += learning_rate * layer.T.dot(delta)

    def predict(self, x):
        x = np.array(x)
        temp = np.ones(x.shape[0] + 1)
        temp[0:-1] = x
        a = temp
        for l in range(0, len(self.weights)):
            a = self.activation(np.dot(a, self.weights[l]))
        return a

调用neuralNetwork中的NeuralNetwork：

from neuralNetwork import NeuralNetwork
import numpy as np

nn = NeuralNetwork([2, 2, 1], 'tanh')
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])
nn.fit(X, y)
for i in [[0, 0], [0, 1], [1, 0], [1, 1]]:
    print i, nn.predict(i)

运行结果：

[0, 0] [-0.00015573]
[0, 1] [0.99842087]
[1, 0] [0.99845495]
[1, 1] [0.03373141]

进程已结束,退出代码0

Regression_Problem(回归问题)：

Simple_Regression_problem

# coding:utf-8
import numpy as np


# x, y为两个列表
def fitSLR(x, y):
    n = len(x)
    # 分母
    dinominator = 0
    # 分子
    numerator = 0
    for i in range(0, n):
        # np.mean()为numpy里面计算均值的方法
        numerator += (x[i] - np.mean(x))*(y[i] - np.mean(y))
        dinominator += (x[i] - np.mean(x))**2
    b1 = numerator/float(dinominator)
    b0 = np.mean(y)/float(np.mean(x))
    return b0, b1


def predict(x, b0, b1):
    return b0 + x*b1

x = [1, 3, 2, 1, 3]
y = [14, 24, 18, 17, 27]

b0, b1 = fitSLR(x, y)

print "intercept:", b0, " slope:", b1

x_test = 6

y_test = predict(x_test, b0, b1)

print "y_test:", y_test

运行结果：

intercept: 10.0 slope: 5.0
y_test: 40.0

进程已结束,退出代码0

计算皮尔逊系数的代码实现：

# coding:utf-8
import numpy as np
import math


def computeCorrelation(X, Y):
    # 计算均值
    x_bar = np.mean(X)
    y_bar = np.mean(Y)
    SSR = 0
    varX = 0
    varY = 0
    for i in range(0, len(X)):
        diffXx_bar = X[i] - x_bar
        diffYy_bar = Y[i] - y_bar
        SSR += (diffXx_bar * diffYy_bar)
        varX += diffXx_bar ** 2
        varY += diffYy_bar ** 2

    SST = math.sqrt(varX * varY)
    return SSR / SST


testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]

print computeCorrelation(testX, testY)

运行结果：

0.940310076545

进程已结束,退出代码0

计算R的平方值的代码实现：

# coding:utf-8
import numpy as np
import math


def computeCorrelation(X, Y):
    # 计算均值
    x_bar = np.mean(X)
    y_bar = np.mean(Y)
    SSR = 0
    varX = 0
    varY = 0
    for i in range(0, len(X)):
        diffXx_bar = X[i] - x_bar
        diffYy_bar = Y[i] - y_bar
        SSR += (diffXx_bar * diffYy_bar)
        varX += diffXx_bar ** 2
        varY += diffYy_bar ** 2

    SST = math.sqrt(varX * varY)
    return SSR / SST


def polyfit(x, y, degree):
    results = {}
    # 当传入三个参数之后，自动将回归方程计算出来，返回值为相关系数
    # x：自变量     y：因变量     degree表示的是最高次数
    coeffs = np.polyfit(x, y, degree)
    results['polynormol'] = coeffs.tolist()

    # p相当于直线方程
    p = np.poly1d(coeffs)
    # 计算y_hat
    yhat = p(x)
    ybar = np.sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)
    print "ssreg: ", ssreg
    sstot = np.sum((y-ybar)**2)
    print "sstot: ", sstot
    results['determination'] = ssreg/sstot
    return results


testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]

print "r: ", computeCorrelation(testX, testY)
print "R^2: ", str(computeCorrelation(testX, testY)**2)

print polyfit(testX, testY, 1)

运行结果：

r: 0.940310076545
R^2: 0.8841830400518192
ssreg: 333.360169492
sstot: 377
{'polynormol': [2.65677966101695, 5.322033898305075], 'determination': 0.8842444814098822}

进程已结束,退出代码0

multiple_Regression_problem：

# coding:utf-8
from numpy import genfromtxt
import numpy as np
from sklearn import datasets, linear_model

# 把csv文件转换成矩阵格式
dataPath = r"F:\study\code\python\machine_learning\Regression_problem\Delivery.csv"
deliveryData = genfromtxt(dataPath, delimiter=',')

print 'data:'
print deliveryData
# 第一个冒号表示提取所有行，第二个冒号表示提取第一列到倒数第一列但不包括最后一列
X = deliveryData[:, :-1]
Y = deliveryData[:, -1]
# print "X:", X
# print "Y:", Y
# 建立模型
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print "coefficients："
# 平面系数
print regr.coef_
print "intercept："
# 截距或截面
print regr.intercept_

xPredicted = [[102, 6]]
yPredicted = regr.predict(xPredicted)
print "predicted y:"
print yPredicted

delivery.csv:

运行结果：

data:
[[100. 4. 9.3]
[ 50. 3. 4.8]
[100. 4. 8.9]
[100. 2. 6.5]
[ 50. 2. 4.2]
[ 80. 2. 6.2]
[ 75. 3. 7.4]
[ 65. 4. 6. ]
[ 90. 3. 7.6]
[ 90. 2. 6.1]]
coefficients：
[0.0611346 0.92342537]
intercept：
-0.868701466782
predicted y:
[10.90757981]

进程已结束,退出代码0

multiple_Regression_problem(数据集特征值中含有分类值的问题):

解决方法：将某些数据进行特殊处理，即dummy化。

# coding:utf-8
from numpy import genfromtxt
import numpy as np
from sklearn import datasets, linear_model

# 把csv文件转换成矩阵格式
dataPath = r"F:\study\code\python\machine_learning\Regression_problem\Delivery_Dummy.csv"
deliveryData = genfromtxt(dataPath, delimiter=',')

print 'data:'
print deliveryData
# 第一个冒号表示提取所有行，第二个冒号表示提取第一列到倒数第一列但不包括最后一列
X = deliveryData[:, :-1]
Y = deliveryData[:, -1]
# print "X:", X
# print "Y:", Y
# 建立模型
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print "coefficients："
# 平面系数
print regr.coef_
print "intercept："
# 截距或截面
print regr.intercept_

xPredicted = [[102, 6, 1, 0, 0]]
yPredicted = regr.predict(xPredicted)
print "predicted y:"
print yPredicted

运行结果：

Delivery_Dummy.csv(已处理好的):C-E列为原数据中的处理结果

data:
[[100. 4. 0. 1. 0. 9.3]
[ 50. 3. 1. 0. 0. 4.8]
[100. 4. 0. 1. 0. 8.9]
[100. 2. 0. 0. 1. 6.5]
[ 50. 2. 0. 0. 1. 4.2]
[ 80. 2. 0. 1. 0. 6.2]
[ 75. 3. 0. 1. 0. 7.4]
[ 65. 4. 1. 0. 0. 6. ]
[ 90. 3. 1. 0. 0. 7.6]
[ 90. 2. 0. 0. 1. 6.1]]
coefficients：
[ 0.05520428 0.6952821 -0.16572633 0.58179313 -0.4160668 ]
intercept：
0.209160181582
predicted y:
[9.84596304]

进程已结束,退出代码0

SVM算法：

# coding:utf-8
from sklearn import svm
# svm实现太过复杂，现在只谈怎么使用


x = [[1, 1], [2, 0], [2, 3]]
# y指的是分类的标记，在python通常用0或者1分类
y = [0, 0, 1]
clf = svm.SVC(kernel='linear')
clf.fit(x, y)

print(clf)
# 找出支持向量点
print(clf.support_vectors_)
# 找出支持向量点的下标
print(clf.support_)
# 分别找出超平面两边各有几个支持向量点
print(clf.n_support_)

# 预测
print(clf.predict([[2, .1]]))

运行结果：

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
[[1. 1.]
[2. 3.]]
[0 2]
[1 1]
[0]

进程已结束,退出代码0

example_1:

# coding:utf-8
# python中支持科学计算的一个包
import numpy as np
# python中提供的可以作图的包
import pylab as pl
from sklearn import svm

# 创建40个点
np.random.seed(0)
# 随机生成一个矩阵，通过正态分布的方法随机产生一些数，矩阵有二十行两列，[2, 2]表示均值是2，方差也是2，这样产生的点恰好可以被一条直线分开成两部分
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
# print X
# 将这些点进行归类
Y = [0] * 20 + [1] * 20
# print Y
# Y = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# 建立svm模型
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
# print clf.support_vectors_
# clf.support_vectors_指的是支持向量点，在本实例中有三个
w = clf.coef_[0]
# print w
# w = [0.90230696 0.64821811]
# 斜率
a = -w[0] / w[1]
# print a
# 随机生成50个连续的等间隔数列点，-5和5分别表示开头和结尾，可加入第三个参数指定元素个数
xx = np.linspace(-5, 5)
# print xx
# - (clf.intercept_[0])/w[1]指的是截距
yy = a * xx - (clf.intercept_[0])/w[1]

# 找到直线上下方和两个支持向量点相切的直线方程
b = clf.support_vectors_[0]
yy_down = a*xx + (b[1] - a*b[0])
b = clf.support_vectors_[-1]
yy_up = a*xx + (b[1] - a*b[0])

# 作图
# 三条直线
pl.plot(xx, yy, 'k-')
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')
# 画点
pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
           s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)

pl.axis('tight')
pl.show()

运行结果：

example_2：

# coding:utf-8
# 用来计时
from time import time
# 提供了通用的日志系统
import logging
# 将识别的人脸打印出来，这个包提供了绘图的功能
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
# 用来下载人脸而定数据集
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC


"""
    日志一共分成5个等级，从低到高分别是：DEBUG INFO WARNING ERROR CRITICAL。
    DEBUG：详细的信息,通常只出现在诊断问题上
    INFO：确认一切按预期运行
    WARNING：一个迹象表明,一些意想不到的事情发生了,或表明一些问题在不久的将来(例如。磁盘空间低”)。这个软件还能按预期工作。
    ERROR：更严重的问题,软件没能执行一些功能
    CRITICAL：一个严重的错误,这表明程序本身可能无法继续运行

    logging.basicConfig函数中，可以指定日志的输出格式format，这个参数可以输出很多有用的信息。
        %(levelno)s: 打印日志级别的数值
        %(levelname)s: 打印日志级别名称
        %(pathname)s: 打印当前执行程序的路径，其实就是sys.argv[0]
        %(filename)s: 打印当前执行程序名
        %(funcName)s: 打印日志的当前函数
        %(lineno)d: 打印日志的当前行号
        %(asctime)s: 打印日志的时间
        %(thread)d: 打印线程ID
        %(threadName)s: 打印线程名称
        %(process)d: 打印进程ID
        %(message)s: 打印日志信息
"""
# 把程序进展信息打印出来
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

# 下载人脸数据集
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# 返回数据集的图片数量以及h，w的值（用于绘图）
n_samples, h, w = lfw_people.images.shape

# X是特征向量的矩阵，每一行是个实例，每一列是个特征值
X = lfw_people.data
# n_features表示的就是维度
# 维度：每个人会提取多少的特征值
n_features = X.shape[1]

# 提取每个实例对应每个人脸，目标分类标记，不同的人的身份
y = lfw_people.target
target_names = lfw_people.target_names
# 多少行，shape就是多少行，多少个人，多少类
n_classes = target_names.shape[0]

print("Total dataset size:")
# 实例的个数
print("n_samples:%d" % n_samples)
# 特征向量的维度
print("n_features:%d" % n_features)
# 总共有多少人
print("n_classes:%d" % n_classes)

# 下面开始拆分数据，分成训练集和测试集,有个现成的函数，通过调用train_test_split；来分成四部分，即训练集的特征向量，测试集的特征向量，训练集对应的归类标记，测试集对应的归类标记
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25)

# 数据降维，因为特征值的维度还是比较高
n_components = 150
print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
# 记录时间
t0 = time()
# 将高维特征向量降低为低维的
pca = PCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))
# 对于人脸的一张照片上提取的特征值名为eigenfaces
eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the inpyt data on the eigenfaces orthonormal basis")
t0 = time()
# 特征量中训练集所有的特征向量通过pca转换成更低维的矩阵
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
print "*"*100
# print "X_test_pca",X_test_pca
# print "X_train_pca",X_train_pca

print("Fitting the classifier to the training set")
t0 = time()
# param_grid把参数设置成了不同的值，C：权重；gamma：多少的特征点将被使用，因为我们不知道多少特征点最好，选择了不同的组合
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],}
# 把所有我们所列参数的组合都放在SVC里面进行计算，最后看出哪一组函数的表现度最好
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
# 其实建模非常非常简单，主要是数据的预处理麻烦
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

# 测试集预测看看准确率能到多少
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))


# 把数据可视化的可以看到，把需要打印的图打印出来
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    # 在figure上建立一个图当背景
    plt.figure(figsize=(1.8*n_col, 2.4*n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i+1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# 把预测的函数归类标签和实际函数归类标签，比如布什
def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:  %s'% (pred_name, true_name)
# 把预测出来的人名存起来
prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)

eigenface_titles = ['eigenface %d' %i for i in range(eigenfaces.shape[0])]
# 提取过特征向量之后的脸是什么样子
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()

运行结果：

Total dataset size:
n_samples:1288
n_features:1850
n_classes:7
Extracting the top 150 eigenfaces from 966 faces
done in 0.111s
Projecting the inpyt data on the eigenfaces orthonormal basis
done in 0.014s
****************************************************************************************************
Fitting the classifier to the training set
done in 30.601s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.005, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
Predicting people's names on the test set
done in 0.079s
precision recall f1-score support

Ariel Sharon 0.92 0.60 0.73 20
Colin Powell 0.81 0.86 0.84 51
Donald Rumsfeld 0.82 0.72 0.77 25
George W Bush 0.80 0.97 0.88 141
Gerhard Schroeder 0.95 0.56 0.71 32
Hugo Chavez 1.00 0.61 0.76 18
Tony Blair 0.94 0.83 0.88 35

avg / total 0.85 0.84 0.83 322

[[ 12 4 0 4 0 0 0]
[ 0 44 0 7 0 0 0]
[ 0 3 18 4 0 0 0]
[ 0 2 2 137 0 0 0]
[ 1 1 1 10 18 0 1]
[ 0 0 0 5 1 11 1]
[ 0 0 1 5 0 0 29]]

你可能感兴趣的:(Python,机器学习算法)

python基于Django的旅游景点数据分析及可视化的设计与实现 7blk7 qq2295116502 python django 数据分析
目录项目介绍技术栈具体实现截图Scrapy爬虫框架关键技术和使用的工具环境等的说明解决的思路开发流程爬虫核心代码展示系统设计论文书写大纲详细视频演示源码获取项目介绍大数据分析是现下比较热门的词汇，通过分析之后可以得到更多深入且有价值的信息。现实的科技手段中，越来越多的应用都会涉及到大数据随着大数据时代的到来，数据挖掘、分析与应用成为多个行业的关键,本课题首先介绍了网络爬虫的基本概念以及技术实现方法
python strip()函数牛也唱歌
strip函数原型声明：s为字符串，rm为要删除的字符序列.只能删除开头或是结尾的字符或是字符串。不能删除中间的字符或是字符串。s.strip(rm)删除s字符串中开头、结尾处，位于rm删除序列的字符s.lstrip(rm)删除s字符串中开头处，位于rm删除序列的字符s.rstrip(rm)删除s字符串中结尾处，位于rm删除序列的字符注意：1.当rm为空时，默认删除空白符（包括'\n','\r',
用python执行js代码：PyExecJS库详解数据知道 2025年爬虫和逆向教程 python javascript 爬虫数据采集 nodejs
更多内容请见：爬虫和逆向教程-专栏介绍和目录文章目录1.介绍和安装1.1PyExecJS介绍1.2安装JavaScript运行时1.3安装PyExecJS2.PyExecJS的基本使用2.1执行简单的JavaScript代码2.2使用外部JavaScript文件2.3先编译、后调用2.4传递参数和获取返回值3.PyExecJS的高级功能3.1指定JavaScript运行时3.2处理异步JavaSc
Python中strip()函数详细讲解甯公子_ Python入门程序 python 开发语言算法
strip()是Python中字符串（str）对象的一个内置方法，用于去除字符串开头和结尾的空白字符（包括空格、换行符、制表符等）。它不会修改字符串中间的空白字符。语法str.strip([chars])str：需要处理的字符串。chars（可选）：指定要去除的字符集合。如果未指定，默认去除空白字符（包括空格、换行符\n、制表符\t等）。返回值返回一个新的字符串，去除了开头和结尾的指定字符。常见用
国外7个最佳大语言模型 (LLM) API推荐幂简集成 API新理念语言模型人工智能自然语言处理
大型语言模型(LLM)API将彻底改变我们处理语言的方式。在深度学习和机器学习算法的支持下，LLMAPI提供了前所未有的自然语言理解能力。通过利用这些新的API，开发人员现在可以创建能够以前所未有的方式理解和响应书面文本的应用程序。下面，我们将比较从Bard到ChatGPT、PaLM等市场上顶级LLMAPI。我们还将探讨整合这些LLM的潜在用例，并考虑其对语言处理的影响。什么是大语言模型(LLM)
利用Python爬虫获取淘宝商品评论：实战案例分析数据小爬虫@ API python 爬虫开发语言
在数字化时代，数据的价值日益凸显，尤其是对于电商平台而言，商品评论作为用户反馈的重要载体，蕴含着丰富的信息。本文将详细介绍如何利用Python爬虫技术获取淘宝商品评论，包括代码示例和关键步骤解析。淘宝商品评论的重要性淘宝商品评论不仅对消费者购买决策有着重要影响，而且对于商家来说，也是了解市场需求、改进产品和服务的重要途径。因此，获取并分析淘宝商品评论数据，对于电商运营和市场分析具有重要意义。Pyt
Python 自动探索性数据分析库———KLib 若木胡 tools python 数据分析开发语言
Python自动探索性数据分析库——KLib一、引言在当今数据驱动的时代，数据分析师和科学家们面临着海量的数据需要处理和分析。探索性数据分析（EDA）作为数据处理流程中的关键环节，旨在帮助人们快速理解数据的特征、分布、相关性等重要信息，从而为后续的深入分析、建模以及决策提供坚实的基础。Python以其丰富的生态系统和强大的功能在数据分析领域占据着重要地位，而KLib则是其中一款专注于自动探索性数据
源码篇：python生成《蔬菜店销售数据分析报告》案例 IT小本本 python python 数据分析开发语言
本文将通过Python实现一个完整的蔬菜销售数据分析项目，涵盖数据生成、清洗、分析及可视化全流程。我们将利用模拟数据生成技术创建90天的销售记录，通过Pandas进行数据处理，结合Matplotlib和Seaborn实现多样化的可视化图表，并最终生成动态交互报告。一、数据生成：模拟真实销售场景为了模拟真实的蔬菜销售数据，我们设计了包含10种蔬菜（白菜、土豆、西红柿等）的90天销售记录。数据生成逻辑
[附源码]Python计算机毕业设计SSM基于B-S的心理健康管理系统（程序+LW) Python、JAVA毕设程序源码 java 开发语言
环境配置：Jdk1.8+Tomcat7.0+Mysql+HBuilderX（Webstorm也行）+Eclispe（IntelliJIDEA,Eclispe,MyEclispe,Sts都支持）。项目技术：SSM+mybatis+Maven+Vue等等组成，B/S模式+Maven管理等等。环境需要1.运行环境：最好是javajdk1.8，我们在这个平台上运行的。其他版本理论上也可以。2.IDE环境：
5-1 使用ECharts将MySQL数据库中的数据可视化上课的牛马实训大数据
方法一：使用PythonFlask框架搭建API对于技术小白来说，使用ECharts将MySQL数据库中的数据可视化需要分步骤完成。以下是详细的实现流程：一、技术架构‌后端服务‌：使用PythonFlask框架搭建API（简单易学，适合新手）数据库连接‌：通过Python的pymysql库连接MySQL前端可视化‌：HTML+JavaScript+ECharts数据流向‌：MySQL数据库→Pyt
绕过 reCAPTCHA V2/V3：Python、Selenium 指南 qq_33253945 python selenium javascript 网络爬虫爬虫算法
前言验证码（CAPTCHA）技术已经存在许多年，尽管它的有效性一直备受争议，但许多网站仍然依赖它来保护资源。尤其是Google推出的reCAPTCHA系列，一直是验证码领域的佼佼者。本文将详细介绍如何绕过reCAPTCHAV2和V3，并提供实用的代码示例。详情请见：解决验证码recaptcha、cloudflare、incapsula1.什么是reCAPTCHA？reCAPTCHA是Google推
CSP-J备考冲刺必刷题（C++） | AcWing 11 背包问题求方案数热爱编程的通信人 c++算法开发语言
本文分享的必刷题目是从蓝桥云课、洛谷、AcWing等知名刷题平台精心挑选而来，并结合各平台提供的算法标签和难度等级进行了系统分类。题目涵盖了从基础到进阶的多种算法和数据结构，旨在为不同阶段的编程学习者提供一条清晰、平稳的学习提升路径。欢迎大家订阅我的专栏：算法题解：C++与Python实现！附上汇总贴：算法竞赛备考冲刺必刷题（C++）|汇总【题目来源】AcWing：11.背包问题求方案数-AcWi
python数据可视化绘制图表（直方图，饼图圆环图，散点或气泡图，误差棒图） 2224070304 信息可视化 python 数据分析
一，直方图#先导入模块importnumpyasnp importmatplotlib.pyplotasplt#准备50个随机的数据scores=np.random.randint(0,100,50)#绘制直方图plt.hist(scores,bins=8,histtype='stepfilled')plt.show()其中，scores为数组（可为单个或多个的数列)bins=8,表示矩形的条数为
用Python实现SFM 薄辉 python opencv 计算机视觉人工智能图像处理
SFM(结构化光流法)是一种用于解决三维重建问题的方法，它可以根据许多二维图像和它们之间的相对位置，估计出三维场景的深度和摄像机的姿态。在Python中，你可以使用OpenCV库来实现SFM。下面是一个简单的例子，展示了如何使用OpenCV库的cv2.sfm_create函数来实现SFM：importcv2#读入图像，存入列表images中images=[]foriinrange(1,11):im
使用Python轻松拆分PDF，每页独立成文件 AI航海家(Ethan) python python pdf
使用Python轻松拆分PDF，每页独立成文件嗨，各位PDF爱好者！如果你曾经有想要拆分一个大PDF文件的想法，让每一页都成为独立的文件，那么这篇博客就是为你准备的！我们将使用Python中的一个非常强大的库–PyPDF2，把这些需求变得简单易行。PyPDF2登场首先，我们需要安装PyPDF2库。如果你还没有安装，别担心，只需要在终端运行以下命令：pipinstallPyPDF2安装好了吗？下面我
决策树算法及其python实例 m0_74831463 算法决策树 python
一、决策数的概念什么是决策树算法呢？决策树（DecisionTree）是一种基本的分类与回归方法，本文主要讨论分类决策树。决策树模型呈树形结构，在分类问题中，表示基于特征对数据进行分类的过程。它可以认为是if-then规则的集合。每个内部节点表示在属性上的一个测试，每个分支代表一个测试输出，每个叶节点代表一种类别二、决策树的构造1、决策树的构造步骤输入：训练集D={(21,11),(z2,32),
探索Sfm-python: 一款强大的计算机视觉库缪昱锨Hunter
探索Sfm-python:一款强大的计算机视觉库去发现同类优质开源项目:https://gitcode.com/在计算机视觉领域，Sfm-python是一个值得关注的开源项目，它以简洁高效的Python接口提供结构化从运动（StructurefromMotion,SfM）算法。如果你对3D重建、图像匹配或地理定位有兴趣，那么这个项目将是你不可或缺的工具。让我们一起深入了解一下它的技术细节、应用场景
Python Textract库：文本提取程序员喵哥 python 开发语言
更多Python学习内容：ipengtao.comTextract是一个强大的Python库，用于从各种文件格式中提取文本。无论是PDF、Word文档、Excel电子表格、HTML页面还是图像，Textract都能有效地提取其中的文本内容。Textract通过集成多种开源工具和库，实现了对多种文件格式的支持，使得文本提取变得简单而高效。本文将详细介绍Textract库的安装、主要功能、基本操作、高
python学智能算法（八）|决策树西猫雷婶人工智能 python学习笔记机器学习 python 决策树开发语言
【1】引言前序学习进程中，已经对KNN邻近算法有了探索，相关文章链接为：python学智能算法（七）|KNN邻近算法-CSDN博客但KNN邻近算法有一个特点是：它在分类的时候，不能知晓每个类别内事物的具体面貌，只能获得类别，停留在事物的表面。为了进一步探索事物的内在特征，就需要学习新的算法。本篇文章就是在KNN的基础上学习新算法：决策树。【2】原理分析在学习决策树执之前，需要先了解香农熵。本科学控
freecad嵌入工作台黄河里的小鲤鱼软件开发建模 python
1Introduction导言FreeCADcanbeimportedasaPythonmoduleinotherprogramsorinastandalonePythonconsole,togetherwithallitsmodulesandcomponents.It’sevenpossibletoimporttheFreeCADuserinterfaceasapythonmodulebutwi
家用笔记本换装centos7当服务器全流程吕域服务器 windows 电脑 centos
目录1、安装centos7系统硬件准备软件和镜像准备制作启动盘2、网络连接和ssh远程登陆centos7连接网络ssh远程登陆3、笔记本闭盖不休眠（7*24小时可用）4、定时开关机（省电、保护电脑）5、配置开发环境（此处以python为例，非必要项，示需求安装）1、安装centos7系统硬件准备老旧淘汰笔记本一台（新笔记本不合算，舍不得）一个大于8G的U盘网线一根（后续联网用）软件和镜像准备软件U
python 函数—文档、类型注释和内省想知道哇 python python 开发语言
Python文档、类型注释和内省目录引言函数文档docstring的使用help()函数类型注释基本类型注释复杂类型注释内省技术基本内省方法inspect模块的高级内省综合示例建议引言Python提供了丰富的文档和内省机制，使开发者能够编写自解释的代码并在运行时检查对象属性。本教程详细介绍了函数文档、类型注释和内省技术。函数文档docstring的使用Python使用三引号字符串（'''或"""）
奇异值分解（SVD）文弱_书生乱七八糟神经网络人工智能
奇异值分解(SVD)介绍奇异值分解(SVD)，这是最强大的矩阵分解技术之一。SVD广泛应用于机器学习、数据科学和其他计算领域，用于降维、降噪和矩阵近似等应用。与仅适用于方阵的特征分解不同，SVD可以应用于任何矩阵，使其成为一种多功能工具。在这里煮啵将分解SVD背后的理论，通过手动计算示例进行分析，并展示如何在Python中实现SVD。在本节结束时，您将清楚地了解SVD的强大功能及其在机器学习中的应
python异步--asyncio HWQlet python python异步编程
在python2.x和python3.x早期版本的时候，协程的主流实现方法是gevent，这个我之前讲过asyncio在python3.4后内置在python中了，在后面还有async/await，更后面有aiohttp，flask实现就有参照aiohttpasync和await分别又来替换早期协程的asyncio.coroutine和yieldfrom。从此以后，协程就是python中一个新的语
Python异步编程 - asyncio库孤寒者 Python全栈系列教程 python 异步编程 asyncio yield 协程
目录：每篇前言：异步IOPython中的异步编程实现方式：协程Python传统协程示例：实现生产者-消费者模型消费者：生产者：运行流程：整体流程：传统协程——>现代协程：asyncio库async/await每篇前言：作者介绍：【孤寒者】—CSDN全栈领域优质创作者、HDZ核心组成员、华为云享专家Python全栈领域博主、CSDN原力计划作者本文已收录于爬虫必备前端技术栈专栏：《爬虫必备前端技术栈
python输出星号等腰三角形_python打印直角三角形与等腰三角形实例代码 weixin_39644139 python输出星号等腰三角形
python打印直角三角形与等腰三角形实例代码前言本文通过示例给大家详细介绍了关于python打印三角形的相关，分享出来供大家参考学习，下面话不多说了，来一起看看详细的介绍吧1、直角三角形#i控制行数j控制*的个数foriinrange(5):i+=1forjinrange(i):print('*',end='')#end=‘'输出空格print()/2、等腰三角形row=int(input('p
python绘制等边三角形的代码_Python打印等边三角形 weixin_39621178
示例1:#!/usr/bin/python#-*-coding:UTF-8-*-#根据输入打印rows=int(raw_input('pleaseinputnumber:'))#等边三角形foriinrange(0,rows+1):forjinrange(0,rows-i):print"",j+=1forkinrange(0,2*i-1):ifk==0ork==2*i-2ori==rows:ifi
Python写倒三角森之林 python
4.(程序题)编程显示如下所示的三角形图案。要求程序运行时，输入一个正整数，显示该整数行高度的三角形图案。#############h=int(input("请输入高度："))foriinrange(h):forjinrange(i,h):print("#",end="")forrinrange(0,i):print("",end="")print("")
python+flask计算机毕业设计基于Android平台的景区移动端旅游软件系统（程序+开题+论文） Node.js彤彤程序 python flask 课程设计
本系统（程序+源码+数据库+调试部署+开发环境）带论文文档1万字以上，文末可获取，系统界面在最后面。系统程序文件列表开题报告内容研究背景随着移动互联网技术的飞速发展，智能手机已成为人们日常生活中不可或缺的一部分，特别是在旅游领域，移动端应用以其便捷性、实时性和个性化服务的特点，极大地改变了人们的旅游体验方式。当前，旅游市场日益繁荣，游客对于旅游信息获取、行程规划、景点导航、票务预订及个性化服务的需
ALO蚁狮优化算法：从背景到实战的全面解析 der丸子吱吱吱智能优化算法 ALO算法
目录引言背景2.1蚁狮优化算法的起源2.2自然启发式算法的背景2.3ALO的发展与应用原理3.1蚁狮的生物行为3.2ALO的数学建模3.3算法流程与关键步骤实战应用4.1函数优化问题4.2工程优化案例4.3组合优化与约束优化代码实现与结果分析5.1Python代码实现5.2实验设计与结果分析5.3性能评估与优化建议学习资源6.1工具推荐6.2网站与文献资源6.3ALO与AI结合的方法结论1.引言在
Spring的注解积累 yijiesuifeng spring 注解
用注解来向Spring容器注册Bean。需要在applicationContext.xml中注册： <context:component-scan base-package=”pagkage1[,pagkage2,…,pagkageN]”/>。如：在base-package指明一个包 <context:component-sc
传感器百合不是茶 android 传感器
android传感器的作用主要就是来获取数据,根据得到的数据来触发某种事件下面就以重力传感器为例; 1,在onCreate中获得传感器服务 private SensorManager sm;// 获得系统的服务 private Sensor sensor;// 创建传感器实例 @Override protected void
[光磁与探测]金吕玉衣的意义 comsci
这是一个古代人的秘密:现在告诉大家信不信由你们: 穿上金律玉衣的人,如果处于灵魂出窍的状态,可以飞到宇宙中去看星星这就是为什么古代
精简的反序打印某个数沐刃青蛟打印
以前看到一些让求反序打印某个数的程序。比如：输入123，输出321。记得以前是告诉你是几位数的，当时就抓耳挠腮，完全没有思路。似乎最后是用到%和/方法解决的。而今突然想到一个简短的方法，就可以实现任意位数的反序打印（但是如果是首位数或者尾位数为0时就没有打印出来了）代码如下： long num, num1=0;
PHP：6种方法获取文件的扩展名 IT独行者 PHP 扩展名
PHP：6种方法获取文件的扩展名 1、字符串查找和截取的方法 1 $extension = substr ( strrchr ( $file , '.' ), 1); 2、字符串查找和截取的方法二 1 $extension = substr
面试111 文强chu 面试
1事务隔离级别有那些，事务特性是什么（问到一次） 2 spring aop 如何管理事务的，如何实现的。动态代理如何实现，jdk怎么实现动态代理的，ioc是怎么实现的，spring是单例还是多例，有那些初始化bean的方式，各有什么区别（经常问） 3 struts默认提供了那些拦截器（一次） 4 过滤器和拦截器的区别（频率也挺高） 5 final，finally final
XML的四种解析方式小桔子 dom jdom dom4j sax
在平时工作中，难免会遇到把 XML 作为数据存储格式。面对目前种类繁多的解决方案，哪个最适合我们呢？在这篇文章中，我对这四种主流方案做一个不完全评测，仅仅针对遍历 XML 这块来测试，因为遍历 XML 是工作中使用最多的（至少我认为）。　　预备　　测试环境：　　AMD 毒龙1.4G OC 1.5G、256M DDR333、Windows2000 Server
wordpress中常见的操作 aichenglong 中文注册 wordpress 移除菜单
1 wordpress中使用中文名注册解决办法 1)使用插件 2)修改wp源代码进入到wp-include/formatting.php文件中找到 function sanitize_user( $username, $strict = false
小飞飞学管理-1 alafqq 管理
项目管理的下午题，其实就在提出问题（挑刺），分析问题，解决问题。今天我随意看下10年上半年的第一题。主要就是项目经理的提拨和培养。结合我自己经历写下心得对于公司选拔和培养项目经理的制度有什么毛病呢？ 1，公司考察，选拔项目经理，只关注技术能力，而很少或没有关注管理方面的经验，能力。 2，公司对项目经理缺乏必要的项目管理知识和技能方面的培训。 3，公司对项目经理的工作缺乏进行指
IO输入输出部分探讨百合不是茶 IO
//文件处理在处理文件输入输出时要引入java.IO这个包； /* 1，运用File类对文件目录和属性进行操作 2，理解流，理解输入输出流的概念 3，使用字节/符流对文件进行读/写操作 4，了解标准的I/O 5，了解对象序列化 */ //1，运用File类对文件目录和属性进行操作 //在工程中线创建一个text.txt
getElementById的用法 bijian1013 element
getElementById是通过Id来设置/返回HTML标签的属性及调用其事件与方法。用这个方法基本上可以控制页面所有标签，条件很简单，就是给每个标签分配一个ID号。返回具有指定ID属性值的第一个对象的一个引用。语法： &n
励志经典语录 bijian1013 励志人生
经典语录1: 哈佛有一个著名的理论：人的差别在于业余时间，而一个人的命运决定于晚上8点到10点之间。每晚抽出2个小时的时间用来阅读、进修、思考或参加有意的演讲、讨论，你会发现，你的人生正在发生改变，坚持数年之后，成功会向你招手。不要每天抱着QQ/MSN/游戏/电影/肥皂剧……奋斗到12点都舍不得休息，看就看一些励志的影视或者文章，不要当作消遣；学会思考人生，学会感悟人生
[MongoDB学习笔记三]MongoDB分片 bit1129 mongodb
MongoDB的副本集(Replica Set)一方面解决了数据的备份和数据的可靠性问题，另一方面也提升了数据的读写性能。MongoDB分片(Sharding)则解决了数据的扩容问题，MongoDB作为云计算时代的分布式数据库，大容量数据存储，高效并发的数据存取，自动容错等是MongoDB的关键指标。本篇介绍MongoDB的切片(Sharding) 1.何时需要分片 &nbs
【Spark八十三】BlockManager在Spark中的使用场景 bit1129 manager
1. Broadcast变量的存储，在HttpBroadcast类中可以知道 2. RDD通过CacheManager存储RDD中的数据，CacheManager也是通过BlockManager进行存储的 3. ShuffleMapTask得到的结果数据，是通过FileShuffleBlockManager进行管理的，而FileShuffleBlockManager最终也是使用BlockMan
yum方式部署zabbix ronin47 yum方式部署zabbix
安装网络yum库#rpm -ivh http://repo.zabbix.com/zabbix/2.4/rhel/6/x86_64/zabbix-release-2.4-1.el6.noarch.rpm 通过yum装mysql和zabbix调用的插件还有agent代理#yum install zabbix-server-mysql zabbix-web-mysql mysql-
Hibernate4和MySQL5.5自动创建表失败问题解决方法 byalias J2EE Hibernate4
今天初学Hibernate4，了解了使用Hibernate的过程。大体分为4个步骤： ①创建hibernate.cfg.xml文件 ②创建持久化对象 ③创建*.hbm.xml映射文件 ④编写hibernate相应代码在第四步中，进行了单元测试，测试预期结果是hibernate自动帮助在数据库中创建数据表，结果JUnit单元测试没有问题，在控制台打印了创建数据表的SQL语句，但在数据库中
Netty源码学习-FrameDecoder bylijinnan java netty
Netty 3.x的user guide里FrameDecoder的例子，有几个疑问： 1.文档说：FrameDecoder calls decode method with an internally maintained cumulative buffer whenever new data is received. 为什么每次有新数据到达时，都会调用decode方法？ 2.Dec
SQL行列转换方法 chicony 行列转换
create table tb(终端名称 varchar(10) , CEI分值 varchar(10) , 终端数量 int) insert into tb values('三星' , '0-5' , 74) insert into tb values('三星' , '10-15' , 83) insert into tb values('苹果' , '0-5' , 93)
中文编码测试 ctrain 编码
循环打印转换编码 String[] codes = { "iso-8859-1", "utf-8", "gbk", "unicode" }; for (int i = 0; i < codes.length; i++) { for (int j
hive 客户端查询报堆内存溢出解决方法 daizj hive 堆内存溢出
hive> select * from t_test where ds=20150323 limit 2; OK Exception in thread "main" java.lang.OutOfMemoryError: Java heap space 问题原因： hive堆内存默认为256M 这个问题的解决方法为：修改/us
人有多大懒，才有多大闲 (评论『卓有成效的程序员』) dcj3sjt126com 程序员
卓有成效的程序员给我的震撼很大，程序员作为特殊的群体，有的人可以这么懒，懒到事情都交给机器去做，而有的人又可以那么勤奋，每天都孜孜不倦得做着重复单调的工作。在看这本书之前，我属于勤奋的人，而看完这本书以后，我要努力变成懒惰的人。不要在去庞大的开始菜单里面一项一项搜索自己的应用程序，也不要在自己的桌面上放置眼花缭乱的快捷图标
Eclipse简单有用的配置 dcj3sjt126com eclipse
1、显示行号 Window -- Prefences -- General -- Editors -- Text Editors -- show line numbers 2、代码提示字符 Window ->Perferences，并依次展开 Java -> Editor -> Content Assist，最下面一栏 auto-Activation
在tomcat上面安装solr4.8.0全过程 eksliang Solr solr4.0后的版本安装 solr4.8.0安装
转载请出自出处： http://eksliang.iteye.com/blog/2096478 首先solr是一个基于java的web的应用，所以安装solr之前必须先安装JDK和tomcat，我这里就先省略安装tomcat和jdk了第一步：当然是下载去官网上下载最新的solr版本，下载地址
Android APP通用型拒绝服务、漏洞分析报告 gg163 漏洞 android APP 分析
点评：记得曾经有段时间很多SRC平台被刷了大量APP本地拒绝服务漏洞，移动安全团队爱内测（ineice.com）发现了一个安卓客户端的通用型拒绝服务漏洞，来看看他们的详细分析吧。 0xr0ot和Xbalien交流所有可能导致应用拒绝服务的异常类型时，发现了一处通用的本地拒绝服务漏洞。该通用型本地拒绝服务可以造成大面积的app拒绝服务。针对序列化对象而出现的拒绝服务主要
HoverTree项目已经实现分层 hvt 编程 .net Web C#ASP.ENT
HoverTree项目已经初步实现分层，源代码已经上传到 http://hovertree.codeplex.com请到SOURCE CODE查看。在本地用SQL Server 2008 数据库测试成功。数据库和表请参考：http://keleyi.com/a/bjae/ue6stb42.htmHoverTree是一个ASP.NET 开源项目，希望对你学习ASP.NET或者C#语言有帮助，如果你对
Google Maps API v3: Remove Markers 移除标记天梯梦 google maps api
Simply do the following: I. Declare a global variable: var markersArray = []; II. Define a function: function clearOverlays() { for (var i = 0; i < markersArray.length; i++ )
jQuery选择器总结 lq38366 jquery 选择器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
基础数据结构和算法六：Quick sort sunwinner Algorithm Quicksort
Quick sort is probably used more widely than any other. It is popular because it is not difficult to implement, works well for a variety of different kinds of input data, and is substantially faster t
如何让Flash不遮挡HTML div元素的技巧_HTML/Xhtml_网页制作刘星宇 html Web
今天在写一个flash广告代码的时候，因为flash自带的链接，容易被当成弹出广告，所以做了一个div层放到flash上面，这样链接都是a触发的不会被拦截，但发现flash一直处于div层上面，原来flash需要加个参数才可以。让flash置于DIV层之下的方法，让flash不挡住飘浮层或下拉菜单，让Flash不档住浮动对象或层的关键参数：wmode=opaque。方法如下：
Mybatis实用Mapper SQL汇总示例 wdmcygah sql mysql mybatis 实用
Mybatis作为一个非常好用的持久层框架，相关资料真的是少得可怜，所幸的是官方文档还算详细。本博文主要列举一些个人感觉比较常用的场景及相应的Mapper SQL写法，希望能够对大家有所帮助。不少持久层框架对动态SQL的支持不足，在SQL需要动态拼接时非常苦恼，而Mybatis很好地解决了这个问题，算是框架的一大亮点。对于常见的场景，例如：批量插入/更新/删除，模糊查询，多条件查询，联表查询，