



区分一下 /* @ .dot np.mutiply:

*: 根据数据类型的不同,可能是做点乘运算,也可能做矩阵乘法运算
@: 只做矩阵乘法运算
.dot: 只做矩阵乘法运算

J ( θ ) = 1 2 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 J\left( \theta \right)=\frac{1}{2m}\sum\limits_{i=1}^{m}{{{\left( {{h}_{\theta }}\left( {{x}^{(i)}} \right)-{{y}^{(i)}} \right)}^{2}}} J(θ)=2m1i=1m(hθ(x(i))y(i))2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 初始化数据的三个函数,,,这里只是为了省事。。还要根据实际的数据形式读取特征啊之类的。。
def get_X(df): # 读取特征
    df.insert(0,'ones',1)   # 插入一列 1 
    cols = df.shape[1]
    X = df.iloc[:,0:cols-1]
    return np.array(X.values)      # 返回数组

def get_y(df): # 读取标签
    cols = df.shape[1]
    y = data.iloc[:,cols-1:cols]
    return np.array(y.values)     # 反正就是适应格式。。。

def normalize_feature(df): # 归一化
    return df.apply(lambda column: (column - column.mean()) / column.std()))
# 计算的函数
def computercost(X, y, theta):     # 代价函数
    inner = np.power(((X @ theta.T) - y ),2)  # X m*n  theta n*1
    return np.sum(inner) / (2 * len(X))

def cost2(theta,X,y):   # 另一个版本
    error = X @ theta - y
    square_sum = error.T @ error
    cost = square_sum / (2*m)
    return cost

def gradientDescent(X, y, theta, alpha, iters):  # 梯度下降
    temp = np.matrix(np.zeros(theta.shape))
    parameters = int(theta.ravel().shape[1])
    cost = np.zeros(iters)  # 每次迭代后的代价
    for i in range(iters):
        error = (X * theta.T) - y    # X 要增加一列1.data.insert(0,'ones',1)
        for j in range(parameters):  # n
            term = np.multiply(error, X[:,j])  # theta 1*n 
            temp[0,j] = theta[0,j] - ((alpha / len(X)) * np.sum(term))
        theta = temp
        cost[i] = computercost(X, y, theta)
    return theta, cost

# 正规方程
def normalEqn(X, y):
    theta = np.linalg.inv(X.T@X)@X.T@y  #X.T@X等价于X.T.dot(X)
    return theta

# 看看不同学习率的影响
data = pd.read_csv('ex1data1.txt',header=None,names=['Population','Profit'])

base = np.logspace(-1, -5, num=4)  # 一堆学习率,看看不同学习率对下降的影响
candidate = np.sort(np.concatenate((base, base*3)))  # 数组的拼接
print(candidate)       # 大佬这么写肯定有他们的道理。。。。
[1.00000000e-05 3.00000000e-05 2.15443469e-04 6.46330407e-04
 4.64158883e-03 1.39247665e-02 1.00000000e-01 3.00000000e-01]  

theta = np.matrix(np.zeros(X.shape[1]))
fig, ax = plt.subplots(figsize=(12,8))
iters = 100
for i in candidate:
    theta ,cost = gradientDescent(X, y, theta, i, iters)  
    ax.plot(np.arange(iters), cost, label=i)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# 可以看出学习率过大,过小都是不适合的




from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X, y)                 
x = np.array(X[:, 1].A1)
f = model.predict(X).flatten()             # 得到的应该是拟合线上的预测值。

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.set_title('Predicted Profit vs. Population Size')

# 这个就图一乐。。。等以后在解决吧。。。
def linear_regression(X_data, y_data, alpha, epoch, optimizer=tf.train.GradientDescentOptimizer):
      # placeholder for graph input
    X = tf.placeholder(tf.float32, shape=X_data.shape)
    y = tf.placeholder(tf.float32, shape=y_data.shape)

    # construct the graph
    with tf.variable_scope('linear-regression'):
        W = tf.get_variable("weights",
                            (X_data.shape[1], 1),
                            initializer=tf.constant_initializer())  # n*1

        y_pred = tf.matmul(X, W)  # m*n @ n*1 -> m*1

        loss = 1 / (2 * len(X_data)) * tf.matmul((y_pred - y), (y_pred - y), transpose_a=True)  # (m*1).T @ m*1 = 1*1

    opt = optimizer(learning_rate=alpha)
    opt_operation = opt.minimize(loss)

    # run the session
    with tf.Session() as sess:
        loss_data = []

        for i in range(epoch):
            _, loss_val, W_val = sess.run([opt_operation, loss, W], feed_dict={X: X_data, y: y_data})
            loss_data.append(loss_val[0, 0])  # because every loss_val is 1*1 ndarray

            if len(loss_data) > 1 and np.abs(loss_data[-1] - loss_data[-2]) < 10 ** -9:  # early break when it's converged
                # print('Converged at epoch {}'.format(i))

    # clear the graph
    return {'loss': loss_data, 'parameters': W_val}  # just want to return in row vector format


就是拟合出一个边界,将数据分为0,1 两类。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = 'ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
Exam 1	Exam 2	Admitted
0	34.623660	78.024693	0
1	30.286711	43.894998	0
2	35.847409	72.902198	0
3	60.182599	86.308552	1
4	79.032736	75.344376	1

def sigmoid(z):              # 就是那个Logistics函数      
    return 1/(1+np.exp(-z))

def cost(theta, X, y):           # 代价函数
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    return np.sum(first - second) / (len(X))

def gradient(theta,X,y):        # 梯度下降的导数部分
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    parameters = int(theta.ravel().shape[1])   # n
    grad = np.zeros(parameters)
    error = sigmoid(X * theta.T) - y
    for i in range(parameters):
        term = np.multiply(error, X[:,i])  # X[:,0] = 1
        grad[i] = np.sum(term) / len(X)
    return grad
def gradient2(theta, X, y):      # 这个也可以啊,更简单
    return (1 / len(X)) * X.T @ (sigmoid(X @ theta.T) - y)
#  y m*1 X m*n theta 1*n

def predict(theta, X):     # 预测,,X 是实际数据,theta是学习出来的边界,将数据分为0,1两类
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]    # 0.5为界线

def predict2(x, theta):
    prob = sigmoid(x @ theta)
    return (prob >= 0.5).astype(int)   # 哇哦,这个真是厉害。。>0.5为1,小于为0

theta = np.matrix(np.zeros(X.shape[1]))
X.shape, theta.shape, y.shape
>>> ((100, 3), (1, 3), (100, 1))

import scipy.optimize as opt          # 使用这个科学计算包进行梯度下降
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
>>> (array([-25.16131872,   0.20623159,   0.20147149]), 36, 0)

res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)
    fun: 0.2034977018633035
     jac: array([-2.12380382e-05, -1.40885753e-03, -1.27811598e-03])
 message: 'Optimization terminated successfully.'            # 以一种优化,这返回的啥。。
    nfev: 72
    nhev: 0
     nit: 28
    njev: 186
  status: 0
 success: True
       x: array([-25.16007951,   0.20622062,   0.20146256])    # 结果是一样的
from sklearn.metrics import classification_report#这个包是评价报告
def predict(x, theta):
    prob = sigmoid(x @ theta)
    return (prob >= 0.5).astype(int)
final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))   # 这个评价报告返回的好详细啊
                precision    recall  f1-score   support   # 查准率,召回率,F1值,另一个不知道但以后会知道的

           0       0.87      0.85      0.86        40
           1       0.90      0.92      0.91        60

    accuracy                           0.89       100
   macro avg       0.89      0.88      0.88       100
weighted avg       0.89      0.89      0.89       10

def accuracy(result,X,y):     # 判断一下正确率
    theta_min = np.matrix(result[0])
    predictions = predict(theta_min, X)
    correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
    # 大佬的思路就是厉害。。。
    accuracy = (sum(map(int, correct)) % len(correct))  # 转换成整型在求和
    print ('accuracy = {0}%'.format(accuracy))    

>>>accuracy = 89%        # 所有样本中预测正确的,应该不是查准率或召回率吧。.

那个sigmoid 函数的图像,使用这个进行二分类。

#   看一下决策边界的样子
coef = -(res.x / res.x[2])  # 构建方程,,有点迷
print(coef)    #   t0+ t1x1 + t2x2 = 0 -> x2 = -(t0/t2 + t1/t2 * x1)

x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x 

sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('Exam 1', 'Exam 2', hue='Admitted', data=data, 
           scatter_kws={"s": 25}

plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')



应对这种非线性的数据分类问题,构建的特征方程应该是 多项式特征。。

path =  'ex2data2.txt'
data2 = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])
	Test 1	Test 2	Accepted       # 这里只有两个特征 x1, x2 再加一个 x0
0	0.051267	0.69956	1
1	-0.092742	0.68494	1               # 二维的数据应该就构建两个特征吧。。
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

degree = 5
x1 = data2['Test 1']
x2 = data2['Test 2']

data2.insert(3, 'Ones', 1)

for i in range(1, degree):   # 1,2,3,4
    for j in range(0, i):      
        data2['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)   # x1**(i-j) * x2**j
        # x0 x1 x1^2 x1^2*x2 x1^3 x1^3*x2 x1^3*x2^2 ............ 就类似这样的

data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)

	Accepted	Ones	F10	F20	F21	F30	F31	F32	F40	F41	F42	F43   # 对应特征的次方
0	1	1	0.051267	0.002628	0.035864	0.000135	0.001839	0.025089	0.000007	0.000094	0.001286	0.017551
1	1	1	-0.092742	0.008601	-0.063523	-0.000798	0.005891	-0.043509	0.000074	-0.000546	0.004035	-0.029801
2	1	1	-0.213710	0.045672	-0.147941	-0.009761	0.031616	-0.102412	0.002086	-0.006757	0.021886	-0.070895
3	1	1	-0.375000	0.140625	-0.188321	-0.052734	0.070620	-0.094573	0.019775	-0.026483	0.035465	-0.047494
4	1	1	-0.513250	0.263426	-0.238990	-0.135203	0.122661	-0.111283	0.069393	-0.062956	0.057116	-0.051818


J ( θ ) = 1 m ∑ i = 1 m [ − y ( i ) log ⁡ ( h θ ( x ( i ) ) ) − ( 1 − y ( i ) ) log ⁡ ( 1 − h θ ( x ( i ) ) ) ] + λ 2 m ∑ j = 1 n θ j 2 J\left( \theta \right)=\frac{1}{m}\sum\limits_{i=1}^{m}{[-{{y}^{(i)}}\log \left( {{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)-\left( 1-{{y}^{(i)}} \right)\log \left( 1-{{h}_{\theta }}\left( {{x}^{(i)}} \right) \right)]}+\frac{\lambda }{2m}\sum\limits_{j=1}^{n}{\theta _{j}^{2}} J(θ)=m1i=1m[y(i)log(hθ(x(i)))(1y(i))log(1hθ(x(i)))]+2mλj=1nθj2

def costReg(theta,X,y,learningRate):   # 代价函数
    theta = np.matrix(theta)
    y = np.matrix(y)
    X = np.matrix(X)  # 学习率是一种超参数,控制正则化项
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate/(2*(len(X)))) * np.sum(np.power(theta[:,1:],2))
    return np.sum(first - second) / len(X) + reg

def gradientReg(theta, X, y, learningRate):  # 导数部分
    theta = np.matrix(theta)  
    X = np.matrix(X)
    y = np.matrix(y)
    parameters = int(theta.ravel().shape[1])  # n 
    grad = np.zeros(parameters)
    error = sigmoid(X * theta.T) - y   # 一个常数
    for i in range(parameters):
        term = np.multiply(error, X[:,i])
        if (i == 0):
            grad[i] = np.sum(term) / len(X)   # 对theta0 不进行正则化
            grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
    return grad    # 返回数组,对应每一个theta 

cols = data2.shape[1]      # 还是手动构建数据的结构吧,,那啥啥函数太烦了。。
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
learningRate = 1
X2 = np.array(X2.values)
y2 = np.array(y2.values)
theta2 = np.zeros(11)
>>> ((118, 11), (118, 1), (11,))

result2 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, learningRate))
result2              # 代价函数,代价函数的导数
>>>(array([ 0.53010248,  0.29075567, -1.60725764, -0.58213819,  0.01781027,
        -0.21329508, -0.40024142, -1.37144139,  0.02264304, -0.9503358 ,
         0.0344085 ]),
>>> accuracy = 78%


df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
   test1	test2	accepted
0	0.051267	0.69956	1
1	-0.092742	0.68494	1
2	-0.213710	0.69225	1
3	-0.375000	0.50219	1
4	-0.513250	0.46564	1

def feature_mapping(x, y, power, as_ndarray=False):     # 构建多项式
    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)  
                for p in np.arange(i + 1)   
            }   # 虽然效果一样,但这个看着就np 。。。

    if as_ndarray:
        return np.pd.DataFrame(data).values  # array 
        return pd.DataFrame(data)
x1 = np.array(df.test1)
x2 = np.array(df.test2)
data = feature_mapping(x1, x2, power=6)
data.head()      # 这里构建了28个多项式
	f00	f10	f01	f20	f11	f02	f30	f21	f12	f03	...	f23	f14	f05	f60	f51	f42	f33	f24	f15	f06
0	1.0	0.051267	0.69956	0.002628	0.035864	0.489384	0.000135	0.001839	0.025089	0.342354	...	0.000900	0.012278	0.167542	1.815630e-08	2.477505e-07	0.000003	0.000046	0.000629	0.008589	0.117206
1	1.0	-0.092742	0.68494	0.008601	-0.063523	0.469143	-0.000798	0.005891	-0.043509	0.321335	...	0.002764	-0.020412	0.150752	6.362953e-07	-4.699318e-06	0.000035	-0.000256	0.001893	-0.013981	0.103256
2	1.0	-0.213710	0.69225	0.045672	-0.147941	0.479210	-0.009761	0.031616	-0.102412	0.331733	...	0.015151	-0.049077	0.158970	9.526844e-05	-3.085938e-04	0.001000	-0.003238	0.010488	-0.033973	0.110047
3	1.0	-0.375000	0.50219	0.140625	-0.188321	0.252195	-0.052734	0.070620	-0.094573	0.126650	...	0.017810	-0.023851	0.031940	2.780914e-03	-3.724126e-03	0.004987	-0.006679	0.008944	-0.011978	0.016040
4	1.0	-0.513250	0.46564	0.263426	-0.238990	0.216821	-0.135203	0.122661	-0.111283	0.100960	...	0.026596	-0.024128	0.021890	1.827990e-02	-1.658422e-02	0.015046	-0.013650	0.012384	-0.011235	0.010193
5 rows × 28 columns

theta = np.zeros(data.shape[1])   
X = feature_mapping(x1, x2, power=6, as_ndarray=True)     # 这里都是array 数组,,不要用矩阵啊
(118, 28)

def regularized_cost(theta, X, y, learningRate=1):    # 这个大佬写的就是简洁。。。
    theta_j1_to_n = theta[1:]   # 不对theta0 正则化
    regularized_term = (learningRate / (2 * len(X))) * np.power(theta_j1_to_n, 2).sum()

    return cost(theta, X, y) + regularized_term
# 正则项前面的代价函数,在上面的逻辑回归中定义的,,就是简洁
def cost(theta, X, y):
    return np.mean(-y * np.log(sigmoid(X @ theta)) - (1 - y) * np.log(1 - sigmoid(X @ theta)))
regularized_cost(theta, X, y, learningRate=1)
>>> 0.6931471805599454

def regularized_gradient(theta, X, y, l=1):  # 导数部分
    theta_j1_to_n = theta[1:]
    regularized_theta = (l / len(X)) * theta_j1_to_n

    regularized_term = np.concatenate([np.array([0]), regularized_theta])
    g=gradient(theta, X, y).T   
    return np.array(g + regularized_term)

def gradient(theta, X, y):      # 就是简洁。。。
    return (1 / len(X)) * X.T @ (sigmoid(X @ theta.T) - y.T)
regularized_gradient(theta, X, y)
array([[8.47457627e-03, 1.87880932e-02, 7.77711864e-05, 5.03446395e-02,
        1.15013308e-02, 3.76648474e-02, 1.83559872e-02, 7.32393391e-03,
        8.19244468e-03, 2.34764889e-02, 3.93486234e-02, 2.23923907e-03,
        1.28600503e-02, 3.09593720e-03, 3.93028171e-02, 1.99707467e-02,
        4.32983232e-03, 3.38643902e-03, 5.83822078e-03, 4.47629067e-03,
        3.10079849e-02, 3.10312442e-02, 1.09740238e-03, 6.31570797e-03,
        4.08503006e-04, 7.26504316e-03, 1.37646175e-03, 3.87936363e-02]])

import scipy.optimize as opt
print('init cost = {}'.format(regularized_cost(theta, X, y)))

res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)                 # 进行梯度下降
>>>                                   # 这返回的啥
init cost = 0.6931471805599454
     fun: 0.529002729712739
     jac: array([ 7.26089191e-08,  4.22913232e-09,  8.15815876e-09,  6.15699190e-08,
        7.74567232e-09, -3.09360466e-08,  2.12821347e-08,  1.22156735e-08,
        1.96058084e-08, -3.19108791e-08, -4.39405717e-09, -2.76847096e-09,
       -2.77934021e-08,  1.23592858e-08, -7.14474161e-08,  8.98276579e-09,
        1.45962365e-08, -1.00120216e-08, -7.32796823e-09,  1.43317535e-08,
       -4.38679455e-08, -4.85023121e-09, -3.40732357e-10, -1.11668147e-08,
       -5.01047274e-09, -1.44326742e-08,  8.78794915e-09, -5.71951122e-08])
 message: 'Optimization terminated successfully.'
    nfev: 7
    nhev: 0
     nit: 6
    njev: 57
  status: 0
 success: True
       x: array([ 1.27273909,  0.62527214,  1.18108783, -2.01995993, -0.91742426,
       -1.43166279,  0.12400726, -0.36553444, -0.35723901, -0.17513021,
       -1.45815774, -0.05098947, -0.61555653, -0.27470644, -1.19281683,
       -0.24218793, -0.20600565, -0.04473137, -0.27778488, -0.2953778 ,
       -0.45635711, -1.04320321,  0.02777158, -0.29243198,  0.01556636,
       -0.32738013, -0.14388704, -0.92465213])    # 这应该就是最优化的theta了。。

def predict(x, theta):
    prob = sigmoid(x @ theta)
    return (prob >= 0.5).astype(int)
from sklearn.metrics import classification_report#这个包是评价报告
final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))
                  precision    recall  f1-score   support      # 好长啊。。。。

           0       0.90      0.75      0.82        60
           1       0.78      0.91      0.84        58

    accuracy                           0.83       118
   macro avg       0.84      0.83      0.83       118
weighted avg       0.84      0.83      0.83       118

看看不同 λ \lambda λ 的决策边界

def draw_boundary(power, l):   # 画图

    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='R', s=10)
    plt.title('Decision boundary')
def feature_mapped_logistic_regression(power, l): # 一个函数获取最有的theta...

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y=np.array(df.iloc[:, -1])

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       args=(X, y, l),
    final_theta = res.x

    return final_theta

def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)
    t2 = np.linspace(-1, 1.5, density)   # 两个等差数组

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)    # 这画到坐标轴的话应该是一个实心矩形
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe
   # 映射后是不是就变成了一个圆形了。。。1000000 rows × 28 columns
    inner_product = mapped_cord.values @ theta  # theta  28*1
   # 这下面我是真的看不懂了。。。。这个阈值还有上面的 -1-1.5 怎么选的。。。
    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01

draw_boundary(power=6, l=1) #lambda=1



from sklearn import linear_model#调用sklearn的逻辑回归包
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X2, y2.ravel())
model.score(X2, y2)
>>> 0.6610169491525424    # 不会调参,所以正确率不高。。。

