sklearn PLS DecisionTree EnsembleMethods

偏最小二乘降维即利用单变量的逐次回归估出的参数向量作为成分(维数是相对应的)
得到的先后成分由最小二乘回归的性质知是正交的。

plt subplot方法指定numrows numcolumns numpage从而定位子图像。
pls transform方法返回的是得分。

PLS降维与CCA降维考虑区别应当是前者进行了归一化处理,而后者不是。

import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA

n = 500 
l1 = np.random.normal(size = n)
l2 = np.random.normal(size = n)

latents = np.array([l1, l1, l2, l2]).T 
X = latents + np.random.normal(size = 4 * n).reshape((n, 4))
Y = latents + np.random.normal(size = 4 * n).reshape((n, 4))

X_train = X[:n/2]
Y_train = Y[:n/2]
X_test = X[n/2:]
Y_test = Y[n/2:]

print "Corr(X)"
print np.round(np.corrcoef(X.T), 2)
print "Corr(Y)"
print np.round(np.corrcoef(Y.T), 2)

plsca = PLSCanonical(n_components = 2)
plsca.fit(X_train, Y_train)
X_train_r, Y_train_r = plsca.transform(X_train, Y_train)
X_test_r, Y_test_r = plsca.transform(X_test, Y_test)

plt.figure(figsize = (12, 8))
plt.subplot(221)
plt.plot(X_train_r[:,0], Y_train_r[:,0], "ob", label = "train")
plt.plot(X_test_r[:,0], Y_test_r[:,0], "or", label = "test")
plt.xlabel("x scores")
plt.ylabel("y scores")
plt.title("Comp. 1: X vs Y (test corr = %.2f)" % np.corrcoef(X_test_r[:,0], Y_test_r[:,0])[0, 1])
plt.xticks(())
plt.yticks(())
plt.legend(loc = "best")

plt.subplot(224)
plt.plot(X_train_r[:,1], Y_train_r[:,1], "ob", label = "train")
plt.plot(X_test_r[:,1], Y_test_r[:,1], "or", label = "test")
plt.xlabel("x scores")
plt.ylabel("y scores")
plt.title("Comp. 2: X vs Y (test corr = %.2f)" % np.corrcoef(X_test_r[:, 1], Y_test_r[:, 1])[0, 1])
plt.xticks(())
plt.yticks(())
plt.legend(loc = "best")

plt.subplot(222)
plt.plot(X_train_r[:,0], X_train_r[:,1], "*b", label = "train")
plt.plot(X_test_r[:,0], X_test_r[:,1], "*r", label = "test")
plt.xlabel("X comp. 1")
plt.ylabel("X comp. 2")
plt.title("X comp. 1 vs X comp. 2 (test corr = %.2f)" % np.corrcoef(X_test_r[:,0], X_test_r[:,1])[0, 1])
plt.xticks(())
plt.yticks(())
plt.legend(loc = "best")

plt.subplot(223)
plt.plot(Y_train_r[:,0], Y_train_r[:,1], "*b", label = "train")
plt.plot(Y_test_r[:,0], Y_test_r[:,1], "*r", label = "test")
plt.xlabel("Y comp. 1")
plt.ylabel("Y comp. 2")
plt.title("Y comp. 1 vs Y comp. 2 , (test corr = %.2f)" % np.corrcoef(Y_test_r[:,0], Y_test_r[:, 1])[0, 1])
plt.xticks(())
plt.yticks(())
plt.legend(loc = "best")
plt.show()

n = 1000
q = 3
p = 10
X = np.random.normal(size = n * p).reshape((n, p))
B = np.array([[1, 2] + [0] * (p - 2)] * q).T 
Y = np.dot(X, B) + np.random.normal(size = n * q).reshape((n, q)) + 5

pls2 = PLSRegression(n_components = 3)
pls2.fit(X, Y)
print "True B (sucj that: Y = XB + Err)"
print B
print "Estimated B"
print np.round(pls2.coef_, 1)
pls2.predict(X)

n = 1000
p = 10 
X = np.random.normal(size = n * p).reshape((n, p))
y = X[:, 0] + 2 * X[:, 1] + np.random.normal(size = n * 1) + 5
pls1 = PLSRegression(n_components = 3)
pls1.fit(X, y) 
print "Estimated betas"
print np.round(pls1.coef_, 1)

cca = CCA(n_components = 2)
cca.fit(X_train, Y_train)
X_train_r, Y_train_r = cca.transform(X_train, Y_train)
X_test_r, Y_test_r = cca.transform(X_test, Y_test)






对于一般决策树的叙述是简单的,
下面是利用决策树回归拟合有噪声的sin曲线的例子,这里采用不同树深度看
对过拟合的情况。
import numpy as np 
from sklearn.tree import DecisionTreeRegressor 
import matplotlib.pyplot as plt 

rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis = 0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))

regr_1 = DecisionTreeRegressor(max_depth = 2)
regr_2 = DecisionTreeRegressor(max_depth = 5)
regr_1.fit(X, y)
regr_2.fit(X, y)

X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(X_test)
y_2 = regr_2.predict(X_test)

plt.figure()
plt.scatter(X, y, c = "k", label = "data")
plt.plot(X_test, y_1, c = "g", label = "max_depth = 2", linewidth = 2)
plt.plot(X_test, y_2, c = "r", label = "max_depth = 5", linewidth = 2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()







对于Extremmely Randomized Trees
ExraTreesClassifire ExtratreesRegressor
As in random forests, a random subset of candidate features is used,
but instead of looking for the most discriminative thresholds, thresholds
are drawn at random for each candidate featureand the best of these
randomly-generated thresholds is picked as the sliting rule.
This usually alows to reduce the variance of the model a bit more,
at the expense of a slightly greater increase in bias.

sklearn.datasets make_blobs函数用于生成符合gaussian分布相应多标签样本。
n_samples n_features可以看成相应样本阵X的维数,centers指出此分布样本有多少个
中心,返回的y是相应中心的分类变量。

下面的例子说明Extremmely Randomized Tree 的表现有时优于随机森林。
(这里可以看到数据及模型的拟合较好,属于强模型,对于这种模型采用类Bagging方法
是合理的)

from sklearn.cross_validation import cross_val_score 
from sklearn.datasets import make_blobs 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.tree import DecisionTreeClassifier 

X, y = make_blobs(n_samples = 10000, n_features = 10, centers = 100, random_state = 0)
clf = DecisionTreeClassifier(max_depth = None, min_samples_split = 1, random_state = 0)
scores = cross_val_score(clf, X, y)
print "the mean score of Decision Tree :"
print scores.mean()

clf = RandomForestClassifier(n_estimators = 10, min_samples_split = 1, max_depth = None, random_state = 0)
scores = cross_val_score(clf, X, y)
print "the mean score of Random Forest :"
print scores.mean()

clf = ExtraTreesClassifier(n_estimators = 10, max_depth = None, min_samples_split = 1, random_state = 0)
scores = cross_val_score(clf, X, y)
print "the mean score of Extra random trees :"
print scores.mean()







有关AdaBoost的迭代推导可以参看wikipedia

由于RandomForests及ExtraTrees是使用独立组合分类器的,故可以在多核上进行运算,
但AdaBoost为序列惩罚算法,仅能在单核进行操作。
sklearn.base.clone:
Construct a new estimator with the same parameters.
这里clone与copy的不同可以在下面的代码中看到,clone使得对于原模型也同时作用
于clone对象,类似于容器的拷贝。

对于有n_estimstors参数的组合器,使用其方法estimators_可以获得其分类器序列。
np.meshgrid方法主要用于产生的两个矩阵拉直(ravel)后为网格序列。(外积点集)

plt.contourf用来画填充形式的函数等值线。下面是一个例子:
import numpy as np 
import matplotlib.pyplot as plt 

y,x = np.ogrid[-2:2:200j, -3:3:300j]
z = x * np.exp(-x**2-y**2)

extent = [np.min(x), np.max(x), np.min(y), np.max(y)]

plt.figure(figsize = (10, 4))
plt.subplot(121)
cs = plt.contour(z, 10, extent = extent)
plt.clabel(cs)

plt.subplot(122)
plt.contourf(x.reshape(-1), y.reshape(-1), z, 20)
plt.show()



matplotlib中alpha一般用来设定透明度。

下面是用DecisionTree RandomForest ExtraTrees AdaBost(内部为DecisionTree)
对iris数据进行分类可视化的例子:
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn import clone 
from sklearn.datasets import load_iris 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.externals.six.moves import xrange 
from sklearn.tree import DecisionTreeClassifier

n_classes = 3
n_estimators = 30 
plot_colors = "ryb"
cmap = plt.cm.RdYlBu 
plot_step = 0.02
plot_step_coarser = 0.5 
RANDOM_SEED = 13 

iris = load_iris()
plot_idx = 1 
models = [DecisionTreeClassifier(max_depth = None),
   RandomForestClassifier(n_estimators = n_estimators),
   ExtraTreesClassifier(n_estimators = n_estimators),
   AdaBoostClassifier(DecisionTreeClassifier(max_depth = 3), n_estimators = n_estimators)]

for pair in ([0, 1], [0, 2], [2, 3]):
 for model in models:
  X = iris.data[:, pair]
  y = iris.target 

  idx = np.arange(X.shape[0])
  np.random.seed(RANDOM_SEED)
  np.random.shuffle(idx)
  X = X[idx]
  y = y[idx]

  mean = X.mean(axis = 0)
  std = X.std(axis = 0)
  X = (X - mean) / std

  clf = clone(model)
  clf = model.fit(X, y)

  scores = clf.score(X, y)
  model_title = str(type(model)).split(".")[-1][:-2][:-len("Classifier")]
  model_details = model_title
  if hasattr(model, "estimators_"):
   model_details += "with {} estimators".format(len(model.estimators_))
  print model_details + " with features", pair, "has a score of", scores

  plt.subplot(3, 4, plot_idx)
  if plot_idx <= len(models):
   plt.title(model_title)

  x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
  y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
  xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step))

  if isinstance(model, DecisionTreeClassifier):
   Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
   Z = Z.reshape(xx.shape)
   cs = plt.contourf(xx, yy, Z, cmap = cmap)
  else:
   estimator_alpha = 1.0 / len(model.estimators_)
   for tree in model.estimators_:
    Z = tree.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, alpha = estimator_alpha, cmap = cmap)

  xx_coarser, yy_coarser = np.meshgrid(np.arange(x_min, x_max, plot_step_coarser), np.arange(y_min, y_max, plot_step_coarser))
  Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), yy_coarser.ravel()]).reshape(xx_coarser.shape)
  cs_points = plt.scatter(xx_coarser, yy_coarser, s = 15, c = Z_points_coarser, cmap = cmap, edgecolors = "none")

  for i, c in zip(xrange(n_classes), plot_colors):
   idx = np.where(y == i)
   plt.scatter(X[idx, 0], X[idx, 1], c = c, label = iris.target_names[i], cmap = cmap)

  plot_idx += 1

plt.suptitle("Classifier on feature subsets of Iris dataset")
plt.axis("tight")

plt.show()







numpy ndarray reshape方法当某个维度取值为-1时可以通过其他维度来推断此维度的长度。

ExtraTrees组合分类器还可以进行特征重要性的评估(Feature importance evaluation)
模型调用feature_importances_方法会返回与自变量特征维度相同的向量,值越大越重要。
下面是一个简单例子:
from time import time 
import matplotlib.pyplot as plt 

from sklearn.datasets import fetch_olivetti_faces 
from sklearn.ensemble import ExtraTreesClassifier 

n_jobs = 1
data = fetch_olivetti_faces()
X = data.images.reshape((len(data.images), -1))
y = data.target 

mask = y < 5 
X = X[mask]
y = y[mask]

print "Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs
t0 = time() 
forest = ExtraTreesClassifier(n_estimators = 1000, max_features = 128, n_jobs = n_jobs, random_state = 0)
forest.fit(X, y)
print "done in %.3fs" % (time() - t0)
importances = forest.feature_importances_
importances = importances.reshape(data.images[0].shape)

plt.matshow(importances, cmap = plt.cm.hot)
plt.title("Pixel importances with forests of trees")
plt.show()















你可能感兴趣的:(机器学习,Sklearn,sklearn,scikit-learn,决策树,偏最小二乘,组合提升方法)