sklearn random forest实验

《机器学习技法》作业

sklearn random forest实验_第1张图片

这一题的思路,先bagging,再产生决策树,再平均。

1. 不知道bagging怎么取,用一棵树的随机森林替代。

from __future__ import division 
from sklearn.ensemble import RandomForestClassifier
import numpy as np

data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]
total = 0

for i in range(30000):
    clf = RandomForestClassifier(n_estimators=1, max_features=None)
    clf = clf.fit(train_x, train_y)
    err_in = 1 - clf.score(train_x, train_y)
    total = total + err_in
    print i, err_in
print total/30000

2. 大爷提醒,可以一次产生N的随机树,就取出来了。

from __future__ import division 
from sklearn import tree
import numpy as np
import random

data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]
N = len(train_y)

total = 0
for i in range(30000):
    r = np.random.randint(0, N, N)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_x[r,:], train_y[r])
    err_in = 1 - clf.score(train_x, train_y)
    total = total + err_in
    print i, err_in
print total/30000

sklearn random forest实验_第2张图片

17、18算forest就好算多了。

from __future__ import division 
from sklearn.ensemble import RandomForestClassifier
import numpy as np

data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]

data = np.loadtxt('hw3_test.dat') #直接读取成numpy.ndarray的形式
test_x = data[:,:-1]
test_y = data[:,-1]
total = 0

for i in range(100):
    clf = RandomForestClassifier(n_estimators=300, max_features=None)
    clf = clf.fit(train_x, train_y)
    err_in = 1 - clf.score(test_x, test_y)
    total = total + err_in
    print i, err_in
print total/100
sklearn random forest实验_第3张图片
这个剪一下枝就好了。

from __future__ import division 
from sklearn.ensemble import RandomForestClassifier
import numpy as np

data = np.loadtxt('hw3_train.dat') #直接读取成numpy.ndarray的形式
train_x = data[:,:-1]
train_y = data[:,-1]

data = np.loadtxt('hw3_test.dat') #直接读取成numpy.ndarray的形式
test_x = data[:,:-1]
test_y = data[:,-1]

total_in = 0
total_out = 0

for i in range(100):
    clf = RandomForestClassifier(n_estimators=300, max_depth=1, max_features=None)
    clf = clf.fit(train_x, train_y)
    err_in = 1 - clf.score(train_x, train_y)
    err_out = 1 - clf.score(test_x, test_y)
    total_in = total_in + err_in
    total_out = total_out + err_out
    print i, "err_in:",err_in, "err_out:",err_out
    
print total_in/100
print total_out/100



你可能感兴趣的:(sklearn random forest实验)