Python-随机森林实例

用类封装起来,以后使用起来将方便多了。

import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt


class MYRandomForestClassifier:
    def __init__(self,Modul=RandomForestClassifier,dataset=np.empty((3,3))):
        self.modul=Modul        
        self.dataset=dataset
    def ReadFile(self,input_file):
        # Reading the data
        X = []; 
        with open(input_file, 'r') as f:
            for line in f.readlines():
#                data=line.split(',')
                data = line[:-1].split(',')                 
                X.append(data)
        X = np.array(X)
        # Convert string data to numerical data
        label_encoder = [] 
        X_encoded = np.empty(X.shape)
        for i,item in enumerate(X[0]):
            label_encoder.append(preprocessing.LabelEncoder())
            X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
        X = X_encoded.astype(int)
        self.dataset=X
    def SetModule(self,Modul):
        self.modul=Modul
    def FitModule(self):
        X=self.dataset[:,:-1]
        y=self.dataset[:,-1]
        self.modul.fit(X,y)
        from sklearn import model_selection
        accuracy = model_selection.cross_val_score(self.modul,X, y, scoring='accuracy', cv=3)
        print("Accuracy of the classifier: " + str(round(100*accuracy.mean(), 2)) + "%")
    def PlotModule(self,parameter_grid,args):
        # Plot the curve
        from sklearn.model_selection import validation_curve 
        train_scores, validation_scores = validation_curve(self.modul,self.dataset[:,:-1],
                                                           self.dataset[:,-1],args, 
                                                           parameter_grid, cv=5)
        plt.figure()
        plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
        plt.title('Training curve')
        plt.xlabel('Number of estimators')
        plt.ylabel('Accuracy')
        plt.show()
    def PlotLearingCurve(self,parameter_grid):
        from sklearn.learning_curve import learning_curve      
        train_sizes, train_scores, validation_scores = learning_curve(self.modul, 
                self.dataset[:,:-1],self.dataset[:,-1], train_sizes=parameter_grid, cv=5)
        print( "\n##### LEARNING CURVES #####")
        print("\nTraining scores:\n", train_scores)
        print("\nValidation scores:\n", validation_scores)        
        # Plot the curve
        plt.figure()
        plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
        plt.title('Learning curve')
        plt.xlabel('Number of training samples')
        plt.ylabel('Accuracy')
        plt.show()
params={'n_estimators':200,'max_depth':8,'random_state':7}
classifier=RandomForestClassifier(**params)
MY=MYRandomForestClassifier(Modul=classifier)
MY.ReadFile("d:\\car.data.txt")
MY.FitModule()
MY.modul = RandomForestClassifier(max_depth=4, random_state=7)
parameter_grid = np.linspace(25, 200, 8).astype(int)
MY.PlotModule(parameter_grid,"n_estimators")


MY.modul = RandomForestClassifier(n_estimators=20, random_state=7)
parameter_grid = np.linspace(2, 10, 5).astype(int)
MY.PlotModule(parameter_grid,"max_depth")


MY.modul = RandomForestClassifier(random_state=7)
parameter_grid = np.array([200, 500, 800, 1100])
MY.PlotLearingCurve(parameter_grid)

你可能感兴趣的:(机器学习,学习随记)