用类封装起来,以后使用起来将方便多了。
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
class MYRandomForestClassifier:
def __init__(self,Modul=RandomForestClassifier,dataset=np.empty((3,3))):
self.modul=Modul
self.dataset=dataset
def ReadFile(self,input_file):
# Reading the data
X = [];
with open(input_file, 'r') as f:
for line in f.readlines():
# data=line.split(',')
data = line[:-1].split(',')
X.append(data)
X = np.array(X)
# Convert string data to numerical data
label_encoder = []
X_encoded = np.empty(X.shape)
for i,item in enumerate(X[0]):
label_encoder.append(preprocessing.LabelEncoder())
X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
X = X_encoded.astype(int)
self.dataset=X
def SetModule(self,Modul):
self.modul=Modul
def FitModule(self):
X=self.dataset[:,:-1]
y=self.dataset[:,-1]
self.modul.fit(X,y)
from sklearn import model_selection
accuracy = model_selection.cross_val_score(self.modul,X, y, scoring='accuracy', cv=3)
print("Accuracy of the classifier: " + str(round(100*accuracy.mean(), 2)) + "%")
def PlotModule(self,parameter_grid,args):
# Plot the curve
from sklearn.model_selection import validation_curve
train_scores, validation_scores = validation_curve(self.modul,self.dataset[:,:-1],
self.dataset[:,-1],args,
parameter_grid, cv=5)
plt.figure()
plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
plt.title('Training curve')
plt.xlabel('Number of estimators')
plt.ylabel('Accuracy')
plt.show()
def PlotLearingCurve(self,parameter_grid):
from sklearn.learning_curve import learning_curve
train_sizes, train_scores, validation_scores = learning_curve(self.modul,
self.dataset[:,:-1],self.dataset[:,-1], train_sizes=parameter_grid, cv=5)
print( "\n##### LEARNING CURVES #####")
print("\nTraining scores:\n", train_scores)
print("\nValidation scores:\n", validation_scores)
# Plot the curve
plt.figure()
plt.plot(parameter_grid, 100*np.average(train_scores, axis=1), color='black')
plt.title('Learning curve')
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.show()
params={'n_estimators':200,'max_depth':8,'random_state':7}
classifier=RandomForestClassifier(**params)
MY=MYRandomForestClassifier(Modul=classifier)
MY.ReadFile("d:\\car.data.txt")
MY.FitModule()
MY.modul = RandomForestClassifier(max_depth=4, random_state=7)
parameter_grid = np.linspace(25, 200, 8).astype(int)
MY.PlotModule(parameter_grid,"n_estimators")
MY.modul = RandomForestClassifier(n_estimators=20, random_state=7)
parameter_grid = np.linspace(2, 10, 5).astype(int)
MY.PlotModule(parameter_grid,"max_depth")
MY.modul = RandomForestClassifier(random_state=7)
parameter_grid = np.array([200, 500, 800, 1100])
MY.PlotLearingCurve(parameter_grid)