python实现随机森林(RF)

代码如下:

#coding:utf-8


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder


df_wine_y = pd.read_excel(".//ERα_activity.xlsx", header=None)

y = df_wine_y.iloc[1:, 2:]
labelencoder_X = LabelEncoder()
# 对 X 中的标签数据编码
y = labelencoder_X.fit_transform(y)
y = pd.DataFrame(y)
y.index=y.index+1
print("y值数据:", y)
#df_wine.columns = []

# print(df_wine['Class label'])
# print('Class labels', np.unique(df_wine['Class label']))
# print(df_wine.head())

df_wine_x = pd.read_excel(".//Molecular_Descriptor.xlsx", header=None)
print(df_wine_x)

x = df_wine_x.iloc[1:, 1:].values.astype('float')
print("自变量数据:", x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

feat_labels = df_wine_x.columns[1:]
print("自变量label:", feat_labels)
# n_estimators:森林中树的数量
# n_jobs  整数 可选(默认=1) 适合和预测并行运行的作业数,如果为-1,则将作业数设置为核心数
forest = RandomForestClassifier(n_estimators=20, random_state=0, n_jobs=-1)
forest.fit(x_train, y_train)

# 下面对训练好的随机森林,完成重要性评估
# feature_importances_  可以调取关于特征重要程度
importances = forest.feature_importances_
print("重要性:", importances)
x_columns = df_wine_x.columns[1:]
print(x_columns)
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    # 对于最后需要逆序排序,我认为是做了类似决策树回溯的取值,从叶子收敛
    # 到根,根部重要程度高于叶子。
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

# 筛选变量(选择重要性比较高的变量)
threshold = 0.15
x_selected = x_train[:, importances > threshold]

# 可视化
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.title("分子描述符对生物活性影响的重要性排序", fontsize=18)
plt.ylabel("import level", fontsize=15, rotation=90)
plt.rcParams['font.sans-serif'] = ["SimHei"]
plt.rcParams['axes.unicode_minus'] = False
for i in range(x_columns.shape[0]):
    plt.bar(i, importances[indices[i]], color='orange', align='center')
    plt.xticks(np.arange(x_columns.shape[0]), x_columns, rotation=90, fontsize=15)
plt.show()

你可能感兴趣的:(Python机器学习算法,python,随机森林)