接<美国人口普查数据预测收入sklearn算法汇总1: 了解数据以及数据预处理>
one_hot_cols = dataset_bin.columns.drop('predclass')
dataset_bin_enc = pd.get_dummies(dataset_bin, columns=one_hot_cols)
print(dataset_bin_enc.shape)
dataset_bin_enc.head()
目的: 去掉NaN值, 便于数据编码
dataset_con_test = dataset_con
dataset_con_test['workclass'] = dataset_con['workclass'].factorize()[0]
dataset_con_test['occupation'] = dataset_con['occupation'].factorize()[0]
dataset_con_test['country'] = dataset_con['country'].factorize()[0]
dataset_con_test[dataset_con_test['workclass']==-1]
from sklearn.preprocessing import LabelEncoder
# encoder_cols = dataset_con_test.columns
# for feature in encoder_cols:
# dataset_con_test[feature] = LabelEncoder().fit_transform(dataset_con_test[feature])
# dataset_con_test.head()
dataset_con_enc = dataset_con_test.apply(LabelEncoder().fit_transform)
dataset_con_enc.head()
plt.figure(figsize = (30,14))
plt.subplot(1,2,1)
mask = np.zeros_like(dataset_bin_enc.corr(),dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_bin_enc.corr(),vmin=-1,vmax=1,square=True,mask=mask,
cmap=sns.color_palette('RdBu_r', 100), linewidth=0.5)
plt.subplot(1,2,2)
mask = np.zeros_like(dataset_con_enc.corr(),dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_con_enc.corr(),vmin=-1,vmax=1,square=True,mask=mask,
cmap=sns.color_palette('RdBu_r', 100), linewidth=0.5)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(dataset_con_enc.drop('predclass', axis=1), dataset_con_enc['predclass'])
importance = rfc.feature_importances_
importance = pd.DataFrame(importance, index=dataset_con_enc.drop('predclass',axis=1).columns, columns=['Importance'])
importance = importance.sort_values(by='Importance',ascending=False)
importance.plot(kind='barh',figsize=(20,len(importance)/1.5))
cumulative_importances = np.cumsum(importance['Importance'])
plt.figure(figsize = (20, 6))
plt.plot(list(range(len(importance.index))), cumulative_importances.values, 'b-')
plt.hlines(y=0.95, xmin=0, xmax=importance.shape[0], color='r', linestyles='dashed')
plt.xticks(list(range(len(importance.index))), importance.index, rotation=25)
plt.xlabel('Feature')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importances')
print('Number of features for 95% importance:',np.where(cumulative_importances>0.95)[0][0]+1)
Number of features for 95% importance: 12
涉及参数:
from sklearn import preprocessing
from sklearn.decomposition import PCA
X_bin = preprocessing.StandardScaler().fit_transform(dataset_bin_enc.drop('predclass',axis=1))
pca_bin = PCA(n_components = 80)
fit_bin = pca_bin.fit(X_bin)
X_con = preprocessing.StandardScaler().fit_transform(dataset_con_enc.drop('predclass',axis=1))
pca_con = PCA(n_components = 13)
fit_con = pca_con.fit(X_con)
plt.figure(figsize = (25, 7))
plt.subplot(1,2,1)
plt.bar(range(0,fit_bin.explained_variance_ratio_.size),fit_bin.explained_variance_ratio_)
plt.xlabel('Bin PCA Feature'); plt.ylabel('Variance'); plt.title('PCA for Discretised Dataset')
plt.subplot(1,2,2)
plt.bar(range(0,fit_con.explained_variance_ratio_.size),fit_con.explained_variance_ratio_)
plt.xlabel('Con PCA Feature'); plt.ylabel('Variance'); plt.title('PCA for Continuous Dataset')
y = dataset_con_enc['predclass']
target_names = ['<$50K','>$50K']
colors = ['navy', 'darkorange']
linewidth = 2
alpha = 0.3
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize = (20,8))
plt.subplot(1, 2, 1)
pca = PCA(n_components = 2)
X_2 = pca.fit_transform(X_con)
for color,i,target_name in zip(colors, [0,1], target_names):
plt.scatter(X_2[y==i,0], X_2[y==i,1], color=color, alpha=alpha, lw=linewidth, label=target_name)
plt.legend(loc = 'best', shadow=False, scatterpoints=1)
plt.title('First two PCA directions')
ax = plt.subplot(1, 2, 2, projection='3d')
pca = PCA(n_components = 3)
X_3 = pca.fit_transform(X_con)
for color,i,target_name in zip(colors, [0,1], target_names):
ax.scatter(X_3[y==i,0], X_3[y==i,1], X_3[y==i,2], color=color, alpha=alpha,
linewidth=linewidth, label=target_name)
plt.legend(loc = 'best', shadow=False, scatterpoints=1)
ax.set_title('First three PCA directions')
ax.set_xlabel('1st eigenvector')
ax.set_ylabel('2nd eigenvector')
ax.set_zlabel('3rd eigenvector')
ax.view_init(30, 10)
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
selector = RFECV(LogisticRegression(), step=1, cv=5, n_jobs=-1)
selector = selector.fit(dataset_con_enc.drop('predclass',axis=1).values,dataset_con_enc['predclass'].values)
print('Feature Ranking For Non-Discretised: %s' % selector.ranking_)
print('Optimal number of features: %d' % selector.n_features_)
plt.figure(figsize = (20, 6))
plt.plot(range(1,len(selector.grid_scores_)+1), selector.grid_scores_)
plt.xlabel('Number of features selected - Non-Discretised')
plt.ylabel('Cross validation score')
Feature Ranking For Non-Discretised: [1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1]
Optimal number of features: 15
# 和传参对应,所选择的属性的个数
print(selector.n_features_)
# 打印的是相应位置上属性的排名
print(selector.ranking_)
# 属性选择的一种模糊表示,选择的是true,未选择的是false
print(selector.support_)
# 第1个属相的排名
print(selector.ranking_[1])
# 外部估计函数的相关信息
print(selector.estimator_)
15
[1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1]
[ True True True True True True False True True True True True True True True True]
1
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class=‘warn’, n_jobs=None, penalty=‘l2’, random_state=None, solver=‘warn’, tol=0.0001, verbose=0, warm_start=False)
dataset_con_enc = dataset_con_enc[dataset_con_enc.columns[np.insert(selector.support_, 0, True)]]
dataset_con_enc.head()
selected_dataset = dataset_con_enc
selected_dataset = selected_dataset.dropna(axis=1)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(selected_dataset.drop('predclass',axis=1),selected_dataset['predclass'],test_size=0.3,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
X_train.head()