本文介绍机器学习中特征选择的统计方法,这些方法在工业界开发中非常常用,也是机器学习算法工程师必会技能之一。
具体理论博文地址如下:
机器学习之-统计方法做特征选择的原理_&永恒的星河&的博客-CSDN博客
1. 去除某个特征数据中空值占比大于某个阈值的列,具体实现:
c1 = []
df = pd.read_csv("xxxx.csv", sep='\t', header=0)
v = 0.8
# 统计每个特征的空值占比是否超过了80%
for c in df.columns:
c_rate = df[c].isna().sum() / len(df[c])
if c_rate < v:
c1.append(c)
# 删除某个列中每个值的统计数,删除最高值占比超过90%的列
df_c1 = df[c1]
print(df_c1.columns)
2. 删除列中值占比超过90%的列
df_c1 = df[c1]
c2 = []
v = 0.9
df = pd.read_csv("xxxx.csv", sep='\t', header=0)
for c in df.columns:
if c.startswith("my_"):
c2.append(c)
else:
c_rate = df[c].value_counts(ascending=False).values[0] / len(df[c])
if c_rate < v:
c2.append(c)
df = df[c2]
3. 类别型变量处理
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("xxxx.csv", sep='\t', header=0)
# 存放数值型变量
number_columns = []
# 存放类别型变量
class_columns = []
for c in df.columns:
if df[c].dtype == "object":
class_columns.append(c)
else:
number_columns.append(c)
# 将类别类中的NaN和特殊值替换为"",并数值化处理
for c in class_columns:
df[c].fillna("", inplace=True)
df.replace({c:{"-9999.0":"", "-9999":"", -9999.0: "", -9999:""}}, inplace=True)
ce = LabelEncoder()
df.loc[:, c] = ce.fit_transform(df.loc[:, c])
4. 删除方差小于某个阈值的列
import numpy as np
from sklearn.feature_selection import VarianceThreshold
# 构造数据
np.random.seed(666)
a = [[1, 1, 2, 4], [0, 2, 4, 6], [np.nan, np.nan, 2, 1]]
a = np.array(a)
var_list = []
r, c = a.shape
# 计算每个列方差(去除空值行)
for i in range(c):
a_i = a[:,i]
a_i = a_i[~np.isnan(a_i)]
var_list.append(a_i.var())
print(var_list)
# 删除方差小于1.0的数据列
var = VarianceThreshold(threshold=1.0)
a_var = var.fit_transform(a)
# 输出新数据
print(a_var)
5 使用卡方值选择特征(分类问题且特征必须是离散特征)
# coding:utf-8
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pandas as pd
import numpy as np
# 加载 iris data
iris = load_iris()
print(iris.feature_names)
# 或者特征和标签
X = iris.data
y = iris.target
# 将连续型变量转换为类别型变量
X = X.astype(int)
# 使用卡方统计量计算每个特征与标签的相关相关性
chi2_selector = SelectKBest(chi2, k=2)
# 或者最终选择特征结果输出
X_c = chi2_selector.fit_transform(X, y)
# 查看每个特征的卡方值和P值
chi2_scores = pd.DataFrame(list(zip(iris.feature_names, chi2_selector.scores_, chi2_selector.pvalues_)), columns=['ftr', 'score', 'pval'])
print(chi2_scores)
# 输出选择最佳特征
kbest = np.asarray(iris.feature_names)[chi2_selector.get_support()]
print(kbest)