import pandas as pd
df = pd.DataFrame({"gene_segA": [1, 0, 0, 1, 1, 1, 0, 0, 1, 0],
"gene_segB": [1, 0, 1, 0, 1, 1, 0, 0, 1, 0],
"hypertension": ["Y", 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N'],
"Gallstones": ['Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y']
})
df
df.replace({"N": 0, 'Y': 1})
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # ①
le.fit(['white', 'green', 'red', 'green', 'white']) # ②
le.classes_ # ③
le.transform(["green", 'green', 'green', 'white']) # ④
里面封装了多种编码方式
https://mattzheng.blog.csdn.net/article/details/107851162
import numpy as np
pm25['bdays'] = np.where(pm25["Exposed days"] > pm25["Exposed days"].mean(), 1, 0)
pm25.sample(10)
这个就是根据阈值将数值型转变为二进制型,阈值可以进行设定,另外只能对数值型数据进行处理,且传入的参数必须为2D数组,也就是不能是Series这种类型,shape为(m,n)而不是(n,)类型的数组,下面看下例子
df = DataFrame(np.arange(12).reshape(4,3),columns=['A','B','C'])
df
第一列为索引值
A B C
0 0 1 2
1 3 4 5
2 6 7 8
3 9 10 11
将小于等于5的数值转为0,大于5的转为1
binarize = Binarizer(threshold=5)
binarize.fit_transform(df)
array([[0, 0, 0],
[0, 0, 0],
[1, 1, 1],
[1, 1, 1]])
也可以传入df[['A','B']]来对两列进行转换,注意,不可以是df['A']或者df.A,因为df.A是Series不是二维的
from sklearn.preprocessing import Binarizer
bn = Binarizer(threshold=pm25["Exposed days"].mean()) # ①
result = bn.fit_transform(pm25[["Exposed days"]]) # ②
pm25['sk-bdays'] = result
pm25.sample(10)
补充知识点:
reshape函数是在不改变数组的数据情况下,改变其格式、参数、返回值。
reshape(m, -1) 改变维度为m行、列数未知
reshape(-1, m) 改变维度为m列,行数未知
df = pd.DataFrame({
"color": ['green', 'red', 'blue', 'red'],
"size": ['M', 'L', 'XL', 'L'],
"price": [29.9, 69.9, 99.9, 59.9],
"classlabel": ['class1', 'class2', 'class1', 'class1']
})
df
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping) # ②
df
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
fs = ohe.fit_transform(df[['color']])
fs_ohe = pd.DataFrame(fs.toarray()[:, 1:], columns=["color_green", 'color_red'])
df = pd.concat([df, fs_ohe], axis=1)
df
%matplotlib inline
import seaborn as sns
ax = sns.scatterplot(x='time', y='location', data=data)
import numpy as np
data.drop([0], inplace=True) # 去掉0,不计算log0
data['logtime'] = np.log10(data['time']) # ①
data['logloc'] = np.log10(data['location']) # ②
data.head()
ax2 = sns.scatterplot(x='logtime', y='logloc', data=data)
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(data['logtime'].values.reshape(-1, 1), data['logloc'].values.reshape(-1, 1))
(reg.coef_, reg.intercept_)
sklearn包的PolynomialFeatures
import numpy as np
X = np.arange(6).reshape(3, 2)
X
from sklearn.preprocessing import PolynomialFeatures # ③
poly = PolynomialFeatures(2) # ④
poly.fit_transform(X)
综合案例
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
df = pd.read_csv("/home/aistudio/data/data20514/xsin.csv")
colors = ['teal', 'yellowgreen', 'gold']
plt.scatter(df['x'], df['y'], color='navy', s=30, marker='o', label="training points")
for count, degree in enumerate([3, 4, 5]):
model = make_pipeline(PolynomialFeatures(degree), Ridge()) # ③
model.fit(df[['x']], df[['y']])
y_pre = model.predict(df[['x']])
plt.plot(df['x'], y_pre, color=colors[count], linewidth=2,
label="degree %d" % degree)
plt.legend()
利用pandas的cut()函数将属性分组
ages2 = pd.DataFrame({'years':[10, 14, 30, 53, 300, 32, 45], 'name':['A', 'B', 'C', 'D', 'E', 'F', 'G']})
klass2 = pd.cut(ages2['years'], 3, labels=['Young', 'Middle', 'Senior']) # ②
ages2['label'] = klass2
ages2
ages2 = pd.DataFrame({'years':[10, 14, 30, 53, 300, 32, 45], 'name':['A', 'B', 'C', 'D', 'E', 'F', 'G']})
klass2 = pd.cut(ages2['years'], bins=[9, 30, 50, 300], labels=['Young', 'Middle', 'Senior']) # ③
ages2['label'] = klass2
ages2
调用sklearn的KBinsDiscretizer实现
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') # ④
trans = kbd.fit_transform(ages[['years']]) # ⑤
ages['kbd'] = trans[:, 0] # ⑥
ages
KBinsDiscretizer介绍
https://scikit-learn.org.cn/view/722.html
Sklearn官方例子
该示例比较了带有或不带有离散化实值特征的线性回归(线性模型)和决策树(基于树的模型)的预测结果。
如离散化之前的结果所示,线性模型的建立速度很快,解释起来也相对简单,但是只能建模线性关系,而决策树则可以构建更为复杂的数据模型。使线性模型在连续数据上更强大的一种方法是使用离散化(也称为分箱)。在示例中,我们离散化了特征,并对转换后的数据进行了一次热编码。请注意,如果分箱的宽度不太合理,则过拟合的风险似乎会大大增加,因此通常应在交叉验证下调整离散器参数。
离散化之后,线性回归和决策树做出完全相同的预测。由于每个分箱仓中的要素都是恒定的,因此任何模型都必须为仓中的所有点预测相同的值。与离散化之前的结果相比,线性模型变得更加灵活,而决策树的灵活性则大大降低。请注意,合并功能通常不会对基于树的模型产生任何有益影响,因为这些模型可以学习将数据拆分到任何地方。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor
print(__doc__)
# 构建数据集
rnd = np.random.RandomState(42)
X = rnd.uniform(-3, 3, size=100)
y = np.sin(X) + rnd.normal(size=len(X)) / 3
X = X.reshape(-1, 1)
# 用KBinsDiscretizer转换数据集
enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X)
# 用原始数据集进行预测
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
reg = LinearRegression().fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='green',
label="linear regression")
reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
ax1.plot(line, reg.predict(line), linewidth=2, color='red',
label="decision tree")
ax1.plot(X[:, 0], y, 'o', c='k')
ax1.legend(loc="best")
ax1.set_ylabel("Regression output")
ax1.set_xlabel("Input feature")
ax1.set_title("Result before discretization")
# 用转换后的数据进行预测
line_binned = enc.transform(line)
reg = LinearRegression().fit(X_binned, y)
ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green',
linestyle='-', label='linear regression')
reg = DecisionTreeRegressor(min_samples_split=3,
random_state=0).fit(X_binned, y)
ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red',
linestyle=':', label='decision tree')
ax2.plot(X[:, 0], y, 'o', c='k')
ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=.2)
ax2.legend(loc="best")
ax2.set_xlabel("Input feature")
ax2.set_title("Result after discretization")
plt.tight_layout()
plt.show()
![[KBinsDiscretizer.png]]
调用entropy_based_binning包
import entropy_based_binning as ebb
A = np.array([[1,1,2,3,3], [1,1,0,1,0]])
ebb.bin_array(A, nbins=2, axis=1)
用法介绍
Docstring:
Find and apply the maximum entropy binning to an integer array,
given the number of target bins.
Convenience wrapper around bin_sequence().
Arguments:
----------
A: (N, M) ndarray
input array; must be integer
nbins: int
number of bins
axis: None or int (default None)
axis along which to bin;
if None, the optimal binning is chosen based on all values in the array;
Returns:
--------
B: (N, M) ndarray
binned array
MDLP介绍
https://github.com/hlin117/mdlp-discretization
这是 Usama Fayyad 的基于熵的专家分箱方法的实现
示例
from mdlp.discretization import MDLP
from sklearn.datasets import load_iris
transformer = MDLP()
iris = load_iris()
X, y = iris.data, iris.target
X_disc = transformer.fit_transform(X, y)
X_disc
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
iris_std = StandardScaler().fit_transform(iris.data) # ①
from sklearn.preprocessing import MinMaxScaler
iris_mm = MinMaxScaler().fit_transform(iris.data)
from sklearn.preprocessing import RobustScaler
iris_mm = RobustScaler().fit_transform(iris.data)
示例1
'''
np.random.normal()
第一个参数是均值
第二个参数是标准差
第三个参数是个数
np.concatenate()
将数组拼接起来
'''
#构建数据
import pandas as pd
X = pd.DataFrame({
'x1': np.concatenate([np.random.normal(20, 1, 1000), np.random.normal(1, 1, 25)]),
'x2': np.concatenate([np.random.normal(30, 1, 1000), np.random.normal(50, 1, 25)]),
})
X.sample(10)
#创建RobustScaler, MinMaxScaler规范化模型
from sklearn.preprocessing import RobustScaler, MinMaxScaler
robust = RobustScaler()
robust_scaled = robust.fit_transform(X)
robust_scaled = pd.DataFrame(robust_scaled, columns=['x1', 'x2'])
minmax = MinMaxScaler()
minmax_scaled = minmax.fit_transform(X)
minmax_scaled = pd.DataFrame(minmax_scaled, columns=['x1', 'x2'])
#绘图
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(9, 5))
ax1.set_title('Before Scaling')
sns.kdeplot(X['x1'], ax=ax1)
sns.kdeplot(X['x2'], ax=ax1)
ax2.set_title('After Robust Scaling')
sns.kdeplot(robust_scaled['x1'], ax=ax2)
sns.kdeplot(robust_scaled['x2'], ax=ax2)
ax3.set_title('After Min-Max Scaling')
sns.kdeplot(minmax_scaled['x1'], ax=ax3)
sns.kdeplot(minmax_scaled['x2'], ax=ax3)
示例2
from sklearn.preprocessing import Normalizer
from mpl_toolkits.mplot3d import Axes3D
df = pd.DataFrame({
'x1': np.random.randint(-100, 100, 1000).astype(float),
'y1': np.random.randint(-80, 80, 1000).astype(float),
'z1': np.random.randint(-150, 150, 1000).astype(float),
})
#Normalizer()规范化的效果
scaler = Normalizer()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=df.columns)
fig = plt.figure(figsize=(9, 5))
ax1 = fig.add_subplot(121, projection='3d')
ax2 = fig.add_subplot(122, projection='3d')
ax1.scatter(df['x1'], df['y1'], df['z1'])
ax2.scatter(scaled_df['x1'], scaled_df['y1'], scaled_df['z1'])
#MinMaxScaler()规范化的效果
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=df.columns)
fig = plt.figure(figsize=(9, 5))
ax1 = fig.add_subplot(121, projection='3d')
ax2 = fig.add_subplot(122, projection='3d')
ax1.scatter(df['x1'], df['y1'], df['z1'])
ax2.scatter(scaled_df['x1'], scaled_df['y1'], scaled_df['z1'])