一、通过datasets获取数据
- datasets
▪ 小型数据 — 存在于sklearn package中 — run datasets.load_*
▪ 大型数据 — must be fetched
boston = datasets.load_boston()
print(boston.DESCR) #数据集简要描述
housing = datasets.fetch_california_housing() #fetch a dataset
print(housing.DESCR)
- dataset 数据格式 Bunch
X, y = DataSet.data ,DataSet.target
- using scikit-learn to create toy data
二、数据变换—归一化,二值化等
1. Scaling data to the standard normal
# Scale data to the standard normal
# Scale function
X_2 = preprocessing.scale(X[:,:3])
X_2.mean(axis=0)
#array([ 6.34099712e-17, -6.34319123e-16, -2.68291099e-15])
X_2.std(axis=0)
# array([ 1., 1., 1.])
# Scale Class
my_scaler = preprocessing.StandardScaler()
my_scaler.fit(X[:,:3])
my_scaler.transform(X[:,:3]).mean(axis=0)
# Min-max-Scaler 0-1
my_minmax_scaler = preprocessing.MinMaxScaler()
my_minmax_scaler.fit(X[:,:3])
my_minmax_scaler.transform(X[:,:3]).max(axis=0)
# Min max scaler to a range
my_odd_scaler = preprocessing.MinMaxScaler(feature_range=(-3.14,3.14))
my_odd_scaler.fit(X[:,:3])
my_odd_scaler.transform(X[:,:3]).max(axis=0)
2.Normalization
normalized_X = preprocessing.normalize(X[:,:3])
my_useless_scaler = preprocessing.StandardScaler(with_mean=False,with_std=False)
transformed_sd = my_useless_scaler.fit_transform(X[:,:3]).std(axis=0)
original_sd=X[:,:3].std(axis=0)
np.array_equal(transformed_sd,original_sd)
3. #Create binary features through thresholding
new_target=preprocessing.binarize(boston.target,
threshold=boston.target.mean())
new_target[:5]
#check
(boston.target[:5] > boston.target.mean()).astype(int)
# Using Binarizer class
bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
new_target[:5]
三、处理类别变量
1. # Using Binarizer class
bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
new_target[:5]
iris = datasets.load_iris()
X = iris.data
y = iris.target
#将X,y合并
d=np.column_stack((X,y))
text_encoder = preprocessing.OneHotEncoder()
text_encoder.fit_transform(d[:,-1:]).toarray()[:5]
text_encoder.transform(np.ones((3,1))).toarray()
#array([[ 0., 1., 0.],
# [ 0., 1., 0.],
# [ 0., 1., 0.]])
2. #DictVectorizer
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
my_dict = [{'species':iris.target_names[i]} for i in y]
dv.fit_transform(my_dict).toarray()[:5]
# array([[ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.]])
3. Binarizing Label Features
target = iris.target
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()
new_target = label_binarizer.fit_transform(target)
new_target.shape
new_target[:5]
# array([[1, 0, 0],
# [1, 0, 0],
# [1, 0, 0],
# [1, 0, 0],
# [1, 0, 0]])
label_binarizer.classes_
#array([0, 1, 2])
label_binarizer = LabelBinarizer(neg_label = -1000, pos_label =1000)
label_binarizer.fit_transform(target)[:5]
#array([[ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000]])
四、处理缺失值
from sklearn import datasets
import numpy as np
iris=datasets.load_iris()
iris_X = iris.data
masking_array = np.random.binomial(1,.25,iris_X.shape).astype(bool)
iris_X[masking_array]=np.nan
from sklearn import preprocessing
impute = preprocessing.Imputer()
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
iris_X_prime[3,0]
iris_X[3,0]
impute2 = preprocessing.Imputer(strategy='median')
iris_X_prime2 = impute.fit_transform(iris_X)
iris_X_prime2[:5]
iris_X[np.isnan(iris_X)]=-1
iris_X[:5]
impute3 = preprocessing.Imputer(missing_values=-1)
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
import pandas as pd
iris_X[masking_array]=np.nan
iris_df = pd.DataFrame(iris_X,columns=iris.feature_names)
iris_df.fillna(iris_df.mean())['sepal length (cm)'].head(5)
五、Using pipeline for nultiple preprocessing steps
1. Pipeine
from sklearn import datasets
import numpy as np
mat = datasets.make_spd_matrix(10)
masking_array = np.random.binomial(1,.1,mat.shape).astype(bool)
mat[masking_array]=np.nan
mat[:4,:4]
from sklearn import pipeline
impute = preprocessing.Imputer()
scaler = preprocessing.StandardScaler()
pipe = pipeline.Pipeline([('impute',impute),('scaler',scaler)])
new_mat = pipe.fit_transform(mat)
new_mat[:4,:4]
2. Pipelines functions
• Fit
• Transform
• fit_transform
六 、Reducing dimensionality with PCA
1. PCA
#降到2维
pca2 = decomposition.PCA(n_components=2)
iris_X_prime = pca2.fit_transform(iris_X)
iris_X_prime.shape
# (150, 2)
pca2.explained_variance_ratio_.sum()
#0.97763177502480336
# 以分数形式表示 eg 98%
pca3 = decomposition.PCA(n_components =.98)
iris_X_prime2 = pca3.fit(iris_X)
pca3.explained_variance_ratio_.sum()
#0.99481691454981014
2. Using Factor Analysis for decomposition
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2)
iris_two_dim = fa.fit_transform(iris.data)
iris_two_dim[:5]