Scikit-learn Cookbook (一) --- Premodel Workflow

一、通过datasets获取数据

  1. datasets
    ▪ 小型数据 — 存在于sklearn package中 — run datasets.load_*
    ▪ 大型数据 — must be fetched
    boston = datasets.load_boston()
    print(boston.DESCR) #数据集简要描述
    housing = datasets.fetch_california_housing() #fetch a dataset
    print(housing.DESCR)
  1. dataset 数据格式 Bunch
      X, y = DataSet.data ,DataSet.target
  1. using scikit-learn to create toy data

二、数据变换—归一化,二值化等

1. Scaling data to the standard normal

# Scale data to the standard normal
# Scale function
X_2 = preprocessing.scale(X[:,:3])
X_2.mean(axis=0)
#array([  6.34099712e-17,  -6.34319123e-16,  -2.68291099e-15])
X_2.std(axis=0)
# array([ 1.,  1.,  1.])
# Scale Class
my_scaler = preprocessing.StandardScaler()
my_scaler.fit(X[:,:3])
my_scaler.transform(X[:,:3]).mean(axis=0)
# Min-max-Scaler 0-1
my_minmax_scaler = preprocessing.MinMaxScaler()
my_minmax_scaler.fit(X[:,:3])
my_minmax_scaler.transform(X[:,:3]).max(axis=0)
# Min max scaler to a range
my_odd_scaler = preprocessing.MinMaxScaler(feature_range=(-3.14,3.14))
my_odd_scaler.fit(X[:,:3])
my_odd_scaler.transform(X[:,:3]).max(axis=0)

2.Normalization

normalized_X = preprocessing.normalize(X[:,:3])
my_useless_scaler = preprocessing.StandardScaler(with_mean=False,with_std=False)
transformed_sd = my_useless_scaler.fit_transform(X[:,:3]).std(axis=0)
original_sd=X[:,:3].std(axis=0)
np.array_equal(transformed_sd,original_sd)

3. #Create binary features through thresholding

new_target=preprocessing.binarize(boston.target,
           threshold=boston.target.mean())
new_target[:5]
#check
(boston.target[:5] > boston.target.mean()).astype(int)
# Using Binarizer class
bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
new_target[:5]

三、处理类别变量

1. # Using Binarizer class

bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)
new_target[:5]

iris = datasets.load_iris()
X = iris.data
y = iris.target
#将X,y合并
d=np.column_stack((X,y))
text_encoder = preprocessing.OneHotEncoder()
text_encoder.fit_transform(d[:,-1:]).toarray()[:5]
text_encoder.transform(np.ones((3,1))).toarray()
#array([[ 0.,  1.,  0.],
#       [ 0.,  1.,  0.],
#       [ 0.,  1.,  0.]])

2. #DictVectorizer

from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
my_dict = [{'species':iris.target_names[i]} for i in y]
dv.fit_transform(my_dict).toarray()[:5]
# array([[ 1.,  0.,  0.],
#      [ 1.,  0.,  0.],
#       [ 1.,  0.,  0.],
#       [ 1.,  0.,  0.],
#       [ 1.,  0.,  0.]])

3. Binarizing Label Features

target = iris.target
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()

new_target = label_binarizer.fit_transform(target)
new_target.shape
new_target[:5]
# array([[1, 0, 0],
#        [1, 0, 0],
#         [1, 0, 0],
#         [1, 0, 0],
#        [1, 0, 0]])
label_binarizer.classes_
#array([0, 1, 2])

label_binarizer = LabelBinarizer(neg_label = -1000, pos_label =1000)
label_binarizer.fit_transform(target)[:5]
#array([[ 1000, -1000, -1000],
#       [ 1000, -1000, -1000],
#       [ 1000, -1000, -1000],
#       [ 1000, -1000, -1000],
#       [ 1000, -1000, -1000]])

四、处理缺失值

from sklearn import datasets
import numpy as np
iris=datasets.load_iris()
iris_X = iris.data
masking_array = np.random.binomial(1,.25,iris_X.shape).astype(bool)
iris_X[masking_array]=np.nan #创建缺失值

from sklearn import preprocessing
impute = preprocessing.Imputer()   #通过设置strategy 设置填充null值的方法
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]
iris_X_prime[3,0]
iris_X[3,0]

impute2 = preprocessing.Imputer(strategy='median')
iris_X_prime2 = impute.fit_transform(iris_X)
iris_X_prime2[:5]

# 使用-1代替缺失值
iris_X[np.isnan(iris_X)]=-1
iris_X[:5]

impute3 = preprocessing.Imputer(missing_values=-1)
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5]

#利用pandas ---- data.fillna()填充缺失值
import pandas as pd
iris_X[masking_array]=np.nan
iris_df = pd.DataFrame(iris_X,columns=iris.feature_names)
iris_df.fillna(iris_df.mean())['sepal length (cm)'].head(5)

五、Using pipeline for nultiple preprocessing steps

1. Pipeine

    from sklearn import datasets
    import numpy as np
    mat = datasets.make_spd_matrix(10)
    masking_array = np.random.binomial(1,.1,mat.shape).astype(bool)
    mat[masking_array]=np.nan
    mat[:4,:4]

    from sklearn import pipeline
    impute = preprocessing.Imputer()
    scaler = preprocessing.StandardScaler()
    pipe = pipeline.Pipeline([('impute',impute),('scaler',scaler)])
    new_mat = pipe.fit_transform(mat)
    new_mat[:4,:4]

2. Pipelines functions

• Fit
• Transform
• fit_transform

六 、Reducing dimensionality with PCA

1. PCA

#降到2维
pca2 = decomposition.PCA(n_components=2)
iris_X_prime = pca2.fit_transform(iris_X)
iris_X_prime.shape
# (150, 2)
pca2.explained_variance_ratio_.sum()
#0.97763177502480336

# 以分数形式表示 eg 98%
pca3 = decomposition.PCA(n_components =.98)
iris_X_prime2 = pca3.fit(iris_X)
pca3.explained_variance_ratio_.sum()
#0.99481691454981014

2. Using Factor Analysis for decomposition

from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2)
iris_two_dim = fa.fit_transform(iris.data)
iris_two_dim[:5]

你可能感兴趣的:(数据分析,python)