运行环境:win10 64位 py 3.6 pycharm 2018.1.1
from sklearn.preprocessing import Binarizer
X = [
[1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3],
[1,1,1,1,1]
]
print("before transform:",X)
binarizer = Binarizer(threshold=2.5)
print('after transform:',binarizer.transform(X))
#独热码
from sklearn.preprocessing import OneHotEncoder
X = [
[1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3],
[1,1,1,1,1]
]
print("before transform:",X)
encoder = OneHotEncoder(sparse=False)
encoder.fit(X)
print('active_features_:',encoder.active_features_)
print('featuer_indices_:',encoder.feature_indices_)
print('n_values_:',encoder.n_values_)
print('after transform:',encoder.transform([[1,2,3,4,5]]))
from sklearn.preprocessing import MinMaxScaler
X = [
[1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3],
[1,1,1,1,1]
]
print("before transform:",X)
scalar = MinMaxScaler(feature_range=(0,2))
scalar.fit(X)
print('min_is:',scalar.min_)
print('scale_is:',scalar.scale_)
print('data_max_is:',scalar.data_max_)
print('data_min_is:',scalar.data_min_)
print('data_range_is:',scalar.data_range_)
print('after transform:',scalar.transform(X))
from sklearn.preprocessing import MaxAbsScaler
X = [
[1,5,1,2,10],
[2,6,3,2,7],
[3,7,5,6,4],
[4,8,7,8,1]
]
print("before transform:",X)
scaler = MaxAbsScaler()
scaler.fit(X)
print("scale_is:",scaler.scale_)
print("max_abs_is:",scaler.max_abs_)
print('after transform:',scaler.transform(X))
from sklearn.preprocessing import StandardScaler
X = [
[1,5,1,2,10],
[2,6,3,2,7],
[3,7,5,6,4],
[4,8,7,8,1]
]
print("before transform:",X)
scaler = StandardScaler()
scaler.fit(X)
print('scale_is:',scaler.scale_)
print('mean_is:',scaler.mean_)
print('var_is:',scaler.var_)
print('after transform:',scaler.transform(X))
from sklearn.preprocessing import Normalizer
X = [
[1,2,3,4,5],
[5,4,3,2,1],
[1,3,5,2,4],
[2,4,1,3,5]
]
print("before transform:",X)
normalizer = Normalizer(norm='l2')
print('after transform:',normalizer.transform(X))
from sklearn.feature_selection import VarianceThreshold
X = [
[100,1,2,3],
[100,4,5,6],
[100,7,8,9],
[101,11,12,13]
]
selector = VarianceThreshold(1)
selector.fit(X)
print("variances is %s:"%selector.variances_)
print('After transform is %s:'%selector.transform(X))
print('The surport is %s'%selector.get_support(True))
print('After reverse transform is %s:'%selector.inverse_transform(selector.transform(X)))
from sklearn.feature_selection import SelectKBest,f_classif
X = [
[1,2,3,4,5],
[5,4,3,2,1],
[3,3,3,3,3],
[1,1,1,1,1]
]
y = [0,1,0,1]
print("before transform:",X)
selector = SelectKBest(score_func=f_classif,k=3)
selector.fit(X,y)
print('scores_:',selector.scores_)
print('pvalues_:',selector.pvalues_)
print('selected index:',selector.get_support(True))
print('after transform:',selector.transform(X))
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
estimator = LinearSVC()
selector = RFE(estimator=estimator,n_features_to_select=2)
selector.fit(X,y)
print('N_features %s'%selector.n_features_)
print('Support is %s'%selector.support_)
print('Ranking %s'%selector.ranking_)
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.datasets import load_iris
iris = load_iris()
X,y = iris.data,iris.target
estimator = LinearSVC()
selector = RFE(estimator=estimator,n_features_to_select=2)
X_t = selector.fit_transform(X,y)
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.25,random_state=0,stratify=y)
X_train_t,X_test_t,y_train_t,y_test_t = cross_validation.train_test_split(X_t,y,test_size=0.25,random_state=0,stratify=y)
clf = LinearSVC()
clf_t = LinearSVC()
clf.fit(X_train,y_train)
clf_t.fit(X_train_t,y_train_t)
print('Original DataSet:test score=%s'%(clf.score(X_test,y_test)))
print('Selected DataSet: test score=%s'%(clf_t.score(X_test_t,y_test_t)))
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
estimator = LinearSVC()
selector = RFECV(estimator=estimator,cv=3)
selector.fit(X,y)
print('N_features %s'%selector.n_features_)
print('Support is %s'%selector.support_)
print('Ranking %s'%selector.ranking_)
print('Grid Scores %s'%selector.grid_scores_)
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target
estimator = LinearSVC(penalty='l1',dual=False)
selector = SelectFromModel(estimator=estimator,threshold='mean')
selector.fit(X,y)
selector.transform(X)
print('Threshold %s'%selector.threshold_)
print('Support is %s'%selector.get_support(indices=True))
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits,load_diabetes
from sklearn.linear_model import Lasso
def test_Lasso(*data):
X,y = data
alphas = np.logspace(-2,2)
zeros = []
for alpha in alphas:
regr = Lasso(alpha=alpha)
regr.fit(X,y)
num = 0
for ele in regr.coef_:
if abs(ele) < 1e-5:num+=1
zeros.append(num)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(alphas,zeros)
ax.set_xlabel(r'$\alpha$')
ax.set_xscale('log')
ax.set_ylim(0,X.shape[1]+1)
ax.set_ylabel('zeros in coef')
ax.set_title('Sparsity In Lasso')
plt.show()
def test_LinearSVC(*data):
X,y = data
Cs = np.logspace(-2,2)
zeros = []
for C in Cs:
clf = LinearSVC(C = C,penalty='l1',dual=False)
clf.fit(X,y)
num = 0
for row in clf.coef_:
for ele in row:
if abs(ele) < 1e-5:num+=1
zeros.append(num)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(Cs,zeros)
ax.set_xlabel("C")
ax.set_xscale("log")
ax.set_ylabel("zeros in coef")
ax.set_title("Sparsity In SVM")
plt.show()
if __name__=='__main__':
data = load_diabetes()
test_Lasso(data.data,data.target)
data = load_digits()
test_LinearSVC(data.data,data.target)
from sklearn.svm import LinearSVC
from sklearn.datasets import load_digits
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
def test_Pipeline(data):
X_train,X_test,y_train,y_test=data
steps=[('LinearSVC',LinearSVC(C=1,penalty='l1',dual=False)),('LogisticRegression',LogisticRegression(C=1))]
pipeline = Pipeline(steps)
pipeline.fit(X_train,y_train)
print('Name steps:',pipeline.named_steps)
print('Pipeline Score:',pipeline.score(X_test,y_test))
if __name__=='__main__':
data = load_digits()
X = data.data
y = data.target
from sklearn.decomposition import DictionaryLearning
X = [
[1,2,3,4,5],
[6,7,8,9,10],
[10,9,8,7,6],
[5,4,3,2,1]
]
print('before transform:',X)
dct = DictionaryLearning(n_components=3)
dct.fit(X)
print('components is :',dct.components_)
print('after transform:',dct.transform(X))