import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import Pipeline from sklearn.metrics import precision_score, recall_score, accuracy_score pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (2500, 5000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10), } df = pd.read_csv('./sms.csv') X = df['message'] y = df['label'] label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3) grid_search.fit(X_train, y_train) print('Best score: %0.3f' % grid_search.best_score_) print('Best parameters set:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(X_test) print('Accuracy: %s' % accuracy_score(y_test, predictions)) print('Precision: %s' % precision_score(y_test, predictions)) print('Recall: %s' % recall_score(y_test, predictions)) df = pd.read_csv('./sms.csv') X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['message'], df['label'], random_state=11) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train_raw) X_test = vectorizer.transform(X_test_raw) classifier = LogisticRegression() classifier.fit(X_train, y_train) scores = cross_val_score(classifier, X_train, y_train, cv=5) print('Accuracies: %s' % scores) print('Mean accuracy: %s' % np.mean(scores)) precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision') print('Precision: %s' % np.mean(precisions)) recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall') print('Recall: %s' % np.mean(recalls)) f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1') print('F1 score: %s' % np.mean(f1s))
微调后:
Best score: 0.983
Best parameters set:
clf__C: 10
clf__penalty: 'l2'
vect__max_df: 0.5
vect__max_features: None
vect__ngram_range: (1, 2)
vect__stop_words: None
vect__use_idf: True
Accuracy: 0.9863701578192252
Precision: 0.994535519125683
Recall: 0.91
微调前:
Accuracies: [0.95221027 0.95454545 0.96172249 0.96052632 0.95209581]
Mean accuracy: 0.9562200683094717
Precision: 0.992542742398164
Recall: 0.6836050302748021
F1 score: 0.8090678466269784
我们可以看到极大的改善了Recall,极大的优化了模型,GridSearchCV其实就是暴力搜索。该方法在小数据集上很有用,数据集大了就不太适用。
在大数据集的情况下,容易造成内存溢出,试试下面的GridSearchCV + SVM的代码,看看是不是溢出了。
import matplotlib.pyplot as plt from sklearn.datasets import fetch_mldata, fetch_openml import matplotlib.cm as cm from sklearn.metrics import classification_report from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.pipeline import Pipeline from sklearn.svm import SVC mnist = fetch_openml('mnist_784') # print(mnist.shape) counter = 1 for i in range(1, 4): for j in range(1, 6): plt.subplot(3, 5, counter) plt.imshow(mnist.data[(i - 1) * 8000 + j*200].reshape((28, 28)), cmap=cm.Greys_r) plt.axis('off') counter += 1 plt.show() if __name__ == '__main__': X, y = mnist.data, mnist.target X = X/255.0*2 - 1 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11) pipeline = Pipeline([ ('clf', SVC(kernel='rbf', gamma=0.01, C=100)) ]) parameters = { 'clf__gamma': (0.01, 0.03), 'clf__C': (0.1, 0.3), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy') grid_search.fit(X_train[:10000], y_train[:10000]) print('Best score: %0.3f' % grid_search.best_score_) print('Best parameters set:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(X_test) print(classification_report(y_test, predictions))