波士顿房价:回归
from sklearn.datasets import load_boston data = load_boston() X, y =data.data, data.target.reshape(-1, 1) # X:shape=[506, 13],y:shape=[506, 1]
手写数字:10类
from sklearn.datasets import load_digits data = load_digits() X = data.data 或 X = data.images # 区别:前者shape=[1797, 64],后者[1797, 8, 8] y = data.target.reshape(-1, 1) # shape=[1797, 1]
纸鸢花:3类
from sklearn.datasets import load_iris data = load_iris() X, y = data.data, data.target.reshape(-1, 1) # X:shape=[150, 4],y:shape=[150, 1]
from sklearn. model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y)
from sklearn.preprocessing import StandardScaler SS_X = StandardScaler() SS_y = StandardScaler() X_train = SS_X.fit_transform(X_train) X_test = SS_X.transform(X_test) y_train = SS_y.fit_transform(y_train) y_test = SS_y.transform(y_test) # fit_transform和transform的区别在于前者能更新mean和var,而后者不能 # 查看均值,方差: SS_X.mean, SS_X.var SS_y.mean, SS_y.var
# 以逻辑回归为例: from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.pipeline import Pipeline pipeline = Pipeline([("clf", LogisticRegression())]) parameters = {"clfpenalty": ("l1", "l2"), "clfC": (0.01, 0.1, 1, 10)} grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=0, scoring="accuracy", cv=3) grid_search.fit(X_train, y_train) best_parameters = grid_search.best_estimator_.get_params() best_score = grid_search.best_score_ print("最佳效果:%0.3f" % best_score) print("最优参数组合:", [[para_name, best_parameters[para_name]] for para_name in parameters.keys()])
import joblib joblib.dump(model, 'model.pkl') # 模型保存 model = joblib.load('model.pkl') # 模型加载
评估score
model.score(X_test, y_test) sklearn.linear_model.LinearRegression:# 默认r2_score sklearn.linear_model.SGDRegressor:# 默认为”squared_loss” sklearn.linear_model.LogisticRegression:# 默认为”accuracy” sklearn.tree.DecisionTreeClassifier:# 默认为”accuracy”
交叉验证
from sklearn.model_selection import cross_val_score scores = cross_val_score(model, X, y, scoring=None, cv=k) # scoring=None默认使用model的评估方法,可设为”squared_loss”、”accuracy”、”precision”、”recall”、”f1”等
混淆矩阵
from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)
准确率
from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, y_pred)
精确率
from sklearn.metrics import precision_score precision = precision_score(y_test, y_pred)
召回率
from sklearn.metrics import recall_score recall = recall_score(y_test, y_pred)
F1
from sklearn.metrics import f1_score f1 = recall_score(y_test, y_pred)
ROC
from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 0])
AUC
from sklearn.metrics import auc auc_val = auc(fpr, tpr)
打印报告
from sklearn.metrics import classification_report classification_report(y_test, y_pred)
普通线性回归
from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) model.predict(X_test)
多项式回归 (k) 阶
from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures PF = PolynomialFeatures(degree=k) X_train_k, y_train_k = PF.fit_transform(X_train), y_train X_test_k, y_test_k = PF.transform(X_test), y_test model = LinearRegression() model.fit(X_train_k, y_train_k) model.predict(X_test_k)
from sklearn.linear_model import SGDRegressor model = SGDRegressor(loss="squared_loss") # 默认是squared_loss model.fit(X_train, y_train) model.predict(X_test)
from sklearn.linear_model import LogisticRegression model = LinearRegression() model.fit(X_train, y_train) model.predict(X_test) 或 model.predict_proba(X_test)
from sklearn.svm import SVC model = SVC(kernel="rbf", C=2.0, gamma=0.1) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier(criterion="entropy", max_depth=10, min_samples_split=2, min_samples_leaf=3) model.fit(X_train, y_train) model.predict(X_test)
from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(criterion="entropy", n_estimators=40, max_depth=16, min_samples_split=2, min_samples_leaf=1) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, max_depth=3) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from xgboost import XGBRegressor model = XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=3) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from lightgbm import LGBMRegressor model = LGBMRegressor(boosting_type="gbdt", learning_rate=0.1, n_estimators=100, max_depth=3) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=3) model.fit(X_train, y_train) y_pred = model.predict(X_test)
K-means
from sklearn.cluster import KMeans model = KMeans(n_clusters=k) model.fit(X) labels, centers = model.labels, model.cluster_centers
高斯混合模型GMM
from sklearn.mixture import GaussianMixture model = GaussianMixture(n_components=k).fit(X) labels = model.predict(X) probas = np.max(model.predict_proba(X),axis=1).round(3)
PCA
from sklearn.decomposition import PCA model = PCA(n_components=2) Xr = model.fit_transform(X)
朴素贝叶斯NB
from sklearn.naive_bayes import MultinomialNB model = MultinomialNB() model.fit(X_train, y_train) y_pred = model.predict(X_test)
from sklearn.linear_model import Perceptron model = Perceptron(max_iter=1000, eta0=0.1) model.fit(X_train, y_train) y_pred = model.predict(X_test)
from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(32, 16), activation="logistic", solver="sgd", batch_size=8, learning_rate_init=0.01, max_iter=200) model.fit(X_train, y_train) y_pred = model.predict(X_test)