仅供参考
estimator.fit(x_train, y_train)
方法计算,调用完毕模型生成;estimator.score(x_test, y_test)
;sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,algorithm='auto')
;
def knn_iris():
# 1.获取数据
iris = load_iris()
print(iris)
# 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=6)
print(x_train)
print(x_test)
print(y_train)
print(y_test)
# 3.特征工程:标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) #测试集使用训练集的数据(平均值)等进行标准化
# 4.KNN 算法预估器
estimator = KNeighborsClassifier(n_neighbors=3)
estimator.fit(x_train, y_train)
# 5.模型评估
# 方法一:直接比对预测值与真实值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_test:\n", y_test)
print("预测值与真实值比对:\n", y_predict == y_test)
# 方法二:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
return None
sklearn.model_selection.GridSearchCV(estimator, param_grid=None,cv=None)
;
def knncls():
# 一、处理数据以及特征工程
# 1、读取收
data = pd.read_csv("../resources/p01_machine_learning_sklearn/FBlocation/train.csv")
# 2、数据处理
# 1)数据逻辑筛选操作,缩小数据的范围 df.query()
data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75")
# 2)处理时间特征
time_value = pd.to_datetime(data["time"], unit="s")
date = pd.DatetimeIndex(time_value)
data["day"] = date.day
data["weekday"] = date.weekday
data["hour"] = date.hour
# 3)删除入住次数少于三次位置
place_count = data.groupby('place_id').count()["row_id"]
tf = place_count[place_count > 3].index.values
data_final = data[data['place_id'].isin(tf)]
# 3、取出特征值和目标值
y = data_final['place_id']
# y = data[['place_id']]
x = data_final[["x","y","accuracy","day","weekday","hour"]]
# 4、数据分割与特征工程
# (1)、数据分割
x_train, x_test, y_train, y_test = train_test_split(x, y)
# (2)、标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test) # 测试集使用训练集的数据(平均值)等进行标准化
# (3).KNN 算法预估器
estimator = KNeighborsClassifier()
# 4.1 加入网格搜索与交叉验证
param_dict = {"n_neighbors": [3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)
# (4).模型评估
# 方法一:直接比对预测值与真实值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_test:\n", y_test)
print("预测值与真实值比对:\n", y_predict == y_test)
# 方法二:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
# 最佳参数 best_params_
print("最佳参数:\n", estimator.best_params_)
# 最佳结果 best_score_
print("最佳结果:\n", estimator.best_score_)
# 最佳估计器 best_estimator_
print("最佳估计器:\n", estimator.best_estimator_)
# 交叉验证结果 cv_results_
print("交叉验证结果:\n", estimator.cv_results_)
return None
P ( C ∣ W ) = P ( W ∣ C ) P ( C ) P ( W ) P(C|W)=\frac{P(W|C)P(C)}{P(W)} P(C∣W)=P(W)P(W∣C)P(C)
P ( C ∣ F 1 , F 2 , . . . ) = P ( F 1 , F 2 , . . . ∣ C ) P ( C ) P ( F 1 , F 2 , . . . ) P(C|F1, F2,...)=\frac{P(F1, F2,...|C)P(C)}{P(F1, F2,...)} P(C∣F1,F2,...)=P(F1,F2,...)P(F1,F2,...∣C)P(C)
P ( F 1 ∣ C ) = N i + α N + α m P(F_1|C)=\frac{N_i+\alpha}{N+\alpha m} P(F1∣C)=N+αmNi+α
sklearn.naive_bayes.MultinomialNB(alpha = 1.0)
;
def nb_cls():
# 1.获取新闻的数据
news = fetch_20newsgroups(subset='all')
# 2.进行数据集分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target,test_size=0.3)
# 3.对于文本数据,进行特征抽取
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.estimator估计器流程,朴素贝叶斯算法
mlb = MultinomialNB(alpha=1.0)
mlb.fit(x_train, y_train)
# 5.进行预测
y_predict = mlb.predict(x_test)
print("预测每篇文章的类别:", y_predict[:100])
print("真实类别为:", y_test[:100])
print("预测准确率为:", mlb.score(x_test, y_test))
return None
H ( X ) = − ∑ r i = 1 n ( P ( x i ) ∗ l o g b P ( x i ) ) H(X)=-\sum_{ri=1}^n(P(x_i)*log_bP(x_i)) H(X)=−ri=1∑n(P(xi)∗logbP(xi))
H ( D ) = − ∑ k = 1 K ∣ C k ∣ ∣ D ∣ l o g ∣ C k ∣ ∣ D ∣ H(D)=-\sum_{k=1}^K\frac{|C_k|}{|D|}log\frac{|C_k|}{|D|} H(D)=−k=1∑K∣D∣∣Ck∣log∣D∣∣Ck∣
g ( D , A ) = H ( D ) − H ( D ∣ A ) g(D,A)=H(D)-H(D|A) g(D,A)=H(D)−H(D∣A)
H ( D ∣ A ) = ∑ i = 1 n ∣ D i ∣ D H ( D i ) = − ∑ i = 1 n ∣ D i ∣ ∣ D ∣ ∑ k = 1 K ∣ D i k ∣ ∣ D i ∣ l o g ∣ D i k ∣ ∣ D i ∣ H(D|A)=\sum_{i=1}^n\frac{|D_i|}{D}H(D_i)=-\sum_{i=1}^n\frac{|D_i|}{|D|}\sum_{k=1}^K\frac{|D_{ik}|}{|D_i|}log\frac{|D_{ik}|}{|D_i|} H(D∣A)=i=1∑nD∣Di∣H(Di)=−i=1∑n∣D∣∣Di∣k=1∑K∣Di∣∣Dik∣log∣Di∣∣Dik∣
三种算法:
CART:
API:class sklearn.tree.DecisionTreeClassifier(criterion=’gini’,max_depth=None,random_state=None)
其中会有些超参数:max_depth:树的深度大小;
def decision_iris():
# 1.获取数据集
iris = load_iris()
# 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=6)
# 3.决策树预估器
estimator = DecisionTreeClassifier(criterion="entropy")
estimator.fit(x_train, y_train)
# 4.模型评估
# 方法一:直接比对预测值与真实值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_test:\n", y_test)
print("预测值与真实值比对:\n", y_predict == y_test)
# 方法二:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
return None
sklearn.tree.export_graphviz()
该函数能够导出 DOT 格式;
tree.export_graphviz(estimator,out_file='tree.dot’,feature_names=[‘’,’’])
export_graphviz(estimator, out_file="iris_tree.dot", feature_names=iris.feature_names)
class sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, bootstrap=True, random_state=None, min_samples_split=2)
def random_iris():
# 1.获取数据集
iris = load_iris()
# 2.划分数据集
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=6)
# 3.随机森林预估器
estimator = RandomForestClassifier(criterion="entropy", max_depth=8)
estimator.fit(x_train, y_train)
# 4.模型评估
# 方法一:直接比对预测值与真实值
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("y_test:\n", y_test)
print("预测值与真实值比对:\n", y_predict == y_test)
# 方法二:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
return None