更多pandas操作请参考添加链接描述pandas对于文件数据基本操作
导入的包sklearn
pip3 install --index-url https://pypi.douban.com/simple scikit-learn
#缺失值查看
df.replace('NaN ', np.nan, inplace=True)#将数据中一些None替换为NULL
df.isnull().sum()
df.fillna(method='pad', inplace=True) # 填充前一条数据的值
#类型转换
df[a] = df[a].apply(lambda x: float(x))
字符串编码处理,LabelEncoder
from sklearn.preprocessing import LabelEncoder
#使用LabelEncoder对数据集进行编码
le = LabelEncoder()
cat_data = ['需要编码的列名']
for i in cat_data:
df[i] = le.fit_transform(df[i])
TfidfVectorizer结合TruncatedSVD
#向量转换
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import joblib
# raw documents to tf-idf matrix:
vectorizer = TfidfVectorizer(stop_words='english',
use_idf=True,
smooth_idf=True)
# SVD to reduce dimensionality: 采用了随机SVD算法,迭代次数为10次,将维度降至5
svd_model = TruncatedSVD(n_components=5,
algorithm='randomized',
n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:流水线
svd_transformer = Pipeline([('tfidf', vectorizer),
('svd', svd_model)])
# fit and transform the data:
dc_matrix = svd_transformer.fit_transform(data['分词描述'])
# save the models to disk:
joblib.dump(svd_transformer, 'svd_transformer.joblib')
# load the models from disk:
svd_transformer = joblib.load('svd_transformer.joblib')
#转换
dc_matrix = svd_transformer.transform(data['分词描述'])
dc_matrix.shape
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
x_data = df.iloc[:, 1:-1] # 特征值
y_data = df.iloc[:, -1] # labels
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)
# 使用ANOVA F-value作为评分函数选择最佳的10个特征
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train, y_train)
# 获取选择的特征的索引
selected_features_indices = selector.get_support(indices=True)
# 获取选择的特征的名称
selected_features_names = x_data.columns[selected_features_indices]
# 打印选择的特征名称
print("Selected features:", selected_features_names)
# 绘制特征的评分图表
plt.bar(range(len(selector.scores_)), selector.scores_)
plt.xticks(range(len(selector.scores_)), x_data.columns, rotation=90)
plt.show()
from sklearn.model_selection import train_test_split
x_data = df.iloc[:, 0:-1] # 特征值0--2列
y_data = df.iloc[:, -1] # labels最后一列
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)
排除某一列,例如
x_data = df.drop(df.columns[5], axis=1)
from sklearn import metrics
#划分数据集,输入最佳参数
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
#需要调优的参数
#请尝试将L1正则和L2正则分开,并配合合适的优化求解算法(solver)
#tuned_parameters={'penalth':['l1','l2'],'C':[0.001,0.01,0.1,1,10,100,
# 1000]}
#参数的搜索范围
penaltys=['l1','l2']
Cs=[0.1,1,10,100,1000]
#调优的参数集合,搜索网格为2x5,在网格上的交叉点进行搜索
tuned_parameters=dict(penalty=penaltys,C=Cs)
lr_penalty=LogisticRegression(solver='liblinear')
grid=GridSearchCV(lr_penalty,tuned_parameters,cv=3,scoring='neg_log_loss',
n_jobs=4)
grid.fit(x_train,y_train)
#预测
lr_y_predict = grid.predict(x_test)
# 打印最佳参数和准确率
print('Best parameters:', grid.best_params_)
print('Best accuracy:', grid.best_score_)
#评估
score_lr = metrics.accuracy_score (y_test,lr_y_predict)
#绘制预测效果
print('accuracy_score 评估逻辑回归模型:',score_lr)
如果需要用到回归则换成RandomForestRegressor
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#再对RandomForestClassifier进行调优
k = []
v = []
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=i, random_state=2)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#评估
score_lr = accuracy_score(y_test,lr_y_predict)
#记录结果
k.append(score_lr)
v.append(i)
#展示每次调整的结果
print('best max_depth',v[k.index(max(k))])
val1 = v[k.index(max(k))]
#再对RandomForestClassifier进行调优
k = []
v = []
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=val1, n_estimators=i)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#评估
score_lr = accuracy_score(y_test,lr_y_predict)
#记录结果
k.append(score_lr)
v.append(i)
print('best random_state',v[k.index(max(k))])
print('Accuracy 评估随机森林:',max(k))
from sklearn import svm
from sklearn.metrics import accuracy_score
# 创建 SVM 分类器并拟合训练数据
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
# 预测测试集并计算准确率
y_pred = clf.predict(x_test)
SVMaccuracy = accuracy_score(y_test, y_pred)
print('Accuracy SVM:', SVMaccuracy)
数据在dc_matrix里面
from sklearn.model_selection import GridSearchCV
# 定义参数范围
param_grid = {
'n_clusters': [5, 10, 15, 20],
'init': ['k-means++', 'random'],
'max_iter': [300, 500, 1000],
'tol': [1e-4, 1e-5, 1e-6],
'random_state': [42]
}
# 创建KMeans对象
kmeans = KMeans()
# 创建GridSearchCV对象
grid_search = GridSearchCV(kmeans, param_grid, n_jobs=-1)
# 对数据进行聚类和搜索最佳超参数
grid_search.fit(dc_matrix)
# 输出最佳超参数
best_params = grid_search.best_params_
print(best_params)
# 使用最佳超参数对数据进行聚类
best_kmeans = KMeans(**best_params)
kmeans_results = best_kmeans.fit_predict(dc_matrix)
plt.rcParams["figure.figsize"] = (12, 10)
plt.scatter(x = document_concept_matrix[0], y = document_concept_matrix[1], c=kmeans_results)
绘制距离
# Get distances to cluster centers
distances = best_kmeans.transform(dc_matrix)
# Plot the distances using a boxplot
sns.boxplot(data=distances)
plt.show()
#加入到表里面
dfco['分类结果']= pd.DataFrame([i for i in kmeans_results])[0]
dfco['到分类中心的距离']= pd.DataFrame([i for i in distances])[0]
dfco
师范,保存文本模型,使用其转换,调用聚类,预测
#保存聚类模型
import joblib
# save the models to disk:
joblib.dump(svd_transformer, 'svd_transformer.joblib')
# save the model to disk
filename = 'kmeans_model.sav'
joblib.dump(best_kmeans, filename)
#向量转换
import joblib
# load the models from disk:
svd_transformer = joblib.load('svd_transformer.joblib')
#转换为向量
test_dc_matrix_pf = svd_transformer.transform(dfco['分词描述2'])
# save the model to disk
filename = 'kmeans_model.sav'
# load the model from disk
loaded_model = joblib.load(filename)
# use the loaded model to make predictions
new_results_pf = loaded_model.predict(test_dc_matrix_pf)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
# 定义参数范围
n_neighbors = list(range(1, 11))
weights = ['uniform', 'distance']
algorithms = ['ball_tree', 'kd_tree', 'brute']
# 初始化空列表存储结果
results = []
# 循环遍历所有参数组合
for n in n_neighbors:
for w in weights:
for a in algorithms:
# 实例化模型
knn = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a)
# 拟合数据
knn.fit(x_train, y_train)
# 预测测试集
res_y = knn.predict(x_test)
# 计算r2_score
score_knn = r2_score(y_test, res_y)
# 将参数组合和得分添加到结果列表
results.append((n, w, a, score_knn))
# 将结果转换为NumPy数组并按得分进行排序
results = np.array(results)
results = results[np.argsort(results[:, 3])[::-1]]
# 打印最佳参数和得分
best_params = {'n_neighbors': results[0][0], 'weights': results[0][1], 'algorithm': results[0][2]}
best_score = results[0][3]
print('Best parameters: ', best_params)
print('Best score: ', best_score)
# 绘制折线图,显示不同参数组合的得分
fig, ax = plt.subplots(figsize=(12, 6))
for i, weight in enumerate(weights):
mask = results[:, 1] == weight
x = results[mask][:, 0]
y = results[mask][:, 3]
ax.plot(x, y, marker='o', label=weight)
ax.set_xlabel('n_neighbors')
ax.set_ylabel('r2_score')
ax.set_title('KNeighborsClassifier parameter tuning')
ax.legend()
plt.show()
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
reg = Lasso()
param_grid = {'alpha': np.linspace(0, 1, 100)}
###STEP3###
#问题一:创建参数优化器GridSearchCV,将参数model,param_grid传入
grid = GridSearchCV(reg, param_grid, cv=5)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.score(x_test, y_test))
###STEP4###
#使用最优参数(grid.best_estimator_)的Lasso模型进行预测
ridge_model=grid.best_estimator_
y_hat = ridge_model.predict(x_test)
plt.scatter(y_test, y_hat)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()
from sklearn.ensemble import RandomForestRegressor
lr = LinearRegression(fit_intercept=True, normalize=False)
lr.fit(x_train, y_train)
lr_y_predict = lr.predict(x_test)
score_lr = r2_score(y_test,lr_y_predict)
plt.scatter(y_test, lr_y_predict)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()
print('简单评估线性回归模型:',score_lr)
from sklearn.linear_model import RidgeCV
#加载岭回归模型
rr = RidgeCV(alphas=np.array([.1, .2, .3, .4]))
rr.fit(x_train,y_train)
rr_y_predict = rr.predict(x_test)
score_rr = r2_score(y_test,rr_y_predict)
print('简单评估岭回归模型:',score_rr)
plt.scatter(y_test, rr_y_predict)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()