pd.read_csv(path)
pd.read_excel(path)
pd.read_txt(path)
#图片
plt.imread(path)
#mat文件
from scipy.io import loadmat
data1=loadmat(path)
#二进制方式读取
with open(file_path,mode='rb') as file:
data = pickle.load(file,encoding='ISO-8859-1')
df.to_excel(path)
#导入模块
import sqlite3 as sqlite3
# 跟数据库建立连接
connection = sqlite3.connect('dbpath')
# 读取数据库文件
data = pd.read_sql("SQL语句",connection)
#操作数据库
connection.execute(SQL语句)
# 写入数据库
data.to_sql('dbpath',connection)
#用于切割训练数据和样本数据
from sklearn.model_selection import train_test_split
X_train,x_test,Y_train,y_true = train_test_split(data,target,test_size=0.2)
#定义函数
def transform(items):
unique_list = datas[items].unique()
def change_string(item):
return np.argwhere(item==unique_list)[0][0]
datas[items]=datas[items].map(change_string)
#遍历列,对非数字列使用函数
for i in datas.columns[datas.dtypes=='object']:
transform(i)
for i in datas.loc[:,col]:
datas[i]/= datas[i].sum()
samples = datas[(np.abs(datas-datas.mean(axis=0))<= 2*(datas.std(axis=0))).all(axis=1)]
# 使用机器学习模型进行预测
sklearn_model_dic = {
'KNN':KNeighborsRegressor(),
'Ridge':Ridge(),
'Lasso':Lasso(),
'Linear':LinearRegression()
}
# 分别使用各种机器学习模型训练并预测数据,把预测结果保存到一个字典里,用每个算法的名称做键,保留各个算法的预测值
predict_y_dic = {}
for key,model in sklearn_model_dic.items():
model.fit(X_train,y_train)
preidct_y_ = model.predict(x_test)
predict_y_dic[key] = preidct_y_
plt.figure(figsize=(12,18))
for i in range(100):
plt.subplot(10,10,i+1)
plt.imshow(x_test[i].reshape(8,8))
plt.axis('off')
title = 'KNN:'+ str(knn_y_[i]) + '\nLOGIC:' + str(logistic_y_[i]) + '\nTrue:' + str(y_true[i])
plt.title(title)
# 获取x\y轴的取值范围
xmin,xmax = train[:,0].min()-0.5 , train[:,0].max()+0.5
ymin,ymax = train[:,1].min()-0.5 , train[:,1].max()+0.5
# 生成x、y两个列表做网格化处理
x = np.arange(xmin,xmax,0.1)
y = np.arange(ymin,ymax,0.1)
xx,yy = np.meshgrid(x,y)
x_test = np.c_[xx.ravel(),yy.ravel()]
# 测量线性回归模型的分数
from sklearn.metrics import r2_score
r2_score(y_true,y_pre)
#测量算法得分
knn.score(x_test,y_true)
PCA 用于数据降维,减少运算时间,避免过拟合
n_components参数设置需要保留特征的数量,如果是小数,则表示保留特征的比例
#主成分分析(principal components analysis),主要用于数据降维的
from sklearn.decomposition import PCA
pca = PCA(n_components=150,whiten=True)
pca.fit(X_train,y_train)
X_train_pca = pca.transform(X_train)
x_test_pca = pca.transform(x_test)
结果将由原来的784个特征变为了150个特征
C = [1,3,5,7,9]
gamma = [0.0001,0.0005,0.001,0.005,0.01,0.05]
#创建GridSearchCV对象,estimator参数是需要进行调参处理的机器学习模型
clf = GridSearchCV(svc,param_grid={'C':C,'gamma':gamma})
#开始调参(理解数据,确定哪种参数更合适)
clf.fit(X_train_pca,y_train)
#best_params_来查看选中的最优参数解
clf.best_params_
#预测结果,查看评分
y_pre = clf.predict(x_test_pca)
clf_score = clf.score(x_test_pca,y_true)
clf_score