凯斯西储轴承数据CWRU数据集制作预处理代码。
基于开源代码的改进。
import os
from scipy.io import loadmat
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets.SequenceDatasets import dataset
from datasets.sequence_aug import *
from tqdm import tqdm
def get_files(root, N):
'''
This function is used to generate the final training set and test set.
root:The location of the data set
'''
data = []
lab =[]
for k in range(len(N)):
for n in tqdm(range(len(dataname[N[k]]))):
if n==0:
path1 =os.path.join(root,datasetname[3], dataname[N[k]][n]).replace("\\", "/")
else:
path1 = os.path.join(root,datasetname[0], dataname[N[k]][n]).replace("\\", "/")
data1, lab1 = data_load(path1,dataname[N[k]][n],label=label[n])
data += data1
lab +=lab1
return [data, lab]
def data_load(filename, axisname, label):
'''
This function is mainly used to generate test data and training data.
filename:Data location
axisname:Select which channel's data,---->"_DE_time","_FE_time","_BA_time"
'''
datanumber = axisname.split(".")
if eval(datanumber[0]) < 100:
realaxis = "X0" + datanumber[0] + axis[0]
else:
realaxis = "X" + datanumber[0] + axis[0]
fl = loadmat(filename)[realaxis]
data = []
lab = []
start, end = 0, signal_size
while end <= fl.shape[0]:
data.append(fl[start:end])
lab.append(label)
start += signal_size
end += signal_size
return data, lab
def data_split(data_dir,transfer_task,normlizetype="0-1",transfer_learning=True):
source_N = transfer_task[0]
target_N = transfer_task[1]
data_transforms = {
'train': Compose([
Reshape(),
Normalize(normlizetype),
# RandomAddGaussian(),
# RandomScale(),
# RandomStretch(),
# RandomCrop(),
Retype(),
# Scale(1)
]),
'val': Compose([
Reshape(),
Normalize(normlizetype),
Retype(),
# Scale(1)
])
}
if transfer_learning:
# get source train and val
list_data = get_files(data_dir, source_N)
data_pd = pd.DataFrame({"data": list_data[0], "label": list_data[1]})
train_pd, val_pd = train_test_split(data_pd, test_size=0.2, random_state=40, stratify=data_pd["label"])
source_train = dataset(list_data=train_pd, transform=data_transforms['train'])
source_val = dataset(list_data=val_pd, transform=data_transforms['val'])
# get target train and val
list_data = get_files(data_dir, target_N)
data_pd = pd.DataFrame({"data": list_data[0], "label": list_data[1]})
train_pd, val_pd = train_test_split(data_pd, test_size=0.2, random_state=40, stratify=data_pd["label"])
target_train = dataset(list_data=train_pd, transform=data_transforms['train'])
target_val = dataset(list_data=val_pd, transform=data_transforms['val'])
return source_train, source_val #, target_train, target_val
else:
#get source train and val
list_data = get_files(data_dir, source_N)
data_pd = pd.DataFrame({"data": list_data[0], "label": list_data[1]})
trval_pd, test_pd = train_test_split(data_pd, test_size=0.2, random_state=40) #, stratify=data_pd["label"]
train_pd, val_pd = train_test_split(trval_pd, test_size=0.5, random_state=40)
xtrain = train_pd['data'].values
ytrain = train_pd['label'].values
xval = val_pd['data'].values
yval = val_pd['label'].values
xtest = val_pd['data'].values
ytest = val_pd['label'].values
# source_train = dataset(list_data=train_pd, transform=data_transforms['train'])
# source_val = dataset(list_data=val_pd, transform=data_transforms['val'])
# # get target train and val
# list_data = get_files(data_dir, target_N)
# data_pd = pd.DataFrame({"data": list_data[0], "label": list_data[1]})
# xtest = data_pd['data'].values
# ytest = data_pd['label'].values
# # target_val = dataset(list_data=data_pd, transform=data_transforms['val'])
return xtrain, ytrain , xval , yval , xtest , ytest #source_train, source_val, target_val
if __name__ == '__main__':
#Digital data was collected at 12,000 samples per second
signal_size = 1024
dataname= {0:["97.mat","105.mat", "118.mat", "130.mat", "169.mat", "185.mat", "197.mat", "209.mat", "222.mat","234.mat"], # 1797rpm
1:["98.mat","106.mat", "119.mat", "131.mat", "170.mat", "186.mat", "198.mat", "210.mat", "223.mat","235.mat"], # 1772rpm
2:["99.mat","107.mat", "120.mat", "132.mat", "171.mat", "187.mat", "199.mat", "211.mat", "224.mat","236.mat"], # 1750rpm
3:["100.mat","108.mat", "121.mat","133.mat", "172.mat", "188.mat", "200.mat", "212.mat", "225.mat","237.mat"]} # 1730rpm
datasetname = ["12k Drive End Bearing Fault Data", "12k Fan End Bearing Fault Data", "48k Drive End Bearing Fault Data",
"Normal Baseline Data"]
axis = ["_DE_time", "_FE_time", "_BA_time"]
label = [i for i in range(0, 10)]
data_dir = '../cwru'
output_dir = '../../data/CWRU'
transfer_task = [[0], [3]]
normlizetype = 'mean - std'
X_train,y_train,X_val,y_val,X_test,y_test = data_split(data_dir,transfer_task,normlizetype,transfer_learning=False)
print(X_train)
dat_dict = dict()
# X_train = X_train.permute(0, 2, 1)
dat_dict["samples"] = torch.tensor([item for item in X_train])
dat_dict["samples"] = dat_dict["samples"].permute(0, 2, 1)
dat_dict["labels"] = torch.from_numpy(y_train)
torch.save(dat_dict, os.path.join(output_dir, "train.pt"))
dat_dict = dict()
dat_dict["samples"] = torch.tensor([item for item in X_val])
dat_dict["samples"] = dat_dict["samples"].permute(0, 2, 1)
dat_dict["labels"] = torch.from_numpy(y_val)
torch.save(dat_dict, os.path.join(output_dir, "val.pt"))
dat_dict = dict()
dat_dict["samples"] = torch.tensor([item for item in X_test])
dat_dict["samples"] = dat_dict["samples"].permute(0, 2, 1)
dat_dict["labels"] = torch.from_numpy(y_test)
torch.save(dat_dict, os.path.join(output_dir, "test.pt"))