因四个表格日期数据处理有些复杂,故作此代码一次性处理四组数据
详细说明在main函数中,复制粘贴即可使用:
import datetime
import pandas as pd
def check(string, df, i, num, error_list):
if is_valid(pd.to_datetime(string, errors='coerce', format='%Y/%m/%d'), error_list, i):
df.iloc[i, num] = pd.to_datetime(string, errors='coerce', format='%Y/%m/%d')
# 数据中空格数据的替换
def data_process(data):
new_data = []
for d in data:
d = d.replace(" ", "")
new_data.append(d)
return new_data
# 日期格式错误的判断与修改
def judge(df, data, i, num, date, error_list):
if len(data[0]) == 4 and len(data[1]) == 2 and len(data[2]) == 2:
string = data[0] + '/' + data[1] + '/' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 2 and len(data[1]) == 2 and len(data[2]) == 4:
string = data[2] + '/' + data[1] + '/' + data[0]
check(string, df, i, num, error_list)
elif len(data[0]) == 2 and len(data[1]) == 1 and len(data[2]) == 4:
string = data[2] + '/0' + data[1] + '/' + data[0]
check(string, df, i, num, error_list)
elif len(data[0]) == 4 and len(data[1]) == 1 and len(data[2]) == 1:
string = data[0] + '/0' + data[1] + '/0' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 4 and len(data[1]) == 2 and len(data[2]) == 1:
string = data[0] + '/' + data[1] + '/0' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 4 and len(data[1]) == 1 and len(data[2]) == 2:
string = data[0] + '/0' + data[1] + '/' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 5 and len(data[1]) == 1 and len(data[2]) == 2:
string = data[0][1:5] + '/0' + data[1] + '/' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 5 and len(data[1]) == 1 and len(data[2]) == 1:
string = data[0][1:5] + '/0' + data[1] + '/0' + data[2]
check(string, df, i, num, error_list)
elif len(data[0]) == 5 and len(data[1]) == 2 and len(data[2]) == 1:
string = data[0][1:5] + '/' + data[1] + '/0' + data[2]
check(string, df, i, num, error_list)
else:
print("第{}行{}列异常的数据:{}".format(i + 1, num + 1, date))
error_list.append(i)
# 字符串年月日的处理
def str_process(df, date, i, num, error_list):
string = date[0:4] + '/' + date[4:6] + '/' + date[6:8]
if is_valid(pd.to_datetime(string, errors='coerce', format='%Y/%m/%d'), error_list, i):
df.iloc[i, num] = pd.to_datetime(string, errors='coerce', format='%Y/%m/%d')
# 判断日期是否合法 若要修改日期合法范围,直接修改函数中的(2010, 12, 31)即可
def is_valid(date, error_list, i):
date1 = datetime.datetime(2007, 1, 1)
date2 = datetime.datetime(2010, 12, 31)
if date >= date1 and date <= date2:
return True
else:
print("第{}行异常的数据:{}".format(i + 1, date))
error_list.append(i)
return False
# 数据处理过程
def date_process(file_path, sheet_name, new_excel, new_sheet, flag, is_delete):
df = pd.read_excel(file_path, sheet_name=sheet_name)
error_list = []
for T in range(3, 5):
col_num = df.shape[0]
for i in range(col_num):
try:
date = df.iloc[i, T]
if type(date) != datetime.datetime:
try:
if type(date) == int:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
continue
data = data_process(date.split('-'))
if len(data) == 3:
judge(df, data, i, T, date, error_list)
elif len(data) == 2:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
elif len(data) == 1:
data = data_process(date.split('/'))
if len(data) == 3:
judge(df, data, i, T, date, error_list)
elif len(data) == 1:
if len(date) == 8:
str_process(df, date, i, T, error_list)
else:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
else:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
else:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
except:
if flag:
print("第{}行{}列异常的数据:{}".format(i + 1, T + 1, date))
error_list.append(i)
else:
is_valid(date, error_list, i)
except:
print("出错{} {}".format(i, col_num))
continue
if i >= df.shape[0] - 1:
break
if is_delete:
df = df.drop(df.index[error_list])
df.to_excel(new_excel, index=False, sheet_name=new_sheet)
# 主函数
if __name__ == '__main__':
NULL_Error = True # 空格是否算作错误,若是则为True,若不是则为False
is_delete = True # 错误行是否删除,若删除则为True,若不删除则为False
file_path = "Data/data1.xls" # 要修改的表的文件路径(可以用绝对路径)
sheet_name = "脑卒中" # 要修改表的表名
new_excel = 'data6.xls' # 修改后数据存储文件名
new_sheet = 'Sheet1' # 修改后数据存储的表名
date_process(file_path, sheet_name, new_excel, new_sheet, NULL_Error, is_delete)