后面遇到同样的需要切割txt文档的需求,就用了正则,发现正则会更方便,还没时间整理,直接放源码。因为是刚接触Python时写的,很多包还不清楚方法也不成熟,大家凑合着看
#-*- coding: utf-8 -*-
w1 = '新增' #要分割的字
w2 = '负责人'
w3 = '截止日期'
# w4 = '号'
w4 = '日'
import re
import xlsxwriter
txt_url = r"D:\文本记录1.txt" #txt地址
sava_url = r"D:\文本纪录2.xlsx" #Excel保存路径
# 读取txt 并提取指定字符
def readTXT():
with open(txt_url, 'r', encoding='utf-8') as f:
content = f.read() # 读取txt文件内容 保存到content中
jihua = re.compile(w1+'(.*?)'+w2,re.S)
fuzeren = re.compile(w2+'(.*?)'+w3,re.S)
riqi = re.compile(w3+'(.*?)'+w4,re.S)
result1 = jihua.findall(content)
result2 = fuzeren.findall(content)
result3 = riqi.findall(content)
#将名字中标点符号替换
list1=[",", "《", "。", "》", "、","?",";",":","’","“","【","{","】","}","·","~","!","@","#","¥","%","……","&","*","(",")",".","-","/","+"]
for i in range(len(result2)):
for a in range(len(list1)):
result2[i] = result2[i].replace(list1[a], "")
#将日期中的年和月替换
list2=["年","月"]
for i in range(len(result3)):
for b in range(len(list2)):
result3[i] = result3[i].replace(list2[b], "/")
return result1,result2,result3
readTXT()
def saveExcel(str1, str2):
xl = xlsxwriter.Workbook(sava_url) # todo 创建excel文件
sheet = xl.add_worksheet('sheet1') # todo 添加sheet
# 向表中添加数据标题
sheet.write(0, 0, 'NO') # 其中的'0-行, 0-列'指定表中的单元,'X'是向该单元写入的内容
sheet.write(0, 1, 'Dept')
sheet.write(0, 2, 'Issue')
sheet.write(0, 3, 'Action')
sheet.write(0, 4, 'Link')
sheet.write(0, 5, 'PIC')
sheet.write(0, 6, 'Due Day')
sheet.write(0, 7, 'Status')
# todo 往单元格cell添加数据,索引写入
for i in range(len(str1)):
if i % 1000 == 0: # 数据50一组
j = 1 # 每一组数据后 重新从0行 开始存入
sheet.write(j,0,j)#往表格里写入X坐标 NO
sheet.write(j,1,'-') #部门
sheet.write(j,2,str1[i]) #冲刺计划内容
sheet.write(j,3,'-') #空白
sheet.write(j,4,'-') #空白
sheet.write(j,7,'Ongoing') #进度
j = j + 1 # 行+1 准备写入下一行
for a in range(len(str2)):
if a % 1000 == 0: # 数据50一组
q = 1
sheet.write(q,5,str2[a])
q = q + 1
for b in range(len(str3)):
if b % 1000 == 0: # 数据50一组
w = 1
sheet.write(w,6,str3[b])
w = w + 1
# todo 设置单元格宽度大小
# sheet.set_column('A:B', 30)
# todo 关闭文件
xl.close()
str1, str2, str3 = readTXT()
print(str1)
print(str2)
print(str3)
saveExcel(str1, str2)
要提取的是【新增】和【负责人】之间的,【负责人】与【截止日期】之间的等等的句子。要提取的内容是括号里面:‘(.*?)’