import re
import difflib
import os
from datetime import datetime, timedelta
def extract_snippets_no_duplicates(input_file, output_file, window=150):
"""
从输入文本文件中提取包含目标字符(A、B、C、D、"开始"、"结束")前后 `window` 个字符范围的文本。
提取时扩展到完整的行,并确保每行在输出文件中不会重复,同时保持原始顺序。
参数:
input_file (str): 输入文件路径。
output_file (str): 输出文件路径。
window (int, optional): 目标字符前后扩展的字符数量,默认值为 150。
"""
with open(input_file, 'r', encoding='utf-8') as f:
text = f.read()
target_chars = {'A', 'B', 'C', 'D'}
target_words = {'开始', '结束'}
intervals = []
for i, char in enumerate(text):
if char in target_chars:
start = max(0, i - window)
end = min(len(text), i + window + 1)
intervals.append((start, end))
for word in target_words:
start_idx = 0
while (start_idx := text.find(word, start_idx)) != -1:
start = max(0, start_idx - window)
end = min(len(text), start_idx + len(word) + window)
intervals.append((start, end))
start_idx += len(word)
intervals.sort(key=lambda x: x[0])
merged_intervals = []
if intervals:
current_start, current_end = intervals[0]
for start, end in intervals[1:]:
if start <= current_end:
current_end = max(current_end, end)
else:
merged_intervals.append((current_start, current_end))
current_start, current_end = start, end
merged_intervals.append((current_start, current_end))
lines = text.splitlines(keepends=True)
line_positions = []
start_idx = 0
for line in lines:
end_idx = start_idx + len(line)
line_positions.append((start_idx, end_idx, line))
start_idx = end_idx
extracted_lines = []
seen_lines = set()
for start, end in merged_intervals:
for line_start, line_end, line in line_positions:
if line_end > start and line_start < end:
if line not in seen_lines:
extracted_lines.append(line)
seen_lines.add(line)
with open(output_file, 'w', encoding='utf-8') as f:
f.writelines(extracted_lines)
print("处理完成,结果已保存到", output_file)
def replace_say_and_dianer_in_file(filename):
"""
读取 txt 文件,将 "SAY" 替换为 "C",将 "点儿" 替换为 "点",并直接保存到原文件。
:param filename: 需要处理的文件路径
"""
with open(filename, "r", encoding="utf-8") as file:
text = file.read()
text = text.replace("SAY", "C").replace("点儿", "点")
with open(filename, "w", encoding="utf-8") as file:
file.write(text)
print(f"替换完成,已保存到 {filename}")
def mark_time_gaps(file_path):
"""
处理文件内容,如果当前行的结束时间与下一行的开始时间不连续,
则在当前行末尾添加相应数量的 "|"。
"""
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
processed_lines = []
for i in range(len(lines)):
line = lines[i].strip()
if not line:
processed_lines.append(line + "-")
continue
start_time, end_time = None, None
try:
timestamp_part = line.split(']')[0] + ']'
start_time_str, end_time_str = timestamp_part.strip('[]').split(' - ')
start_time = sum(x * int(t) for x, t in zip([3600, 60, 1], start_time_str.split(':')))
end_time = sum(x * int(t) for x, t in zip([3600, 60, 1], end_time_str.split(':')))
except (IndexError, ValueError):
processed_lines.append(line + "-")
continue
if i == len(lines) - 1:
processed_lines.append(line + "-")
continue
next_line = lines[i + 1].strip()
next_start_time = None
try:
next_timestamp_part = next_line.split(']')[0] + ']'
next_start_time_str, _ = next_timestamp_part.strip('[]').split(' - ')
next_start_time = sum(x * int(t) for x, t in zip([3600, 60, 1], next_start_time_str.split(':')))
except (IndexError, ValueError):
processed_lines.append(line + "-")
continue
time_diff = next_start_time - end_time
if time_diff > 5:
line += "|" * (time_diff // 5)
processed_lines.append(line + "-")
with open(file_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(processed_lines))
def correct_yao_in_text(input_text_1, output_filename_1):
"""
修正文本中的 "yao" 相关汉字,在特定上下文中替换为 "一"。
:param input_text_1: 需要修正的文本
:param output_filename_1: 输出文件名(如果保存)
:return: 修正后的文本
"""
with open(input_text_1, "r", encoding="utf-8") as file:
text = file.read()
yao_variants = {"幺", "要", "腰", "邀", "瑶", "钥", "谣"}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
output_filename = output_filename_1
def replace_yao(match):
word = match.group(0)
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
corrected = "一"
print(f"修正: {word} -> {corrected}")
print("修正后上下文: " + context_before + " | " + corrected + " | " + context_after)
return corrected
return word
pattern = "|".join(yao_variants)
corrected_text = re.sub(pattern, replace_yao, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_ai_with_AI(input_filename, output_filename):
"""
将 AI 相关的汉字替换为 "AI",但仅当前后 3 个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。
:param input_filename: 输入文本文件路径
:param output_filename: 处理后的文本保存路径
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
ai_variants = {"哀", "埃", "挨", "癌", "矮", "爱", "碍", "艾", "隘", "蔼", "霭", "嗳", "诶"}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
def replace_ai(match):
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
print(f"修正: {match.group(0)} -> A")
print(f"修正后上下文: {context_before} | A | {context_after}\n")
return "A"
return match.group(0)
pattern = "|".join(ai_variants)
corrected_text = re.sub(pattern, replace_ai, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_bi_with_B(input_filename, output_filename):
"""
将 "比"、"必"、"笔"、"币"、"闭"、"碧"、"避" 替换为 "B",
但仅当前后存在触发字符(A, C, D, Y, 一, 二, 三...)时才替换。
:param input_filename: 需要处理的文本文件
:param output_filename: 处理后的文本保存路径
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
bi_variants = {"比", "必", "笔", "币", "闭", "碧", "避"}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
def replace_bi(match):
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正: {match.group(0)} -> B")
print("修正后上下文: " + context_before + " | B | " + context_after)
return "B"
return match.group(0)
pattern = "|".join(bi_variants)
corrected_text = re.sub(pattern, replace_bi, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_di_with_D(input_filename, output_filename):
"""
将 "的"、"地"、"得"、"德"、"底"、"敌"、"蒂"、"第"、"递"、"堤"、"笛"、"狄"、"涤"、"翟"、"帝"、"嫡"、"滴"
替换为 "D",但仅当前后存在触发字符(A, B, C, Y, 一, 二, 三...)时才替换。
:param input_filename: 需要处理的文本文件
:param output_filename: 处理后的文本保存路径
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
di_variants = {
"地", "底", "敌", "蒂", "第", "递", "堤", "笛", "狄", "涤", "翟", "帝",
"嫡", "滴", "低"
}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
def replace_di(match):
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
print(f"修正: {match.group(0)} -> D")
print(f"修正后上下文: {context_before} | D | {context_after}\n")
return "D"
return match.group(0)
pattern = "|".join(di_variants)
corrected_text = re.sub(pattern, replace_di, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_ling_with_target(input_filename, output_filename, target="零"):
"""
将 ling_variants 中的汉字替换为 target(默认 "〇"),
但仅当前后三个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。
:param input_filename: 输入文本文件路径
:param output_filename: 处理后的文本保存路径
:param target: 替换后的目标字符,默认 "〇"
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
ling_variants = {
"龄", "铃", "凌", "菱", "羚", "翎", "伶", "苓", "泠", "棂", "瓴", "绫"
}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十"}
def replace_ling(match):
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
print(f"修正: {match.group(0)} -> {target}")
print(f"修正后上下文: {context_before} | {target} | {context_after}\n")
return target
return match.group(0)
pattern = "|".join(ling_variants)
corrected_text = re.sub(pattern, replace_ling, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_shi_with_target(input_filename, output_filename, target="十"):
"""
将 shi_variants 中的汉字替换为 target(默认 "示"),
但仅当前后三个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。
:param input_filename: 输入文本文件路径
:param output_filename: 处理后的文本保存路径
:param target: 替换后的目标字符,默认 "示"
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
shi_variants = {
"诗", "师", "失", "尸", "施", "湿", "狮", "虱", "柿",
"时", "识", "实", "石", "食", "蚀", "拾",
"事", "式", "示", "世", "试", "势", "释", "适", "视", "氏", "室", "侍", "誓", "嗜", "饰"
}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
def replace_shi(match):
start, end = match.start(), match.end()
before = text[max(0, start - 1):start]
after = text[end:end + 1]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
print(f"修正: {match.group(0)} -> {target}")
print(f"修正后上下文: {context_before} | {target} | {context_after}\n")
return target
return match.group(0)
pattern = "|".join(shi_variants)
corrected_text = re.sub(pattern, replace_shi, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def replace_ba_with_eight(input_filename, output_filename):
"""
将 ba_variants 中的汉字替换为 "八",但仅当前后 3 个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。
:param input_filename: 输入文本文件路径
:param output_filename: 处理后的文本保存路径
:return: 处理后的文本
"""
with open(input_filename, "r", encoding="utf-8") as file:
text = file.read()
ba_variants = {
"吧", "巴", "叭", "芭", "捌", "笆", "疤", "扒",
"拔", "跋", "茇", "菝", "魃", "把",
"把", "钯",
"爸", "罢", "霸", "坝"
}
context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
def replace_ba(match):
start, end = match.start(), match.end()
before = text[max(0, start - 3):start]
after = text[end:end + 3]
context_before = text[max(0, start - 10):start]
context_after = text[end:end + 10]
if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
print(f"修正: {match.group(0)} -> 八")
print(f"修正后上下文: {context_before} | 八 | {context_after}\n")
return "八"
return match.group(0)
pattern = "|".join(ba_variants)
corrected_text = re.sub(pattern, replace_ba, text)
with open(output_filename, "w", encoding="utf-8") as file:
file.write(corrected_text)
print(f"修正完成,已保存到 {output_filename}")
return corrected_text
def process_bd_correction(input_file, output_file, context_print_window=20):
"""
读取输入文件,修正文本中的 B/D 混淆问题,并将修正后的文本保存到输出文件。
还会打印所有修正事件的详细信息。
:param input_file: 输入文件路径
:param output_file: 输出文件路径
:param context_print_window: 上下文打印窗口大小(默认前后各 20 个字符)
"""
def correct_bd_mixup(text):
"""
对文本中的 B 和 D 进行检测与修正:
(1) 对 B 检查:如果前文中最近的 A/C 为 C 而不是 A,
且后文中最近的 A/C 为 A 而不是 C,则将 B 修正为 D。
(2) 对 D 检查:如果前文中最近的 A/C 为 A 而不是 C,
且后文中最近的 A/C 为 C 而不是 A,则将 D 修正为 B。
返回修正后的文本以及所有修正事件的详细记录。
每个修正事件记录:(位置, 原字符, 修改后字符, 前最近的 A/C, 后最近的 A/C)。
"""
text_list = list(text)
corrections = []
for i, char in enumerate(text_list):
if char in ('B', 'D'):
prev_char = next((text_list[j] for j in range(i - 1, -1, -1) if text_list[j] in ('A', 'C')), None)
next_char = next((text_list[j] for j in range(i + 1, len(text_list)) if text_list[j] in ('A', 'C')), None)
if char == 'B' and prev_char == 'C' and next_char == 'A':
corrections.append((i, 'B', 'D', prev_char, next_char))
text_list[i] = 'D'
elif char == 'D' and prev_char == 'A' and next_char == 'C':
corrections.append((i, 'D', 'B', prev_char, next_char))
text_list[i] = 'B'
corrected_text = ''.join(text_list)
return corrected_text, corrections
with open(input_file, 'r', encoding='utf-8') as f:
original_text = f.read()
corrected_text, correction_events = correct_bd_mixup(original_text)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(corrected_text)
print("已经保存为: " + output_file)
print("修正记录:")
for pos, orig_char, new_char, prev_ac, next_ac in correction_events:
start_context = max(0, pos - context_print_window)
end_context = min(len(original_text), pos + context_print_window + 1)
original_context = original_text[start_context:end_context]
corrected_context = corrected_text[start_context:end_context]
print(f"位置 {pos}: {orig_char} -> {new_char} (前最近 A/C: {prev_ac}, 后最近 A/C: {next_ac})")
print("原始上下文:")
print(original_context)
print("修正后上下文:")
print(corrected_context)
print("-" * 60)
keywords = [
"开始装机",
"装机开始",
"开始CPB装机",
"CPB装机开始",
"开始预充排气",
"预充排气开始",
"开始安全检查",
"安全检查开始",
"开始主动脉插管",
"主动脉插管开始",
"开始体外循环",
"体外循环开始",
"CPB ON",
"开始双管降温",
"双管降温开始",
"停降温",
"开始停降温",
"停降温开始",
"开始复温",
"复温开始",
"开始超滤",
"超滤开始",
"钳闭下腔静脉",
"开始减流量",
"减流量开始",
"CPB OF",
"结束体外循环",
"体外循环结束",
"开始改良超滤",
"改良超滤开始",
"结束改良超滤",
"改良超滤结束",
"开始记录参数",
"记录参数开始",
"血气分析",
"ACT",
"上下腔静脉阻断",
"升主动脉阻断",
"心肌保护液",
"升主动脉开放",
"开始复跳",
"复跳开始",
"上下腔静脉开放",
"双主动脉灌注",
"拔出降主动脉插管",
"脑灌注开始",
"脑灌开始",
"降主动脉阻断",
"降主动脉开放",
"脑灌注结束",
"脑灌结束",
"开始用冰帽",
"用冰帽",
"开始停循环",
"停循环开始",
"恢复循环",
"开始恢复循环",
"恢复循环开始",
"缓慢复温",
"开始缓慢复温",
"序号一","序号二","序号三","序号四","序号五","序号六","序号七","序号八","序号九",
"复方电解质",
"红细胞悬液",
"人血白蛋白",
"碳酸氢钠",
"甘露醇",
"乙基淀粉",
"甲基强的松龙",
"磷酸肌酸钠",
"地塞米松",
"利多卡因",
"葡萄糖",
"酚妥拉明",
"呋塞米注射液",
"氯化钾",
"氯化钙",
"去甲肾上腺素",
"肝素钠",
"血液回收血",
"乌司他丁"
]
def fuzzy_match(text, keyword, replaced_ranges):
"""
在文本 text 中找到与 keyword 最相似的片段,确保未被替换过。
"""
max_similarity = 0
best_match = ""
best_index = -1
keyword_length = len(keyword)
for i in range(len(text) - keyword_length + 1):
window = text[i:i + keyword_length]
similarity = difflib.SequenceMatcher(None, window, keyword).ratio()
if any(start <= i < end for start, end in replaced_ranges):
continue
if similarity > max_similarity:
max_similarity = similarity
best_match = window
best_index = i
return max_similarity, best_match, best_index
def replace_keywords(text, SIMILARITY_THRESHOLD = 0.6):
"""
在文本中替换匹配的关键词,并打印调试信息,确保已替换内容不会被重复修改
"""
matches = []
replaced_ranges = []
for keyword in keywords:
similarity, matched_text, index = fuzzy_match(text, keyword, replaced_ranges)
if similarity >= SIMILARITY_THRESHOLD and index != -1:
matches.append((similarity, keyword, matched_text, index))
matches.sort(reverse=True, key=lambda x: x[0])
modified_text = text
offset = 0
for i, (similarity, keyword, matched_text, index) in enumerate(matches):
new_text = f"关键词*{keyword}*关键词"
actual_index = index + offset
if any(start <= actual_index < end for start, end in replaced_ranges):
continue
print(f"\n **替换详情** ")
print(f"原始文本片段: {modified_text[max(0, actual_index-10):actual_index+len(matched_text)+10]}")
print(f"匹配关键词: {keyword}")
print(f"相似度: {similarity:.2f}")
print(f"替换前: {matched_text}")
print(f"替换后: {new_text}")
modified_text = (
modified_text[:actual_index]
+ new_text
+ modified_text[actual_index + len(matched_text):]
)
replaced_ranges.append((actual_index, actual_index + len(new_text)))
offset += len(new_text) - len(matched_text)
return modified_text
def process_file_replace_keywords(file_path, output_path):
"""
处理整个文件,保持原始格式
"""
with open(file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
modified_lines = [replace_keywords(line.strip()) + "\n" for line in lines]
with open(output_path, "w", encoding="utf-8") as file:
file.writelines(modified_lines)
def remove_timestamps_and_indexes(input_file, output_file):
"""
去掉时间戳和序号
1. 逐行读取输入文件内容。
2. 按 `]` 进行拆分,取 `]` 之后的内容。
3. 进一步按 `\t`(制表符)拆分,去掉 `chunk_x`,只保留最后一个部分。
4. 将处理后的内容写入新的文件,保持换行符格式。
参数:
input_file (str): 输入文件路径,包含原始文本数据。
output_file (str): 输出文件路径,存储处理后的文本数据。
返回:
None
"""
with open(input_file, "r", encoding="utf-8") as f:
lines = f.readlines()
cleaned_lines = []
for line in lines:
parts = line.split("]")
if len(parts) > 1:
content_parts = parts[1].split("\t")
last_part = content_parts[-1].strip()
cleaned_lines.append(last_part + "\n")
with open(output_file, "w", encoding="utf-8") as f:
f.writelines(cleaned_lines)
print(f"✅ 处理完成,结果已保存到 {output_file}")
def convert_chinese_number_to_arabic(input_file, output_file):
chinese_numbers = {'零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
'五': '5', '六': '6', '七': '7', '八': '8', '九': '9'}
try:
with open(input_file, "r", encoding="utf-8") as f:
content = f.read()
for zh_num, ar_num in chinese_numbers.items():
content = content.replace(zh_num, ar_num)
content = re.sub(r'(?, '10', content)
content = re.sub(r'(?, '10', content)
content = re.sub(r'(?, r'\g<1>0', content)
content = re.sub(r'(?, r'1\g<1>', content)
content = re.sub(r'(?, r'\g<1>\g<2>', content)
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
print(f"转换完成!请查看 {output_file}")
except FileNotFoundError:
print(f"错误:文件 {input_file} 未找到。")
except Exception as e:
print(f"发生错误:{e}")
def replace_act(file_path):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
content = content.replace("ACT", "激活全血凝固时间")
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
def remove_spaces_from_file(filename):
with open(filename, 'r', encoding='utf-8') as file:
lines = file.readlines()
cleaned_lines = [line.replace(" ", "") for line in lines]
with open(filename, 'w', encoding='utf-8') as file:
file.writelines(cleaned_lines)
def extract_time_from_filename(file_path):
parts = file_path.split("_")
if len(parts) >= 2 and len(parts[0]) == 8 and len(parts[1]) == 6:
date_part = parts[0]
time_part = parts[1]
dt = datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
return dt
return None
def calculate_current_time(start_time: str, elapsed_time: str) -> str:
"""
Calculate the current time based on start time and elapsed time.
:param start_time: Start time in "HH:MM:SS" format.
:param elapsed_time: Elapsed time in "HH:MM:SS" format.
:return: Current time in "HH:MM:SS" format.
"""
start_dt = datetime.strptime(start_time, "%H:%M:%S")
h, m, s = map(int, elapsed_time.split(":"))
elapsed_td = timedelta(hours=h, minutes=m, seconds=s)
current_time = start_dt + elapsed_td
return current_time.strftime("%H:%M:%S")
def extract_number_after_keys(text):
'''函数功能:
该函数 extract_values 用于从输入文本中提取字母键(例如 "A", "B", "C", "D")后面的数值。
它通过遍历文本并查找这些字母,获取它们后面紧跟的数字(包含点符号和“点”字),最终返回一个字典,其中包含每个字母键及其对应的数值。
主要步骤:
初始化一个包含字母键的字典,初始值为 None。
遍历每个字母键,查找其在文本中的位置。
如果找到了该字母键,继续向后查找数值,并避免在字母与数值之间出现其他字母或符号。
提取数值部分,如果找到有效的数值,将其存入字典。
最终返回包含字母键和对应数值的字典。
'''
keys = ["A", "B", "C", "D"]
result = {key: None for key in keys}
for key in keys:
index = text.find(key)
while index != -1:
num_start = index + 1
has_other_key = False
while num_start < len(text) and not (text[num_start].isdigit() or text[num_start] in ["点", "."]):
if text[num_start] in keys or text[num_start] == "*":
has_other_key = True
break
num_start += 1
if has_other_key:
search_range = text[index + 1 : index + 31]
new_index = search_range.find(key)
if new_index != -1:
index += new_index + 1
continue
num_str = ""
while num_start < len(text) and (text[num_start].isdigit() or text[num_start] in ["点", "."]):
num_str += "." if text[num_start] == "点" else text[num_start]
num_start += 1
if num_str:
result[key] = float(num_str)
break
return result
def find_and_extract_contextual_event_info(file_path):
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
full_text = "".join(lines)
line_indices = []
current_line = 1
for line in lines:
line_indices.extend([current_line] * len(line))
current_line += 1
results = []
text_length = len(full_text)
i = 0
while i < text_length:
if full_text[i] == 'A':
end_index = min(i + 6, text_length)
next_5_chars = full_text[i + 1:end_index]
contains_digit = any(char.isdigit() for char in next_5_chars)
if contains_digit:
search_range_end = min(i + 21, text_length)
next_15_chars = full_text[i:search_range_end]
for j in range(i, search_range_end):
if full_text[j] == 'C':
c_end_index = min(j + 6, text_length)
c_next_5_chars = full_text[j + 1:c_end_index]
if any(char.isdigit() for char in c_next_5_chars):
start_line = line_indices[i]
end_line = line_indices[min(search_range_end - 1, text_length - 1)]
a_start_index = max(i - 25, 0)
a_start_line = line_indices[a_start_index]
before_a_lines = lines[a_start_line - 1:start_line]
c_end_index = min(j + 26, text_length)
c_end_line = line_indices[c_end_index - 1]
after_c_lines = lines[end_line:c_end_line]
full_matched_lines = lines[start_line - 1:end_line]
results.append(
f"————上文所在行: ({a_start_line} - {start_line - 1})————\n"
f"{''.join(before_a_lines).strip()}\n"
f"————匹配所在行:({start_line} - {end_line})————\n"
f"{''.join(full_matched_lines).strip()}\n"
f"————下文所在行:({end_line + 1} - {c_end_line})————\n"
f"{''.join(after_c_lines).strip()}\n"
)
break
i += 1
if results:
events_list = []
event = {}
for result in results:
print("\n")
print("-"*60)
print(result)
print("-"*60)
print("\n")
result_text = []
event["line_range"] = None
event["event"] = None
for text in result.split("\n"):
if "———上文所在行" in text:
continue
elif "———下文所在行" in text:
continue
elif "———匹配所在行" in text:
start_pos = text.find('(') + 1
end_pos = text.find(')')
line_range = text[start_pos:end_pos]
event["line_range"] = line_range
else:
result_text.append(text)
if '*' in text:
start_pos = text.find('*') + 1
end_pos = text.find('*', start_pos)
substring = text[start_pos:end_pos]
event["event"] = substring
print("".join(result_text))
parameter_result = extract_number_after_keys("".join(result_text))
for one_parameter in parameter_result:
event[one_parameter] = str(parameter_result[one_parameter])
for one_parameter in event:
print(one_parameter + " : " + str(event[one_parameter]))
events_list.append(event.copy())
return events_list
else:
print("未找到符合条件的匹配内容。")
def extract_time_stamps_from_txt(file_path):
result_dict = {}
with open(file_path, "r", encoding="utf-8") as file:
for line_number, line in enumerate(file, start=1):
line = line.strip()
if not line:
continue
parts = line.split("\t", 1)
if len(parts) < 2:
parts = line.split("chunk", 1)
time_range = parts[0]
time = time_range.split("chunk")[0].strip(" ").strip("[").strip("]")
result_dict[line_number] = time
return result_dict
print("-"*60)
input_text_0 = "20250312_104423.txt"
output_filename_1 = input_text_0.split(".")[0] + "_0.txt"
extract_snippets_no_duplicates(input_text_0, output_filename_1)
replace_say_and_dianer_in_file(output_filename_1)
print("已完成精简(保留 A B C D 附近字符)")
print("-"*60)
print("-"*60)
input_text_1 = output_filename_1
output_filename_1 = input_text_1.split(".")[0] + "_1.txt"
corrected_text = correct_yao_in_text(input_text_1, output_filename_1)
print("已完成 yao 修正")
print("-"*60)
print("-"*60)
replace_ai_with_AI(output_filename_1, output_filename_1)
corrected_text = replace_bi_with_B(output_filename_1, output_filename_1)
print("-"*60)
print("-"*60)
corrected_text = replace_di_with_D(output_filename_1, output_filename_1)
print("-"*60)
print("-"*60)
corrected_text = replace_ling_with_target(output_filename_1, output_filename_1)
print("-"*60)
print("-"*60)
corrected_text = replace_shi_with_target(output_filename_1, output_filename_1)
print("-"*60)
print("-"*60)
corrected_text = replace_ba_with_eight(output_filename_1, output_filename_1)
mark_time_gaps(output_filename_1)
print("-"*60)
print("-"*60)
input_file_2 = output_filename_1
output_file_2 = input_file_2.split(".")[0] + "_2.txt"
process_bd_correction(input_file_2, output_file_2, context_print_window=20)
print("已完成 BD混淆 修正")
print("-"*60)
print("-"*60)
input_file_3 = output_file_2
output_file_3 = input_file_3.split(".")[0] + "_3.txt"
process_file_replace_keywords(input_file_3, output_file_3)
remove_timestamps_and_indexes_before_file = output_file_3
print("\n✅ 处理完成,结果已保存到:", output_file_3)
print("已完成 关键词匹配 修正")
print("-"*60)
print("-"*60)
input_file_4 = output_file_3
output_file_4 = input_file_4.split(".")[0] + "_4.txt"
remove_timestamps_and_indexes(input_file_4, output_file_4)
print("-"*60)
print("-"*60)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
print("已完成 yao 修正")
print("-"*60)
print("-"*60)
replace_ai_with_AI(output_file_4, output_file_4)
corrected_text = replace_bi_with_B(output_file_4, output_file_4)
print("-"*60)
print("-"*60)
corrected_text = replace_di_with_D(output_file_4, output_file_4)
print("-"*60)
print("-"*60)
corrected_text = replace_ling_with_target(output_file_4, output_file_4)
corrected_text = replace_ling_with_target(output_file_4, output_file_4)
print("-"*60)
print("-"*60)
corrected_text = replace_shi_with_target(output_file_4, output_file_4)
print("-"*60)
print("-"*60)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
print("-"*60)
print("-"*60)
input_file_5 = output_file_4
output_file_5 = input_file_5.split(".")[0] + "_5.txt"
convert_chinese_number_to_arabic(input_file_5, output_file_5)
print("阿拉伯转换完成!请查看 " + output_file_5)
print("-"*60)
input_file_5 = output_file_5
replace_act(input_file_5)
remove_spaces_from_file(input_file_5)
print("-"*60)
print("-"*60)
file_path = input_file_5
events_list = find_and_extract_contextual_event_info(file_path)
print("提取关键事件+时间+各个参数完成!")
print("-"*60)
print("-"*60)
file_path = remove_timestamps_and_indexes_before_file
time_dict = extract_time_stamps_from_txt(file_path)
file_name = os.path.basename(file_path)
time_in_beijing = extract_time_from_filename(file_name)
start_time = time_in_beijing.strftime("%H:%M:%S")
if time_in_beijing:
print("录音时间(北京时间):", start_time)
else:
print("未找到时间信息")
events_list_1 = []
for event in events_list:
row_num = int(event['line_range'].split("-")[0])
elapsed_time = time_dict[row_num].split("-")[0].strip()
current_time = calculate_current_time(start_time, elapsed_time)
print("Current time:", current_time)
event['line_range'] = current_time
events_list_1.append(event.copy())
print("-"*60)
print("-"*60)
for event in events_list:
print("-"*60)
for one_parameter in event:
if one_parameter == "line_range":
print("time : " + str(event[one_parameter]))
else:
print(one_parameter + " : " + str(event[one_parameter]))
print("-"*60)
print("-"*60)
output_txt_file = input_text_0.split(".")[0] + "_time_event_ABCD.txt"
with open(output_txt_file, "w", encoding="utf-8") as file:
file.write("-" * 60 + "\n")
for index, event in enumerate(events_list, start=1):
file.write(f"Event {7+index}\n")
for one_parameter in event:
if one_parameter == "line_range":
file.write("time : " + str(event[one_parameter]) + "\n")
else:
file.write(one_parameter + " : " + str(event[one_parameter]) + "\n")
file.write("-" * 60 + "\n")
file.write("-" * 60 + "\n")
import os
from datetime import datetime, timedelta
keywords_NO_ABCD = [
"开始装机",
"装机开始",
"开始CPB装机",
"CPB装机开始",
"开始预充排气",
"预充排气开始",
"开始安全检查",
"安全检查开始",
"开始主动脉插管",
"主动脉插管开始",
"CPB OF",
"结束体外循环",
"体外循环结束",
"开始改良超滤",
"改良超滤开始",
"结束改良超滤",
"改良超滤结束",
"心肌保护液",
"开始停循环",
"停循环开始",
"复方电解质",
"红细胞悬液",
"人血白蛋白",
"碳酸氢钠",
"甘露醇",
"乙基淀粉",
"甲基强的松龙",
"磷酸肌酸钠",
"地塞米松",
"利多卡因",
"葡萄糖",
"酚妥拉明",
"呋塞米注射液",
"氯化钾",
"氯化钙",
"去甲肾上腺素",
"肝素钠",
"血液回收血",
"乌司他丁"
]
keyswords_Medication = [
"复方电解质",
"红细胞悬液",
"人血白蛋白",
"碳酸氢钠",
"甘露醇",
"乙基淀粉",
"甲基强的松龙",
"磷酸肌酸钠",
"地塞米松",
"利多卡因",
"葡萄糖",
"酚妥拉明",
"呋塞米注射液",
"氯化钾",
"氯化钙",
"去甲肾上腺素",
"肝素钠",
"血液回收血",
"乌司他丁"
]
def extract_number_after_keys(text):
'''函数功能:
该函数 extract_values 用于从输入文本中提取字母键(例如 "A", "B", "C", "D")后面的数值。
它通过遍历文本并查找这些字母,获取它们后面紧跟的数字(包含点符号和“点”字),最终返回一个字典,其中包含每个字母键及其对应的数值。
主要步骤:
初始化一个包含字母键的字典,初始值为 None。
遍历每个字母键,查找其在文本中的位置。
如果找到了该字母键,继续向后查找数值,并避免在字母与数值之间出现其他字母或符号。
提取数值部分,如果找到有效的数值,将其存入字典。
最终返回包含字母键和对应数值的字典。
'''
keys = ["A", "B", "C", "D", "Y"]
result = {key: None for key in keys}
if "*" in text:
text = text.split('*', 1)[1]
for key in keys:
index = text.find(key)
while index != -1:
num_start = index + 1
has_other_key = False
while num_start < len(text) and not (text[num_start].isdigit() or text[num_start] in ["点", "."]):
if text[num_start] in keys or text[num_start] == "*":
has_other_key = True
break
num_start += 1
if has_other_key:
search_range = text[index + 1 : index + 31]
new_index = search_range.find(key)
if new_index != -1:
index += new_index + 1
continue
else:
search_range = text[index + 1 : index + 15]
new_index = search_range.find(key)
if new_index != -1:
index += new_index + 1
continue
num_str = ""
while num_start < len(text) and (text[num_start].isdigit() or text[num_start] in ["点", "."]):
num_str += "." if text[num_start] == "点" else text[num_start]
num_start += 1
if num_str:
result[key] = float(num_str)
break
return result
def find_time(s):
for i in range(len(s)-1, -1, -1):
if s[i] == '点':
j = i - 1
while j >= 0 and (s[j].isdigit() or s[j] in '十'):
j -= 1
hour_part = s[j+1:i]
if not hour_part:
continue
hour_part = hour_part[-2:]
k = i + 1
while k < len(s) and (s[k].isdigit() or s[k] == '刻'):
k += 1
minute_part = s[i+1:k]
time_str = f"{hour_part}点{minute_part}"
try:
if s[j] == "A" or s[j] == "B" or s[j] == "C" or s[j] == "D" or s[j] == "Y":
return None
if s[k] == "A" or s[k] == "B" or s[k] == "C" or s[k] == "D" or s[k] == "Y":
return None
if s[j-1] == "A" or s[j-1] == "B" or s[j-1] == "C" or s[j-1] == "D" or s[j-1] == "Y":
return None
if s[k+1] == "A" or s[k+1] == "B" or s[k+1] == "C" or s[k+1] == "D" or s[k+1] == "Y":
return None
except:
pass
return {
'time': time_str,
'start': j + 1,
'end': k
}
return None
def find_continuous_numbers(text):
text = text.split('|')[0]
numbers = []
current_number = ""
for i, char in enumerate(text):
if char.isdigit():
current_number += char
elif char == '点':
current_number += "."
else:
if current_number:
numbers.append(current_number)
current_number = ""
if current_number:
numbers.append(current_number)
return numbers
def find_and_extract_contextual_event_info(file_path):
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
full_text = "".join(lines)
line_indices = []
current_line = 1
for line in lines:
line_indices.extend([current_line] * len(line))
current_line += 1
results = []
text_length = len(full_text)
i = 0
while i < text_length:
if full_text[i] == '*' and full_text[i-3:i] == '关键词':
start = i + 1
end = full_text.find('*关键词', start)
if end == -1:
continue
event_name = full_text[start:end].strip()
start_i = start - 4
end_i = end + 4
start_line = line_indices[start_i]
end_line = line_indices[end_i]
current_line_context = lines[start_line-1:start_line]
pre_info = ""
search_length = 15
j = start_i
while search_length > 0 and j >= 0:
if full_text[j] == '*':
break
if full_text[j] == '\n':
j -= 1
continue
pre_info = full_text[j] + pre_info
search_length -= 1
j -= 1
post_info = ""
search_length = 50
j = end_i
while search_length > 0 and j < len(full_text):
if full_text[j] == '*':
break
if full_text[j] == '\n':
j += 1
continue
post_info += full_text[j]
search_length -= 1
j += 1
results.append(
f"————上文———— {pre_info}\n"
f"————匹配所在行:({start_line} - {end_line})匹配所在行———— \
{"".join(current_line_context)}\n"
f"————下文———— {post_info}\n"
)
i+=1
else:
i+=1
if results:
events_list = []
event = {}
for result in results:
print("#"*60)
print("\n")
print("-"*60)
print(result)
print("-"*60)
print("\n")
result_text = []
event["line_range"] = None
event["event"] = None
for text in result.split("\n"):
if "———上文" in text:
result_text.append(text)
elif "———下文" in text:
result_text.append(text)
elif "———匹配所在行" in text:
start_pos = text.find('(') + 1
end_pos = text.find(')')
line_range = text[start_pos:end_pos]
event["line_range"] = line_range
result_text.append(text)
result_text_pre_info = result_text[0].split("————上文———— ")[-1].strip()
current_line_context = result_text[1].split("匹配所在行————")[-1].strip()
result_text_post_info = result_text[2].split("————下文———— ")[-1].strip()
time_info = find_time(result_text_pre_info)
if time_info == None:
event["time"] = None
else:
event["time"] = time_info["time"]
start_pos = "".join(current_line_context).find('关键词*') + 1
end_pos = "".join(current_line_context).find('*关键词', start_pos)
substring = "".join(current_line_context)[start_pos:end_pos]
event["event"] = substring[3:]
if event["event"] == "激活全血凝固时间" or event["event"] == "心肌保护液":
num_results_list = find_continuous_numbers(result_text_post_info)
event["num_results_list"] = num_results_list
else:
event["num_results_list"] = None
parameter_result = extract_number_after_keys(result_text_post_info)
for one_parameter in parameter_result:
event[one_parameter] = str(parameter_result[one_parameter])
for one_parameter in event:
if one_parameter == "B":
if event[one_parameter] == "None":
print(one_parameter + " : " + str(event[one_parameter]))
elif "00" in str(int(float(event[one_parameter]))):
event[one_parameter] = str(int(float(event[one_parameter])))[-3:]
print(one_parameter + " : " + str(int(float(event[one_parameter]))[-3:]))
else:
event[one_parameter] = str(int(float(event[one_parameter])))[-2:]
print(one_parameter + " : " + str(int(float(event[one_parameter])))[-2:])
elif one_parameter == "D":
if event[one_parameter] == "None":
print(one_parameter + " : " + str(event[one_parameter]))
else:
event[one_parameter] = str(int(float(event[one_parameter])))[-3:]
print(one_parameter + " : " + str(int(float(event[one_parameter])))[-3:])
else:
print(one_parameter + " : " + str(event[one_parameter]))
print("-"*60)
print("-"*60)
print("#"*60)
events_list.append(event.copy())
return events_list
else:
print("未找到符合条件的匹配内容。")
def extract_time_stamps_from_txt(file_path):
result_dict = {}
with open(file_path, "r", encoding="utf-8") as file:
for line_number, line in enumerate(file, start=1):
line = line.strip()
if not line:
continue
parts = line.split("\t", 1)
if len(parts) < 2:
parts = line.split("chunk", 1)
time_range = parts[0]
time = time_range.split("chunk")[0].strip(" ").strip("[").strip("]")
result_dict[line_number] = time
return result_dict
def extract_time_from_filename(file_path):
parts = file_path.split("_")
if len(parts) >= 2 and len(parts[0]) == 8 and len(parts[1]) == 6:
date_part = parts[0]
time_part = parts[1]
dt = datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
return dt
return None
def calculate_current_time(start_time: str, elapsed_time: str) -> str:
"""
Calculate the current time based on start time and elapsed time.
:param start_time: Start time in "HH:MM:SS" format.
:param elapsed_time: Elapsed time in "HH:MM:SS" format.
:return: Current time in "HH:MM:SS" format.
"""
start_dt = datetime.strptime(start_time, "%H:%M:%S")
h, m, s = map(int, elapsed_time.split(":"))
elapsed_td = timedelta(hours=h, minutes=m, seconds=s)
current_time = start_dt + elapsed_td
return current_time.strftime("%H:%M:%S")
def convert_time(time_str):
time_str = time_str.replace('点', ':')\
.replace('1刻', '15')\
.replace('2刻', '30')\
.replace('3刻', '40')\
.replace('半', '30')
if time_str.endswith(':'):
time_str += '00'
if len(time_str.split(':')) == 2:
time_str += ':00'
time_str = ':'.join(f'{int(x):02d}' for x in time_str.split(':'))
return time_str
print("-"*60)
file_path = "20250312_104423_0_1_2_3_4_5.txt"
events_list = find_and_extract_contextual_event_info(file_path)
print("提取关键事件+时间+各个参数完成!")
print("-"*60)
print("-"*60)
file_path = "20250312_104423_0_1_2_3.txt"
time_dict = extract_time_stamps_from_txt(file_path)
file_name = os.path.basename(file_path)
time_in_beijing = extract_time_from_filename(file_name)
start_time = time_in_beijing.strftime("%H:%M:%S")
if time_in_beijing:
print("录音时间(北京时间):", start_time)
else:
print("未找到时间信息")
events_list_1 = []
for event in events_list:
row_num = int(event['line_range'].split("-")[0])
elapsed_time = time_dict[row_num].split("-")[0].strip()
current_time = calculate_current_time(start_time, elapsed_time)
print("Current time:", current_time)
event['line_range'] = current_time
events_list_1.append(event.copy())
print("-"*60)
for i in range(len(events_list_1)):
event = events_list_1[i]
print("-"*60)
if event["time"] == "None" or event["time"] == None:
event["time"] = event['line_range']
if i >= 1:
if event["A"] == "None" or event["A"] == None:
event["A"] = events_list_1[i-1]["A"]
if event["B"] == "None" or event["B"] == None:
event["B"] = events_list_1[i-1]["B"]
if event["C"] == "None" or event["C"] == None:
event["C"] = events_list_1[i-1]["C"]
if event["D"] == "None" or event["D"] == None:
event["D"] = events_list_1[i-1]["D"]
if event["Y"] == "None" or event["Y"] == None:
event["Y"] = events_list_1[i-1]["Y"]
for event in events_list_1:
event['time'] = convert_time(event['time'])
sorted_events = sorted(events_list_1, key=lambda x: x['time'])
for event in sorted_events:
print(event)
sorted_events_1 = []
for event in sorted_events:
if event["event"] == "激活全血凝固时间" or event["event"] == "心肌保护液":
continue
else:
sorted_events_1.append(event)
output_txt_file = file_path.split(".")[0] + "_time_event_ABCD.txt"
with open(output_txt_file, "w", encoding="utf-8") as file:
file.write("-" * 60 + "\n")
index_num = 1
for index, event in enumerate(sorted_events_1, start=1):
if "序号" in event["event"]:
continue
file.write(f"Event {index_num}\n")
index_num +=1
for one_parameter in event:
if one_parameter == "line_range":
file.write("line_range_time : " + str(event[one_parameter]) + "\n")
else:
file.write(one_parameter + " : " + str(event[one_parameter]) + "\n")
file.write("-" * 60 + "\n")
file.write("-" * 60 + "\n")