语音识别后处理代码

import re
import difflib
import os
from datetime import datetime, timedelta

def extract_snippets_no_duplicates(input_file, output_file, window=150):
    """
    从输入文本文件中提取包含目标字符(A、B、C、D、"开始"、"结束")前后 `window` 个字符范围的文本。
    提取时扩展到完整的行,并确保每行在输出文件中不会重复,同时保持原始顺序。

    参数:
    input_file (str): 输入文件路径。
    output_file (str): 输出文件路径。
    window (int, optional): 目标字符前后扩展的字符数量,默认值为 150。
    """
    # 读取整个文本内容
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    target_chars = {'A', 'B', 'C', 'D'}  # 目标单字符
    target_words = {'开始', '结束'}  # 目标关键词
    intervals = []  # 存储目标字符和关键词的扩展范围

    # 遍历文本,找到目标字符的位置
    for i, char in enumerate(text):
        if char in target_chars:
            start = max(0, i - window)
            end = min(len(text), i + window + 1)
            intervals.append((start, end))

    # 额外查找 "开始" 和 "结束" 关键词
    for word in target_words:
        start_idx = 0
        while (start_idx := text.find(word, start_idx)) != -1:
            start = max(0, start_idx - window)
            end = min(len(text), start_idx + len(word) + window)
            intervals.append((start, end))
            start_idx += len(word)  # 避免重复匹配同一个位置

    # 按起始位置排序
    intervals.sort(key=lambda x: x[0])

    # 合并重叠区间
    merged_intervals = []
    if intervals:
        current_start, current_end = intervals[0]
        for start, end in intervals[1:]:
            if start <= current_end:  # 有重叠则合并
                current_end = max(current_end, end)
            else:
                merged_intervals.append((current_start, current_end))
                current_start, current_end = start, end
        merged_intervals.append((current_start, current_end))

    # 读取所有行,并记录每行的起止位置
    lines = text.splitlines(keepends=True)  # 保留换行符
    line_positions = []
    start_idx = 0
    for line in lines:
        end_idx = start_idx + len(line)
        line_positions.append((start_idx, end_idx, line))
        start_idx = end_idx

    # 计算完整行的提取范围,并去重
    extracted_lines = []
    seen_lines = set()
    for start, end in merged_intervals:
        for line_start, line_end, line in line_positions:
            if line_end > start and line_start < end:  # 判断行是否与区间有重叠
                if line not in seen_lines:
                    extracted_lines.append(line)
                    seen_lines.add(line)

    # 将筛选出的行写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(extracted_lines)

    print("处理完成,结果已保存到", output_file)

def replace_say_and_dianer_in_file(filename):
    """
    读取 txt 文件,将 "SAY" 替换为 "C",将 "点儿" 替换为 "点",并直接保存到原文件。

    :param filename: 需要处理的文件路径
    """

    with open(filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 替换文本
    text = text.replace("SAY", "C").replace("点儿", "点")

    # 直接覆盖原文件
    with open(filename, "w", encoding="utf-8") as file:
        file.write(text)

    print(f"替换完成,已保存到 {filename}")

def mark_time_gaps(file_path):
    """
    处理文件内容,如果当前行的结束时间与下一行的开始时间不连续,
    则在当前行末尾添加相应数量的 "|"。
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    processed_lines = []
    for i in range(len(lines)):
        line = lines[i].strip()
        if not line:
            processed_lines.append(line + "-")
            continue

        # 解析当前行的时间戳
        start_time, end_time = None, None
        try:
            # 提取时间戳部分
            timestamp_part = line.split(']')[0] + ']'
            start_time_str, end_time_str = timestamp_part.strip('[]').split(' - ')
            # 将时间字符串转换为秒数
            start_time = sum(x * int(t) for x, t in zip([3600, 60, 1], start_time_str.split(':')))
            end_time = sum(x * int(t) for x, t in zip([3600, 60, 1], end_time_str.split(':')))
        except (IndexError, ValueError):
            # 如果时间戳解析失败,直接添加原行
            processed_lines.append(line + "-")
            continue

        # 如果是最后一行,直接添加
        if i == len(lines) - 1:
            processed_lines.append(line + "-")
            continue

        # 解析下一行的时间戳
        next_line = lines[i + 1].strip()
        next_start_time = None
        try:
            next_timestamp_part = next_line.split(']')[0] + ']'
            next_start_time_str, _ = next_timestamp_part.strip('[]').split(' - ')
            next_start_time = sum(x * int(t) for x, t in zip([3600, 60, 1], next_start_time_str.split(':')))
        except (IndexError, ValueError):
            # 如果下一行时间戳解析失败,直接添加原行
            processed_lines.append(line + "-")
            continue

        # 计算时间差
        time_diff = next_start_time - end_time
        if time_diff > 5:
            # 在当前行末尾添加相应数量的 "|"
            line += "|" * (time_diff // 5)

        processed_lines.append(line + "-")

    # 将处理后的内容写回文件
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(processed_lines))

def correct_yao_in_text(input_text_1, output_filename_1):
    """
    修正文本中的 "yao" 相关汉字,在特定上下文中替换为 "一"。
    :param input_text_1: 需要修正的文本
    :param output_filename_1: 输出文件名(如果保存)
    :return: 修正后的文本
    """

    with open(input_text_1, "r", encoding="utf-8") as file:
        text = file.read()

    # 定义可能被识别成 "yao" 的汉字
    yao_variants = {"幺", "要", "腰", "邀", "瑶", "钥", "谣"}  # 可扩展
    
    # 触发修正的前后文
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}
    
    output_filename = output_filename_1

    def replace_yao(match):
        word = match.group(0)
        start, end = match.start(), match.end()
        
        # # 获取前后一个字符
        # before = text[max(0, start - 1)]
        # after = text[min(len(text) - 1, end)]
        
        # # 获取前后文 10 个字符
        # context_before = text[max(0, start - 10):start]
        # context_after = text[end:end + 10]
        
        # # 如果前后存在触发修正的字符,则替换为 "一"
        # if before in context_triggers or after in context_triggers:
        
        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            corrected = "一"
            print(f"修正: {word} -> {corrected}")
            print("修正后上下文:  "  + context_before + " | " + corrected + " | " + context_after)
            return corrected
        
        # 无修正时打印原字及上下文信息
        # print("未修正:  "  + context_before + " | " + word + " | " + context_after)
        return word  # 否则保持原样
    
    # 使用正则找到所有可能的 yao 相关汉字
    pattern = "|".join(yao_variants)
    corrected_text = re.sub(pattern, replace_yao, text)
    
    # 保存修正后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)
    print(f"修正完成,已保存到 {output_filename}")
    
    return corrected_text

def replace_ai_with_AI(input_filename, output_filename):
    """
    将 AI 相关的汉字替换为 "AI",但仅当前后 3 个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。

    :param input_filename: 输入文本文件路径
    :param output_filename: 处理后的文本保存路径
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 需要替换的 "AI" 相关汉字
    ai_variants = {"哀", "埃", "挨", "癌", "矮", "爱", "碍", "艾", "隘", "蔼", "霭", "嗳", "诶"}

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}

    def replace_ai(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
            print(f"修正: {match.group(0)} -> A")
            print(f"修正后上下文: {context_before} | A | {context_after}\n")
            return "A"

        return match.group(0)  # 否则保持原样

    # 使用正则替换
    pattern = "|".join(ai_variants)
    corrected_text = re.sub(pattern, replace_ai, text)

    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)

    print(f"修正完成,已保存到 {output_filename}")
    return corrected_text

def replace_bi_with_B(input_filename, output_filename):
    """
    将 "比"、"必"、"笔"、"币"、"闭"、"碧"、"避" 替换为 "B",
    但仅当前后存在触发字符(A, C, D, Y, 一, 二, 三...)时才替换。

    :param input_filename: 需要处理的文本文件
    :param output_filename: 处理后的文本保存路径
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 定义需要替换的 "bi" 相关汉字
    bi_variants = {"比", "必", "笔", "币", "闭", "碧", "避"}

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}

    def replace_bi(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正: {match.group(0)} -> B")
            print("修正后上下文: " + context_before + " | B | " + context_after)
            return "B"

        return match.group(0)  # 否则保持不变

    # 使用正则替换
    pattern = "|".join(bi_variants)
    corrected_text = re.sub(pattern, replace_bi, text)

    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)

    print(f"修正完成,已保存到 {output_filename}")

    return corrected_text

def replace_di_with_D(input_filename, output_filename):
    """
    将 "的"、"地"、"得"、"德"、"底"、"敌"、"蒂"、"第"、"递"、"堤"、"笛"、"狄"、"涤"、"翟"、"帝"、"嫡"、"滴"
    替换为 "D",但仅当前后存在触发字符(A, B, C, Y, 一, 二, 三...)时才替换。

    :param input_filename: 需要处理的文本文件
    :param output_filename: 处理后的文本保存路径
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 需要替换的 "D" 相关汉字
    di_variants = {
        "地", "底", "敌", "蒂", "第", "递", "堤", "笛", "狄", "涤", "翟", "帝", 
        "嫡", "滴", "低"
    }

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}

    def replace_di(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
            print(f"修正: {match.group(0)} -> D")
            print(f"修正后上下文: {context_before} | D | {context_after}\n")
            return "D"

        return match.group(0)  # 否则保持不变

    # 使用正则替换
    pattern = "|".join(di_variants)
    corrected_text = re.sub(pattern, replace_di, text)

    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)

    print(f"修正完成,已保存到 {output_filename}")

    return corrected_text

def replace_ling_with_target(input_filename, output_filename, target="零"):
    """
    将 ling_variants 中的汉字替换为 target(默认 "〇"),
    但仅当前后三个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。

    :param input_filename: 输入文本文件路径
    :param output_filename: 处理后的文本保存路径
    :param target: 替换后的目标字符,默认 "〇"
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 需要替换的 "ling" 相关汉字(所有拼音 líng 读音)
    ling_variants = {
        "龄", "铃", "凌", "菱", "羚", "翎", "伶", "苓", "泠", "棂", "瓴", "绫"
    }

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十"}

    def replace_ling(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
            print(f"修正: {match.group(0)} -> {target}")
            print(f"修正后上下文: {context_before} | {target} | {context_after}\n")
            return target

        return match.group(0)  # 否则保持原样

    # 使用正则替换
    pattern = "|".join(ling_variants)
    corrected_text = re.sub(pattern, replace_ling, text)

    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)

    print(f"修正完成,已保存到 {output_filename}")

    return corrected_text

def replace_shi_with_target(input_filename, output_filename, target="十"):
    """
    将 shi_variants 中的汉字替换为 target(默认 "示"),
    但仅当前后三个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。

    :param input_filename: 输入文本文件路径
    :param output_filename: 处理后的文本保存路径
    :param target: 替换后的目标字符,默认 "示"
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 需要替换的 "shi" 相关汉字(所有拼音 shi 读音)
    shi_variants = {
        # 一声(shī)
        "诗", "师", "失", "尸", "施", "湿", "狮", "虱", "柿",  
        # 二声(shí)
        "时", "识", "实", "石", "食", "蚀", "拾",  
        # # 三声(shǐ)
        # "史", "使", "始", "驶", "矢", "屎",  
        # # 四声(shì)
        "事", "式", "示", "世", "试", "势", "释", "适", "视", "氏", "室", "侍", "誓", "嗜", "饰"
    }

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}

    def replace_shi(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 1):start]
        after = text[end:end + 1]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
            print(f"修正: {match.group(0)} -> {target}")
            print(f"修正后上下文: {context_before} | {target} | {context_after}\n")
            return target

        return match.group(0)  # 否则保持原样

    # 使用正则替换
    pattern = "|".join(shi_variants)
    corrected_text = re.sub(pattern, replace_shi, text)

    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)

    print(f"修正完成,已保存到 {output_filename}")

    return corrected_text

def replace_ba_with_eight(input_filename, output_filename):
    """
    将 ba_variants 中的汉字替换为 "八",但仅当前后 3 个字符中存在触发字符(C, D, Y, 一, 二, 三...)时才替换。

    :param input_filename: 输入文本文件路径
    :param output_filename: 处理后的文本保存路径
    :return: 处理后的文本
    """

    with open(input_filename, "r", encoding="utf-8") as file:
        text = file.read()

    # 需要替换的 "ba" 相关汉字(所有拼音 ba 读音)
    ba_variants = {
        "吧", "巴", "叭", "芭", "捌", "笆", "疤", "扒",  # bā
        "拔", "跋", "茇", "菝", "魃", "把",  # bá
        "把", "钯",  # bǎ
        "爸", "罢", "霸", "坝"  # bà
    }

    # 触发替换的前后字符
    context_triggers = {"A", "B", "C", "D", "Y", "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "零"}

    def replace_ba(match):
        start, end = match.start(), match.end()

        # 获取前后三个字符
        before = text[max(0, start - 3):start]
        after = text[end:end + 3]

        # 获取前后文 10 个字符(便于调试)
        context_before = text[max(0, start - 10):start]
        context_after = text[end:end + 10]

        # 仅在前后三个字符中存在触发字符时替换
        if any(char in context_triggers for char in before) or any(char in context_triggers for char in after):
            print(f"修正前上下文: {context_before} | {match.group(0)} | {context_after}")
            print(f"修正: {match.group(0)} -> 八")
            print(f"修正后上下文: {context_before} | 八 | {context_after}\n")
            return "八"

        return match.group(0)  # 否则保持原样

    # 使用正则替换
    pattern = "|".join(ba_variants)
    corrected_text = re.sub(pattern, replace_ba, text)
    # 保存处理后的文本
    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(corrected_text)
    print(f"修正完成,已保存到 {output_filename}")
    return corrected_text

def process_bd_correction(input_file, output_file, context_print_window=20):
    """
    读取输入文件,修正文本中的 B/D 混淆问题,并将修正后的文本保存到输出文件。
    还会打印所有修正事件的详细信息。
    
    :param input_file: 输入文件路径
    :param output_file: 输出文件路径
    :param context_print_window: 上下文打印窗口大小(默认前后各 20 个字符)
    """
    def correct_bd_mixup(text):
        """
        对文本中的 B 和 D 进行检测与修正:
        (1) 对 B 检查:如果前文中最近的 A/C 为 C 而不是 A,
            且后文中最近的 A/C 为 A 而不是 C,则将 B 修正为 D。
        (2) 对 D 检查:如果前文中最近的 A/C 为 A 而不是 C,
            且后文中最近的 A/C 为 C 而不是 A,则将 D 修正为 B。
        
        返回修正后的文本以及所有修正事件的详细记录。
        每个修正事件记录:(位置, 原字符, 修改后字符, 前最近的 A/C, 后最近的 A/C)。
        """
        text_list = list(text)  # 转换为列表,便于逐字符修改
        corrections = []        # 记录所有修正事件
    
        # 遍历文本中每个字符
        for i, char in enumerate(text_list):
            if char in ('B', 'D'):
                # 向前查找最近的 A 或 C
                prev_char = next((text_list[j] for j in range(i - 1, -1, -1) if text_list[j] in ('A', 'C')), None)
                # 向后查找最近的 A 或 C
                next_char = next((text_list[j] for j in range(i + 1, len(text_list)) if text_list[j] in ('A', 'C')), None)
    
                # 根据规则判断并修正
                if char == 'B' and prev_char == 'C' and next_char == 'A':
                    corrections.append((i, 'B', 'D', prev_char, next_char))
                    text_list[i] = 'D'
                elif char == 'D' and prev_char == 'A' and next_char == 'C':
                    corrections.append((i, 'D', 'B', prev_char, next_char))
                    text_list[i] = 'B'
    
        corrected_text = ''.join(text_list)
        return corrected_text, corrections

    # 读取原始文本
    with open(input_file, 'r', encoding='utf-8') as f:
        original_text = f.read()

    # 进行 B/D 混淆检测与修正
    corrected_text, correction_events = correct_bd_mixup(original_text)

    # 保存修正后的文本到新文件
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(corrected_text)
    print("已经保存为: " + output_file)

    # 打印修正记录及上下文,便于检查实现的对错
    print("修正记录:")
    for pos, orig_char, new_char, prev_ac, next_ac in correction_events:
        # 定义上下文窗口(前后各 context_print_window 个字符)
        start_context = max(0, pos - context_print_window)
        end_context = min(len(original_text), pos + context_print_window + 1)
        original_context = original_text[start_context:end_context]
        corrected_context = corrected_text[start_context:end_context]

        print(f"位置 {pos}: {orig_char} -> {new_char} (前最近 A/C: {prev_ac}, 后最近 A/C: {next_ac})")
        print("原始上下文:")
        print(original_context)
        print("修正后上下文:")
        print(corrected_context)
        print("-" * 60)


# 关键词列表
keywords = [

    # CPB并行循环模板
    "开始装机",
    "装机开始",
    "开始CPB装机",
    "CPB装机开始",

    "开始预充排气",
    "预充排气开始",

    "开始安全检查",
    "安全检查开始",

    "开始主动脉插管",
    "主动脉插管开始",

    "开始体外循环",
    "体外循环开始",
    "CPB ON",

    "开始双管降温",
    "双管降温开始",

    "停降温",
    "开始停降温",
    "停降温开始",

    "开始复温",
    "复温开始",

    "开始超滤",
    "超滤开始",

    "钳闭下腔静脉",

    "开始减流量",
    "减流量开始",

    "CPB OF",
    "结束体外循环",
    "体外循环结束",


    "开始改良超滤",
    "改良超滤开始",

    "结束改良超滤",
    "改良超滤结束",

    "开始记录参数",
    "记录参数开始",

    "血气分析",
    "ACT",

    # 常规体外循环模板
    "上下腔静脉阻断",

    "升主动脉阻断",

    "心肌保护液", 

    "升主动脉开放",

    "开始复跳",
    "复跳开始",

    "上下腔静脉开放",

    # 选择性脑灌注模板

    "双主动脉灌注",

    "拔出降主动脉插管",

    "脑灌注开始",

    "脑灌开始",

    "降主动脉阻断",

    "降主动脉开放",

    "脑灌注结束",

    "脑灌结束",

    # DHCA模板

    "开始用冰帽",
    "用冰帽",

    "开始停循环",
    "停循环开始",

    "恢复循环",
    "开始恢复循环",
    "恢复循环开始",

    "缓慢复温",
    "开始缓慢复温",

    # 
    "序号一","序号二","序号三","序号四","序号五","序号六","序号七","序号八","序号九",

    # 
    "复方电解质",
    "红细胞悬液",
    "人血白蛋白",
    "碳酸氢钠",
    "甘露醇",
    "乙基淀粉",
    "甲基强的松龙",
    "磷酸肌酸钠",
    "地塞米松",
    "利多卡因",
    "葡萄糖",
    "酚妥拉明",
    "呋塞米注射液",
    "氯化钾",
    "氯化钙",
    "去甲肾上腺素",
    "肝素钠",
    "血液回收血",
    "乌司他丁"
]

def fuzzy_match(text, keyword, replaced_ranges):
    """
    在文本 text 中找到与 keyword 最相似的片段,确保未被替换过。
    """
    max_similarity = 0
    best_match = ""
    best_index = -1
    keyword_length = len(keyword)

    # 采用滑动窗口法遍历 text
    for i in range(len(text) - keyword_length + 1):
        window = text[i:i + keyword_length]  # 取与关键词等长的窗口
        similarity = difflib.SequenceMatcher(None, window, keyword).ratio()

        # **跳过已经被替换的区域**
        if any(start <= i < end for start, end in replaced_ranges):
            continue

        if similarity > max_similarity:
            max_similarity = similarity
            best_match = window
            best_index = i  # 记录最佳匹配的起始索引

    return max_similarity, best_match, best_index

def replace_keywords(text, SIMILARITY_THRESHOLD = 0.6):
    """
    在文本中替换匹配的关键词,并打印调试信息,确保已替换内容不会被重复修改
    """
    matches = []
    replaced_ranges = []  # 记录已替换的索引范围,防止重复替换

    for keyword in keywords:
        similarity, matched_text, index = fuzzy_match(text, keyword, replaced_ranges)

        if similarity >= SIMILARITY_THRESHOLD and index != -1:
            matches.append((similarity, keyword, matched_text, index))

    # 按照相似度从高到低排序
    matches.sort(reverse=True, key=lambda x: x[0])

    # 逐步替换文本(确保不重复替换)
    modified_text = text
    offset = 0  # 记录偏移量,确保替换后索引不乱
    for i, (similarity, keyword, matched_text, index) in enumerate(matches):
        new_text = f"关键词*{keyword}*关键词"
        actual_index = index + offset  # 计算当前索引的偏移量

        # **检查是否重叠已替换区域**
        if any(start <= actual_index < end for start, end in replaced_ranges):
            continue  # 跳过此替换,避免二次修改

        # **打印调试信息**
        print(f"\n **替换详情** ")
        print(f"原始文本片段: {modified_text[max(0, actual_index-10):actual_index+len(matched_text)+10]}")
        print(f"匹配关键词: {keyword}")
        print(f"相似度: {similarity:.2f}")
        print(f"替换前: {matched_text}")
        print(f"替换后: {new_text}")

        # 替换文本
        modified_text = (
            modified_text[:actual_index]
            + new_text
            + modified_text[actual_index + len(matched_text):]
        )

        # **更新替换范围**
        replaced_ranges.append((actual_index, actual_index + len(new_text)))

        # 更新偏移量
        offset += len(new_text) - len(matched_text)

    return modified_text

def process_file_replace_keywords(file_path, output_path):
    """
    处理整个文件,保持原始格式
    """
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    modified_lines = [replace_keywords(line.strip()) + "\n" for line in lines]

    with open(output_path, "w", encoding="utf-8") as file:
        file.writelines(modified_lines)


def remove_timestamps_and_indexes(input_file, output_file):
    """
    去掉时间戳和序号
    
    1. 逐行读取输入文件内容。
    2. 按 `]` 进行拆分,取 `]` 之后的内容。
    3. 进一步按 `\t`(制表符)拆分,去掉 `chunk_x`,只保留最后一个部分。
    4. 将处理后的内容写入新的文件,保持换行符格式。
    
    参数:
        input_file (str): 输入文件路径,包含原始文本数据。
        output_file (str): 输出文件路径,存储处理后的文本数据。
    
    返回:
        None
    """
    
    # 读取原始文件内容
    with open(input_file, "r", encoding="utf-8") as f:
        lines = f.readlines()  # 按行读取,保持换行符

    cleaned_lines = []  # 存储处理后的文本

    for line in lines:
        parts = line.split("]")  # 以 `]` 分割行内容

        # 确保有 `]` 存在,避免索引错误
        if len(parts) > 1:
            content_parts = parts[1].split("\t")  # 按 `tab` 进一步分割
            last_part = content_parts[-1].strip()  # 取最后部分并去除前后空格
            cleaned_lines.append(last_part + "\n")  # 重新加入换行符,保持原格式

    # 将处理后的内容写入新文件
    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(cleaned_lines)  # 批量写入所有行

    print(f"✅ 处理完成,结果已保存到 {output_file}")  # 友好的用户提示


def convert_chinese_number_to_arabic(input_file, output_file):
    chinese_numbers = {'零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
                       '五': '5', '六': '6', '七': '7', '八': '8', '九': '9'}
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            content = f.read()
        
        for zh_num, ar_num in chinese_numbers.items():
            content = content.replace(zh_num, ar_num)
        
        # 处理“十”的情况
        content = re.sub(r'(?, '10', content)  # "十" 单独出现时 -> "10"
        content = re.sub(r'(?, '10', content)  # "一十" -> "10"
        content = re.sub(r'(?, r'\g<1>0', content)  # "X十" -> "X0"
        content = re.sub(r'(?, r'1\g<1>', content)  # "十X" -> "1X"
        content = re.sub(r'(?, r'\g<1>\g<2>', content)  # "X十Y" -> "XY"

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(content)
        
        print(f"转换完成!请查看 {output_file}")
    except FileNotFoundError:
        print(f"错误:文件 {input_file} 未找到。")
    except Exception as e:
        print(f"发生错误:{e}")


def replace_act(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()  # 读取文件内容

    # 替换 ACT 为 激活全血凝固时间
    content = content.replace("ACT", "激活全血凝固时间")

    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)  # 写回文件

def remove_spaces_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    # 去除每行的空格
    cleaned_lines = [line.replace(" ", "") for line in lines]
    
    with open(filename, 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

def extract_time_from_filename(file_path):
    # 通过 "_" 分割文件名
    parts = file_path.split("_")
    # 确保前两部分是日期和时间
    if len(parts) >= 2 and len(parts[0]) == 8 and len(parts[1]) == 6:
        date_part = parts[0]  # "20250306"
        time_part = parts[1]  # "105401"
        
        # 组合并解析成 datetime 对象
        dt = datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
        return dt
    return None

def calculate_current_time(start_time: str, elapsed_time: str) -> str:
    """
    Calculate the current time based on start time and elapsed time.

    :param start_time: Start time in "HH:MM:SS" format.
    :param elapsed_time: Elapsed time in "HH:MM:SS" format.
    :return: Current time in "HH:MM:SS" format.
    """
    # 解析 start_time
    start_dt = datetime.strptime(start_time, "%H:%M:%S")

    # 解析 elapsed_time
    h, m, s = map(int, elapsed_time.split(":"))
    elapsed_td = timedelta(hours=h, minutes=m, seconds=s)

    # 计算当前时间
    current_time = start_dt + elapsed_td

    # 返回计算后的时间字符串
    return current_time.strftime("%H:%M:%S")

def extract_number_after_keys(text):
    '''函数功能:
    该函数 extract_values 用于从输入文本中提取字母键(例如 "A", "B", "C", "D")后面的数值。
    
    它通过遍历文本并查找这些字母,获取它们后面紧跟的数字(包含点符号和“点”字),最终返回一个字典,其中包含每个字母键及其对应的数值。

    主要步骤:
    初始化一个包含字母键的字典,初始值为 None。
    遍历每个字母键,查找其在文本中的位置。
    如果找到了该字母键,继续向后查找数值,并避免在字母与数值之间出现其他字母或符号。
    提取数值部分,如果找到有效的数值,将其存入字典。
    最终返回包含字母键和对应数值的字典。
    '''
    keys = ["A", "B", "C", "D"]  # 定义待提取的字母键
    result = {key: None for key in keys}  # 初始化结果字典,键的初始值为 None
    
    for key in keys:  # 遍历每一个键
        index = text.find(key)  # 查找键在文本中的第一次出现位置
        while index != -1:  # 如果找到该键,则继续进行提取
            num_start = index + 1  # 数值部分应该从键的下一个位置开始
            
            # 检查字母和数值之间是否有其他字母或符号
            has_other_key = False
            while num_start < len(text) and not (text[num_start].isdigit() or text[num_start] in ["点", "."]):
                if text[num_start] in keys or text[num_start] == "*":  # 遇到其他键或者星号时,标记为有其他字母
                    has_other_key = True
                    break  # 跳出检查循环
                num_start += 1  # 向后移动,检查下一个字符
            
            # 如果字母和数值之间有其他字母,则向下 30 字符内再找
            if has_other_key:
                search_range = text[index + 1 : index + 31]  # 获取当前位置后的 30 个字符范围
                new_index = search_range.find(key)  # 查找这个范围内是否有相同的字母
                if new_index != -1:
                    index += new_index + 1  # 更新索引位置
                    continue  # 重新开始查找该字母对应的数值
            
            # 提取数值部分
            num_str = ""
            while num_start < len(text) and (text[num_start].isdigit() or text[num_start] in ["点", "."]):
                num_str += "." if text[num_start] == "点" else text[num_start]  # 将“点”转换为小数点
                num_start += 1  # 向后移动,继续提取数值
            
            # 存储有效数值
            if num_str:
                result[key] = float(num_str)  # 将提取的数值转换为浮动数并存入结果字典
            break  # 跳出当前字母的查找,继续查找下一个字母
    
    return result  # 返回最终的结果字典

def find_and_extract_contextual_event_info(file_path):
    # 读取文件并合并为一个连续字符串,同时记录每个字符属于的行号
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    full_text = "".join(lines)  # 连接所有行,形成一个完整的字符串
    line_indices = []  # 记录每个字符属于的行号
    current_line = 1  # 行号计数

    for line in lines:
        line_indices.extend([current_line] * len(line))  # 记录该行的字符索引
        current_line += 1

    results = []
    text_length = len(full_text)
    i = 0  # 遍历整个字符串

    while i < text_length:
        if full_text[i] == 'A':
            # A 后 5 个字符(跨行处理)
            end_index = min(i + 6, text_length)
            next_5_chars = full_text[i + 1:end_index]
            contains_digit = any(char.isdigit() for char in next_5_chars)

            if contains_digit:
                # A 之后的 20 个字符(跨行处理)
                search_range_end = min(i + 21, text_length)
                next_15_chars = full_text[i:search_range_end]

                # 在 15 个字符范围内查找 'C'
                for j in range(i, search_range_end):
                    if full_text[j] == 'C':
                        # C 后 5 个字符(跨行处理)
                        c_end_index = min(j + 6, text_length)
                        c_next_5_chars = full_text[j + 1:c_end_index]
                        if any(char.isdigit() for char in c_next_5_chars):
                            # 获取匹配内容的行号范围
                            start_line = line_indices[i]  # A 所在行
                            end_line = line_indices[min(search_range_end - 1, text_length - 1)]  # 结束位置的行

                            # 获取 A 前 25 个字符的起始索引
                            a_start_index = max(i - 25, 0)
                            a_start_line = line_indices[a_start_index]  # A 前 25 字符的起始行
                            before_a_lines = lines[a_start_line - 1:start_line]  # 避免重复

                            # 获取 C 后 25 个字符的结束索引
                            c_end_index = min(j + 26, text_length)  # 额外 +1,确保包含 C 后 25 字符
                            c_end_line = line_indices[c_end_index - 1]  # C 后 25 字符的结束行
                            after_c_lines = lines[end_line:c_end_line]  # 避免重复

                            # 提取完整的匹配行
                            full_matched_lines = lines[start_line - 1:end_line]  # 避免 `before_a_lines` 里的行重复

                            # 记录匹配结果
                            results.append(
                                # f"匹配内容: {next_15_chars.strip()} (行 {start_line} - {end_line})\n"
                                f"————上文所在行: ({a_start_line} - {start_line - 1})————\n"
                                f"{''.join(before_a_lines).strip()}\n"
                                f"————匹配所在行:({start_line} - {end_line})————\n"
                                f"{''.join(full_matched_lines).strip()}\n"
                                f"————下文所在行:({end_line + 1} - {c_end_line})————\n"
                                f"{''.join(after_c_lines).strip()}\n"
                            )
                            break  # 找到符合条件的就跳出内层循环

        i += 1  # 继续遍历文本

    # 输出结果
    if results:
        events_list = []
        event = {}
        for result in results:
            print("\n")
            print("-"*60)
            print(result)
            print("-"*60)
            print("\n")
            result_text = []
            event["line_range"] = None
            event["event"] = None

            for text in result.split("\n"):
                if "———上文所在行" in text:
                    continue
                elif "———下文所在行" in text:
                    continue
                elif "———匹配所在行" in text:
                    # 查找括号的位置
                    start_pos = text.find('(') + 1  # 找到第一个'('的位置,并加1跳过括号
                    end_pos = text.find(')')        # 找到第一个')'的位置
                    # 截取括号中的内容
                    line_range = text[start_pos:end_pos]
                    event["line_range"] = line_range
                    # 分割开始行和结束行
                    # start_line, end_line = line_range.split(' - ')
                else:
                    result_text.append(text)
                    if '*' in text:
                        # 查找 * 的位置
                        start_pos = text.find('*') + 1  # 第一个 '*' 后面的字符开始
                        end_pos = text.find('*', start_pos)  # 第二个 '*' 的位置
                        # 提取 * 之间的内容
                        substring = text[start_pos:end_pos]
                        event["event"] = substring

            # 输出结果
            print("".join(result_text))
            parameter_result = extract_number_after_keys("".join(result_text))

            for one_parameter in parameter_result:
                event[one_parameter] = str(parameter_result[one_parameter])
            for one_parameter in event:
                print(one_parameter + " : " + str(event[one_parameter]))
            events_list.append(event.copy())

        return events_list
    else:
        print("未找到符合条件的匹配内容。")

def extract_time_stamps_from_txt(file_path):
    result_dict = {}  # 初始化空字典用于存储行号和时间戳
    with open(file_path, "r", encoding="utf-8") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()  # 去除行两端的空白字符
            if not line:
                continue  # 跳过空行
            # 拆分时间戳部分和文本部分
            parts = line.split("\t", 1)  # 按制表符分割
            if len(parts) < 2:
                parts = line.split("chunk", 1)
            
            time_range = parts[0]  # 例如 [00:00:32 - 00:00:34]
            time = time_range.split("chunk")[0].strip(" ").strip("[").strip("]")  # 提取起始时间 00:00:32

            # 存入字典,key 为行号,value 为起始时间
            result_dict[line_number] = time
    return result_dict  # 返回包含行号和时间戳的字典


print("-"*60)
# 提取目标字符(A、B、C、D)前后 window 个字符范围的文本,但扩展到整行,并保持原始顺序。
input_text_0 = "20250312_104423.txt"
output_filename_1 = input_text_0.split(".")[0] + "_0.txt"
extract_snippets_no_duplicates(input_text_0, output_filename_1)
replace_say_and_dianer_in_file(output_filename_1)
print("已完成精简(保留 A B C D 附近字符)")
print("-"*60)


print("-"*60)
input_text_1 = output_filename_1
output_filename_1 = input_text_1.split(".")[0] + "_1.txt"
corrected_text = correct_yao_in_text(input_text_1, output_filename_1)
print("已完成 yao 修正")
print("-"*60)

print("-"*60)
replace_ai_with_AI(output_filename_1, output_filename_1)
corrected_text = replace_bi_with_B(output_filename_1, output_filename_1)
print("-"*60)

print("-"*60)
corrected_text = replace_di_with_D(output_filename_1, output_filename_1)
print("-"*60)

print("-"*60)
corrected_text = replace_ling_with_target(output_filename_1, output_filename_1)
print("-"*60)

print("-"*60)
corrected_text = replace_shi_with_target(output_filename_1, output_filename_1)
print("-"*60)

print("-"*60)
corrected_text = replace_ba_with_eight(output_filename_1, output_filename_1)
mark_time_gaps(output_filename_1)
print("-"*60)

print("-"*60)
input_file_2 = output_filename_1
output_file_2 = input_file_2.split(".")[0] + "_2.txt"
process_bd_correction(input_file_2, output_file_2, context_print_window=20)
print("已完成 BD混淆 修正")
print("-"*60)


print("-"*60)
input_file_3 = output_file_2  # 你的输入文件
output_file_3 = input_file_3.split(".")[0] + "_3.txt"  # 输出文件
process_file_replace_keywords(input_file_3, output_file_3)
remove_timestamps_and_indexes_before_file = output_file_3
print("\n✅ 处理完成,结果已保存到:", output_file_3)
print("已完成 关键词匹配 修正")
print("-"*60)


print("-"*60)
input_file_4 = output_file_3
output_file_4 = input_file_4.split(".")[0] + "_4.txt"
remove_timestamps_and_indexes(input_file_4, output_file_4)
print("-"*60)

print("-"*60)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
corrected_text = correct_yao_in_text(output_file_4, output_file_4)
print("已完成 yao 修正")
print("-"*60)

print("-"*60)
replace_ai_with_AI(output_file_4, output_file_4)
corrected_text = replace_bi_with_B(output_file_4, output_file_4)
print("-"*60)

print("-"*60)
corrected_text = replace_di_with_D(output_file_4, output_file_4)
print("-"*60)

print("-"*60)
corrected_text = replace_ling_with_target(output_file_4, output_file_4)
corrected_text = replace_ling_with_target(output_file_4, output_file_4)
print("-"*60)

print("-"*60)
corrected_text = replace_shi_with_target(output_file_4, output_file_4)
print("-"*60)

print("-"*60)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
corrected_text = replace_ba_with_eight(output_file_4, output_file_4)
print("-"*60)

# print("-"*60)
# process_bd_correction(output_file_4, output_file_4, context_print_window=20)
# print("已完成 BD混淆 修正")
# print("-"*60)


print("-"*60)
# 读取输入文件并转换
input_file_5 = output_file_4
output_file_5 = input_file_5.split(".")[0] + "_5.txt"
convert_chinese_number_to_arabic(input_file_5, output_file_5)
print("阿拉伯转换完成!请查看 " + output_file_5)

print("-"*60)
input_file_5 = output_file_5
replace_act(input_file_5)
remove_spaces_from_file(input_file_5)
print("-"*60)

print("-"*60)
file_path = input_file_5
events_list = find_and_extract_contextual_event_info(file_path)
print("提取关键事件+时间+各个参数完成!")
print("-"*60)

print("-"*60)
file_path = remove_timestamps_and_indexes_before_file  # 你的 TXT 文件路径
time_dict = extract_time_stamps_from_txt(file_path)
file_name = os.path.basename(file_path)
time_in_beijing = extract_time_from_filename(file_name)
start_time = time_in_beijing.strftime("%H:%M:%S")
if time_in_beijing:
    print("录音时间(北京时间):", start_time)
else:
    print("未找到时间信息")

events_list_1 = []
for event in events_list:
    row_num = int(event['line_range'].split("-")[0])
    elapsed_time = time_dict[row_num].split("-")[0].strip()
    current_time = calculate_current_time(start_time, elapsed_time)
    print("Current time:", current_time)
    event['line_range'] = current_time
    events_list_1.append(event.copy())
print("-"*60)

print("-"*60)
for event in events_list:
    print("-"*60)
    for one_parameter in event:
        if one_parameter == "line_range":
            print("time : " + str(event[one_parameter]))
        else:
            print(one_parameter + " : " + str(event[one_parameter]))
    print("-"*60)
print("-"*60)


output_txt_file = input_text_0.split(".")[0] + "_time_event_ABCD.txt"
with open(output_txt_file, "w", encoding="utf-8") as file:
    file.write("-" * 60 + "\n")
    for index, event in enumerate(events_list, start=1):
        file.write(f"Event {7+index}\n")
        for one_parameter in event:
            if one_parameter == "line_range":
                file.write("time : " + str(event[one_parameter]) + "\n")
            else:
                file.write(one_parameter + " : " + str(event[one_parameter]) + "\n")
        file.write("-" * 60 + "\n")
    file.write("-" * 60 + "\n")
import os
from datetime import datetime, timedelta

keywords_NO_ABCD = [

    # CPB并行循环模板
    "开始装机",
    "装机开始",
    "开始CPB装机",
    "CPB装机开始",

    "开始预充排气",
    "预充排气开始",

    "开始安全检查",
    "安全检查开始",

    "开始主动脉插管",
    "主动脉插管开始",

    "CPB OF",
    "结束体外循环",
    "体外循环结束",


    "开始改良超滤",
    "改良超滤开始",

    "结束改良超滤",
    "改良超滤结束",


    # 常规体外循环模板

    "心肌保护液", 


    # 选择性脑灌注模板


    # DHCA模板


    "开始停循环",
    "停循环开始",

    # 
    "复方电解质",
    "红细胞悬液",
    "人血白蛋白",
    "碳酸氢钠",
    "甘露醇",
    "乙基淀粉",
    "甲基强的松龙",
    "磷酸肌酸钠",
    "地塞米松",
    "利多卡因",
    "葡萄糖",
    "酚妥拉明",
    "呋塞米注射液",
    "氯化钾",
    "氯化钙",
    "去甲肾上腺素",
    "肝素钠",
    "血液回收血",
    "乌司他丁"
]
keyswords_Medication = [
    "复方电解质",
    "红细胞悬液",
    "人血白蛋白",
    "碳酸氢钠",
    "甘露醇",
    "乙基淀粉",
    "甲基强的松龙",
    "磷酸肌酸钠",
    "地塞米松",
    "利多卡因",
    "葡萄糖",
    "酚妥拉明",
    "呋塞米注射液",
    "氯化钾",
    "氯化钙",
    "去甲肾上腺素",
    "肝素钠",
    "血液回收血",
    "乌司他丁"
]
def extract_number_after_keys(text):
    '''函数功能:
    该函数 extract_values 用于从输入文本中提取字母键(例如 "A", "B", "C", "D")后面的数值。
    
    它通过遍历文本并查找这些字母,获取它们后面紧跟的数字(包含点符号和“点”字),最终返回一个字典,其中包含每个字母键及其对应的数值。

    主要步骤:
    初始化一个包含字母键的字典,初始值为 None。
    遍历每个字母键,查找其在文本中的位置。
    如果找到了该字母键,继续向后查找数值,并避免在字母与数值之间出现其他字母或符号。
    提取数值部分,如果找到有效的数值,将其存入字典。
    最终返回包含字母键和对应数值的字典。
    '''
    keys = ["A", "B", "C", "D", "Y"]  # 定义待提取的字母键
    result = {key: None for key in keys}  # 初始化结果字典,键的初始值为 None
    if "*" in text:
        text = text.split('*', 1)[1]  # 只截取第一个 `*` 之后的部分

    for key in keys:  # 遍历每一个键
        index = text.find(key)  # 查找键在文本中的第一次出现位置
        while index != -1:  # 如果找到该键,则继续进行提取
            num_start = index + 1  # 数值部分应该从键的下一个位置开始
            
            # 检查字母和数值之间是否有其他字母或符号
            has_other_key = False
            while num_start < len(text) and not (text[num_start].isdigit() or text[num_start] in ["点", "."]):
                if text[num_start] in keys or text[num_start] == "*":  # 遇到其他键或者星号时,标记为有其他字母
                    has_other_key = True
                    break  # 跳出检查循环
                num_start += 1  # 向后移动,检查下一个字符
            
            # 如果字母和数值之间有其他字母,则向下 30 字符内再找
            if has_other_key:
                search_range = text[index + 1 : index + 31]  # 获取当前位置后的 30 个字符范围
                new_index = search_range.find(key)  # 查找这个范围内是否有相同的字母
                if new_index != -1:
                    index += new_index + 1  # 更新索引位置
                    continue  # 重新开始查找该字母对应的数值
            else:
                search_range = text[index + 1 : index + 15]  # 获取当前位置后的 30 个字符范围
                new_index = search_range.find(key)  # 查找这个范围内是否有相同的字母
                if new_index != -1:
                    index += new_index + 1  # 更新索引位置
                    continue  # 重新开始查找该字母对应的数值
            
            # 提取数值部分
            num_str = ""
            while num_start < len(text) and (text[num_start].isdigit() or text[num_start] in ["点", "."]):
                num_str += "." if text[num_start] == "点" else text[num_start]  # 将“点”转换为小数点
                num_start += 1  # 向后移动,继续提取数值
            
            # 存储有效数值
            if num_str:
                result[key] = float(num_str)  # 将提取的数值转换为浮动数并存入结果字典
            break  # 跳出当前字母的查找,继续查找下一个字母
    
    return result  # 返回最终的结果字典

def find_time(s):
    for i in range(len(s)-1, -1, -1):
        if s[i] == '点':
            j = i - 1
            while j >= 0 and (s[j].isdigit() or s[j] in '十'):
                j -= 1
            hour_part = s[j+1:i]
            if not hour_part:
                continue
            hour_part = hour_part[-2:]

            k = i + 1
            while k < len(s) and (s[k].isdigit() or s[k] == '刻'):
                k += 1
            minute_part = s[i+1:k]
            time_str = f"{hour_part}{minute_part}"
            try:
                if s[j] == "A" or s[j] == "B" or s[j] == "C" or s[j] == "D" or s[j] == "Y":
                    return None
                if s[k] == "A" or s[k] == "B" or s[k] == "C" or s[k] == "D" or s[k] == "Y":
                    return None
                if s[j-1] == "A" or s[j-1] == "B" or s[j-1] == "C" or s[j-1] == "D" or s[j-1] == "Y":
                    return None
                if s[k+1] == "A" or s[k+1] == "B" or s[k+1] == "C" or s[k+1] == "D" or s[k+1] == "Y":
                    return None
            except:
                pass

            return {
                'time': time_str,
                'start': j + 1,  # 时间在 pre_info 中的起始位置
                'end': k           # 时间在 pre_info 中的结束位置
            }
    return None

def find_continuous_numbers(text):
    # 遇到 '|' 停止处理
    text = text.split('|')[0]
    
    numbers = []
    current_number = ""
    
    for i, char in enumerate(text):
        if char.isdigit():
            current_number += char
        elif char == '点':
            current_number += "."
        else:
            if current_number:
                numbers.append(current_number)
                current_number = ""
    
    if current_number:
        numbers.append(current_number)
    
    return numbers

def find_and_extract_contextual_event_info(file_path):
    # 读取文件并合并为一个连续字符串,同时记录每个字符属于的行号
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    full_text = "".join(lines)  # 连接所有行,形成一个完整的字符串
    line_indices = []  # 记录每个字符属于的行号
    current_line = 1  # 行号计数

    for line in lines:
        line_indices.extend([current_line] * len(line))  # 记录该行的字符索引
        current_line += 1

    results = []
    text_length = len(full_text)
    i = 0  # 遍历整个字符串

    while i < text_length:
        if full_text[i] == '*' and full_text[i-3:i] == '关键词':
            start = i + 1
            end = full_text.find('*关键词', start)
            if end == -1:
                continue  # 无闭合的 *,跳过
            event_name = full_text[start:end].strip()

            start_i = start - 4
            end_i = end + 4
            start_line = line_indices[start_i]
            end_line = line_indices[end_i]
            current_line_context = lines[start_line-1:start_line]

            # 获取上文信息,向前倒序寻找10个字符,遇到 * 停止
            pre_info = ""
            search_length = 15
            j = start_i  # 从 '关键词' 之前的位置开始
            while search_length > 0 and j >= 0:
                if full_text[j] == '*':
                    break
                if full_text[j] == '\n':  # 忽略换行符
                    j -= 1
                    continue
                pre_info = full_text[j] + pre_info
                search_length -= 1
                j -= 1

            # 获取下文信息,向后寻找30个有效字符(忽略换行符),遇到 * 停止
            post_info = ""
            search_length = 50
            j = end_i
            while search_length > 0 and j < len(full_text):
                if full_text[j] == '*':
                    break
                if full_text[j] == '\n':  # 忽略换行符
                    j += 1
                    continue
                post_info += full_text[j]
                search_length -= 1
                j += 1
            # 记录匹配结果
            # print(
            #     # f"匹配内容: {next_15_chars.strip()} (行 {start_line} - {end_line})\n"
            #     f"————上文————\n"
            #     f"{pre_info}\n"
            #     f"————匹配所在行:({start_line} - {end_line})————\n"
            #     f"{"".join(current_line_context)}\n"
            #     f"————下文————\n"
            #     f"{post_info}\n"
            # )
            results.append(
                # f"匹配内容: {next_15_chars.strip()} (行 {start_line} - {end_line})\n"
                f"————上文———— {pre_info}\n"
                f"————匹配所在行:({start_line} - {end_line})匹配所在行———— \
                {"".join(current_line_context)}\n"
                f"————下文———— {post_info}\n"
            )
            i+=1
        else:
            i+=1

    # 输出结果
    if results:
        events_list = []
        event = {}
        for result in results:
            print("#"*60)
            print("\n")
            print("-"*60)
            print(result)
            print("-"*60)
            print("\n")

            result_text = []
            event["line_range"] = None
            event["event"] = None

            for text in result.split("\n"):
                if "———上文" in text:
                    result_text.append(text)
                elif "———下文" in text:
                    result_text.append(text)
                elif "———匹配所在行" in text:
                    # 查找括号的位置
                    start_pos = text.find('(') + 1  # 找到第一个'('的位置,并加1跳过括号
                    end_pos = text.find(')')        # 找到第一个')'的位置
                    # 截取括号中的内容
                    line_range = text[start_pos:end_pos]
                    event["line_range"] = line_range
                    # 分割开始行和结束行
                    # start_line, end_line = line_range.split(' - ')
                    result_text.append(text)

            # 输出结果
            result_text_pre_info = result_text[0].split("————上文———— ")[-1].strip()
            current_line_context = result_text[1].split("匹配所在行————")[-1].strip()
            result_text_post_info = result_text[2].split("————下文———— ")[-1].strip()

            # 在 pre_info 中查找时间
            time_info = find_time(result_text_pre_info)
            if time_info == None:
                event["time"] = None
            else:
                event["time"] = time_info["time"]

            start_pos = "".join(current_line_context).find('关键词*') + 1  # 第一个 '*' 后面的字符开始
            end_pos = "".join(current_line_context).find('*关键词', start_pos)  # 第二个 '*' 的位置
            # 提取 * 之间的内容
            substring = "".join(current_line_context)[start_pos:end_pos]
            event["event"] = substring[3:]

            if event["event"] == "激活全血凝固时间" or event["event"] == "心肌保护液":
                num_results_list = find_continuous_numbers(result_text_post_info)
                event["num_results_list"] = num_results_list
            else:
                event["num_results_list"] = None
                # if  "A" not in result_text_post_info and\
                #     "B" not in result_text_post_info and\
                #     "C" not in result_text_post_info and\
                #     "D" not in result_text_post_info and\
                #     "Y" not in result_text_post_info:
                #     continue

            # if event["event"] == "体外循环开始":
            #     event["ACT"] = 480
            # else:
            #     event["ACT"] = None

            parameter_result = extract_number_after_keys(result_text_post_info)

            for one_parameter in parameter_result:
                event[one_parameter] = str(parameter_result[one_parameter])

            for one_parameter in event:
                if one_parameter == "B":
                    if event[one_parameter] == "None":
                        print(one_parameter + " : " + str(event[one_parameter]))
                    elif "00" in str(int(float(event[one_parameter]))):
                        event[one_parameter] = str(int(float(event[one_parameter])))[-3:]
                        print(one_parameter + " : " + str(int(float(event[one_parameter]))[-3:]))
                    else:
                        event[one_parameter] = str(int(float(event[one_parameter])))[-2:]
                        print(one_parameter + " : " + str(int(float(event[one_parameter])))[-2:])

                elif one_parameter == "D":
                    if event[one_parameter] == "None":
                        print(one_parameter + " : " + str(event[one_parameter]))
                    else:    
                        event[one_parameter] = str(int(float(event[one_parameter])))[-3:]
                        print(one_parameter + " : " + str(int(float(event[one_parameter])))[-3:])
                else:
                    print(one_parameter + " : " + str(event[one_parameter]))

            print("-"*60)
            print("-"*60)
            print("#"*60)
            events_list.append(event.copy())
        return events_list
    else:
        print("未找到符合条件的匹配内容。")

def extract_time_stamps_from_txt(file_path):
    result_dict = {}  # 初始化空字典用于存储行号和时间戳
    with open(file_path, "r", encoding="utf-8") as file:
        for line_number, line in enumerate(file, start=1):
            line = line.strip()  # 去除行两端的空白字符
            if not line:
                continue  # 跳过空行
            # 拆分时间戳部分和文本部分
            parts = line.split("\t", 1)  # 按制表符分割
            if len(parts) < 2:
                parts = line.split("chunk", 1)
            
            time_range = parts[0]  # 例如 [00:00:32 - 00:00:34]
            time = time_range.split("chunk")[0].strip(" ").strip("[").strip("]")  # 提取起始时间 00:00:32

            # 存入字典,key 为行号,value 为起始时间
            result_dict[line_number] = time
    return result_dict  # 返回包含行号和时间戳的字典

def extract_time_from_filename(file_path):
    # 通过 "_" 分割文件名
    parts = file_path.split("_")
    # 确保前两部分是日期和时间
    if len(parts) >= 2 and len(parts[0]) == 8 and len(parts[1]) == 6:
        date_part = parts[0]  # "20250306"
        time_part = parts[1]  # "105401"
        
        # 组合并解析成 datetime 对象
        dt = datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
        return dt
    return None

def calculate_current_time(start_time: str, elapsed_time: str) -> str:
    """
    Calculate the current time based on start time and elapsed time.

    :param start_time: Start time in "HH:MM:SS" format.
    :param elapsed_time: Elapsed time in "HH:MM:SS" format.
    :return: Current time in "HH:MM:SS" format.
    """
    # 解析 start_time
    start_dt = datetime.strptime(start_time, "%H:%M:%S")

    # 解析 elapsed_time
    h, m, s = map(int, elapsed_time.split(":"))
    elapsed_td = timedelta(hours=h, minutes=m, seconds=s)

    # 计算当前时间
    current_time = start_dt + elapsed_td

    # 返回计算后的时间字符串
    return current_time.strftime("%H:%M:%S")

def convert_time(time_str):
    time_str = time_str.replace('点', ':')\
                        .replace('1刻', '15')\
                        .replace('2刻', '30')\
                        .replace('3刻', '40')\
                        .replace('半', '30')
    
    # If the time string ends with a colon, it means minutes are missing, so set to '00'
    if time_str.endswith(':'):
        time_str += '00'

    # Ensure two-digit format for hours, minutes, and seconds
    # Add seconds if necessary
    if len(time_str.split(':')) == 2:  # e.g., '9点' becomes '09:00'
        time_str += ':00'
    
    # Format as two digits for each component
    time_str = ':'.join(f'{int(x):02d}' for x in time_str.split(':'))
    
    return time_str

print("-"*60)
file_path = "20250312_104423_0_1_2_3_4_5.txt"
events_list = find_and_extract_contextual_event_info(file_path)
print("提取关键事件+时间+各个参数完成!")
print("-"*60)



print("-"*60)
file_path = "20250312_104423_0_1_2_3.txt"  # 你的 TXT 文件路径
time_dict = extract_time_stamps_from_txt(file_path)
file_name = os.path.basename(file_path)
time_in_beijing = extract_time_from_filename(file_name)
start_time = time_in_beijing.strftime("%H:%M:%S")
if time_in_beijing:
    print("录音时间(北京时间):", start_time)
else:
    print("未找到时间信息")

events_list_1 = []
for event in events_list:
    row_num = int(event['line_range'].split("-")[0])
    elapsed_time = time_dict[row_num].split("-")[0].strip()
    current_time = calculate_current_time(start_time, elapsed_time)
    print("Current time:", current_time)
    event['line_range'] = current_time
    events_list_1.append(event.copy())
print("-"*60)


for i in range(len(events_list_1)):
    event = events_list_1[i]
    print("-"*60)
    if event["time"] == "None" or event["time"] == None:
        event["time"] = event['line_range']
    if i >= 1:
        if event["A"] == "None" or event["A"] == None:
            event["A"] = events_list_1[i-1]["A"]
        if event["B"] == "None" or event["B"] == None:
            event["B"] = events_list_1[i-1]["B"]
        if event["C"] == "None" or event["C"] == None:
            event["C"] = events_list_1[i-1]["C"]
        if event["D"] == "None" or event["D"] == None:
            event["D"] = events_list_1[i-1]["D"]
        if event["Y"] == "None" or event["Y"] == None:
            event["Y"] = events_list_1[i-1]["Y"]


# Add the converted time to each event
for event in events_list_1:
    event['time'] = convert_time(event['time'])
# Sort the events based on the converted time
sorted_events = sorted(events_list_1, key=lambda x: x['time'])
# Print the sorted events
for event in sorted_events:
    print(event)

# print("-"*60)
# for event in sorted_events:
#     print("-"*60)
#     if event["event"] == "激活全血凝固时间" or event["event"] == "心肌保护液":
#         continue
#     if "序号" in event["event"]:
#         continue
#     if event["time"] == "None" or event["time"] == None:
#         event["time"] = event['line_range']
#     for one_parameter in event:
#         if one_parameter == "line_range":
#             print("line_range_time(Record time) : " + str(event[one_parameter]))
#         else:
#             print(one_parameter + " : " + str(event[one_parameter]))
#     print("-"*60)
# print("-"*60)


sorted_events_1 = []
for event in sorted_events:
    if event["event"] == "激活全血凝固时间" or event["event"] == "心肌保护液":
        continue
    else:
        sorted_events_1.append(event)


output_txt_file = file_path.split(".")[0] + "_time_event_ABCD.txt"
with open(output_txt_file, "w", encoding="utf-8") as file:
    file.write("-" * 60 + "\n")
    index_num = 1
    for index, event in enumerate(sorted_events_1, start=1):
        if "序号" in event["event"]:
            continue
        file.write(f"Event {index_num}\n")
        index_num +=1
        for one_parameter in event:
            if one_parameter == "line_range":
                file.write("line_range_time : " + str(event[one_parameter]) + "\n")
            else:
                file.write(one_parameter + " : " + str(event[one_parameter]) + "\n")
        file.write("-" * 60 + "\n")
    file.write("-" * 60 + "\n")

你可能感兴趣的:(语音识别后处理)