用python搜索文件名并在其内容中查找指定字符串,txt,word,PPT,Excel,pdf格式均可实现

电脑上的搜索功能无法对文件内容进行搜索,写了一个可以同时搜索文件名和文件内容的python 脚本。支持正则表达式匹配。

先针对文本文件、word、ppt、pdf、Excel 分别够建了一个函数,最后整合函数中调用,进行搜索匹配。

我的测试目录下有这么几个文件:


微信图片_20230420130404.png

搜索文本文件

构建一个文本文件内容查找函数

import os
import re
#构建一个文本文件内容查找函数
def search_text(file_path, search_str):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            total_count = 0
            first_line = None
            for i, line in enumerate(lines):
                if re.search(search_str, line):
                    total_count += 1
                    if first_line is None:
                        first_line = i + 1
            if total_count > 0:
                print(f'{file_path}: 首次行: {first_line},  共匹配:{total_count}')          
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='ISO-8859-1') as f:
            lines = f.readlines()
            total_count = 0
            first_line = None
            for i, line in enumerate(lines):
                if re.search(search_str, line):
                    total_count += 1
                    if first_line is None:
                        first_line = i + 1
            if total_count > 0:
                print(f'{file_path}: 首次行: {first_line},  共匹配:{total_count}') 

测试一下search_text

file_path = "E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r"
search_str = "mark.*"
search_str = re.compile(search_str,re.IGNORECASE)
search_text(file_path, search_str)

#E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r: 首次行: 1,  共匹配:17

搜索word文件

构建一个word文件内容查找函数

import re
import docx

def search_docx(file_path, search_str):
    doc = docx.Document(file_path)
    pattern = re.compile(search_str)
    total_count = 0
    first_page = None
    for i, para in enumerate(doc.paragraphs):
        para_text = para.text
        matches = pattern.findall(para_text)
        if matches:
            total_count += len(matches)
            if first_page is None:
                    first_page = i + 1
    if total_count > 0:
        print(f"{file_path}  首次行: {first_page}  共匹配:{total_count} 次。")

测试一下search_docx

file_path = "E:\shell\python\search_file_and_content\example\cellphoneDB可视化进阶-详细画图.docx"
search_str = "li.*nd"
search_str = re.compile(search_str,re.IGNORECASE)
search_docx(file_path, search_str)

#E:\shell\python\search_file_and_content\example\cellphoneDB可视化进阶-详细画图.docx  首次行: 13  共匹配:10 次。

搜索ppt文件

构建PPT文件内容查找函数

import os
import re
from pptx import Presentation

def search_pptx_file(file_path, search_str):
    """在指定ppt文件中查找指定字符串,找到则输出该文件名、第一次匹配到的页码,以及总共匹配到多少次。要求输出结果在同一行显示。
    注意指定字符串可能为正则表达式。
    """
    prs = Presentation(file_path)
    count = 0
    first_page = -1
    for index, slide in enumerate(prs.slides, start=1):
        for shape in slide.shapes:
            if shape.has_text_frame:
                text_frame = shape.text_frame
                if search_str:
                    matches = re.findall(search_str, text_frame.text)
                    if matches:
                        count += len(matches)
                        if first_page == -1:
                            first_page = index
    if count > 0:
        file_path = os.path.basename(file_path)
        print(f"{file_path}  首次页码: {first_page}  共匹配: {count} 次")

测试一下search_pptx_file

file_path = "E:\shell\python\search_file_and_content\example\paper_PPT文献分享.pptx"
search_str = "single.*cell"
search_str = re.compile(search_str,re.IGNORECASE)
search_pptx_file(file_path, search_str)

## paper_PPT文献分享.pptx  首次页码: 1  共匹配: 1 次

搜索pdf文件

构建pdf文件内容查找函数

import re
import PyPDF2
#要求PyPDF2 3.0.0或以上,因为好几个函数名与低版本都不同了
def search_pdf_file(file_path, search_str):
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        matches = 0
        first_page = None
        for page_num in range(len(pdf_reader.pages)):
            page_obj = pdf_reader.pages[page_num]
            text = page_obj.extract_text()
            if re.search(search_str, text):
                matches += 1
                if first_page is None:
                    first_page = page_num + 1
        if matches > 0:
            print(f"{file_path}  首次页码: {first_page}  共匹配: {matches} 次")

测试一下search_pdf_file

file_path = r"E:\shell\python\search_file_and_content\example\2.Dotplot_TSubtype_marker.pdf"
#这里遇到了一个以数字开头的文件,Python 会把以数字开头的字符串当成数字来解析
#r 字符串前缀可以很好地解决这个问题

search_str = ".*FOXP3"
search_str = re.compile(search_str,re.IGNORECASE)
search_pdf_file(file_path, search_str)


## E:\shell\python\search_file_and_content\example\2.Dotplot_TSubtype_marker.pdf  首次页码: 1  共匹配: 1 次

搜索Excel文件

构建Excel文件内容查找函数

import openpyxl 
def search_xls_file(file_path, search_str):
    wb = openpyxl.load_workbook(file_path)
    for sheet_name in wb.sheetnames:
        sheet = wb[sheet_name]
        total_count = 0
        fist_cell = None
        for row in sheet.rows:
            for cell in row:
                if isinstance(cell.value, str) and re.search(search_str, cell.value):
                    match = re.search(search_str, cell.value)
                    total_count += 1
                    if fist_cell is None:
                        fist_cell = cell.coordinate
        if total_count > 0:
            print(f'{file_path}: 首次单元格: {fist_cell}  共匹配:{total_count} 次')

测试一下search_xls_file

file_path = r"E:\shell\python\search_file_and_content\example\Tsub_Marker_for_heatmap.xlsx"
search_str = ".*FOXP3"
search_str = re.compile(search_str,re.IGNORECASE)
search_xls_file(file_path, search_str)

## E:\shell\python\search_file_and_content\example\Tsub_Marker_for_heatmap.xlsx: 首次单元格: B18  共匹配:1 次

整合为一个函数

将上面的函数整合,实现文件查找,并可以对内容进行搜索

def search_files(directory, keyword, search_str):
    keyword = re.compile(keyword, re.IGNORECASE)
    search_str = re.compile(search_str, re.IGNORECASE)
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            if re.search(keyword, file):
                file_path = os.path.join(root, file)
                if file.endswith('.docx'):
                    search_docx(file_path, search_str)
                elif file.endswith('.pptx'):
                    search_pptx_file(file_path, search_str)
                elif file.endswith('.pdf'):
                    search_pdf_file(file_path, search_str)
                elif file.endswith('.xlsx'):
                    search_xls_file(file_path, search_str)
                else:
                    search_text(file_path, search_str)

测试整合函数search_files

directory = r"E:\shell\python\search_file_and_content\example"
keyword = ".*marker.*heatmap"
search_str = "Seu.*t"
search_files(directory, keyword, search_str)

## E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r: 首次行: 2,  共匹配:1
## E:\shell\python\search_file_and_content\example\subdir1\3.4.4.celltype_marker_heatmap_dotplot.r: 首次行: 2,  共匹配:2
## E:\shell\python\search_file_and_content\example\subdir1\subdir2\2_marker heatmap_in_single_cell.r: 首次行: 23,  共匹配:1

你可能感兴趣的:(用python搜索文件名并在其内容中查找指定字符串,txt,word,PPT,Excel,pdf格式均可实现)