电脑上的搜索功能无法对文件内容进行搜索,写了一个可以同时搜索文件名和文件内容的python 脚本。支持正则表达式匹配。
先针对文本文件、word、ppt、pdf、Excel 分别够建了一个函数,最后整合函数中调用,进行搜索匹配。
我的测试目录下有这么几个文件:
搜索文本文件
构建一个文本文件内容查找函数
import os
import re
#构建一个文本文件内容查找函数
def search_text(file_path, search_str):
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
total_count = 0
first_line = None
for i, line in enumerate(lines):
if re.search(search_str, line):
total_count += 1
if first_line is None:
first_line = i + 1
if total_count > 0:
print(f'{file_path}: 首次行: {first_line}, 共匹配:{total_count}')
except UnicodeDecodeError:
with open(file_path, 'r', encoding='ISO-8859-1') as f:
lines = f.readlines()
total_count = 0
first_line = None
for i, line in enumerate(lines):
if re.search(search_str, line):
total_count += 1
if first_line is None:
first_line = i + 1
if total_count > 0:
print(f'{file_path}: 首次行: {first_line}, 共匹配:{total_count}')
测试一下search_text
file_path = "E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r"
search_str = "mark.*"
search_str = re.compile(search_str,re.IGNORECASE)
search_text(file_path, search_str)
#E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r: 首次行: 1, 共匹配:17
搜索word文件
构建一个word文件内容查找函数
import re
import docx
def search_docx(file_path, search_str):
doc = docx.Document(file_path)
pattern = re.compile(search_str)
total_count = 0
first_page = None
for i, para in enumerate(doc.paragraphs):
para_text = para.text
matches = pattern.findall(para_text)
if matches:
total_count += len(matches)
if first_page is None:
first_page = i + 1
if total_count > 0:
print(f"{file_path} 首次行: {first_page} 共匹配:{total_count} 次。")
测试一下search_docx
file_path = "E:\shell\python\search_file_and_content\example\cellphoneDB可视化进阶-详细画图.docx"
search_str = "li.*nd"
search_str = re.compile(search_str,re.IGNORECASE)
search_docx(file_path, search_str)
#E:\shell\python\search_file_and_content\example\cellphoneDB可视化进阶-详细画图.docx 首次行: 13 共匹配:10 次。
搜索ppt文件
构建PPT文件内容查找函数
import os
import re
from pptx import Presentation
def search_pptx_file(file_path, search_str):
"""在指定ppt文件中查找指定字符串,找到则输出该文件名、第一次匹配到的页码,以及总共匹配到多少次。要求输出结果在同一行显示。
注意指定字符串可能为正则表达式。
"""
prs = Presentation(file_path)
count = 0
first_page = -1
for index, slide in enumerate(prs.slides, start=1):
for shape in slide.shapes:
if shape.has_text_frame:
text_frame = shape.text_frame
if search_str:
matches = re.findall(search_str, text_frame.text)
if matches:
count += len(matches)
if first_page == -1:
first_page = index
if count > 0:
file_path = os.path.basename(file_path)
print(f"{file_path} 首次页码: {first_page} 共匹配: {count} 次")
测试一下search_pptx_file
file_path = "E:\shell\python\search_file_and_content\example\paper_PPT文献分享.pptx"
search_str = "single.*cell"
search_str = re.compile(search_str,re.IGNORECASE)
search_pptx_file(file_path, search_str)
## paper_PPT文献分享.pptx 首次页码: 1 共匹配: 1 次
搜索pdf文件
构建pdf文件内容查找函数
import re
import PyPDF2
#要求PyPDF2 3.0.0或以上,因为好几个函数名与低版本都不同了
def search_pdf_file(file_path, search_str):
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
matches = 0
first_page = None
for page_num in range(len(pdf_reader.pages)):
page_obj = pdf_reader.pages[page_num]
text = page_obj.extract_text()
if re.search(search_str, text):
matches += 1
if first_page is None:
first_page = page_num + 1
if matches > 0:
print(f"{file_path} 首次页码: {first_page} 共匹配: {matches} 次")
测试一下search_pdf_file
file_path = r"E:\shell\python\search_file_and_content\example\2.Dotplot_TSubtype_marker.pdf"
#这里遇到了一个以数字开头的文件,Python 会把以数字开头的字符串当成数字来解析
#r 字符串前缀可以很好地解决这个问题
search_str = ".*FOXP3"
search_str = re.compile(search_str,re.IGNORECASE)
search_pdf_file(file_path, search_str)
## E:\shell\python\search_file_and_content\example\2.Dotplot_TSubtype_marker.pdf 首次页码: 1 共匹配: 1 次
搜索Excel文件
构建Excel文件内容查找函数
import openpyxl
def search_xls_file(file_path, search_str):
wb = openpyxl.load_workbook(file_path)
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
total_count = 0
fist_cell = None
for row in sheet.rows:
for cell in row:
if isinstance(cell.value, str) and re.search(search_str, cell.value):
match = re.search(search_str, cell.value)
total_count += 1
if fist_cell is None:
fist_cell = cell.coordinate
if total_count > 0:
print(f'{file_path}: 首次单元格: {fist_cell} 共匹配:{total_count} 次')
测试一下search_xls_file
file_path = r"E:\shell\python\search_file_and_content\example\Tsub_Marker_for_heatmap.xlsx"
search_str = ".*FOXP3"
search_str = re.compile(search_str,re.IGNORECASE)
search_xls_file(file_path, search_str)
## E:\shell\python\search_file_and_content\example\Tsub_Marker_for_heatmap.xlsx: 首次单元格: B18 共匹配:1 次
整合为一个函数
将上面的函数整合,实现文件查找,并可以对内容进行搜索
def search_files(directory, keyword, search_str):
keyword = re.compile(keyword, re.IGNORECASE)
search_str = re.compile(search_str, re.IGNORECASE)
for root, dirs, files in os.walk(directory):
for file in files:
if re.search(keyword, file):
file_path = os.path.join(root, file)
if file.endswith('.docx'):
search_docx(file_path, search_str)
elif file.endswith('.pptx'):
search_pptx_file(file_path, search_str)
elif file.endswith('.pdf'):
search_pdf_file(file_path, search_str)
elif file.endswith('.xlsx'):
search_xls_file(file_path, search_str)
else:
search_text(file_path, search_str)
测试整合函数search_files
directory = r"E:\shell\python\search_file_and_content\example"
keyword = ".*marker.*heatmap"
search_str = "Seu.*t"
search_files(directory, keyword, search_str)
## E:\shell\python\search_file_and_content\example\Tsub_defined_marker_heatmap.r: 首次行: 2, 共匹配:1
## E:\shell\python\search_file_and_content\example\subdir1\3.4.4.celltype_marker_heatmap_dotplot.r: 首次行: 2, 共匹配:2
## E:\shell\python\search_file_and_content\example\subdir1\subdir2\2_marker heatmap_in_single_cell.r: 首次行: 23, 共匹配:1