参考资料:
1). pdfminer3k 是 pdfminer 的 python3 版本,主要用于读取 pdf 中的文本。
2). python-docx 模块是读取和写入 word 文档的工具.
from io import StringIO
import os
from docx import Document
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
import threading
# 多线程实现批量 PDF 转换器工具
# 提取PDF文件中的文字
def read_from_pdf(file_path):
with open(file_path, 'rb') as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()
device = TextConverter(resource_manager, return_str,
laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()
content = return_str.getvalue()
return_str.close()
return content
def save_text_to_word(content, file_path):
doc = Document()
for line in content.split('\n'):
paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)
def remove_control_characters(content):
mpa = dict.fromkeys(range(32))
return content.translate(mpa)
def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)
print('pdf to doc success')
def find_pdf():
file_name_list = []
for a, b, c in os.walk('.'):
for i in c:
if i.endswith('.pdf'):
file_name_list.append(i)
print('find pdf success')
return file_name_list
if __name__ == '__main__':
sourceFiles = find_pdf()
thread = []
for sourceFile in sourceFiles:
targetFile = sourceFile.rstrip('.pdf') + '.docx'
t = threading.Thread(target=pdf_to_word, args=(sourceFile, targetFile))
thread.append(t)
t.start()
[t.join() for t in thread]
注意: 使用创建子类的方式实现多线程任务
from threading import Thread
import json
import requests
class GetHostAliveThread(Thread):
"""
创建子线程, 执行的任务:判断指定的IP是否存活
"""
def __init__(self, ip):
super(GetHostAliveThread, self).__init__()
self.ip = ip
def run(self):
url = 'http://ip-api.com/json/%s' % (ip)
try:
response = requests.get(url)
except Exception as e:
print("网页获取错误:", e)
else:
# 默认返回的是字符串
"""
{"as":"AS174 Cogent Communications","city":"Beijing","country":"China","countryCode":"CN","isp":"China Unicom Shandong Province network","lat":39.9042,"lon":116.407,"org":"NanJing XinFeng Information Technologies, Inc.","query":"114.114.114.114","region":"BJ","regionName":"Beijing","status":"success","timezone":"Asia/Shanghai","zip":""}
"""
contentPage = response.text
# 将页面的json字符串转换成便于处理的字典;
data_dict = json.loads(contentPage)
# 获取对应的城市和国家
city = data_dict.get('city', 'null') # None
country = data_dict.get('country', 'null')
print(ip, city, country)
if __name__ == '__main__':
for i in range(1, 255):
ip = '1.1.1.' + str(i)
thread = GetHostAliveThread(ip)
thread.start()
注意: 使用实例化对象的方式实现多线程任务
项目描述: 如果要在本地网络中确定哪些地址处于活动状态或哪些计算机处于活动状态,
则可以使用此脚本。我们将依次 ping 地址, 每次都要等几秒钟才能返回值。这可以在 Python
中编程,在 IP 地址的地址范围内有一个 for 循环和一个 os.popen(“ping -q -c2”+ ip)。
项目瓶颈: 没有线程的解决方案效率非常低,因为脚本必须等待每次 ping。
'''
from threading import Thread
def task(ip):
import os
# 需要执行的shell命令
cmd = 'ping -c1 -w1 %s &> /dev/null' % (ip)
result = os.system(cmd)
# 返回值如果为0, 代表命令正确执行,没有报错; 如果不为0, 执行报错;
if result == 0:
print("%s主机ping通" % (ip))
if __name__ == '__main__':
print("打印172.25.254.0网段没有使用的IP地址".center(50, '*'))
if __name__ == '__main__':
# 1.1.1.1 -- 1.1.1.10
threads = []
for item in range(10):
ip = '172.25.254.' + str(item + 1)
# task(ip)
# 多线程执行任务
thread = Thread(target=task, args=(ip,))
# 启动线程并执行任务
thread.start()
# 存储创建的所有线程对象;
threads.append(thread)
[thread.join() for thread in threads]
print("任务执行结束.........")