从目标网址下载付费文档,并保存为word形式
网址点这里
python3.6 + selenium + docx + pywin32
安装(推荐使用清华源):
pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install pywin32 -i https://pypi.tuna.tsinghua.edu.cn/simple/
import time
from selenium import webdriver
from selenium.webdriver.common import keys
import requests
from docx import Document
from docx.shared import Inches
from win32com.client import constants, gencache
class YuanLC:
def __init__(self, url):
self.filename = None
# 创建session网络请求对象
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,a"
"pplication/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "CLIENT_SYS_UN_ID=3rvgCl9XYO1u41DVBzG/Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1599561"
"968279953439%7C%7C%7Cfsts%3D%3E1599561968%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_s=cdh%3"
"D%3E27a30245%7C%7C%7Clast_req%3D%3E1599561968%7C%7C%7Csid%3D%3E1599561968685697441%7C%7C%7Cd"
"sps%3D%3E0; __cfduid=dcce463c0931f0014f9ed1b030e9c47981599561968",
"Host": "view-cache.book118.com",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8"
"3.0.4103.106 Safari/537.36",
}
self.session = requests.session()
self.session.headers = headers
# 创建web驱动对象 ---> 自动化
self.driver = webdriver.Chrome()
self.driver.implicitly_wait(10)
self.url = url
# 创建doc文档对象
self.doc = Document()
self.run()
self.driver.quit()
def get_src(self):
"""获取图片父节点"""
# 向目标网址发起请求、解析出图片url
# self.driver.get('https://max.book118.com/html/2017/0128/87099242.shtm')
self.driver.get(self.url)
self.filename = self.driver.find_element_by_xpath('//*[@id="main"]/div[1]/div[1]/h1').text[:-7] + 'docx'
# 点击继续浏览
while True:
# tag = self.driver.find_element_by_id("btn_preview_remain")
tag = self.driver.find_element_by_xpath('//div[@class="btns"]')
if tag.text == '下载文档':
break
else:
self.driver.execute_script("arguments[0].scrollIntoView();", tag) # 拖动到可见的元素去
tag.click()
# 总页数
page = self.driver.find_element_by_id('pagenumber').text[:-1]
self.driver.execute_script("var q=document.documentElement.scrollTop=0")
# 加载图片
for i in range(1260, int(page) * 1260, 50):
self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
time.sleep(0.02) # 网速不好把值适当调大
for i in range(1260 * 6, int(page) * 1260, 50):
self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
time.sleep(0.035) # 网速不好把值适当调大
srcs = self.driver.find_elements_by_xpath('//div[@class="webpreview-item"]')
return srcs
def download(self, src):
"""下载图片"""
# 利用选择器取出src属性值
url = src.find_element_by_css_selector('img').get_attribute('src')
# 获取图片
print(url)
if url:
res = self.session.get(url)
return res
def createword(self, res):
"""创建word"""
if res is None:
return
# 主动捕获异常
try:
# 将图片保存
with open('1.png', 'wb') as f:
f.write(res.content)
# width=Inches(6), height=Inches(8) 将写入的图片设置成A4大小
self.doc.add_picture("1.png", width=Inches(6), height=Inches(8))
except Exception as e:
print('1.png 写入失败,原因是:%s' % str(e))
# 保存成word文档
self.doc.save(self.filename)
def createpdf(self):
"""word转pdf """
word = gencache.EnsureDispatch('Word.Application')
self.doc = word.documents.Open(self.filename, ReadOnly=1)
self.doc.ExportAsFixedFormat(
self.pdfPath,
constants.wdExportFormatPDF,
Item=constants.wdExportdocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks
)
word.Quit(constants.wdDoNotSaveChanges)
def run(self):
for src in self.get_src():
res = self.download(src)
self.createword(res)
# self.createpdf()
YuanLC(
url='https://max.book118.com/html/2021/0106/6232223004003045.shtm',
)
^_^
有帮助就给个赞吧~~~