Python爬虫实战 下载原力创付费文档---全屏阅览式

下载原力创付费文档—全屏阅览式

一、项目需求:

从目标网址下载付费文档,并保存为word形式
网址点这里

二、思路

  • 1.利用selenium实现异步加载,获取图片url
  • 2.爬取图片
  • 3.将图片写进word文档

三、技术点

  • 1.python + selenium自动化
  • 2.python + docx

四、环境

python3.6 + selenium + docx

安装(推荐使用清华源):
   pip install selenium -i https://pypi.tuna.tsinghua.edu.cn/simple/
   pip install python-docx -i https://pypi.tuna.tsinghua.edu.cn/simple/

五、代码

import time

from selenium import webdriver
from selenium.webdriver.common import keys
import requests
from docx import Document
from docx.shared import Inches


class YuanLC:
 def __init__(self, url, filename):

     # 创建session网络请求对象
     headers = {
     
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,a"
                   "pplication/signed-exchange;v=b3;q=0.9",
         "Accept-Encoding": "gzip, deflate, br",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cache-Control": "no-cache",
         "Connection": "keep-alive",
         "Cookie": "CLIENT_SYS_UN_ID=3rvgCl9XYO1u41DVBzG/Ag==; s_v=cdh%3D%3E27a30245%7C%7C%7Cvid%3D%3E1599561"
                   "968279953439%7C%7C%7Cfsts%3D%3E1599561968%7C%7C%7Cdsfs%3D%3E0%7C%7C%7Cnps%3D%3E1; s_s=cdh%3"
                   "D%3E27a30245%7C%7C%7Clast_req%3D%3E1599561968%7C%7C%7Csid%3D%3E1599561968685697441%7C%7C%7Cd"
                   "sps%3D%3E0; __cfduid=dcce463c0931f0014f9ed1b030e9c47981599561968",
         "Host": "view-cache.book118.com",
         "Pragma": "no-cache",
         "Sec-Fetch-Dest": "document",
         "Sec-Fetch-Mode": "navigate",
         "Sec-Fetch-Site": "none",
         "Sec-Fetch-User": "?1",
         "Upgrade-Insecure-Requests": "1",
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/8"
                       "3.0.4103.106 Safari/537.36",
     }
     self.session = requests.session()
     self.session.headers = headers

     # 创建web驱动对象 ---> 自动化
     self.driver = webdriver.Chrome()
     self.driver.implicitly_wait(10)
     self.url = url

     # 创建doc文档对象
     self.doc = Document()
     self.wordPath = self.pdfPath = filename

     self.run()

     self.driver.quit()

 def get_src(self):
     """获取图片父节点"""

     # 向目标网址发起请求、解析出图片url
     self.driver.get(self.url)

     # 点击全屏预览
     self.driver.find_element_by_id('full').click()
     # 切换到iframe
     self.driver.switch_to.frame('layer_new_view_iframe')

     # 总页数
     while True:
         page = self.driver.find_element_by_xpath('//*[@id="newView"]/div[1]/div/span').text[2:]
         if page:
             break

     # 加载图片
     for j in range(2):
         for i in range(1000, int(page) * 1200, 50):
             self.driver.execute_script("window.scrollTo(0, %s)" % str(i))
             time.sleep(0.02)

     srcs = self.driver.find_elements_by_xpath('//*[@id="newView"]/div[2]/div/img')

     return srcs

 def download(self, src):
     """下载图片"""

     # 利用选择器取出src属性值
     url = src.get_attribute('src')

     # 获取图片
     if url:
         res = self.session.get(url)
         return res

 def createword(self, res):
     """创建word"""

     if res is None:
         return

     # 主动捕获异常
     try:

         # 将图片保存
         with open('1.png', 'wb') as f:
             f.write(res.content)

         # width=Inches(6), height=Inches(8) 将写入的图片设置成A4大小
         self.doc.add_picture("1.png", width=Inches(6), height=Inches(8))

     except Exception as e:
         print('1.png 写入失败,原因是:%s' % str(e))

     # 保存成word文档
     self.doc.save(self.wordPath)

 def run(self):
     for src in self.get_src():
         res = self.download(src)
         self.createword(res)


YuanLC(
 url='https://max.book118.com/html/2015/0504/16361645.shtm',
 filename="光电检测技术论文.docx"
)


^_^有帮助就点个赞吧~~~

滑动式点这

你可能感兴趣的:(python,付费下载,selenium,自动化,python爬虫,selenium,爬虫)