mac os平台使用python爬虫自动下载巨潮网络文件

环境配置
选择python+selenium+wget+Safari的环境来下载文件,本来期望使用phantomjs,但使用时点击出的链接网页为空白网页,无法下载文件。
使用Safari时遇到的错误:selenium.common.exceptions.WebDriverException: Message: Could not create a session: You must enable the 'Allow Remote Automation' option in Safari's Develop menu to control Safari via WebDriver.解决方式需要在Safari=>开发中选上"允许远程自动化"。
原始代码
#!/usr/bin/python

# -*- coding: utf-8 -*- 
__metaclass__ = type

import io
from selenium import webdriver
import time
import sys
import re
import os
from selenium.webdriver.common.keys import Keys
import wget
import urllib
from urllib import request
import shutil
#from selenium.webdriver import ActionChains
#from selenium.webdriver.common.keys import Keys

'''class: DownloadFromCninfo'''
class DownloadFromCninfo(object):
    def __init__(self,stockNumberStr,maxNumber=10000):
        self.stockNumber = stockNumberStr
        #选择浏览器
        self.RecordDownloadIndex = 1
        self.maxDownloadNumber = maxNumber
        self.driver = webdriver.Safari()
        #self.driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        if(int(stockNumberStr) >= 600000):
            self.dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/sse'
        else:
            self.dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse'
        #make new directory
        prefixpath = "./download/"
        self.prefixpathname = prefixpath+self.stockNumber+"/"
        if os.path.exists(self.prefixpathname):
            pass
        else:
            os.mkdir(self.prefixpathname)

    def downloadPDF(self):    
        self.driver.quit()
        #self.driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        self.driver = webdriver.Safari()
        #设置超时时间,存在有可能超时为无限值,无法访问网页时挂死的情况
        self.driver.set_page_load_timeout(10)
        #tmpURL = "http://www.cninfo.com.cn/finalpage/2017-12-29/1204276365.PDF"
        #self.driver.get(tmpURL)
        self.driver.get(self.dst_url)
        self.driver.maximize_window()
        time.sleep(2)
        #print(self.stockNumber
        print('%s'%self.driver.current_url)
        self.driver.find_element_by_class_name("input-stock").send_keys(self.stockNumber)
        #self.driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").click()
        self.driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").send_keys(Keys.ENTER)
        #切换网页,以获取新弹出的网页窗口
        #tmpDriver = self.driver
        #time.sleep(30)
        time.sleep(5)
        for handle in self.driver.window_handles:
            self.driver.switch_to_window(handle)
            print('current url:%s'%self.driver.current_url)
            if "show" in self.driver.current_url :
                break
        time.sleep(1)

        urldata = self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")
        print('%s'%urldata.text)
        name = self.driver.find_element_by_xpath("//div[@id='plus-tag-div']/a/span").text
        print('%s'%name)
        patternStr = '\d+'
        rslt = re.findall(patternStr,urldata.text)
        #print(len(self.driver.window_handles))
        #最大化窗口,不可以随便最大化,否则影响handle的顺序
        #self.driver.maximize_window()
        #print(len(self.driver.window_handles))
        while(rslt[0] != rslt[1]):
            #self.driver.find_element_by_link_text('更多').click()
            #self.driver.find_element_by_link_text('更多').send_keys(Keys.ENTER)
            if(int(rslt[1]) >= self.maxDownloadNumber):
                break
                
            self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='show-more']/a").click()
            #等待网页相应时间
            time.sleep(1)
            urldata = self.driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")
            print('%s'%urldata.text)
            patternStr = '\d+'
            rslt = re.findall(patternStr,urldata.text)
        listNum = int(rslt[1])
        
        if(listNum != 0):
            for indexValue in range(1,listNum+1):
                
                for handle in self.driver.window_handles:
                    self.driver.switch_to_window(handle)
                    print('current url:%s'%self.driver.current_url)
                    if "show" in self.driver.current_url :
                        break
                time.sleep(1)

                findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span[@class='d3']"%indexValue
                urlTextGet = self.driver.find_element_by_xpath(findXpathStr)
                tmpTimeStr = urlTextGet.text
                print('timestr %s'%urlTextGet.text)
                
                findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span/a"%indexValue
                print('%s'%findXpathStr)
                urlTextGet = self.driver.find_element_by_xpath(findXpathStr)
                print('%s'%urlTextGet.text)
                tmpName = urlTextGet.text
                #if(re.search('澄清公告',urlTextGet.text)):
                #print('澄清公告忽略!%s'%urlTextGet.text)
                    #continue
                        
                self.driver.find_element_by_xpath(findXpathStr).click()
                time.sleep(5)
                #enterNumber=0
                for handle in self.driver.window_handles:
                    self.driver.switch_to_window(handle)
                    print('%s'%self.driver.current_url)
                    if "pdf" in self.driver.current_url :
                        break
                    if "PDF" in self.driver.current_url :
                        break
                    #enterNumber = enterNumber + 1
                    #print(enterNumber)
                time.sleep(1)
                print('%s'%self.driver.current_url)
                
                wgetURL = self.driver.current_url
                findlinkSuccess = 1
                downloadfilename = '%s%s%s.pdf'%(self.prefixpathname,tmpTimeStr.strip(),tmpName)
                if(findlinkSuccess == 1):
                    wget.download(wgetURL,downloadfilename)
                else:
                    print('无效链接!ignore')
                self.driver.close()

            #返回指向前一次最新的URL
            for handle in self.driver.window_handles:
                self.driver.switch_to_window(handle)
                time.sleep(1)
                    
        self.driver.close()
        self.driver.quit()
        
if __name__ == "__main__":
    if(len(sys.argv) < 2):
        print("Input stock number error!")
        print(sys.argv[0])
        sys.exit()
    downloadHandle = DownloadFromCninfo(sys.argv[1],20)
    downloadHandle.downloadPDF()
遗留问题
不清楚phantomjs不能使用的具体原因是什么;发现phantomjs变换成Safari两个平台时,有时单击不起效果,需要使用Enter的方式。


你可能感兴趣的:(工具使用,python知识)