Selenium&&PhantomJS获取网站中的JS返回的数据

一、安装Selenium模块

pip install selenium
       Selenium 是一套完整的Web应用程序测试系统,包含了测试的录制、编写及运行和测试的并行处理。


二、安装PhantomJS(官网下载:http://phantomjs.org/)

        下载后放在python安装目录,和python.exe在一个文件夹下。Selenium和Phantomjs配合可以模拟获取包括JavaScript的数据。


三、运行环境

     (1) win7 

    (2) python 2.7

    (3) pycharm


四、获取百度搜索结果

from selenium import webdriver

browser = webdriver.PhantomJS()
browser.get('https://www.baidu.com')
browser.implicitly_wait(10)
data = browser.find_element_by_xpath('/*')
print browser.title
print data.text
with open('2.html', 'w') as fp:
    fp.write(browser.page_source.encode('utf8'))
browser.quit()

五、获取ip代理

(1)mylog.py

import logging
import getpass
import sys


#### 定义MyLog类
class MyLog(object):
#### 类MyLog的构造函数
	def __init__(self):
		self.user = getpass.getuser()
		self.logger = logging.getLogger(self.user)
		self.logger.setLevel(logging.DEBUG)

####  日志文件名
		self.logFile = sys.argv[0][0:-3] + '.log'
		self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')

####  日志显示到屏幕上并输出到日志文件内
		self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
		self.logHand.setFormatter(self.formatter)
		self.logHand.setLevel(logging.DEBUG)

		self.logHandSt = logging.StreamHandler()
		self.logHandSt.setFormatter(self.formatter)
		self.logHandSt.setLevel(logging.DEBUG)

		self.logger.addHandler(self.logHand)
		self.logger.addHandler(self.logHandSt)

####  日志的5个级别对应以下的5个函数
	def debug(self,msg):
		self.logger.debug(msg)

	def info(self,msg):
		self.logger.info(msg)

	def warn(self,msg):
		self.logger.warn(msg)

	def error(self,msg):
		self.logger.error(msg)

	def critical(self,msg):
		self.logger.critical(msg)
(2)ip.py

from selenium import webdriver
from mylog import MyLog as mylog


class Item(object):
	ip = None              #代理ip
	port = None            #代理端口
	anonymous = None       #是否匿名
	type = None            #类型
	support = None         #支持的协议
	local = None           #物理地址
	speed = None           #代理速度

class GetProxy(object):
	def __init__(self):
		self.startUrl = 'http://www.kuaidaili.com/proxylist/'
		self.log = mylog()
		self.urls = self.getUrls()
		self.proxyList = self.getProxyList(self.urls)
		self.fileName = 'proxy.txt'
		self.saveFile(self.fileName, self.proxyList)

	def getUrls(self):
		urls = []
		for i in xrange(1,11):
			url = self.startUrl + str(i)
			urls.append(url)
			self.log.info('get url %s to urls' %url)
		return urls

	def getProxyList(self, urls):
		browser = webdriver.PhantomJS()
		proxyList = []
		item = Item()
		for url in urls:
			browser.get(url)
			browser.implicitly_wait(5)
			elements = browser.find_elements_by_xpath('//tbody/tr')
			for element in elements:
				item.ip = element.find_element_by_xpath('./td[1]').text.encode('utf8')
				item.port = element.find_element_by_xpath('./td[2]').text.encode('utf8')
				item.anonymous = element.find_element_by_xpath('./td[3]').text.encode('utf8')
				item.type = element.find_element_by_xpath('./td[4]').text.encode('utf8')
				item.support = element.find_element_by_xpath('./td[5]').text.encode('utf8')
				item.local = element.find_element_by_xpath('./td[6]').text.encode('utf8')
				item.speed = element.find_element_by_xpath('./td[7]').text.encode('utf8')
				proxyList.append(item)
				self.log.info('add proxy %s:%s to list' %(item.ip, item.port))
		browser.quit()
		return proxyList

	def saveFile(self, fileName, proxyList):
		self.log.info('add all proxy to %s' %fileName)
		with open(fileName, 'w') as fp:
			for item in proxyList:
				fp.write(item.ip + '\t')
				fp.write(item.port + '\t')
				fp.write(item.anonymous + '\t')
				fp.write(item.type + '\t')
				fp.write(item.support + '\t')
				fp.write(item.local + '\t')
				fp.write(item.speed + '\n')
				

if __name__ == '__main__':
	GP = GetProxy()

六、获取漫画截图
from selenium import webdriver
from mylog import MyLog as mylog
import os
import time

class GetCartoon(object):
    def __init__(self):
        #self.startUrl = u'http://www.1kkk.com/ch1-406302/'
        self.startUrl = u'http://www.1kkk.com/ch1-397573/'
        self.log = mylog()
        self.browser = self.getBrowser()
        self.saveCartoon(self.browser)
        self.browser.quit()
        
        
    def getBrowser(self):
        browser = webdriver.PhantomJS()
        try:
            browser.get(self.startUrl)
        except:
            mylog.error('open the %s failed' %self.startUrl)
        browser.implicitly_wait(20)
        return browser
            
    def saveCartoon(self, browser):
        cartoonTitle = browser.title.split('_')[0]
        self.createDir(cartoonTitle)
        os.chdir(cartoonTitle)
        sumPage = int(self.browser.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
        i = 1
        while i<=sumPage:
            imgName = str(i) + '.png'
            browser.get_screenshot_as_file(imgName)
            self.log.info('save img %s' %imgName)
            i += 1    
            NextTag = browser.find_element_by_id('next')
            NextTag.click()
#            browser.implicitly_wait(20)
            time.sleep(5)
        self.log.info('save img sccess')
    
    def createDir(self, dirName):
        if os.path.exists(dirName):
            self.log.error('create directory %s failed, hava a same name file or directory' %dirName)
        else:
            try:
                os.makedirs(dirName)
            except:
                self.log.error('create directory %s failed' %dirName)
            else:
                self.log.info('create directory %s success' %dirName)
                
            
if __name__ == '__main__':
    GC = GetCartoon()
Selenium&&PhantomJS获取网站中的JS返回的数据_第1张图片

  

你可能感兴趣的:(python爬虫,selenium,python爬虫,python,PhantomJS)