一、安装Selenium模块
pip install selenium
Selenium 是一套完整的Web应用程序测试系统,包含了测试的录制、编写及运行和测试的并行处理。
二、安装PhantomJS(官网下载:http://phantomjs.org/)
下载后放在python安装目录,和python.exe在一个文件夹下。Selenium和Phantomjs配合可以模拟获取包括JavaScript的数据。
三、运行环境
(1) win7
(2) python 2.7
(3) pycharm
四、获取百度搜索结果
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get('https://www.baidu.com')
browser.implicitly_wait(10)
data = browser.find_element_by_xpath('/*')
print browser.title
print data.text
with open('2.html', 'w') as fp:
fp.write(browser.page_source.encode('utf8'))
browser.quit()
五、获取ip代理
(1)mylog.py
import logging
import getpass
import sys
#### 定义MyLog类
class MyLog(object):
#### 类MyLog的构造函数
def __init__(self):
self.user = getpass.getuser()
self.logger = logging.getLogger(self.user)
self.logger.setLevel(logging.DEBUG)
#### 日志文件名
self.logFile = sys.argv[0][0:-3] + '.log'
self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n')
#### 日志显示到屏幕上并输出到日志文件内
self.logHand = logging.FileHandler(self.logFile, encoding='utf8')
self.logHand.setFormatter(self.formatter)
self.logHand.setLevel(logging.DEBUG)
self.logHandSt = logging.StreamHandler()
self.logHandSt.setFormatter(self.formatter)
self.logHandSt.setLevel(logging.DEBUG)
self.logger.addHandler(self.logHand)
self.logger.addHandler(self.logHandSt)
#### 日志的5个级别对应以下的5个函数
def debug(self,msg):
self.logger.debug(msg)
def info(self,msg):
self.logger.info(msg)
def warn(self,msg):
self.logger.warn(msg)
def error(self,msg):
self.logger.error(msg)
def critical(self,msg):
self.logger.critical(msg)
(2)ip.py
from selenium import webdriver
from mylog import MyLog as mylog
class Item(object):
ip = None #代理ip
port = None #代理端口
anonymous = None #是否匿名
type = None #类型
support = None #支持的协议
local = None #物理地址
speed = None #代理速度
class GetProxy(object):
def __init__(self):
self.startUrl = 'http://www.kuaidaili.com/proxylist/'
self.log = mylog()
self.urls = self.getUrls()
self.proxyList = self.getProxyList(self.urls)
self.fileName = 'proxy.txt'
self.saveFile(self.fileName, self.proxyList)
def getUrls(self):
urls = []
for i in xrange(1,11):
url = self.startUrl + str(i)
urls.append(url)
self.log.info('get url %s to urls' %url)
return urls
def getProxyList(self, urls):
browser = webdriver.PhantomJS()
proxyList = []
item = Item()
for url in urls:
browser.get(url)
browser.implicitly_wait(5)
elements = browser.find_elements_by_xpath('//tbody/tr')
for element in elements:
item.ip = element.find_element_by_xpath('./td[1]').text.encode('utf8')
item.port = element.find_element_by_xpath('./td[2]').text.encode('utf8')
item.anonymous = element.find_element_by_xpath('./td[3]').text.encode('utf8')
item.type = element.find_element_by_xpath('./td[4]').text.encode('utf8')
item.support = element.find_element_by_xpath('./td[5]').text.encode('utf8')
item.local = element.find_element_by_xpath('./td[6]').text.encode('utf8')
item.speed = element.find_element_by_xpath('./td[7]').text.encode('utf8')
proxyList.append(item)
self.log.info('add proxy %s:%s to list' %(item.ip, item.port))
browser.quit()
return proxyList
def saveFile(self, fileName, proxyList):
self.log.info('add all proxy to %s' %fileName)
with open(fileName, 'w') as fp:
for item in proxyList:
fp.write(item.ip + '\t')
fp.write(item.port + '\t')
fp.write(item.anonymous + '\t')
fp.write(item.type + '\t')
fp.write(item.support + '\t')
fp.write(item.local + '\t')
fp.write(item.speed + '\n')
if __name__ == '__main__':
GP = GetProxy()
from selenium import webdriver
from mylog import MyLog as mylog
import os
import time
class GetCartoon(object):
def __init__(self):
#self.startUrl = u'http://www.1kkk.com/ch1-406302/'
self.startUrl = u'http://www.1kkk.com/ch1-397573/'
self.log = mylog()
self.browser = self.getBrowser()
self.saveCartoon(self.browser)
self.browser.quit()
def getBrowser(self):
browser = webdriver.PhantomJS()
try:
browser.get(self.startUrl)
except:
mylog.error('open the %s failed' %self.startUrl)
browser.implicitly_wait(20)
return browser
def saveCartoon(self, browser):
cartoonTitle = browser.title.split('_')[0]
self.createDir(cartoonTitle)
os.chdir(cartoonTitle)
sumPage = int(self.browser.find_element_by_xpath('//font[@class="zf40"]/span[2]').text)
i = 1
while i<=sumPage:
imgName = str(i) + '.png'
browser.get_screenshot_as_file(imgName)
self.log.info('save img %s' %imgName)
i += 1
NextTag = browser.find_element_by_id('next')
NextTag.click()
# browser.implicitly_wait(20)
time.sleep(5)
self.log.info('save img sccess')
def createDir(self, dirName):
if os.path.exists(dirName):
self.log.error('create directory %s failed, hava a same name file or directory' %dirName)
else:
try:
os.makedirs(dirName)
except:
self.log.error('create directory %s failed' %dirName)
else:
self.log.info('create directory %s success' %dirName)
if __name__ == '__main__':
GC = GetCartoon()