一、Selenium模拟浏览器爬取界面
def selenium_crawl_goubanjia_proxy_ip():
parent_dir = os.path.dirname(__file__)
current_operation_system = platform.system()
if current_operation_system == 'Windows':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver.exe')
elif current_operation_system == 'Linux':
driver_file_path = os.path.join(parent_dir, 'driver', 'chromedriver')
print driver_file_path
chrome_driver = os.path.abspath(driver_file_path)
os.environ['webdriver.chrome.driver'] = chrome_driver
if current_operation_system == 'Windows':
browser = webdriver.Chrome(chrome_driver)
elif current_operation_system == 'Linux':
service_log_path = "{}/chromedriver.log".format(chrome_driver)
service_args = ['--verbose']
browser = webdriver.Chrome(chrome_driver, service_args=service_args, service_log_path=service_log_path)
browser.get("http://www.goubanjia.com/")
ips = []
ip_elements = browser.find_elements_by_css_selector('table.table tr td.ip')
for ip_element in ip_elements:
ips.append(ip_element.text)
ports = []
port_elements = browser.find_elements_by_css_selector('table.table tr td.port')
for port_element in port_elements:
ports.append(port_element.text)
proxies = []
for i in xrange(len(ips)):
proxy = {}
proxy['ip_port'] = ips[i] + ':' + ports[i]
proxy['user_pass'] = ''
proxies.append(proxy)
browser.close()
browser.quit()
return proxies
二、PhantomJS模拟登录获取Cookies信息
# -*- coding:utf-8 -*-
import sys
from selenium import webdriver
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding(default_encoding)
def csdn_login(username, password):
driver = webdriver.PhantomJS(executable_path='F:\\develop\\crawler\\phantomjs-2.1.1\\bin\\phantomjs.exe')
driver.get('http://passport.csdn.net/account/login')
driver.find_element_by_id('username').clear()
driver.find_element_by_id('password').clear()
driver.find_element_by_id('username').send_keys(username)
driver.find_element_by_id('password').send_keys(password)
driver.find_element_by_class_name('logging').click()
for item in driver.get_cookies():
print item
def iteye_login(username, password):
driver = webdriver.PhantomJS(executable_path='F:\\develop\\crawler\\phantomjs-2.1.1\\bin\\phantomjs.exe')
driver.get('http://www.iteye.com/login')
driver.find_element_by_id('user_name').clear()
driver.find_element_by_id('password').clear()
driver.find_element_by_id('user_name').send_keys(username)
driver.find_element_by_id('password').send_keys(password)
driver.find_element_by_id('button').click()
for item in driver.get_cookies():
print item
def qqqun_login(username, password):
driver = webdriver.PhantomJS(executable_path='F:\\develop\\crawler\\phantomjs-2.1.1\\bin\\phantomjs.exe')
driver.get('http://qqun.qq.com/group/login.html')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('p').clear()
driver.find_element_by_id('u').send_keys(username)
driver.find_element_by_id('p').send_keys(password)
driver.find_element_by_id('login_button').click()
for item in driver.get_cookies():
print item
三、Scrapy提交表单请求
# -*- coding:utf-8 -*-
import sys
import scrapy
from scrapy import Selector
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
print sys.getdefaultencoding()
reload(sys)
sys.setdefaultencoding(default_encoding)
class CSDNSpider(scrapy.Spider):
name = "csdn_spider"
allowed_domains = ["csdn.net"]
start_urls = ["http://geek.csdn.net/"]
def parse(self, response):
selector = Selector(response)
print selector.xpath('//*[@id="geek_list"]/dl/dd/span[2]/a/text()').extract()
def start_requests(self):
login_url = 'http://passport.csdn.net/account/login'
heads = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
}
return [scrapy.Request(
url=login_url,
meta={'cookiejar': 1},
callback=self.post_login
)]
def post_login(self, response):
print 'preparing login'
heads = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
}
lt = Selector(response).xpath('//*[@id="fm1"]/input[3]/@value').extract()[0]
print lt
execution = Selector(response).xpath('//*[@id="fm1"]/input[4]/@value').extract()[0]
print execution
_eventId = Selector(response).xpath('//*[@id="fm1"]/input[5]/@value').extract()[0]
print _eventId
return [scrapy.FormRequest.from_response(response,
formdata={
'username': '[email protected]',
'password': 'xxxxxxxxxxxxxxxx',
'rememberMe': True,
'lt': lt,
'execution': execution,
'_eventId': _eventId
},
callback=self.after_login
)]
def after_login(self, response):
for url in self.start_urls:
yield self.make_requests_from_url(url)
四、爬虫采集过程操作
爬虫采集过程终止:Ctrl + c
Scrapy的CloseSpider扩展会在满足条件时自动终止爬虫程序。可以设置
CLOSESPIDER_TIMEOUT(秒)、
CLOSESPIDER_ITEMCOUNT、
CLOSESPIDER_PAGECOUNT、
CLOSESPIDER_ERRORCOUNT
分别代表在指定时间过后、在抓取了指定数目的Item之后、在收到了指定数目的响应之后、在发生了指定数目的错误之后就终止爬虫程序。通常情况下可以在命令行中设置:
scrapy crawl spider_name -s CLOSESPIDER_ITEMCOUNT=10
scrapy crawl spider_name -s CLOSESPIDER_PAGECOUNT=10
scrapy crawl spider_name -s CLOSESPIDER_TIMEOUT=10
scrapy crawl spider_name -sCLOSESPIDER_ERRORCOUNT=10
爬虫采集过程恢复:
scrapy crawl spider_name -s JOBDIR=crawler/spider_name
也可以在settings.py文件中添加JOBDIR='spider_name.com',使用命令scrapy crawl spider_name,就会自动生成一个spider_name.com的目录,之后工作列表就会放到这个文件夹里 。
五、爬虫采集日志记录
scrapy crawl spider_name -s LOG_FILE=spider_name.log