mac . scrapy

phantomjs 配置

macweb =  webdriver.PhantomJS('/Users/apple/xinjiang/phantomjs-2.1.1-macosx/bin/phantomjs')

firefox 配置

browser = webdriver.Firefox('/Users/apple/xinjiang/phantomjs-2.1.1-macosx/bin/')

火狐浏览器 需要geckodriver 在统一文件夹下。

driver = webdriver.Chrome('/Users/apple/xinjiang/phantomjs-2.1.1-macosx/bin/chromedriver')  谷歌浏览器

新疆首页

http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml

============================================

>>> from selenium import webdriver

>>> from selenium.webdriver.common.action_chains import ActionChains

>>> from selenium.webdriver.common.keys import Keys

>>> browser = webdriver.Firefox('/Users/apple/xinjiang/phantomjs-2.1.1-macosx/bin/')

>>> browser = webdriver.Firefox('/Users/apple/xinjiang/phantomjs-2.1.1-macosx/bin/')

>>> browser.get('http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml')

>>> yy=browser.find_element_by_css_selector('#keyword_qycx')

>>> yy.click()

>>> yy.send_keys("新疆建设大厦".decode('utf-8'))

=====================

middleware

class middle (object):

def process_request(self,request,spider):

try:

driver=webdriver.PhantomJS()

driver.get(request.url)

time.sleep(10)

content=dirver.page_source.encode('utf-8')

url=driver.current_url.encode('utf-8')

print('xxxxxxxx')

if content== '':

logger.info('content is empty :503')

retrun HtmlResponse(request.url,encoding='utf-8',status=503,body='')

else:

logger.info('content get success:200')

print('ok')

return HtmlResponse(request.url,encoding='utf-8',status=503,body=content)

except Exception,e:

logger.warning(e)

logger.info('Exception content is empty :503')

retrun HtmlResponse(request.url,encoding='utf-8',status=503,body='')



time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))

你可能感兴趣的:(mac . scrapy)