pip install selenium
谷歌浏览器驱动下载地址:http://npm.taobao.org/mirrors/chromedriver/
火狐浏览器驱动下载地址:http://npm.taobao.org/mirrors/geckodriver/
查看谷歌浏览器版本:帮助 --> 关于 Google Chrome
进入到今日头条主页,点击 科技 ,到对应的页面爬取标题和url,但是这个页面需要将滚轮滑倒底部,才能加载下面的内容,所有需要执行js代码
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time,xlwt
#1、模拟浏览器找到访问位置
driver = webdriver.Chrome() #实例化一个初始浏览器
url = "https://www.toutiao.com"
driver.get(url=url) #以get方式发送一个url
driver.implicitly_wait(10) #隐形等待,确保节点都完全加载出来
driver.find_element_by_link_text('科技').click() #获取a标签超链接文本
driver.implicitly_wait(10)
#2、获取源码,标识滚动条位置的代码
for i in range(3):
js = 'var q = document.documentElement.scrollTop=' + str(i*3000)
driver.execute_script(js) #执行js代码
time.sleep(2)
html_doc = driver.page_source #获取网页源码
#3、解析内容
data = []
soup = BeautifulSoup(html_doc,'html.parser')
res = soup.select('.wcommonFeed ul li .rbox-inner .title-box a')
for i in range(len(res)):
if 'http' not in res[i]['href']:
data.append([res[i].text, url + res[i]['href']])
else:
data.append([res[i].text, res[i]['href']])
#4、将数据写入到excel中
newTable = 'test2018.xls'
book = xlwt.Workbook(encoding='utf-8') #创建excle表格
boot_sheet = book.add_sheet('今日头条') #添加工作薄
headData = ['标题','url地址']
#写入标题
for colnum in range(2): # 2为几列
boot_sheet.write(0,colnum,headData[colnum])
#写入内容
index = 1
for content in range(len(data)):
for colnum in range(2):
boot_sheet.write(index,colnum,data[content][colnum])
index += 1
book.save(newTable)
driver.quit() #退出
输入用户名和密码后爬取对应的页面内容
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from selenium import webdriver
import time
browser = webdriver.Chrome() #实例化一个初始浏览器
browser.get('http://login.sina.com.cn') # 访问网站
elem_user = browser.find_element_by_name('username') # 定位到用户名元素
elem_user.send_keys('username') # 账号用户名
elem_pwd = browser.find_element_by_name("password")
elem_pwd.send_keys('password') # 账号密码
elem_sub = browser.find_element_by_xpath("//input[@class='W_btn_a btn_34px']") # 定位到提交元素
elem_sub.click() # 点击登陆
time.sleep(10) # 等待10s
current_url = browser.current_url # 当前页面的url
print(current_url)
# 跳转到要爬取的页面
browser.get('http://k.sina.com.cn/article_5807684875_m15a2a3d0b00100i7hw.html?cre=mysinapc&mod=f&loc=11&r=15&doct=0&rfunc=47')
time.sleep(10)
html = browser.page_source # 获得当前页面的html字符串
print(html)
browser.quit()
详情请参考官网地址:https://selenium-python.readthedocs.io
#-*- coding:utf-8 -*-
# 下面是一个中间件,后期直接在settings里面调用即可。
# 实现的功能是将web页面上的内容通过selenium爬取,不使用scrapy自带异步框架爬取
import time
from selenium import webdriver
from scrapy.http.response.html import HtmlResponse
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome(executable_path=r'chromedriver.exe')
def process_request(self,request,spider):
self.driver.get(request.url)
time.sleep(1)
try:
while True:
showMore = self.driver.find_element_by_class_name('show-more')
showMore.click()
time.sleep(0.3)
if not showMore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
return response