爬取动态加载 & selenium & 无头Chrome
实例:
import requests
import os
import re
from selenium import webdriver
from time import sleep
from lxml import etree
from multiprocessing.dummy import Pool
def get_date(num):
option = webdriver.FirefoxOptions()
option.add_argument('--headless')
browser = webdriver.Firefox(options=option)
browser.get('https://huaban.com/boards/'+str(c)+'/')
if num == 0:
date = browser.page_source
browser.quit()
return(date)
else:
for i in range(num):
browser.execute_script('window.scrollTo(0,10240)')
sleep(1)
date = browser.page_source
browser.quit()
return(date)
def get_url(url):
url = 'https://huaban.com'+url
req = requests.get(url)
img_url = re.findall('"key":"(.*?)", "type":"image/jpeg", "height":',req.text)[0]
img_url = 'http://hbimg.huabanimg.com/'+ img_url
save_img(img_url)
def save_img(url):
name = url[-30:-13]
print(url)
date = requests.get(url)
with open(b+'/'+name+'.jpg', 'wb') as f:
f.write(date.content)
if __name__ == '__main__':
c = int(input('输入链接码: '))
a = int(input('输入下载的页数: '))
b = input('创建目录名: ')
os.makedirs(b)
list_old=[]
list=[]
for i in range(a):
date = get_date(i)
soup = etree.HTML(date)
list_old += soup.xpath('//*[@id="waterfall"]/div/a/@href')
for i in list_old:
if i not in list:
list.append(i)
pool = Pool(4)
pool.map(get_url,list)
pool.close()
pool.join()
print('\n'+'....下载完成....')
谷歌浏览器的驱动下载地址:
http://chromedriver.storage.googleapis.com/index.html
使用selenium打开百度
from selenium import webdriver
from time import sleep
driver = webdriver.Chrome('./chromedriver.exe')
driver.get('http://www.baidu.com')
driver.find_element_by_id('kw').send_keys('人民币')
sleep(2)
driver.find_element_by_id('su').click()
sleep(2)
driver.quit()
find_element_by_id 根据id找节点
find_elements_by_name 根据name找
find_elements_by_xpath 根据xpath查找
find_elements_by_tag_name 根据标签名找
find_elements_by_class_name 根据class名字查找
无头Chrome
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = './chromedriver.exe'
browser = webdriver.Chrome(executable_path=path,options=chrome_options)
url ='http://www.baidu.com/'
browser.get(url)
time.sleep(3)
browser.save_screenshot('baid.png')
browser.quit()