之前对Py感兴趣主要是平常喜欢爬一些小黄图。。。首先我的环境是在window上面的,selenium 的安装使用pip install selenium
安装即可,PhantomJS不需要安装直接下载压缩包即可。
先看下主要的效果图
先看下一个简单的demo_1.py
from selenium import webdriver
driver = webdriver.PhantomJS(executable_path="C:/D-Dir/phantomjs-2.1.1-windows/bin/phantomjs.exe")
driver.get("http://www.csdn.net")
data = driver.title
driver.save_screenshot('csdn.png')
print(data)
很简单地访问http://www.csdn.net
,然后保存网页为图片到本地
看下一般的demo_2.py
import time
from selenium import webdriver
UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
header_info = {
'user-agent': UA,
}
chromedriver_path = 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chromedriver_path)
url = 'https://www.duitang.com/search/?kw=文豪野犬&type=feed'
def test_2():
driver.get(url=url)
time.sleep(1)
for i in range(10):
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1)
title = driver.title
print('driver.title===%s' % title)
content = driver.page_source
with open('test.html', 'w', encoding='utf-8') as f:
f.write(content)
mbpho_list = driver.find_elements_by_class_name('mbpho')
count = 1
for mbp in mbpho_list:
src = mbp.find_element_by_tag_name('img').get_attribute('src')
print('当前是第%d页,,,%d张图片,地址是%s' % (1, count, src))
count += 1
page_sth = driver.find_element_by_class_name('woo-pager')
page_total = 1
if page_sth:
a_list = page_sth.find_elements_by_tag_name('a')
for a in a_list:
page_str = a.text
try:
page = int(page_str)
if page > page_total:
page_total = page
except Exception as e:
print(e)
print('获取到的最大页数是==%d' % page_total)
if page_total < 2:
return
for i in range(1, page_total):
page_num = i + 1
temp_url = 'https://www.duitang.com/search/?kw=%E6%96%87%E8%B1%AA%E9%87%8E%E7%8A%AC&type=feed#!s-p' + str(
page_num)
test_3(page_num, temp_url)
def test_3(page_num, url):
driver.get(url=url)
time.sleep(1)
for i in range(10):
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1)
mbpho_list = driver.find_elements_by_class_name('mbpho')
count = 1
for mbp in mbpho_list:
src = mbp.find_element_by_tag_name('img').get_attribute('src')
print('当前是第%d页,,,%d张图片,地址是%s' % (page_num, count, src))
count += 1
if __name__ == '__main__':
test_2()
说下运行结果吧,读取demo访问网址里面所有的图片信息然后打印出来,这里需要注意的是使用了
for i in range(10):
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1)
循环滚动到浏览器底部
最后看下有问题的demo_3.py
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
header_info = {
'user-agent': UA,
}
PhantomJS = 'C:/D-Dir/phantomjs-2.1.1-windows/bin/phantomjs.exe'
service_args = []
service_args.append('--load-images=no') ##关闭图片加载
service_args.append('--disk-cache=yes') ##开启缓存
service_args.append('--ignore-ssl-errors=true') ##忽略https错误
browser = webdriver.PhantomJS(executable_path=PhantomJS, service_args=service_args)
url = 'https://www.duitang.com/search/?kw=文豪野犬&type=feed'
def test_2():
browser.get(url=url)
time.sleep(5)
print(browser.page_source)
# 等待登陆页面加载完成
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'mbpho')))
if is_ele_exist(browser):
print("登录失败")
browser.quit()
return None
print("登陆成功")
def is_ele_exist(browser):
try:
mbpho_list = browser.find_elements_by_class_name('mbpho')
return True
except:
return False
if __name__ == '__main__':
test_2()
这里获取出来的browser.page_source
并没有图片数据,用了util
还是不行;如果有大神知道请告知我怎么解决一下,谢谢。。。