Python爬取煎蛋妹子图

准备工作:Python3.5+Requests+BeautifulSoup
这是个爬虫练手项目,只做到了爬取图片,没有考虑性能问题。直接上代码

from bs4 import BeautifulSoup
import re

Directory = 'E:/meizitu/'
base_url = "http://jandan.net/ooxx/page-"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}

def getMeiziTu(page):
    meizi_url = base_url + str(page) + "#comments"
    respone = requests.get(meizi_url, headers=headers)
    respone.encoding = 'utf-8'
    soup = BeautifulSoup(respone.text, "html5lib")
    if soup.find(text=re.compile('屏蔽')) == None:
        print('=============================')
        print('正在下载第 ' + str(page) + ' 页')
        #               存储包含图片地址的标签
        img = []
        imgall = soup.body('li', id=re.compile("comment-"))
        for tmp in imgall:
            img += tmp.div.find(
                'div', class_='row').find(
                'div', class_='text').find_all(
                'img', src=True)
        for n, girl in enumerate(img):
            print('       第 ' + str(n) + ' 张', end='')
            if not girl.has_attr('org_src'):
                url = girl['src']
                with open(Directory + '妹纸图' + str(page) + '-' + str(n)
                                  + url[-4:], 'wb') as f:
                    f.write(requests.get(url).content)
            else:
                url = girl['org_src']
                with open(Directory + '妹纸图' + str(page) + '-' + str(n)
                                  + url[-4:], 'wb') as f:
                    f.write(requests.get(url).content)
            print('...OK!')
        print('第 ' + str(page) + ' 页下载完成啦!!!')
        return True

思路是获取妹子图页面图片的URL,保存到本地,但是发现一个问题,现在煎蛋做了反爬虫的机制,图片的URL做了加密处理,F12能看到,但是beautifulsoup解析不出来。

本来是想找解密的方法,无意中搜到selemium这个神器。selenium 是一个web的自动化测试工具,可以模拟用户操作浏览器。这样就可以直接获取图片URL了。

import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# import lxml

Directory = 'E:/meizitu/'
base_url = "http://jandan.net/ooxx/page-"
path = "F:\chrome\chromedriver.exe"
driver = webdriver.PhantomJS(executable_path=r'F:\phantomjs-2.1.1-windows\phantomjs-2.1.1-windows\bin\phantomjs.exe')
# driver = webdriver.Chrome(executable_path=path)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}
img_url = []
urls = ["http://jandan.net/ooxx/page-{}#comments".format(str(i)) for i in range(400, 500)]

def getImg():
    n = 1
    for url in img_url:
        print('第' + str(n) + ' 张', end='')
        with open(Directory + url[-15:], 'wb') as f:
            f.write(requests.get(url).content)
        print('...OK!')
        n = n+1

def getImgUrl(url):
    driver.get(url)
    data = driver.page_source
    soup = BeautifulSoup(data, "html.parser")  # 解析网页
    images = soup.select("a.view_img_link")  # 定位元素
    for i in images:
        z = i.get('href')
        if str('gif') in str(z):
            pass
        else:
            http_url = "http:" + z
            img_url.append(http_url)
            print(http_url)


if __name__ == "__main__":
    for url in urls:
        getImgUrl(url)
    getImg()
    print("")

PS:爬煎蛋不要太过分,对煎蛋服务器压力很大,练手后去爬其他大站吧。

你可能感兴趣的:(Python爬取煎蛋妹子图)