Python爬虫实战笔记_1-4 爬动态加载页面

练习如何爬一个动态加载的网页,并将图片下载到本地。

美女图片网址打开不容易,换成爬knewone的图片。加载原理相同。

#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# filter out specifical info from a dynamic webpage
#
from bs4 import BeautifulSoup
import requests
import os
import urllib


def getemtext(element):
    return element.get_text().strip()

def get_target_info(url):
    wbdata = requests.get(url)
    soup = BeautifulSoup(wbdata.text, 'lxml')
    titlelsit = soup.select('section.content > h4 > a')
    imglist = soup.select('header.cover > a > img')
    # link = soup.select('section.content > h4 > a')
    datalist = []
    for (title,img) in zip(titlelsit, imglist):
        data = {
            "title": getemtext(title), #title.get_text(),
            "img": img.get('src').split('!')[0]
        }
        print(data)
        datalist.append(data)
    return datalist
    #downimg(data['img'], data['title'])

def downimg(url, filename=''):
    if not filename:
        filename = os.path.basename(url)
    else:
        filename = filename + '.' + url.split('.')[-1]
    filepath = os.path.join('knewonepic', filename)
    urllib.request.urlretrieve(url, filepath)
    #print(filepath)

def downimgproxy(url, filename=''):

    #urllib.request.urlretrieve(url, filepath)
    #print(filepath)
    proxies = {'http': "207.62.234.53:8118"}
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    request = requests.get(url, proxies = proxies, headers = headers)
    if request.status_code != 200:
        return

    if not filename:
        filename = os.path.basename(url)
    else:
        filename = filename + '.' + url.split('.')[-1]
    filepath = os.path.join('knewonepic', filename)

    with open(filepath, 'wb') as f:
        f.write(request.content)

if __name__ == "__main__":
    folder = 'knewonepic'
    if not os.path.exists(folder):
        os.mkdir(folder)

    url = 'https://knewone.com/things/categories/sheng-huo'
    urls = [url + "?page={}".format(pageid) for pageid in range(1, 10)]
    for url in urls:
        datas = get_target_info(url)
        for data in datas:
            downimgproxy(data['img'], data['title'])

截取部分运行结果

{'img': 'https://making-photos.b0.upaiyun.com/photos/2b70f6cd1b3f54f693a04746c697dc4c.jpg', 'title': 'Humanscale World Chair 工作椅'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/abe59bbfa954ba252bcdb69d21893246.jpg', 'title': 'Lithe Clock'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/93a1293213d0c97322f457dedd484576.jpg', 'title': '日式取暖桌こたつ'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/503d0ec2e327089a8b05b1b94f0a1611.jpg', 'title': '磁力沙漏'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/fcddd7860eb8d3a80ab6c4c7676ea899.jpg', 'title': 'Anglepoise Original 1227 台灯'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/5aa04046f2a53d4a3eb5a74d92fc0981.jpg', 'title': 'Starry Light'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/549b66529d656c3dd1194926a8d1b71e.png', 'title': 'The Swiss Musical Starship'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/91ed4e9eab2d3969ae6fae753a731a92.jpg', 'title': 'Philips Hue LED 灯泡'}
{'img': 'https://making-photos.b0.upaiyun.com/photos/cc2f649e5ff5990492ebd32931ce90f5.jpg', 'title': 'Slimline'}

下载到本地文件夹中的图片,用从页面中爬出来的title为图片命名�


Python爬虫实战笔记_1-4 爬动态加载页面_第1张图片
Screen Shot 2016-06-27 at 9.28.00 PM.png
总结
  • 下载图片的方法
    (1) downimg() 用urllib.request.urlretrieve方法实现图片下载
    (2) downimgproxy() 用proxy实现图片下载
  • 使用代理的方法还得再研究研究。 照视频里老师的例子把代码运行成功了,但还没有真正想清楚

你可能感兴趣的:(Python爬虫实战笔记_1-4 爬动态加载页面)