网络爬虫:依据搜索词爬取各网页上图片

转载请注明出处!

目录

环境配置

首先定义头文件,用于模仿浏览器搜索关键词

依据关键词搜索并获取各词条链接

进入每一个词条抓取该网页中图片


环境配置

环境:anaconda3

python包:urllib、sys、re、BeautifulSoup

首先定义头文件,用于模仿浏览器搜索关键词

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, compress',
    'Accept-Language': 'en-us;q=0.5,en;q=0.3',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    } 

依据关键词搜索并获取各词条链接

def search(key,page_num):    #key:关键字;page_num:搜索后得到的页码
    url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词,pn是百度用来分页的..

    response = urllib.request.urlopen(url) # 依据url爬取整个网页
    page = response.read()

    with open('search_test.txt', 'a') as all: # 将搜索到的网页词条超链接存入创建的文档中
        soup = BeautifulSoup(page, 'lxml')
        tagh3 = soup.find_all('h3') 
        for h3 in tagh3:
            href = h3.find('a').get('href')
            baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
            real_url = baidu_url.headers['Location']  #获取原始url
            if real_url.startswith('http'):
                all.write(real_url + '\n')

进入每一个词条抓取该网页中图片

def getHtml(url):#依据每个词条超链接进入网页爬取
    #open a url address
    page = urllib.request.urlopen(url)  
    html = page.read() 
    return html

def getImg(html):
    #图片正则表达式(仅列了jpg图像)
    reg = r'http.*?\.jpg'    
    # complie the Regular Exception as an object
    imgre = re.compile(reg)     
    html = html.decode('utf-8') #python3
    print(type(html))
    imglist = re.findall(imgre,html)      
    print(imglist)

    # 该方法的核心是直接下载远程图像到本地,并以递增顺序重命名图像
    x = 0

    for imgurl in imglist:
     urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
     x += 1

 

完整代码

# -*- coding: utf-8 -*-

'''
author:Elijah Lee
desc: kiwiSpider
'''
import urllib
import sys
import re 
import urllib.request
import urllib3.request
import requests

from bs4 import BeautifulSoup
from urllib import parse

# import downloadImg

def getHtml(url):
    #open a url address
    page = urllib.request.urlopen(url)  
    html = page.read() 
    return html

def getImg(html):
    #Regular Exception
    reg = r'http.*?\.jpg'    
    # complie the Regular Exception as an object
    imgre = re.compile(reg)     
    html = html.decode('utf-8') #python3
    print(type(html))
    imglist = re.findall(imgre,html)      
    print(imglist)

    # the core of such algrithom is to download the remote data to localhost derectly,at the same time,rename the images with a serious of increasing digital
    x = 0

    for imgurl in imglist:
     urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
     x += 1

def search(key,page_num):
    #define headfile,camouflage as browser
    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, compress',
    'Accept-Language': 'en-us;q=0.5,en;q=0.3',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    } 
    
    url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词,pn是百度用来分页的..

    response = urllib.request.urlopen(url)
    page = response.read()

    with open('search_test.txt', 'a') as all:
        soup = BeautifulSoup(page, 'lxml')
        tagh3 = soup.find_all('h3')
        for h3 in tagh3:
            href = h3.find('a').get('href')
            baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
            real_url = baidu_url.headers['Location']  #get the original url
            if real_url.startswith('http'):
                all.write(real_url + '\n')
                download(real_url)
#download imgs
def download(url):
    #find imgs in every pages
    html = getHtml(url)
    #download imgs in folder
    getImg(html)
                
if __name__=='__main__':
    key=input('input key word:')
    for page_num in range(0,30,10):
        search(key,page_num)
    print("over!")

参考:https://www.cnblogs.com/fnng/p/3576154.html

 

 

你可能感兴趣的:(网络爬虫:依据搜索词爬取各网页上图片)