转载请注明出处!
目录
环境配置
首先定义头文件,用于模仿浏览器搜索关键词
依据关键词搜索并获取各词条链接
进入每一个词条抓取该网页中图片
环境:anaconda3
python包:urllib、sys、re、BeautifulSoup
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
def search(key,page_num): #key:关键字;page_num:搜索后得到的页码
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词,pn是百度用来分页的..
response = urllib.request.urlopen(url) # 依据url爬取整个网页
page = response.read()
with open('search_test.txt', 'a') as all: # 将搜索到的网页词条超链接存入创建的文档中
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('h3')
for h3 in tagh3:
href = h3.find('a').get('href')
baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] #获取原始url
if real_url.startswith('http'):
all.write(real_url + '\n')
def getHtml(url):#依据每个词条超链接进入网页爬取
#open a url address
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(html):
#图片正则表达式(仅列了jpg图像)
reg = r'http.*?\.jpg'
# complie the Regular Exception as an object
imgre = re.compile(reg)
html = html.decode('utf-8') #python3
print(type(html))
imglist = re.findall(imgre,html)
print(imglist)
# 该方法的核心是直接下载远程图像到本地,并以递增顺序重命名图像
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
x += 1
# -*- coding: utf-8 -*-
'''
author:Elijah Lee
desc: kiwiSpider
'''
import urllib
import sys
import re
import urllib.request
import urllib3.request
import requests
from bs4 import BeautifulSoup
from urllib import parse
# import downloadImg
def getHtml(url):
#open a url address
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(html):
#Regular Exception
reg = r'http.*?\.jpg'
# complie the Regular Exception as an object
imgre = re.compile(reg)
html = html.decode('utf-8') #python3
print(type(html))
imglist = re.findall(imgre,html)
print(imglist)
# the core of such algrithom is to download the remote data to localhost derectly,at the same time,rename the images with a serious of increasing digital
x = 0
for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
x += 1
def search(key,page_num):
#define headfile,camouflage as browser
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词,pn是百度用来分页的..
response = urllib.request.urlopen(url)
page = response.read()
with open('search_test.txt', 'a') as all:
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('h3')
for h3 in tagh3:
href = h3.find('a').get('href')
baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] #get the original url
if real_url.startswith('http'):
all.write(real_url + '\n')
download(real_url)
#download imgs
def download(url):
#find imgs in every pages
html = getHtml(url)
#download imgs in folder
getImg(html)
if __name__=='__main__':
key=input('input key word:')
for page_num in range(0,30,10):
search(key,page_num)
print("over!")
参考:https://www.cnblogs.com/fnng/p/3576154.html