无意中可看到一个爬取煎蛋网的美女照片 博客,感觉很有趣,就仿照写了个,还几个
一、安装必要的库
1.1 beautifulsoup
pip install beautifulsoup4
1.2
pip install requests
1.3
pip install lxml
几个很重要的函数
1. request.urlretrieve(url,local) 把远程数据下载到本地
2. req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
xhtml = req.text
获取网页的文本信息
二、python 正则表达
获取图片网址
reg = '[^"]*?\.jpg' #表示获取jpg结尾的文本
imgre = re.compile(reg)
imgList = imgre.findall(html)
浏览器的头文件
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
防止被服务禁止下载,模拟浏览器访问
opener=request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]
request.install_opener(opener)
request.urlretrieve(imgurl,'F:/python3/webImage/%s_%s.jpg'%(page,x))
三、写爬虫
import re
import requests
import time
from urllib import request
from urllib.request import urlretrieve ,HTTPError ,urlopen,URLError
url = ""
#获取网页文本
def getHtml(url):
#注释中,使用默认的读取网页,和下面 二选一
#file = urlopen(url)
#html = file.read()
#xhtml=html.decode('utf-8')
#len(xhtml)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
xhtml = req.text
#print(xhtml)
#bf = BeautifulSoup(html, 'lxml')
return xhtml
#分析网页,获取图片的网址
def getImageList(html):
reg = '[^"]*?\.jpg'
imgre = re.compile(reg)
imgList = imgre.findall(html)
#re.findall(imgre,html)
imgList= sorted(set(imgList),key=imgList.index)#去除重复的字段
List=[]
for list in imgList:
# 判断网址是否存在http:
if 'http' not in list:
List.append(('http://'+list).replace('////','//'))
else:
List.append(list)
return List
#下载图片
def download(imgList, page):
x = 1
for imgurl in imgList:
imgurl=imgurl.replace('\/', '/')
#print(imgurl)
try:
print(imgurl)
#模拟浏览器读取图片,防止被服务器 禁止访问
opener=request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]
request.install_opener(opener)
request.urlretrieve(imgurl,'F:/python3/webImage/%s_%s.jpg'%(page,x))
except HTTPError as e:
print('e.value HTTPError')
print(e.code)
print(e.read())
except URLError as e:
print('e.value URLError')
print(e.reason )
except TimeoutError as e:
print("e.value")
x+=1
time.sleep(1) #推迟调用线程一秒
def downImageNum(pagenum):
page = 1
pageNumber = pagenum
while(page <= pageNumber):
html = getHtml(url)#获得url指向的html内容
imageList = getImageList(html)#获得所有图片的地址,返回列表
download(imageList,page)#下载所有的图片
page=page+1
if __name__ == '__main__':
for num in range(1,88):
if num == 1:
url = 'http://jandan.net/ooxx'
else:
url = 'http://jandan.net/ooxx/page-%d#comments' % num
downImageNum(num)
参考
python3网络爬虫:爬取煎蛋网美女照片写了下面这个爬虫,把写的记录下来。
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
import re
if __name__ == '__main__':
list_url = []
list_url_1 = []
for num in range(1,50):
try:
url = 'http://www.rs05.com/movie/?p=%d' % num
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'html.parser')
targets_url = bf.find_all(class_="movie-thumbnails")
except TimeoutError as e:
print("e.value")
for each in targets_url:
list_url.append(each.get('href'))
for each in list_url:
try:
req = requests.get(url = each,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'html.parser')
targets_url = bf.find_all(class_="edui-img")
for i in targets_url:
list_url_1.append(i.get('src'))
for target_url_1 in list_url_1:
try:
filename =str(num )+'-'+str(time.localtime())+'.jpg'
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
img_req = requests.get(url = target_url_1,headers = headers)
img_req.encoding = 'utf-8'
img_html = img_req.text
img_bf = BeautifulSoup(img_html, 'lxml')
urlretrieve(url = target_url_1,filename = 'E:/cache-work/python3/images1/'+ filename )
#print(filename)
time.sleep(1)
except TimeoutError as e:
print(e.value)
list_url_1.clear()
except TimeoutError as e:
print("e.value")
#print(len(list_url_1))
if len(list_url)>0:
list_url.clear()
if num%7==0:
time.sleep(500)
else:
time.sleep(100)
print('下载完成!')
四、总结
新手上路,总是遇到很多bug,感谢发达的信息时代,遇到问题,很快就能查到结果
1、tabERr,。原来回车换行 的时候,是按照tab空行的,要用四个空格代替;
2、有些网站 ,图片下载 403错误,下载的时候,添加投,模拟浏览器就ok了。
3、开始的时候,没加try‘’except 异常环节,很容易终止操作。
4、目前下载的图片存在重复,因为不同网址 对应相同的图片,正在想办法解决。