1. 分析网站
一张照片经过三级链接获取
加载链接->网页链接->图片链接
图片集合链接
图片链接的两种方式
2. 获取链接前准备
import requests
import re
from urllib.parse import urlencode
base_url = 'https://www.toutiao.com/api/search/content/?'
headers = {
#'Host':'www.toutiao.com',##Host最好不要,因为对应的不同界面的Host也不同,所以不要的话,直接浏览器请求
'Referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Accept':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Accept-encoding':'gzip, deflate, br',
'Accept-language':'zh-CN,zh;q=0.9',
'Cookie':'csrftoken=d32c8983fb8bc9e7566efe97b76ffeff; tt_webid=6812819376333932046;'
' ttcid=b4fe85f9f6f643cb948966693aaf3f7828; s_v_web_id=verify_k8pl0hop_i212c1x'
'w_rdGt_4Lcn_9h2l_X2078LwpUan2; SLARDAR_WEB_ID=66d03bfb-6755-43b0-98ba-0ada7f04c'
'7e6; __tasessionId=fq76pgib81586268405348; tt_scid=8y1zq72aj9SgZ86dD90RgIG2c8Ojl'
'lw6p1ScXO9ZNetTUBzh8It8ZDmUETGTr7HK3c1f'
}
3. 获取加载链接
def get_index_page(offset):#获取主页每次加载的文本,此时这个可以执行正确了,这个链接与原来的多了两个参数的链接的结果还是有区别的,不完全一样,凑合用吧
params={
'aid':'24',
'app_name':'web_search',
'offset':str(offset),
'format':'json',
'keyword':'街拍',
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis'
}
url = base_url+urlencode(params)
print(url)
response = requests.get(url,headers=headers)#如果请求头出错的话会出现400响应码,服务器不能理解客户端的请求
#print(response.status_code)
try:
if response.status_code==200:
Json = response.json()
return Json
except requests.ConnectionError as e:
print("Error:",e.args)
4. 加载链接获取网页链接
def parse_index_page(Json):#根据加载的文本获取当页的所有链接 存在生成器detail_url
#print(Json)
if Json.get('data'):
items = Json.get('data')#获取data数据
#print(items)
pattern = re.compile('http://toutiao.com/group')#所有的二级链接都是以此开头
for item in items:
if re.match(pattern,str((item.get('article_url')))):
detail_url = {}
detail_url['url'] = item.get('article_url')
detail_url['title'] = item.get('title')
yield detail_url#返回data数据链接
5. 由网页链接过去图片链接
def get_image_url(detail_url):#此链接为http://toutiao.com/group类
#链接有两种保存方式,所以要两种匹配方式
print(detail_url)
pattern_content = re.compile('content: \'"(.*)"')
pattern_gallery = re.compile('gallery: JSON.parse(.*)')
html_text = requests.get(detail_url,headers=headers).text#获取http://toutiao.com/group类网页源码
print(requests.get(detail_url,headers=headers).status_code)
try:
if re.search(pattern_gallery,html_text):
#imageurl_str = re.search(pattern_gallery,html_text)
gallery_url = re.search(pattern_gallery,html_text).group(1)#原始二进制页面链接,没有经过转码以及拆分
print("原始gallery_url:", gallery_url)
gallery_url = re.sub('\\\\\\\\','',gallery_url)#去掉多余的\\
print("gallery_url:",gallery_url)
pattern = re.compile('url_list.*?(http:.*?)\\\\"}')#去掉斜杠\
images_url = re.findall(pattern, gallery_url)#拆分后的图片链接集合
for image_url in images_url:
imagestr = {}
imagestr['url'] = image_url.encode('utf-8').decode('unicode_escape')#获取单个图片的链接
yield imagestr#保存每个图片的链接为生成器
elif re.search(pattern_content,html_text):
#imageurl_str = re.search(pattern_content,html_text)
content_url = re.search(pattern_content,html_text).group(1)
print("content_url:",content_url)
#prttern = re.compile('"(http.*?&quo)')
pattern = re.compile('"(http.*?)\\\\"')
images_url = re.findall(pattern,content_url)
print(images_url)
for image_url in images_url:
imagestr = {}
imagestr['url'] = image_url.encode('utf-8').decode('unicode_escape') # 获取单个图片的链接
yield imagestr
except:
print('error')
6. 下载图片
def save_image(title,image_url):#保存图片,传入文件名以及链接
if requests.get(image_url, headers=headers).status_code == 200:
image = requests.get(image_url, headers=headers).content
filepath = '/Users/mac/Downloads/今日头条/'+title
imagefile = open(filepath, 'wb')
imagefile.write(image)
else:
print('error')
由此每个加载项的所有图片都可以获取,还要看看有多少加载项,每个加载项链接只有一个偏移参数offset不一样,可求
def get_offset():#获取网页偏移
Start=0
End = 0
while True:
Start = End*20
url_index = get_index_page(Start)
if url_index and url_index.get('data'):
End=End+1
else:
break
return End
主函数:
if __name__ =='__main__':
End = get_offset()
for page in range(0,End):
Json = get_index_page(page*20)#
#print(Json)
detail_url = parse_index_page(Json)
for url_index in detail_url:
strurl = url_index['url']
strtitle = url_index['title']
count = 1
images_url = get_image_url(strurl)
for image_url in images_url:
image_last_url = image_url['url']
imagetitle = strtitle+str(count)+'.jpg'
print(imagetitle,image_last_url)
count=count+1
save_image(imagetitle,image_last_url)
结果展示:
⚠️:这样的爬虫很简陋,导致容易被网站封掉,爬虫过段时间就不能打开网页了,会出现这样的情况。http://toutiao.com/group/类的页面打开是空白,用浏览器打开也是一样,这时候就等会再爬就好了,或者加代理。还有就是链接为视频链接,被过滤了。
但有时候爬取的图片不能打开,是因为本身网站的图片也是损坏状态。如下图