05爬取约会吧美女全部照片

# 爬虫思路:首先找到约会吧的链接地址,
# 然后获取网页,从中提取出每个发消息用户的详情页,找到存放图片的详情页链接,
# 根据地址爬取图片
import requests,parsel

# 用来获取约会吧主页的函数
def get_yuehuiba_url(url,headers):
# 通过requests库获取网页html代码
response = requests.get(url = url,headers = headers).text
# 通过parsel解析出我们需要的内容
html = parsel.Selector(response)
title_url = html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href').getall()
print(title_url)
return title_url
# 用来获取每个人详情页
def get_image(headers,title_url):
# 刚才解析出的详情页的url链接,需要加上贴吧的主页链接才可以访问
second_url = "https://tieba.baidu.com"
# title_url中存放的是每一个详情页的链接
result_lists = []
for url in title_url:
all_url = second_url + url
print("当前的贴子链接:",all_url)
# 再次发送链接,进入帖子内部
response_2 = requests.get(url = all_url,headers = headers).text
# 第二次解析,解析出每一个妹子照片的链接,并存放在列表中
response_2_data = parsel.Selector(response_2)
result_list = response_2_data.xpath('//cc/div/img[@class="BDE_Image"]/@src').getall()
result_lists.append(result_list)
print(result_list)
return result_lists
# 获取每张妹子粘片并保存
def save_image(result_lists,headers):
for result_list in result_lists:
for li in result_list:
# 通过访问每一个详情页的链接,下载妹子的照片,
# 这里和上边不一样,这里获取的是二进制数据
print(li)
img_data = requests.get(url = li,headers = headers).content
file_name = li.split("/")[-1]
file_name = r"D:\python\贴吧美女\{}.jpg".format(file_name)
# 然后写入特定的文件夹
with open(file_name,mode = 'wb') as f:
f.write(img_data)
print("正在保存",file_name)



if __name__ == "__main__":
# 定义变量存放约会吧的链接地址
yuehuiba_url = "https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BA%A6%E4%BC%9A%E5%90%A7&fr=search"
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"
}
title_url = get_yuehuiba_url(yuehuiba_url,headers)
result_lists = get_image(headers,title_url)
save_image(result_lists,headers)


你可能感兴趣的:(05爬取约会吧美女全部照片)