最近有点空闲时间,又开始研究python的爬虫,事实上这几天已经写了好几个爬虫,也尝试了用pyspider
爬取网页,慢慢积累,今天和大家分享一个表情包爬虫。
相信大家都喜欢斗图,今天这个爬虫就是爬取爱斗图网站的图包,资源丰富,内容很多:
这是从网上拿到的结果:
这个网站主要是静态网页,结构并不复杂,我们的操作步骤如下:
下面是具体的python代码,这里没有下载图片,只是把图片数据存到了本地,如果要下载可以再定义一个下载函数。以上,谢谢。
import requests
from bs4 import BeautifulSoup as bs
import json
class Doutu:
def __init__(self):
self.start_url = 'http://www.adoutu.com/article/list/1'
self.part_url = 'http://www.adoutu.com'
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_page(self, url):
try:
response = requests.get(url, headers=self.headers)
return response.content.decode()
except:
return None
def parse_page(self, html):
soup=bs(html, 'lxml')
contents=soup.select('div.article-part-list > div.list-group')
next_url=self.part_url+soup.select('li.page-item > a.page-link')[-1]['href']
content_list=[]
for content in contents:
result={
'title': content.select_one('.title-content').get_text(),
'detail_url': self.part_url+content.select_one('div.list-group-item > a')['href'],
'date': content.select('.title-property')[0].get_text().replace('上传时间:', ''),
'nums': content.select('.title-property')[1].get_text().replace('数量:', ''),
'hot': content.select('.title-property')[2].get_text().replace('热度:', ''),
'keywords': [i.get_text() for i in content.select('.detail-keyword-item ')]
}
content_list.append(result)
return next_url, content_list
def get_detail_page(self, url):
content=self.get_page(url)
s=bs(content, 'lxml')
results=s.select('div.detail-content > div.detail-picture')
pics_list=[i.find('img')['src'] for i in results]
# pic_title=[i.find('img')['title'] for i in results]
return pics_list#, pic_title
def run(self):
html=self.get_page(self.start_url)
next_url, content_list=self.parse_page(html)
for i in content_list:
detail_url=i['detail_url']
pics_list=self.get_detail_page(detail_url)
i['pics_list']=pics_list
self.on_save(i)
while self.get_page(next_url):
html=self.get_page(next_url)
next_url, content_list=self.parse_page(html)
for i in content_list:
detail_url=i['detail_url']
pics_list=self.get_detail_page(detail_url)
i['pics_list']=pics_list
self.on_save(i)
def on_save(self, content):
if content:
with open('E:/spiders/doutu/doutu.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False))
f.write('\n')
print(content['title'], 'Done')
if __name__ == '__main__':
dt=Doutu()
dt.run()