1-获取微博热搜url
weibo_url = 'https://s.weibo.com/top/summary?cate=realtimehot'
2-创建存放微博热搜目录是否存在 不存在就创建
if not os.path.exists(r'd:/新浪新闻'):
os.mkdir(r'd:/新浪新闻')
3-获取所需要的字段值
eles=selector.cssselect('tbody>tr')
ls=[]
for index, ele in enumerate(eles):
title = ele.xpath('./td[@class="td-02"]/a/text()')[0]
url = ele.xpath('./td[@class="td-02"]/a/@href')[0]
hot = ele.xpath('./td[@class="td-02"]/span/text()')
cwawl_time = str(datetime.now())
weibo_dict={}
weibo_dict['title'] =title
weibo_dict['url'] =url
weibo_dict['hot'] =hot
weibo_dict['cwawl_time'] =cwawl_time
ls.append(weibo_dict)
4-将获取到的字段值存到本地文件夹中
with open(r'd:/新浪新闻/%d.txt'%(index+1),'w') as f:
f.write('title:'+title+'\n')
f.write('url:'+url+'\n')
f.write('hot:'+str(hot)+'\n')
f.write('cwawl_time:'+cwawl_time+'\n')
5-完整代码实现功能
'''目的:爬取新浪热搜 头条'''
import requests
import cssselect
from datetime import datetime
import os
from lxml import etree
if not os.path.exists(r'd:/新浪新闻'):
os.mkdir(r'd:/新浪新闻')
weibo_url = 'https://s.weibo.com/top/summary?cate=realtimehot'
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
res = requests.get(weibo_url,headers=headers)
res.encoding = 'utf-8'
selector = etree.HTML(res.text)
eles=selector.cssselect('tbody>tr')
ls=[]
for index, ele in enumerate(eles):
title = ele.xpath('./td[@class="td-02"]/a/text()')[0]
url = ele.xpath('./td[@class="td-02"]/a/@href')[0]
hot = ele.xpath('./td[@class="td-02"]/span/text()')
cwawl_time = str(datetime.now())
weibo_dict={}
weibo_dict['title'] =title
weibo_dict['url'] =url
weibo_dict['hot'] =hot
weibo_dict['cwawl_time'] =cwawl_time
ls.append(weibo_dict)
with open(r'd:/新浪新闻/%d.txt'%(index+1),'w') as f:
f.write('title:'+title+'\n')
f.write('url:'+url+'\n')
f.write('hot:'+str(hot)+'\n')
f.write('cwawl_time:'+cwawl_time+'\n')
print('微博热搜爬取完成!')
6-效果截图展示