爬虫实例3:爬取微博热搜

1-获取微博热搜url

weibo_url = 'https://s.weibo.com/top/summary?cate=realtimehot'

2-创建存放微博热搜目录是否存在 不存在就创建

if not os.path.exists(r'd:/新浪新闻'):
    os.mkdir(r'd:/新浪新闻')

3-获取所需要的字段值

eles=selector.cssselect('tbody>tr')
ls=[]
for index, ele in enumerate(eles):
    title = ele.xpath('./td[@class="td-02"]/a/text()')[0]
    #print(title)
    url = ele.xpath('./td[@class="td-02"]/a/@href')[0]
    hot = ele.xpath('./td[@class="td-02"]/span/text()')
    #print(title,url,hot)
    cwawl_time = str(datetime.now())
    weibo_dict={}
    weibo_dict['title'] =title
    weibo_dict['url'] =url
    weibo_dict['hot'] =hot
    weibo_dict['cwawl_time'] =cwawl_time
    ls.append(weibo_dict)
#print(ls)

4-将获取到的字段值存到本地文件夹中

with open(r'd:/新浪新闻/%d.txt'%(index+1),'w') as f:
        f.write('title:'+title+'\n')
        f.write('url:'+url+'\n')
        f.write('hot:'+str(hot)+'\n')
        f.write('cwawl_time:'+cwawl_time+'\n')

5-完整代码实现功能

'''目的:爬取新浪热搜 头条'''
import requests
import cssselect
from datetime import datetime
import os 
from lxml import etree
if not os.path.exists(r'd:/新浪新闻'):
    os.mkdir(r'd:/新浪新闻')

weibo_url = 'https://s.weibo.com/top/summary?cate=realtimehot'
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}

res = requests.get(weibo_url,headers=headers)
res.encoding = 'utf-8'
#print(res.text)
selector = etree.HTML(res.text)

eles=selector.cssselect('tbody>tr')
ls=[]
for index, ele in enumerate(eles):
    title = ele.xpath('./td[@class="td-02"]/a/text()')[0]
    #print(title)
    url = ele.xpath('./td[@class="td-02"]/a/@href')[0]
    hot = ele.xpath('./td[@class="td-02"]/span/text()')
    #print(title,url,hot)
    cwawl_time = str(datetime.now())
    weibo_dict={}
    weibo_dict['title'] =title
    weibo_dict['url'] =url
    weibo_dict['hot'] =hot
    weibo_dict['cwawl_time'] =cwawl_time
    ls.append(weibo_dict)
#print(ls)


    with open(r'd:/新浪新闻/%d.txt'%(index+1),'w') as f:
        f.write('title:'+title+'\n')
        f.write('url:'+url+'\n')
        f.write('hot:'+str(hot)+'\n')
        f.write('cwawl_time:'+cwawl_time+'\n')
print('微博热搜爬取完成!')


6-效果截图展示

爬虫实例3:爬取微博热搜_第1张图片

你可能感兴趣的:(python,爬虫)