本文实现爬取知乎发现页面的热门话题部分,将问题和答案同义保存为文本格式
用requests库进行爬取,注意要将头部信息的user-agent改为mozilla.
headers={'User-Agent':'Mozilla'}
url="https://www.zhihu.com/explore"
r=requests.get(url,headers=headers)
html=r.text
使用pyquery进行文本解析
doc=pq(html)
items=doc('.explore-feed.feed-item').items()
for item in items:
question=item.find('h2').text()
author=item.find('.author-link-line').text()
anwser=pq(item.find('.content').html()).text()
file=open('explore3.txt','a',encoding='utf-8')
file.write('\n'.join([question,author,anwser]))
file.write('\n'+'='*50+'\n')
file.close()
from pyquery import PyQuery as pq
import requests
headers={'User-Agent':'Mozilla'}
url="https://www.zhihu.com/explore"
r=requests.get(url,headers=headers)
html=r.text
doc=pq(html)
items=doc('.explore-feed.feed-item').items()
for item in items:
question=item.find('h2').text()
author=item.find('.author-link-line').text()
anwser=pq(item.find('.content').html()).text()
file=open('explore3.txt','a',encoding='utf-8')
file.write('\n'.join([question,author,anwser]))
file.write('\n'+'='*50+'\n')
file.close()