# 静态网页在浏览器中展示的内容都在HTML的源码中,但主流网页使用 Javascript时,很多内容不出现在HTML的源代码中,此时仍然使用
# requests+beautifulsoup是不能够成功的,如:
# 动态网页的爬取,使用 requests+beautifulsoup是不会成功的:
# import requests
# from bs4 import BeautifulSoup
# url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802'
# headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
# html = requests.get(url, headers= headers)
# bs=BeautifulSoup(html.text,'html.parser')
# comments_tags=bs.find_all('div',{'class':'reply-content-wrapper'})
# for comment in comments_tags:
# print(comment.attrs['data-content'])
# Ajax: Asynchronous Javascript And XML,异步JvvaScript和 XML; 在不重新加载整个网页的情况下对网页的某部分进行更新,节省流量,速度快。
# 加大了 爬虫的难度。为解决这个问题,可以采用两种技术: 1)通过浏览器审查元素解析真实网页的地址。2)使用 Selenium模拟浏览器的方法。
# 本节内容:通过浏览器审查元素解析真实网页的地址:
# 真实网址:
# 第一页: https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802
# 第二页: https://api-zero.livere.com/v1/comments/list?callback=jQuery112408983696804040213_1592128123614&limit=10&offset=2&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128123621
# 重新刷新第二页: https://api-zero.livere.com/v1/comments/list?callback=jQuery1124042695935490813275_1592128347126&limit=10&offset=2&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128347133
# 第一页和第二页最明显的区别在于:
# offset (虽然有其他地方也不一样,但不影响,只有 offset起决定作用),所以可以通过控制 offset来翻页。
# 请求头: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362
# 根据上面信息,我们将代码设计为:
import requests
url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
r = requests.get(url, headers= headers)
print (r.text)
/**/ typeof jQuery112406954584941688864_1592120544800 === 'function' && jQuery112406954584941688864_1592120544800({"results":{"parents":[{"replySeq":42003685,"name":"奔跑的苹果树","memberId":"oBVoaxMyiTIYdTYmbPxXxNVrAxz4","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/2CBNK5cDVstrL3W33VXJSCic8Pu3jczS4UNQtf04ZhdpVtk1PlRc8slz1lzJCakwKeFLtdGO0cqj9dDBosicWq6w/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":42003685,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"112.102.211.149","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-14T07:35:53.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"真实地址怎么获取?点击右键检查了也没发现啊。","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32374754,"memberSeq":32926179,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41888279,"name":"Creep","memberId":"oBVoaxAxqLr16sfwz1GXm9UaHVF4","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/62cLVFreHtJN80DNyHnEGqrC9v42QWErXr20KB2icDCSQuNAPuYibpO7yAYTb5FY90MSpl1gLIabf7KktQibia4nNA/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":41888279,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"58.62.87.37","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.159 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-01T12:20:08.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"学习中","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32349986,"memberSeq":32901188,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41882866,"name":"余非鱼^*^","memberId":"oBVoaxHwTIri5lNP36JXwSK2NMzg","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTIl3ibbP9gC9ES0zN5LIhvfzPB4zICW123JG2PawaXS9c0oiaoFDQp4RJrupZf8AolXZQH3tNI2QwWA/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":41882866,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"171.34.101.38","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-01T02:35:00.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一起学习","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32348903,"memberSeq":32900097,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41458240,"name":"無","memberId":"UID_43B3E8679B3B9880BEB734882BCE59B3","memberIcon":"http://thirdqq.qlogo.cn/g?b=oidb&k=zuYsrwicH5EvoOeKJibGVaaQ&s=100&t=1584881994","memberUrl":"https://qq.com/","memberDomain":"qq","good":0,"bad":0,"police":0,"parentSeq":41458240,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"117.166.113.250","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-04-22T04:29:49.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一句话,给我爬!!!!","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32277925,"memberSeq":32828481,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085166,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085166,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:13:25.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"为什么不多放几个回帖","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085164,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085164,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:13:01.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"哎,还要多少啊。","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085162,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085162,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:12:40.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"我不知道要多少帖子才能翻篇啊,你们没有买他的书吗","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085159,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085159,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:11:49.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"我要疯了。作者拜托你能不能改一下啊","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085152,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085152,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:11:22.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一页到底能装多少回帖啊?","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085150,"name":"astin2020","memberId":"[email protected]","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085150,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:10:59.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"好累啊","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null}],"children":[],"quotations":[]},"resultCode":200,"resultMessage":"Okay, livere"});
# 只获取第一页评论:
# 解析得到的字符串r.text(即 json字符串)可以使用json库来完成解析:
import json
import requests
url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
r = requests.get(url, headers= headers)
json_data_dict=json.loads(r.text[r.text.find('{'):-2])
# 将从左大括号开始至倒数第三个字符(即将字符串末尾的括号和分号去除掉)load反序列化成字典。
# json_data_dict是一个字典嵌套字典的数据结构(字典的value是字典)。
# 其中外部字典的results键对应一个字典,该字典的parents键对应一个值是列表(列表的元素又是字典)。
comments_list=json_data_dict['results']['parents']
for comment_dict in comments_list:
print(comment_dict['content'])
# 或 :
import json
import requests
import jsonpath
url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
r = requests.get(url, headers= headers)
json_data_dict=json.loads(r.text[r.text.find('{'):-2])
comments_list=jsonpath.jsonpath(json_data_dict,'$.results.parents[*].content')
for comment in comments_list:
print(comment)
# 真实地址怎么获取?点击右键检查了也没发现啊。
# 学习中
# 一起学习
# 一句话,给我爬!!!!
# 为什么不多放几个回帖
# 哎,还要多少啊。
# 我不知道要多少帖子才能翻篇啊,你们没有买他的书吗
# 我要疯了。作者拜托你能不能改一下啊
# 一页到底能装多少回帖啊?
# 好累啊
# 获取两页评论:
import json
import requests
def get_comments(page_num):
global comments_list
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
url='https://api-zero.livere.com/v1/comments/list?callback=jQuery1124042695935490813275_1592128347126&limit=10&offset='\
+page_num+\
'&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128347133'
r = requests.get(url, headers= headers)
json_data_dict=json.loads(r.text[r.text.find('{'):-2]) # 将从左大括号开始至倒数第三个字符(即将字符串末尾的 ');'括号和分号去除掉)load反序列化成字典。
# json_data_dict是一个字典嵌套字典的数据结构(字典的value是字典)。
# 其中外部字典的results键对应一个字典,该字典的parents键对应一个值是列表(列表的元素又是字典)。
comments_list.extend(json_data_dict['results']['parents']) # 列表
if __name__=='__main__':
comments_list=[]
for page_num in range(1,3):
get_comments(str(page_num))
for comment_dict in comments_list:
print(comment_dict['content'])
# 真实地址怎么获取?点击右键检查了也没发现啊。
# 学习中
# 一起学习
# 一句话,给我爬!!!!
# 为什么不多放几个回帖
# 哎,还要多少啊。
# 我不知道要多少帖子才能翻篇啊,你们没有买他的书吗
# 我要疯了。作者拜托你能不能改一下啊
# 一页到底能装多少回帖啊?
# 好累啊
# 还不够哦
# 如果这样违反了你的规定,请原谅,我也是没有办法,只能帮你把水灌上
# 不然好多代码我没有办法去按照你书上的内容操作。很郁闷
# 主人可能忘记爬虫的跟帖必须要翻过两页才能测试啊
# 是不是要10页才翻篇
# 我要追加多少评论才够两页呢
# 为什么我能看到评论呢??
# 学习
# 不是
# 我是第一个来的吗?
# 回顾:
# 1)--代码在 IDE里的换行:
a='aaaaaaaaaaaaaaaaaaaaabbbbbbccc\
ggggg'
print(a) # aaaaaaaaaaaaaaaaaaaaabbbbbbcccggggg
b='aaaaaaaaaaaaaaaaaaaaabbbbbbccc'\
+\
'ggggg'
print(b) # aaaaaaaaaaaaaaaaaaaaabbbbbbcccggggg
# 2)--在输出里换行,换行符是字符串本身的一部分:
c='aaaaaaaaaaaaaaaaaaaaabbbbbbccc\nggggg'
print(c)
# aaaaaaaaaaaaaaaaaaaaabbbbbbccc
# ggggg
i=True
if\
i==True:
print('haha')