爬取静态网页案例:
from bs4 import BeautifulSoup
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
url = 'http://news.baidu.com/'
# 取得新闻标题
def craw2(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
for title_href in soup.find_all('div'):
print([title.get_text()
for title in title_href.find_all('a')])
craw2(url)
上面案例可以得到网页代码中内容,即使不设置header头也能获取到。
但是,相同代码应用于获取动态网站就失效了。
在动态页面中,所显示的内容往往不是通过HTML页面呈现的,而是通过调用js等方式从数据库中得到数据,回显到网页上。
爬取动态网页案例:
爬取https://www.infoq.cn/为案例,如果直接爬取网站HTML页面是无法获取想要的文章内容,可以在页面右键显示网页源代码看到想要的文章内容并不在HTML中,所以可确定为动态网站。
下面操作方法来源此文章
通过模拟客户端发送post请求来爬取,首先需要找到post请求url:
import requests
from bs4 import BeautifulSoup
def test0():
url="https://www.infoq.cn/public/v1/config/getAdList"
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':'0',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'这里的内容用自己',
'Host':'www.infoq.cn',
'Origin':'https://www.infoq.cn',
'Referer':'https://www.infoq.cn/public/v1/config/getAdList',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
re=requests.post(url,headers = headers,data={'start':0})
print(re.text)
def test1():
url="https://www.infoq.cn/public/v1/my/recommond"
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':'11',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'这里的内容用自己',
'Host':'www.infoq.cn',
'Origin':'https://www.infoq.cn',
'Referer':'https://www.infoq.cn/public/v1/my/recommond',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
}
re=requests.post(url,headers = headers,data={'start':33,'offset':33})
print(re.text)
test0()
#获取到的内容
{'code': 0,
'data':
{'list':
[{'name': 'QCon',
'link': 'https://2019.qconbeijing.com/track?utm_source=infoq&utm_medium=banner&term=lusu',
'image': 'https://static001.geekbang.org/resource/image/97/70/97c80c4d51d01cbebebcf397e5e63f70.png'},
{'name': 'QCon广州',
'link': 'https://qconguangzhou.geekbang.org/?utm_source=infoq&utm_medium=banner',
'image': 'https://static001.infoq.cn/resource/image/0a/60/0a7516e6e8ed699d14fcfaf412475960.jpg'},
{'name': '西云数据',
'link': 'https://www.bagevent.com/event/2356848?bag_track=Banner3',
'image': 'https://static001.geekbang.org/resource/image/0c/fd/0c3a140ba910744136d9fe0e726945fd.jpg'},
{'name': 'GMTC',
'link': 'https://gmtc2019.geekbang.org/?utm_source=infoq&utm_medium=banner&utm_campaign=7',
'image': 'https://static001.geekbang.org/resource/image/72/fa/726dfd8346cdf7a0de843b32df3ccbfa.jpg'},
{'name': 'GTLC',
'link': 'https://gtlc2019.geekbang.org/?utm_source=infoq&utm_medium=guanwang&utm_campaign=banner',
'image': 'https://static001.geekbang.org/resource/image/b6/87/b6375457793da7b7b5fbc9eba8530a87.jpg'},
{'name': '极客大学',
'link': 'https://time.geekbang.org/special/arithmetic?utm_source=infoq_web&utm_medium=banner',
'image': 'https://static001.geekbang.org/resource/image/53/dd/53c9dca735254ffa687984ffef0f93dd.jpg'},
{'name': '企业账号',
'link': 'https://service.geekbang.org/goods/list?category=7&page=1#utm_source=website&utm_medium=infoq&utm_campaign=banner&utm_term=0221',
'image': 'https://static001.infoq.cn/resource/image/37/a7/375a5847c1f0d397d2280f23e077c6a7.jpg'}],
'offsets': 0},
'error': {},
'extra': {'cost': 0.001510902,
'request-id': '396b8defe24e713f8610038737671eba@2@infoq'}}
test1()
#获取到的内容
{'code': 0,
'data': [{'aid': 22233,
'article_cover': 'https://static001.geekbang.org/resource/image/e4/71/e4a978422d6c045c7a541b78bbca1f71.jpeg',
'article_cover_point': '{"big":{"point":{"x":0,"y":189,"w":2815,"h":1451}},"small":{"point":{"x":0,"y":0,"w":2816,"h":2091}},"width":2816,"height":2112}',
'article_sharetitle': '北大AI公开课2019 | 商汤科技沈徽:AI创新与落地',
'article_subtitle': '',
'article_summary': '北大AI公开课第四讲如期开讲,商汤科技集团副总裁、商业与数据洞察事业群总裁、工程院院长沈徽带来了《AI创新与落地》的分享',
'article_title': '北大AI公开课2019 | 商汤科技沈徽:AI创新与落地',
'author':
[{'uid': 1277332,
'nickname': '蔡芳芳',
'avatar': ''}],
'ctime': 1552714233280,
'is_collect': False,
'no_author': '',
'publish_time': 1552716000000,
'score': 1552716000000,
'sub_author': [],
'sub_topic': [],
'topic':
[{'id': 31,
'name': 'AI'},
{'id': 3,
'name': '文化 & 方法'},
{'id': 127,
'name': '计算机视觉'}],
'type': 1,
'utime': 1552716005424,
'uuid': 'GoNSrsxpC0AT6a3V-6NQ',
'views': 0},
{'aid': 22232,
'article_cover': 'https://static001.geekbang.org/resource/image/ee/47/ee719a85e81ae073f79039ba9da3df47.jpg',
'article_cover_point': '{"big":{"point":{"x":0,"y":309,"w":5184,"h":2672}},"small":{"point":{"x":506,"y":332,"w":4040,"h":3001}},"width":5184,"height":3456}',
'article_sharetitle': '3·15曝光丨智能机器人一年拨打40亿个骚扰电话,6亿人信息已遭泄露!',
'article_subtitle': '',
'article_summary': '在昨晚的315晚会上,一条探针盒子+数据匹配+智能外呼机器人的灰色产业链遭到曝光。据报道,智能外呼机器人一年拨打电话可达40多亿个,探针盒子公司收集有全国6亿用户的各类信息!',
'article_title': '3·15曝光丨智能机器人一年拨打40亿个骚扰电话,6亿人信息已遭泄露!',
'author':
[{'uid': 1000106,
'nickname': '小智',
'avatar': 'https://static001.geekbang.org/account/avatar/00/0f/42/aa/b9a67c2e.jpg'}],
'ctime': 1552698015145,
'is_collect': False,
'no_author': '',
'publish_time': 1552698000000,
'score': 1552698000000,
'sub_author': [],
'sub_topic': [],
'topic': [{'id': 21,
'name': '安全'},
{'id': 15,
'name': '大数据'},
{'id': 148,
'name': '信息泄露'}],
'type': 1,
'utime': 1552698015145,
'uuid': 'NgG*uSOwwI2OVhA80o0G',
'views': 0},
{'aid': 22230,
'article_cover': 'https://static001.geekbang.org/resource/image/30/6f/3087f86b2cbe7b3222c3bb5d7557126f.jpg',
'article_cover_point': '{"big":{"point":{"x":0,"y":0,"w":1279,"h":659}},"small":{"point":{"x":51,"y":0,"w":1148,"h":852}},"width":1280,"height":853}',
'article_sharetitle': '这可能是人工智能领域覆盖最全的一份技术趋势报告',
'article_subtitle': '',
'article_summary': '这份报告对AI领域的技术预测可谓面面俱到,无论是对于AI企业、研究者,还是AI学习者来说都有一定参考价值',
'article_title': '这可能是人工智能领域覆盖最全的一份技术趋势报告',
'author': [{'uid': 1462160,
'nickname': '未来今日研究所',
'avatar': ''}],
'ctime': 1552698005153,
'is_collect': False,
'no_author': '',
'publish_time': 1552698000000,
'score': 1552698000000,
'sub_author': [],
'sub_topic': [],
'topic':
[{'id': 31,
'name': 'AI'},
{'id': 1,
'name': '语言 & 开发'},
{'id': 45,
'name': '物联网'}],
'translator': [{'uid': 1282296,
'nickname': 'Debra',
'avatar': ''}],
'type': 1,
'utime': 1552698005153,
'uuid': 'A315uodoMbWrNrZh*MzP',
'views': 0},
......
{'aid': 22225,
'article_cover': 'https://static001.geekbang.org/resource/image/6f/a2/6f3b4ff25e9e2e926e8e2bd4436f96a2.jpg',
'article_cover_point': '{"big":{"point":{"x":0,"y":10,"w":1000,"h":515}},"small":{"point":{"x":136,"y":12,"w":761,"h":565}},"width":1000,"height":600}',
'article_sharetitle': 'Google 和 Facebook 披露全球范围宕机原因',
'article_subtitle': '',
'article_summary': '昨日,Google、Facebook两巨头在同一天相继发生全球大规模宕机,其中Facebook的断电时常更是超过10小时之久。',
'article_title': 'Google 和 Facebook 披露全球范围宕机原因',
'author':
[{'uid': 1278039,
'nickname': '张婵',
'avatar': ''}],
'ctime': 1552642208366,
'is_collect': False,
'no_author': '',
'publish_time': 1552642207254,
'score': 1552642207254,
'sub_author': [],
'sub_topic': [],
'topic':
[{'id': 3,
'name': '文化 & 方法'},
{'id': 147,
'name': '企业动态'},
{'id': 48,
'name': '方法论'}],
'type': 1,
'utime': 1552642208366,
'uuid': 'e-NCah5RTmJMrvmrmbCU',
'views': 0}],
'error': {},
'extra':
{'cost': 0.028648169,
'request-id': '1ceebf53fd24003586f1272cc881b7fd@2@infoq'}}