Python基础学习20

爬取静态网页案例:

from bs4 import BeautifulSoup
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url = 'http://news.baidu.com/'

# 取得新闻标题
def craw2(url):
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.text, 'lxml')

    for title_href in soup.find_all('div'):
        print([title.get_text()
              for title in title_href.find_all('a')])

craw2(url)

上面案例可以得到网页代码中内容,即使不设置header头也能获取到。
但是,相同代码应用于获取动态网站就失效了。
在动态页面中,所显示的内容往往不是通过HTML页面呈现的,而是通过调用js等方式从数据库中得到数据,回显到网页上。
爬取动态网页案例:
爬取https://www.infoq.cn/为案例,如果直接爬取网站HTML页面是无法获取想要的文章内容,可以在页面右键显示网页源代码看到想要的文章内容并不在HTML中,所以可确定为动态网站。
下面操作方法来源此文章

Python基础学习20_第1张图片
找到内容对应url

Python基础学习20_第2张图片
捕捉信息发现通过post请求获取内容

通过模拟客户端发送post请求来爬取,首先需要找到post请求url:

import requests
from bs4 import BeautifulSoup

def test0():
    url="https://www.infoq.cn/public/v1/config/getAdList"
    headers = {
        'Accept':'*/*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Connection':'keep-alive',
        'Content-Length':'0',
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie':'这里的内容用自己',
        'Host':'www.infoq.cn',
        'Origin':'https://www.infoq.cn',
        'Referer':'https://www.infoq.cn/public/v1/config/getAdList',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        }

    re=requests.post(url,headers = headers,data={'start':0})
    print(re.text)

def test1():
    url="https://www.infoq.cn/public/v1/my/recommond"
    headers = {
        'Accept':'*/*',
        'Accept-Encoding':'gzip, deflate, br',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Connection':'keep-alive',
        'Content-Length':'11',
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie':'这里的内容用自己',
        'Host':'www.infoq.cn',
        'Origin':'https://www.infoq.cn',
        'Referer':'https://www.infoq.cn/public/v1/my/recommond',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        }
    re=requests.post(url,headers = headers,data={'start':33,'offset':33})
    print(re.text)

test0()
#获取到的内容
{'code': 0,
 'data':
     {'list':
          [{'name': 'QCon',
            'link': 'https://2019.qconbeijing.com/track?utm_source=infoq&utm_medium=banner&term=lusu',
            'image': 'https://static001.geekbang.org/resource/image/97/70/97c80c4d51d01cbebebcf397e5e63f70.png'},
           {'name': 'QCon广州',
            'link': 'https://qconguangzhou.geekbang.org/?utm_source=infoq&utm_medium=banner',
            'image': 'https://static001.infoq.cn/resource/image/0a/60/0a7516e6e8ed699d14fcfaf412475960.jpg'},
           {'name': '西云数据',
            'link': 'https://www.bagevent.com/event/2356848?bag_track=Banner3',
            'image': 'https://static001.geekbang.org/resource/image/0c/fd/0c3a140ba910744136d9fe0e726945fd.jpg'},
           {'name': 'GMTC',
            'link': 'https://gmtc2019.geekbang.org/?utm_source=infoq&utm_medium=banner&utm_campaign=7',
            'image': 'https://static001.geekbang.org/resource/image/72/fa/726dfd8346cdf7a0de843b32df3ccbfa.jpg'},
           {'name': 'GTLC',
            'link': 'https://gtlc2019.geekbang.org/?utm_source=infoq&utm_medium=guanwang&utm_campaign=banner',
            'image': 'https://static001.geekbang.org/resource/image/b6/87/b6375457793da7b7b5fbc9eba8530a87.jpg'},
           {'name': '极客大学',
            'link': 'https://time.geekbang.org/special/arithmetic?utm_source=infoq_web&utm_medium=banner',
            'image': 'https://static001.geekbang.org/resource/image/53/dd/53c9dca735254ffa687984ffef0f93dd.jpg'},
           {'name': '企业账号',
            'link': 'https://service.geekbang.org/goods/list?category=7&page=1#utm_source=website&utm_medium=infoq&utm_campaign=banner&utm_term=0221',
            'image': 'https://static001.infoq.cn/resource/image/37/a7/375a5847c1f0d397d2280f23e077c6a7.jpg'}],
      'offsets': 0},
 'error': {},
 'extra': {'cost': 0.001510902,
           'request-id': '396b8defe24e713f8610038737671eba@2@infoq'}}

test1()
#获取到的内容
{'code': 0,
 'data': [{'aid': 22233,
           'article_cover': 'https://static001.geekbang.org/resource/image/e4/71/e4a978422d6c045c7a541b78bbca1f71.jpeg',
           'article_cover_point': '{"big":{"point":{"x":0,"y":189,"w":2815,"h":1451}},"small":{"point":{"x":0,"y":0,"w":2816,"h":2091}},"width":2816,"height":2112}',
           'article_sharetitle': '北大AI公开课2019 | 商汤科技沈徽:AI创新与落地',
           'article_subtitle': '',
           'article_summary': '北大AI公开课第四讲如期开讲,商汤科技集团副总裁、商业与数据洞察事业群总裁、工程院院长沈徽带来了《AI创新与落地》的分享',
           'article_title': '北大AI公开课2019 | 商汤科技沈徽:AI创新与落地',
           'author':
               [{'uid': 1277332,
                 'nickname': '蔡芳芳',
                 'avatar': ''}],
           'ctime': 1552714233280,
           'is_collect': False,
           'no_author': '',
           'publish_time': 1552716000000,
           'score': 1552716000000,
           'sub_author': [],
           'sub_topic': [],
           'topic':
               [{'id': 31,
                 'name': 'AI'},
                {'id': 3,
                 'name': '文化 & 方法'},
                {'id': 127,
                 'name': '计算机视觉'}],
           'type': 1,
           'utime': 1552716005424,
           'uuid': 'GoNSrsxpC0AT6a3V-6NQ',
           'views': 0},
          {'aid': 22232,
           'article_cover': 'https://static001.geekbang.org/resource/image/ee/47/ee719a85e81ae073f79039ba9da3df47.jpg',
           'article_cover_point': '{"big":{"point":{"x":0,"y":309,"w":5184,"h":2672}},"small":{"point":{"x":506,"y":332,"w":4040,"h":3001}},"width":5184,"height":3456}',
           'article_sharetitle': '3·15曝光丨智能机器人一年拨打40亿个骚扰电话,6亿人信息已遭泄露!',
           'article_subtitle': '',
           'article_summary': '在昨晚的315晚会上,一条探针盒子+数据匹配+智能外呼机器人的灰色产业链遭到曝光。据报道,智能外呼机器人一年拨打电话可达40多亿个,探针盒子公司收集有全国6亿用户的各类信息!',
           'article_title': '3·15曝光丨智能机器人一年拨打40亿个骚扰电话,6亿人信息已遭泄露!',
           'author':
               [{'uid': 1000106,
                 'nickname': '小智',
                 'avatar': 'https://static001.geekbang.org/account/avatar/00/0f/42/aa/b9a67c2e.jpg'}],
           'ctime': 1552698015145,
           'is_collect': False,
           'no_author': '',
           'publish_time': 1552698000000,
           'score': 1552698000000,
           'sub_author': [],
           'sub_topic': [],
           'topic': [{'id': 21,
                      'name': '安全'},
                     {'id': 15,
                      'name': '大数据'},
                     {'id': 148,
                      'name': '信息泄露'}],
           'type': 1,
           'utime': 1552698015145,
           'uuid': 'NgG*uSOwwI2OVhA80o0G',
           'views': 0},
          {'aid': 22230,
           'article_cover': 'https://static001.geekbang.org/resource/image/30/6f/3087f86b2cbe7b3222c3bb5d7557126f.jpg',
           'article_cover_point': '{"big":{"point":{"x":0,"y":0,"w":1279,"h":659}},"small":{"point":{"x":51,"y":0,"w":1148,"h":852}},"width":1280,"height":853}',
           'article_sharetitle': '这可能是人工智能领域覆盖最全的一份技术趋势报告',
           'article_subtitle': '',
           'article_summary': '这份报告对AI领域的技术预测可谓面面俱到,无论是对于AI企业、研究者,还是AI学习者来说都有一定参考价值',
           'article_title': '这可能是人工智能领域覆盖最全的一份技术趋势报告',
           'author': [{'uid': 1462160,
                       'nickname': '未来今日研究所',
                       'avatar': ''}],
           'ctime': 1552698005153,
           'is_collect': False,
           'no_author': '',
           'publish_time': 1552698000000,
           'score': 1552698000000,
           'sub_author': [],
           'sub_topic': [],
           'topic':
               [{'id': 31,
                 'name': 'AI'},
                {'id': 1,
                 'name': '语言 & 开发'},
                {'id': 45,
                 'name': '物联网'}],
           'translator': [{'uid': 1282296,
                           'nickname': 'Debra',
                           'avatar': ''}],
           'type': 1,
           'utime': 1552698005153,
           'uuid': 'A315uodoMbWrNrZh*MzP',
           'views': 0},
          ......
          {'aid': 22225,
           'article_cover': 'https://static001.geekbang.org/resource/image/6f/a2/6f3b4ff25e9e2e926e8e2bd4436f96a2.jpg',
           'article_cover_point': '{"big":{"point":{"x":0,"y":10,"w":1000,"h":515}},"small":{"point":{"x":136,"y":12,"w":761,"h":565}},"width":1000,"height":600}',
           'article_sharetitle': 'Google 和 Facebook 披露全球范围宕机原因',
           'article_subtitle': '',
           'article_summary': '昨日,Google、Facebook两巨头在同一天相继发生全球大规模宕机,其中Facebook的断电时常更是超过10小时之久。',
           'article_title': 'Google 和 Facebook 披露全球范围宕机原因',
           'author':
               [{'uid': 1278039,
                 'nickname': '张婵',
                 'avatar': ''}],
           'ctime': 1552642208366,
           'is_collect': False,
           'no_author': '',
           'publish_time': 1552642207254,
           'score': 1552642207254,
           'sub_author': [],
           'sub_topic': [],
           'topic':
               [{'id': 3,
                 'name': '文化 & 方法'},
                {'id': 147,
                 'name': '企业动态'},
                {'id': 48,
                 'name': '方法论'}],
           'type': 1,
           'utime': 1552642208366,
           'uuid': 'e-NCah5RTmJMrvmrmbCU',
           'views': 0}],
 'error': {},
 'extra':
     {'cost': 0.028648169,
      'request-id': '1ceebf53fd24003586f1272cc881b7fd@2@infoq'}}

你可能感兴趣的:(Python基础学习20)