python 爬虫小例子2-知乎 热榜

知乎热榜摘要

image.png

1、正则表达式匹配方式

mport requests
from bs4 import BeautifulSoup
from lxml import etree  #首先导入lxml库的etree模块
import re
### 如果出现,添加headers

header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'Cookie':'_xsrf=Z4Hjo3PqiZMqXtDRKurmVUqh02Slyr7B; d_c0="AHBvmhNYbBCPTpk-h-eAdO8HbKsII2fdPgY=|1574928753"; _zap=3a046cbc-f746-485c-b3c5-4844b77e044b; _ga=GA1.2.1078833172.1600711743; q_c1=0d4b4e484af54fb7bba21abc7604ef00|1614658586000|1590171377000; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=4ImoDBp3245AQAFEAAJqg%2B59w0Xvf70R; __snaker__id=SrrBmSr3tQPIrIXj; l_n_c=1; l_cap_id="OGEwZGIxNWZjNzM4NDNhNGI1ZTE4ZjcyMTRlYWMwMWI=|1625560752|54a05a563107bffe7274170405068d3694f2c3d4"; r_cap_id="OTc4YmU1Zjk2MmQ4NGRmM2JlYjE2MjBlMWFhNzk0NTE=|1625560752|ec3f32198d2297f82b330a72844d1557c7e81968"; cap_id="MWExMjM4MDM4YTUxNDcwN2IxOTYxMTgzZTRhYmExYjc=|1625560752|3bde6f3a98c04c83431f5b1de68f85e5554c49a4"; n_c=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1624895374,1625385648,1625407402,1625560770; captcha_session_v2="2|1:0|10:1625560770|18:captcha_session_v2|88:dEJMNHBHUHlPRkRscnUxcU1NUE1hVVU2eU8zZUF3NnhaSHpSRmZmTFd4cU41b2xNUjdVemR5QklBWFoxVkZocQ==|7d512a5b1b76b98150ba2ace26e6e65686f036e749dc80e79277be2d5fc23ef1"; SESSIONID=5IV8WbLfW8qIRgYF1llYFE1p8mtgMQSFNisiKyD8ApY; JOID=UF0XA08UvKwuqvnwVBZtN6DIcR1FJvL4EvucuzBFg9pl_JiuFdevTkyr-fVSNU0AvrxNcPAcdV3qBuNFXVUzVl4=; osd=U10VBUMXvK4opvrwVhBhNKDKdxFGJvD-HvicuTZJgNpn-pStFdWpQk-r-_NeNk0CuLBOcPIaeV7qBOVJXlUxUFI=; gdxidpyhxdE=6mZOTbU1Ho0prDrWmWr47QWLbmwV3%5CpTOPpDVSYnpxBILTCQ4XAYLRo8hgvnDdLe29rR%2BRPC%5CqSwLsSjYy2g4gDfd0XZ4lm63Ytd9kDRDl4PDxOT5pfaa8IqEm9UpcXN7bBqX5jDw32XlstgGoeYjqmakc%5CTixEXPGM5%2BrS1XeCR44%2Bz%3A1625561672170; YD00517437729195%3AWM_NI=FFKc9YmnH9eHKOFfB%2B5hLrd%2F0Z%2Fz9yhyvafzU1mYEWJocEuQT2I1kGE7Jw3wV0TmQmNFdvfVoMHAzVepSiPxYZVKl6%2FHIzIDpPhG3yrKLE6SzQEJ53NmB2lkj0RA%2FiHcS0o%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb3aa42f89c9ea5e970bb868ba3c15f928e9abbaa7381f586ccee44e99fa2b7f12af0fea7c3b92a869a8584bc53afb288dae84489918a87c121f7b88493e725f486e19bbc73aca7ba82e57297efa3bac85d8ab1ada6f85b98ee87d2ea4d87e98698b27d96f09c8ed4678e9aac97e55ead9388a2f95ea7b09d8ad3749b869aacef3eed98879bcc50b490a8a8b273b29584a6b559ba8f8babb8398eb38ed3ef61aaf09ea2ae638cbd9dd1ee37e2a3; captcha_ticket_v2="2|1:0|10:1625560897|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfLXAwNGoxV01wWUh2SlpSYm42SzRBVS5QcFJOTGlFQ3lYdXN0bU5YaWtrOEIyaHFybFJHa3pYaU5iNElJNnFLQy1jbGNxaDBKdmJuMjVZQmF4eU5ORDZHemhsOXU2RV9zXzdSdlFzQTc3MS5ZcGZQRHFHLTJONW1EV3lUOUJpLXNuMkt6eDBmbmJ1bmpXV3RDb3RxTzRWUGplcC5DdkxGQS5XbjZLT3VXNWhvVURTcmtWbF9pR1dVaXZKTlZEQTFWd2FVemllREtMWndIal8wOWNrOTkuLjR5Z2t6d01rXzVkUHpRd3paLXRHYm5CV3I4Sm80WVg0ZEZsYkVUeUxWN1VXVi5qWF9MbHFvWS5WZmtDdFJHTnNlbFY5OUdjNm0yOENqSmg5Z0FobHhNR2tIZ1JEaXJjc1BVWVVrRjdxdFN1OVhUR1dEXzBnVUpsNEFybi02VEc2SDRpWWNVLTJqZkdtY3FTVm5yektVeERDRHVDR0gucl8tYlBsMGw3ei5iRHhQODBSdy43cDFZclkyVEhLbVdRd3lfLXc3a3RreTR0d0h5UWpCdXZVaUxuV0thbjVTdmVHdEVfamh1Ui5IWXR5MDJoRWdpZm1ndkFOX1lhS1lCTk11ZkFrWWZEc3k0Q2pSdjBXQnZSYjJnRENSZ2x2SXFkeGZZRWRtMyJ9|b9f1a1a781b0bd0b09d8fdcf6fb9ab611c4b12a7ad05bd9e2868494514359b53"; z_c0="2|1:0|10:1625560898|4:z_c0|92:Mi4xOWxFcENRQUFBQUFBY0ctYUUxaHNFQ1lBQUFCZ0FsVk5RbVhSWVFET3FnczZZdnZhM0hYcC1LWjdSejBFZzRnTU53|70529aa2b8e57594c3411fa181d984e4c687377c31221312cca0f589f8293866"; tshl=; tst=h; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1625560903; KLBRSID=f48cb29c5180c5b0d91ded2e70103232|1625560906|1625560752'}

####爬虫网址
zhihu_url='https://www.zhihu.com/hot'
html= requests.get(url=zhihu_url,headers=header)

####lxml解析器解析
soup=BeautifulSoup(html.content,'lxml')

###找到div 的class
int_re = re.compile('"excerptArea":{"text":"(.*?)"}',re.S|re.I)
int_results = int_re.findall(html.text)
for int_r in int_results:
    if int_r is None or int_r == '':
        continue
    print(int_r)
    print('-'*20)

2、select 选择器

import requests
from bs4 import BeautifulSoup
from lxml import etree  #首先导入lxml库的etree模块
import re
### 如果出现,添加headers

header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'Cookie':'_xsrf=Z4Hjo3PqiZMqXtDRKurmVUqh02Slyr7B; d_c0="AHBvmhNYbBCPTpk-h-eAdO8HbKsII2fdPgY=|1574928753"; _zap=3a046cbc-f746-485c-b3c5-4844b77e044b; _ga=GA1.2.1078833172.1600711743; q_c1=0d4b4e484af54fb7bba21abc7604ef00|1614658586000|1590171377000; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=4ImoDBp3245AQAFEAAJqg%2B59w0Xvf70R; __snaker__id=SrrBmSr3tQPIrIXj; l_n_c=1; l_cap_id="OGEwZGIxNWZjNzM4NDNhNGI1ZTE4ZjcyMTRlYWMwMWI=|1625560752|54a05a563107bffe7274170405068d3694f2c3d4"; r_cap_id="OTc4YmU1Zjk2MmQ4NGRmM2JlYjE2MjBlMWFhNzk0NTE=|1625560752|ec3f32198d2297f82b330a72844d1557c7e81968"; cap_id="MWExMjM4MDM4YTUxNDcwN2IxOTYxMTgzZTRhYmExYjc=|1625560752|3bde6f3a98c04c83431f5b1de68f85e5554c49a4"; n_c=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1624895374,1625385648,1625407402,1625560770; captcha_session_v2="2|1:0|10:1625560770|18:captcha_session_v2|88:dEJMNHBHUHlPRkRscnUxcU1NUE1hVVU2eU8zZUF3NnhaSHpSRmZmTFd4cU41b2xNUjdVemR5QklBWFoxVkZocQ==|7d512a5b1b76b98150ba2ace26e6e65686f036e749dc80e79277be2d5fc23ef1"; SESSIONID=5IV8WbLfW8qIRgYF1llYFE1p8mtgMQSFNisiKyD8ApY; JOID=UF0XA08UvKwuqvnwVBZtN6DIcR1FJvL4EvucuzBFg9pl_JiuFdevTkyr-fVSNU0AvrxNcPAcdV3qBuNFXVUzVl4=; osd=U10VBUMXvK4opvrwVhBhNKDKdxFGJvD-HvicuTZJgNpn-pStFdWpQk-r-_NeNk0CuLBOcPIaeV7qBOVJXlUxUFI=; gdxidpyhxdE=6mZOTbU1Ho0prDrWmWr47QWLbmwV3%5CpTOPpDVSYnpxBILTCQ4XAYLRo8hgvnDdLe29rR%2BRPC%5CqSwLsSjYy2g4gDfd0XZ4lm63Ytd9kDRDl4PDxOT5pfaa8IqEm9UpcXN7bBqX5jDw32XlstgGoeYjqmakc%5CTixEXPGM5%2BrS1XeCR44%2Bz%3A1625561672170; YD00517437729195%3AWM_NI=FFKc9YmnH9eHKOFfB%2B5hLrd%2F0Z%2Fz9yhyvafzU1mYEWJocEuQT2I1kGE7Jw3wV0TmQmNFdvfVoMHAzVepSiPxYZVKl6%2FHIzIDpPhG3yrKLE6SzQEJ53NmB2lkj0RA%2FiHcS0o%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb3aa42f89c9ea5e970bb868ba3c15f928e9abbaa7381f586ccee44e99fa2b7f12af0fea7c3b92a869a8584bc53afb288dae84489918a87c121f7b88493e725f486e19bbc73aca7ba82e57297efa3bac85d8ab1ada6f85b98ee87d2ea4d87e98698b27d96f09c8ed4678e9aac97e55ead9388a2f95ea7b09d8ad3749b869aacef3eed98879bcc50b490a8a8b273b29584a6b559ba8f8babb8398eb38ed3ef61aaf09ea2ae638cbd9dd1ee37e2a3; captcha_ticket_v2="2|1:0|10:1625560897|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfLXAwNGoxV01wWUh2SlpSYm42SzRBVS5QcFJOTGlFQ3lYdXN0bU5YaWtrOEIyaHFybFJHa3pYaU5iNElJNnFLQy1jbGNxaDBKdmJuMjVZQmF4eU5ORDZHemhsOXU2RV9zXzdSdlFzQTc3MS5ZcGZQRHFHLTJONW1EV3lUOUJpLXNuMkt6eDBmbmJ1bmpXV3RDb3RxTzRWUGplcC5DdkxGQS5XbjZLT3VXNWhvVURTcmtWbF9pR1dVaXZKTlZEQTFWd2FVemllREtMWndIal8wOWNrOTkuLjR5Z2t6d01rXzVkUHpRd3paLXRHYm5CV3I4Sm80WVg0ZEZsYkVUeUxWN1VXVi5qWF9MbHFvWS5WZmtDdFJHTnNlbFY5OUdjNm0yOENqSmg5Z0FobHhNR2tIZ1JEaXJjc1BVWVVrRjdxdFN1OVhUR1dEXzBnVUpsNEFybi02VEc2SDRpWWNVLTJqZkdtY3FTVm5yektVeERDRHVDR0gucl8tYlBsMGw3ei5iRHhQODBSdy43cDFZclkyVEhLbVdRd3lfLXc3a3RreTR0d0h5UWpCdXZVaUxuV0thbjVTdmVHdEVfamh1Ui5IWXR5MDJoRWdpZm1ndkFOX1lhS1lCTk11ZkFrWWZEc3k0Q2pSdjBXQnZSYjJnRENSZ2x2SXFkeGZZRWRtMyJ9|b9f1a1a781b0bd0b09d8fdcf6fb9ab611c4b12a7ad05bd9e2868494514359b53"; z_c0="2|1:0|10:1625560898|4:z_c0|92:Mi4xOWxFcENRQUFBQUFBY0ctYUUxaHNFQ1lBQUFCZ0FsVk5RbVhSWVFET3FnczZZdnZhM0hYcC1LWjdSejBFZzRnTU53|70529aa2b8e57594c3411fa181d984e4c687377c31221312cca0f589f8293866"; tshl=; tst=h; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1625560903; KLBRSID=f48cb29c5180c5b0d91ded2e70103232|1625560906|1625560752'}
####爬虫网址

zhihu_url='https://www.zhihu.com/hot'

html= requests.get(url=zhihu_url,headers=header)
#print(html.text)

####lxml解析器解析

soup=BeautifulSoup(html.content,'lxml')

###找到div 的class,具体的按照网页为准

content = soup.select('.HotItem')
for k in content:
    title=k.select('.HotItem-content .HotItem-title')[0].text
    hot=k.select('.HotItem-content .HotItem-metrics')[0].text
    print(title,hot)
    tro=k.select('.HotItem-content .HotItem-excerpt')[0].text
    print(tro)

3、xpath方式

import requests
from bs4 import BeautifulSoup
from lxml import etree  #首先导入lxml库的etree模块
import re
### 如果出现,添加headers

header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'Cookie':'_xsrf=Z4Hjo3PqiZMqXtDRKurmVUqh02Slyr7B; d_c0="AHBvmhNYbBCPTpk-h-eAdO8HbKsII2fdPgY=|1574928753"; _zap=3a046cbc-f746-485c-b3c5-4844b77e044b; _ga=GA1.2.1078833172.1600711743; q_c1=0d4b4e484af54fb7bba21abc7604ef00|1614658586000|1590171377000; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=4ImoDBp3245AQAFEAAJqg%2B59w0Xvf70R; __snaker__id=SrrBmSr3tQPIrIXj; l_n_c=1; l_cap_id="OGEwZGIxNWZjNzM4NDNhNGI1ZTE4ZjcyMTRlYWMwMWI=|1625560752|54a05a563107bffe7274170405068d3694f2c3d4"; r_cap_id="OTc4YmU1Zjk2MmQ4NGRmM2JlYjE2MjBlMWFhNzk0NTE=|1625560752|ec3f32198d2297f82b330a72844d1557c7e81968"; cap_id="MWExMjM4MDM4YTUxNDcwN2IxOTYxMTgzZTRhYmExYjc=|1625560752|3bde6f3a98c04c83431f5b1de68f85e5554c49a4"; n_c=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1624895374,1625385648,1625407402,1625560770; captcha_session_v2="2|1:0|10:1625560770|18:captcha_session_v2|88:dEJMNHBHUHlPRkRscnUxcU1NUE1hVVU2eU8zZUF3NnhaSHpSRmZmTFd4cU41b2xNUjdVemR5QklBWFoxVkZocQ==|7d512a5b1b76b98150ba2ace26e6e65686f036e749dc80e79277be2d5fc23ef1"; SESSIONID=5IV8WbLfW8qIRgYF1llYFE1p8mtgMQSFNisiKyD8ApY; JOID=UF0XA08UvKwuqvnwVBZtN6DIcR1FJvL4EvucuzBFg9pl_JiuFdevTkyr-fVSNU0AvrxNcPAcdV3qBuNFXVUzVl4=; osd=U10VBUMXvK4opvrwVhBhNKDKdxFGJvD-HvicuTZJgNpn-pStFdWpQk-r-_NeNk0CuLBOcPIaeV7qBOVJXlUxUFI=; gdxidpyhxdE=6mZOTbU1Ho0prDrWmWr47QWLbmwV3%5CpTOPpDVSYnpxBILTCQ4XAYLRo8hgvnDdLe29rR%2BRPC%5CqSwLsSjYy2g4gDfd0XZ4lm63Ytd9kDRDl4PDxOT5pfaa8IqEm9UpcXN7bBqX5jDw32XlstgGoeYjqmakc%5CTixEXPGM5%2BrS1XeCR44%2Bz%3A1625561672170; YD00517437729195%3AWM_NI=FFKc9YmnH9eHKOFfB%2B5hLrd%2F0Z%2Fz9yhyvafzU1mYEWJocEuQT2I1kGE7Jw3wV0TmQmNFdvfVoMHAzVepSiPxYZVKl6%2FHIzIDpPhG3yrKLE6SzQEJ53NmB2lkj0RA%2FiHcS0o%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb3aa42f89c9ea5e970bb868ba3c15f928e9abbaa7381f586ccee44e99fa2b7f12af0fea7c3b92a869a8584bc53afb288dae84489918a87c121f7b88493e725f486e19bbc73aca7ba82e57297efa3bac85d8ab1ada6f85b98ee87d2ea4d87e98698b27d96f09c8ed4678e9aac97e55ead9388a2f95ea7b09d8ad3749b869aacef3eed98879bcc50b490a8a8b273b29584a6b559ba8f8babb8398eb38ed3ef61aaf09ea2ae638cbd9dd1ee37e2a3; captcha_ticket_v2="2|1:0|10:1625560897|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfLXAwNGoxV01wWUh2SlpSYm42SzRBVS5QcFJOTGlFQ3lYdXN0bU5YaWtrOEIyaHFybFJHa3pYaU5iNElJNnFLQy1jbGNxaDBKdmJuMjVZQmF4eU5ORDZHemhsOXU2RV9zXzdSdlFzQTc3MS5ZcGZQRHFHLTJONW1EV3lUOUJpLXNuMkt6eDBmbmJ1bmpXV3RDb3RxTzRWUGplcC5DdkxGQS5XbjZLT3VXNWhvVURTcmtWbF9pR1dVaXZKTlZEQTFWd2FVemllREtMWndIal8wOWNrOTkuLjR5Z2t6d01rXzVkUHpRd3paLXRHYm5CV3I4Sm80WVg0ZEZsYkVUeUxWN1VXVi5qWF9MbHFvWS5WZmtDdFJHTnNlbFY5OUdjNm0yOENqSmg5Z0FobHhNR2tIZ1JEaXJjc1BVWVVrRjdxdFN1OVhUR1dEXzBnVUpsNEFybi02VEc2SDRpWWNVLTJqZkdtY3FTVm5yektVeERDRHVDR0gucl8tYlBsMGw3ei5iRHhQODBSdy43cDFZclkyVEhLbVdRd3lfLXc3a3RreTR0d0h5UWpCdXZVaUxuV0thbjVTdmVHdEVfamh1Ui5IWXR5MDJoRWdpZm1ndkFOX1lhS1lCTk11ZkFrWWZEc3k0Q2pSdjBXQnZSYjJnRENSZ2x2SXFkeGZZRWRtMyJ9|b9f1a1a781b0bd0b09d8fdcf6fb9ab611c4b12a7ad05bd9e2868494514359b53"; z_c0="2|1:0|10:1625560898|4:z_c0|92:Mi4xOWxFcENRQUFBQUFBY0ctYUUxaHNFQ1lBQUFCZ0FsVk5RbVhSWVFET3FnczZZdnZhM0hYcC1LWjdSejBFZzRnTU53|70529aa2b8e57594c3411fa181d984e4c687377c31221312cca0f589f8293866"; tshl=; tst=h; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1625560903; KLBRSID=f48cb29c5180c5b0d91ded2e70103232|1625560906|1625560752'}
####爬虫网址

zhihu_url='https://www.zhihu.com/hot'

html= requests.get(url=zhihu_url,headers=header)
#print(html.text)

####lxml解析器解析

soup=BeautifulSoup(html.content,'lxml')

tree = etree.HTML(str(soup))
# "="符号表示完全匹配
# 逐级向下寻找(list下有很多section)div[2](第二个div)
sections = tree.xpath('//div[@class="HotList-list"]/section')   
for i in sections:
    tro=i.xpath('./div[2]/a/p/text()')
    title =i.xpath('./div[2]/a/h2/text()')
    print(title,tro)

你可能感兴趣的:(python 爬虫小例子2-知乎 热榜)