python 如果遇到爬取解析到得文本为 style="display: none" 没有找到相关内容该怎样解决 修改文本属性获取新的内容

在有的爬取网页内容时候有时候遇到text得内容为空 如下图所示 这应该就是display:none的问题,遇到这样问题要改变css的style中的内容 这需要pyquery这个库

python 如果遇到爬取解析到得文本为 style=

下面图是原始f12中的disply:block 显示内容 所以 爬取时候 内容一的到实际的内容为准 也就是requests.get(url = " ",headers = " ").text

python 如果遇到爬取解析到得文本为 style=

不说太多上代码 下面有详细代码 本章用的是正则 的得到的 代码不懂可以评论 哈 最终结果 如图所示 python 如果遇到爬取解析到得文本为 style=

import requests
import re
from pyquery import PyQuery as pq


""""
        爬取sopu信息
        url = http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0
"""

class SoPu(object):

    def __init__(self) -> None:
        self.url = "http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0"
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        }
    def get_url_list(self):
        response = requests.get(url=self.url,headers=self.header)
        html = response.text
        doc = pq(html)
        if doc('.ctl00_main_NoDataPanel').attr:
            doc('.NextPage').attr("display: block")
            response = requests.get(url=self.url, headers=self.header)
            html = response.text
            # print(html)
            return html
        else:
            print("无法获取本页面内容")

    def get_content(self,html):
        rE= re.compile(r'class="table_style2">(.*?)

', re.M | re.S) img_url_ls = re.compile(r"", re.M | re.S) name = re.compile(r"", re.M | re.S) td_ls =re.compile(r" (.*?)", re.M | re.S) adress = re.compile(r"(.*?)") type = re.compile(r".*? (.*?)") eara = re.compile(r".*? .*?(.*?)") extend = re.compile(r".*?(.*?)") data = re.compile(r".*?(.*?)") updata_data =re.compile(r"

(.*?)

") follow =re.compile(r"(.*?)") # print("rE",type(rE),"name",type(name)) ls = rE.findall(html) for each in ls: # print("each",each) match_name = name.search(each) if match_name != None: Shop_name = match_name.group(1) else: Shop_name = '未知' print('Shop_name:', Shop_name) match_url = img_url_ls.search(each) if match_url != None: Shop_img_url = match_url.group(1) else: Shop_img_url = '未知' print('Shop_img_url:', Shop_img_url) match_td = td_ls.search(each) if match_td != None: Compay_namme = match_td.group(1) else: Compay_namme = '未知' print('Compay_namme:', Compay_namme) match_adress = adress.search(each) if match_adress != None: Compay_adress = match_adress.group(1) else: Compay_adress = '未知' print('Compay_adress:', Compay_adress) match_type = type.search(each) if match_type != None: Compay_type = match_type.group(1) else: Compay_type = '未知' print('Compay_type:', Compay_type) match_eara = eara.search(each) if match_eara != None: Compay_eara = match_eara.group(1) else: Compay_eara = '未知' print('Compay_type:', Compay_eara) match_extend = extend.search(each) if match_extend != None: Compay_extend = match_extend.group(1) else: Compay_extend = '未知' print('Compay_extend:', Compay_extend) match_data = data.search(each) if match_data != None: Compay_data = match_data.group(1) else: Compay_data = '未知' print('Compay_data:', Compay_data) match_updata_data = updata_data.search(each) if match_updata_data != None: Compay_updata_data = match_updata_data.group(1) else: Compay_updata_data = '未知' print('Compay_updata_data:', Compay_updata_data) match_follow = follow.search(each) if match_follow != None: Compay_follow = match_follow.group(1) else: Compay_follow = '未知' print('Compay_follow:', Compay_follow) def net_page(self): pass if __name__ == '__main__': sopu = SoPu() html =sopu.get_url_list() sopu.get_content(html)

你可能感兴趣的:(爬虫)