import requests
import re
from pyquery import PyQuery as pq
""""
爬取sopu信息
url = http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0
"""
class SoPu(object):
def __init__(self) -> None:
self.url = "http://www.soupu.com/pinpai/list.aspx?byt=6&syt=606&pptype=0"
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
def get_url_list(self):
response = requests.get(url=self.url,headers=self.header)
html = response.text
doc = pq(html)
if doc('.ctl00_main_NoDataPanel').attr:
doc('.NextPage').attr("display: block")
response = requests.get(url=self.url, headers=self.header)
html = response.text
# print(html)
return html
else:
print("无法获取本页面内容")
def get_content(self,html):
rE= re.compile(r'class="table_style2">(.*?)
(.*?)
") follow =re.compile(r"(.*?)") # print("rE",type(rE),"name",type(name)) ls = rE.findall(html) for each in ls: # print("each",each) match_name = name.search(each) if match_name != None: Shop_name = match_name.group(1) else: Shop_name = '未知' print('Shop_name:', Shop_name) match_url = img_url_ls.search(each) if match_url != None: Shop_img_url = match_url.group(1) else: Shop_img_url = '未知' print('Shop_img_url:', Shop_img_url) match_td = td_ls.search(each) if match_td != None: Compay_namme = match_td.group(1) else: Compay_namme = '未知' print('Compay_namme:', Compay_namme) match_adress = adress.search(each) if match_adress != None: Compay_adress = match_adress.group(1) else: Compay_adress = '未知' print('Compay_adress:', Compay_adress) match_type = type.search(each) if match_type != None: Compay_type = match_type.group(1) else: Compay_type = '未知' print('Compay_type:', Compay_type) match_eara = eara.search(each) if match_eara != None: Compay_eara = match_eara.group(1) else: Compay_eara = '未知' print('Compay_type:', Compay_eara) match_extend = extend.search(each) if match_extend != None: Compay_extend = match_extend.group(1) else: Compay_extend = '未知' print('Compay_extend:', Compay_extend) match_data = data.search(each) if match_data != None: Compay_data = match_data.group(1) else: Compay_data = '未知' print('Compay_data:', Compay_data) match_updata_data = updata_data.search(each) if match_updata_data != None: Compay_updata_data = match_updata_data.group(1) else: Compay_updata_data = '未知' print('Compay_updata_data:', Compay_updata_data) match_follow = follow.search(each) if match_follow != None: Compay_follow = match_follow.group(1) else: Compay_follow = '未知' print('Compay_follow:', Compay_follow) def net_page(self): pass if __name__ == '__main__': sopu = SoPu() html =sopu.get_url_list() sopu.get_content(html)