爬取【京客隆超市】店铺信息

  1. 导入所需库
import requests
import pandas as pd
from lxml import etree
  1. 爬取各区链接
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]

3.当只存在一个大区需要翻页时

for i in city_url:
    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
        for a in range(1,4):

            date = {
                '__EVENTTARGET': 'AspNetPager1',
                '__EVENTARGUMENT': a
            }
            response3 = requests.post(url = i, data=date,headers=headers).text
            html2 = etree.HTML(response3)
            city_shop_name = html2.xpath('//span[@class="con01"]/text()')   
            city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
            city_shop_time = html2.xpath('//span[@class="con04"]/text()')
            shop_name = [d.strip()  for d in city_shop_name]
            print(shop_name)
            print('*'*30)
            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
            date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
    else:
        response1 =  requests.post(url=i,headers=headers).text

        html1 = etree.HTML(response1)

        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')

        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
        shop_name1 = [c.strip()  for c in city_shop_name1]  
        print(shop_name1)  
    #数据存储
    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
    date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")

完成代码

#爬取【京客隆超市】店铺信息
import requests
import pandas as pd
from lxml import etree
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i  for i in city_url]
for i in city_url:
    if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
        for a in range(1,4):

            date = {
                '__EVENTTARGET': 'AspNetPager1',
                '__EVENTARGUMENT': a
            }
            response3 = requests.post(url = i, data=date,headers=headers).text
            html2 = etree.HTML(response3)
            city_shop_name = html2.xpath('//span[@class="con01"]/text()')   
            city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
            city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
            city_shop_time = html2.xpath('//span[@class="con04"]/text()')
            shop_name = [d.strip()  for d in city_shop_name]
            print(shop_name)
            print('*'*30)
            date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
            date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
    else:
        response1 =  requests.post(url=i,headers=headers).text

        html1 = etree.HTML(response1)

        city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')

        city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
        city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
        city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
        shop_name1 = [c.strip()  for c in city_shop_name1]  
        print(shop_name1)  
    #数据存储
    date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
    date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
#如果区域内店铺不止一页,且只有一页时



        
    

你可能感兴趣的:(python,网页爬虫)