- 导入所需库
import requests
import pandas as pd
from lxml import etree
- 爬取各区链接
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i for i in city_url]
3.当只存在一个大区需要翻页时
for i in city_url:
if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
for a in range(1,4):
date = {
'__EVENTTARGET': 'AspNetPager1',
'__EVENTARGUMENT': a
}
response3 = requests.post(url = i, data=date,headers=headers).text
html2 = etree.HTML(response3)
city_shop_name = html2.xpath('//span[@class="con01"]/text()')
city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
city_shop_time = html2.xpath('//span[@class="con04"]/text()')
shop_name = [d.strip() for d in city_shop_name]
print(shop_name)
print('*'*30)
date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
else:
response1 = requests.post(url=i,headers=headers).text
html1 = etree.HTML(response1)
city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')
city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
shop_name1 = [c.strip() for c in city_shop_name1]
print(shop_name1)
#数据存储
date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
date.to_csv("e:/爬取爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
完成代码
#爬取【京客隆超市】店铺信息
import requests
import pandas as pd
from lxml import etree
url = 'http://www.jkl.com.cn/cn/shop.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'}
response = requests.get(url,headers=headers).text
html = etree.HTML(response)
city_name = html.xpath('//div[@class="infoLis"]//a/text()')
city_name = [i.strip() for i in city_name]
city_url = html.xpath('//div[@class="infoLis"]//a/@href')
city_url = ['http://www.jkl.com.cn/cn/' + i for i in city_url]
for i in city_url:
if i == 'http://www.jkl.com.cn/cn/shopLis.aspx?id=865':
for a in range(1,4):
date = {
'__EVENTTARGET': 'AspNetPager1',
'__EVENTARGUMENT': a
}
response3 = requests.post(url = i, data=date,headers=headers).text
html2 = etree.HTML(response3)
city_shop_name = html2.xpath('//span[@class="con01"]/text()')
city_shop_dis = html2.xpath('//span[@class="con02"]/text()')
city_shop_phone = html2.xpath('//span[@class="con03"]/text()')
city_shop_time = html2.xpath('//span[@class="con04"]/text()')
shop_name = [d.strip() for d in city_shop_name]
print(shop_name)
print('*'*30)
date = pd.DataFrame({"店铺名称":shop_name,"店铺地址":city_shop_dis,"联系方式":city_shop_phone,"营业时间":city_shop_time})
date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
else:
response1 = requests.post(url=i,headers=headers).text
html1 = etree.HTML(response1)
city_shop_name1 = html1.xpath('//span[@class="con01"]/text()')
city_shop_dis1 = html1.xpath('//span[@class="con02"]/text()')
city_shop_phone1 = html1.xpath('//span[@class="con03"]/text()')
city_shop_time1 = html1.xpath('//span[@class="con04"]/text()')
shop_name1 = [c.strip() for c in city_shop_name1]
print(shop_name1)
#数据存储
date = pd.DataFrame({"店铺名称":shop_name1,"店铺地址":city_shop_dis1,"联系方式":city_shop_phone1,"营业时间":city_shop_time1})
date.to_csv("e:/爬取【京客隆超市】店铺信息.csv",index=False,header=0,mode="a",encoding = "ANSI")
#如果区域内店铺不止一页,且只有一页时