最近有个需求,是要爬取某个物流公司的官网信息,我看了下官网,基本上都是静态页面比较好抓取,不像那种资讯类,电子商务类型的网站结果复杂,反爬严格,AJAX众多,还内心暗自庆幸,当我进一步分析时候发现并非普通的静态页面。
例如这个URL界面,我要获取全中国各大城市的物流园区分布信息,并且要获取详情信息,
这个页面里面是有个地图镶嵌,每个城市物流信息你要单独点击地图上的信息才能显示。
https://www.glprop.com.cn/our...
我刚开始想,这种会不会是ajax请求呢,通过chrmoe抓包并没有发现,然后我查看网页源代码
发现所有城市信息在一个scripts里面
如图:
然后各个园区的信息在一个叫park={xx}里面存着
原来都在这里面,直接获取源代码,正则匹配,开干。
item:
#普洛斯
class PuluosiNewsItem(scrapy.Item):
newstitle=scrapy.Field()
newtiems=scrapy.Field()
newslink=scrapy.Field()
class PuluosiItem(scrapy.Item):
assetstitle = scrapy.Field()
assetaddress=scrapy.Field()
assetgaikuang=scrapy.Field()
assetpeople=scrapy.Field()
asseturl = scrapy.Field()
pipelines:
class PuluosiNewsPipeline(object):
def __init__(self):
self.wb=Workbook()
self.ws=self.wb.active
#设置表头
self.ws.append(['普洛斯新闻标题','新闻发布时间','新闻URL'])
self.wb2 = Workbook()
self.ws2 = self.wb2.active
self.ws2.append(['资产标题', '资产地址', '资产概况','其他信息','URL'])
def process_item(self,item,spider):
if isinstance(item, PuluosiNewsItem):
line = [item['newstitle'], item['newtiems'], item['newslink']] # 把数据中每一项整理出来
self.ws.append(line)
self.wb.save('PuluosiNews.xlsx') # 保存xlsx文件
elif isinstance(item,PuluosiItem):
line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']]
self.ws2.append(line)
self.wb2.save('PuluosiAsset.xlsx') # 保存xlsx文件
return item
spider:
# -*- coding: utf-8 -*-
import scrapy,re,json
from news.items import PuluosiNewsItem,PuluosiItem
from scrapy.linkextractors import LinkExtractor
class PuluosiSpider(scrapy.Spider):
name = 'puluosi'
allowed_domains = ['glprop.com.cn']
# start_urls = ['https://www.glprop.com.cn/press-releases.html']
def start_requests(self):
yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1)
yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2)
yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3)
yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4)
def parse1(self, response):
print('此时启动的爬虫为:puluosi' )
item=PuluosiNewsItem()
web=response.xpath('//tbody/tr')
web.pop(0)
for node in web:
item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
print(item['newstitle'])
item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
print(item['newtiems'])
# urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接
item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
# print(item['newslink'])
yield item
#加入try 来判断当前年份的新闻是否有下一页出现
try:
next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0]
if next_url_tmp:
next_url = "https://www.glprop.com.cn" + next_url_tmp
yield scrapy.Request(next_url,callback=self.parse1)
except Exception as e:
print("当前页面没有下一页")
href=response.xpath('//ul[@class="timeList"]/li/a/@href')
for nexturl in href:
url1 =nexturl.extract()
if url1:
url="https://www.glprop.com.cn"+url1
yield scrapy.Request(url,callback=self.parse1)
def parse2(self,response):
item = PuluosiNewsItem()
web = response.xpath('//tbody/tr')
web.pop(0)
for node in web:
item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
print(item['newstitle'])
item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
print(item['newtiems'])
# urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接
item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
print(item['newslink'])
yield item
#加入try 来判断当前年份的新闻是否有下一页出现
try:
next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0]
if next_url_tmp:
next_url = "https://www.glprop.com.cn" + next_url_tmp
yield scrapy.Request(next_url,callback=self.parse2)
except Exception as e:
print("当前页面没有下一页")
href=response.xpath('//ul[@class="timeList"]/li/a/@href')
for nexturl in href:
url1 =nexturl.extract()
if url1:
url="https://www.glprop.com.cn"+url1
yield scrapy.Request(url,callback=self.parse2)
def parse3(self,response):
item=PuluosiNewsItem()
web=response.xpath('//tbody/tr')
web.pop()
for node in web:
item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
print(item['newstitle'])
item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
print(item['newtiems'])
# urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接
item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
print(item['newslink'])
yield item
def parse4(self,response):
link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]')
links=link.extract_links(response)
#获取所有城市的links
for i in links:
detailurl=i.url
yield scrapy.Request(url=detailurl,callback=self.parse5)
def parse4(self, response):
item = PuluosiItem()
citycode=re.findall('var cities =(.*);',response.text )
citycodejson=json.loads(("".join(citycode)))
#把每个城市的id和name取出来放到一个字典
dictcity={}
for i in citycodejson:
citycodename=i['name']
citycodenm=i['id']
dictcity[citycodenm]=citycodename
detail=re.findall('var parks =(.*);',response.text )
jsonBody = json.loads(("".join(detail)))
list = []
for key1 in jsonBody:
for key2 in jsonBody[key1]:
tmp=jsonBody[key1][key2]
list.append(jsonBody[key1][key2])
for node in list:
assetaddress = node['city_id']
item['assetaddress'] = dictcity[assetaddress]
# print(item['assetaddress'])
item['assetstitle'] = node['name']
# print(item['assetstitle'])
item['assetgaikuang'] = node['detail_single'].strip().replace(' ', '').replace(' ', '')
# print(item['assetgaikuang'])
assetpeople = node['description']
item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace(' ', '')
item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress']
# print(item['assetpeople'])
yield item
然后我顺便把页面的新闻信息也爬取了。