本次爬取是安居客的房产信息:话不多说,直接上代码!
一:爬虫板块:
1.运行文件:run.py
from scrapy import cmdline
cmdline.execute(‘scrapy crawl anjuke_shanghai’.split())
2.网页解析:anjuke_shanghai.py
import scrapy
import time
from anjuke.items import AnjukeItem
class AnjukeShanghaiSpider(scrapy.Spider):
name = ‘anjuke_shanghai’
allowed_domains = [‘anjuke.com’]
start_urls = [‘https://shanghai.anjuke.com/sale/p11/#filtersort’]
next_page_id = 12
def parse(self, response):
for ajk in response.xpath("//ul[@id='houselist-mod-new']/li"):
time.sleep(5)
item = AnjukeItem()
title = ajk.xpath(".//div[@class='house-title']/a/text()")[0].extract()
time.sleep(1)
item['title'] = title.strip()
# print(item['title'])
price = ajk.xpath(".//span[@class='price-det']/strong/text()")[0].extract()
time.sleep(1)
item['price'] = price
# print(item['price'])
unit_price = ajk.xpath(".//span[@class='unit-price']/text()")[0].extract()
time.sleep(1)
if len(unit_price) > 0:
item['unit_price'] = unit_price.replace("元/m²", "")
else:
item['unit_price'] = ""
# print(item['unit_price'])
site = ajk.xpath(".//span[@class='comm-address']/text()").extract()
time.sleep(1)
if len(site) > 0:
st = site[0].split()
item['site'] = " ".join(st)
else:
item['site'] = ""
# print(item['site'])
house_type = ajk.xpath(
".//div[@class='details-item']/span[1]/text()").extract()
time.sleep(1)
if len(house_type) > 0:
item['house_type'] = house_type[0]
else:
item['house_type'] = ""
# print(item['house_type'])
area = ajk.xpath(
".//div[@class='details-item']/span[2]/text()")[0].extract()
time.sleep(1)
if len(area) > 0:
item['area'] = area.replace("m²", "")
else:
item['area'] = ""
item['house_url'] = ajk.xpath(".//div[@class='house-title']/a/@href")[0].extract()
time.sleep(1)
# print(item['house_url'])
yield item
url = "https://shanghai.anjuke.com/sale/p{}/#filtersort".format(
self.next_page_id)
if self.next_page_id < 50:
time.sleep(5)
yield scrapy.Request(url=url, dont_filter=True, callback=self.parse)
# print(self.page_id)
self.next_page_id += 1
3. items.py
import scrapy
class AnjukeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()
# 总价
price = scrapy.Field()
# 单价
unit_price = scrapy.Field()
# 地点
site = scrapy.Field()
# 类型
house_type = scrapy.Field()
# 面积
area = scrapy.Field()
# 链接
house_url = scrapy.Field()
4. middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
class AnjukeSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class AnjukeDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
5. pipelines.py
from itemadapter import ItemAdapter
import pymysql
class AnjukePipeline:
def init(self):
self.connect = pymysql.connect(host=“localhost”, user=“root”, passwd=“1234”, db=“anjuke”)
self.cursor = self.connect.cursor()
print(“数据库连接成功”)
def process_item(self, item, spider):
print("开始保存数据")
insql = "insert into anjuke_shanghai(title,price,unit_price,site,house_type,area,house_url) values (%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(insql, (
item['title'], item['price'], item['unit_price'], item['site'], item['house_type'],
item['area'], item['house_url']))
self.connect.commit()
print("保存数据成功")
return item
def parse_close(self):
self.connect.close()
self.cursor.close()
6. settings.py
BOT_NAME = ‘anjuke’
SPIDER_MODULES = [‘anjuke.spiders’]
NEWSPIDER_MODULE = ‘anjuke.spiders’
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DEFAULT_REQUEST_HEADERS = {
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8’,
‘Accept-Language’: ‘en’,
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36’,
‘Cookie’: ‘aQQ_ajkguid=8E3DD02F-E811-A2DA-DA53-C1B88CD60608; id58=e87rkF/lNzIYHcjBD+SdAg==; _ga=GA1.2.93190540.1608857396; _gid=GA1.2.334371282.1608857396; 58tj_uuid=6fc5ade0-bfd0-4187-bd4e-9686d7082817; new_uv=1; als=0; sessid=B70FA124-E42F-8DAD-3813-6C91C72B7A20; ctid=11; twe=2; obtain_by=2; ajk_member_verify=QUbPDLTnm9FWHSOd33buoCZE2z1wm%2FVudTO6LdSsWYs%3D; ajk_member_verify2=MTYwMDA4MTUwfFUxNTU3Mjk4NzEwNDM3NXwx; xxzl_cid=7380c6b8f44840bea607d5323fb011f4; xzuid=a8fd56b1-e885-46cd-b255-5dcd8fa79dc4; ajkAuthTicket=TT=f841c95d589fd9118d083c3ba68b97a3&TS=1608895520230&PBODY=VcG9Y6AtpZbA4ERSDzm8x-gaGSpJliB6sqdOLZ5r43ZgbMtoUuIQ3_UEzjH93WSEcM1W26Q_96d7T9tcmKpasHOQN42asUK9WLXeGZ4ssbi9u2MLY5aKXbsVALuXFkdG1gu6vlvjxUMNOn_EEGoo7fk8RHanQCv-vKtjgHmzDBk&VER=2’
}
ITEM_PIPELINES = {
‘anjuke.pipelines.AnjukePipeline’: 300,
}
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
二:数据版块:
1.数据库内容:
2.使用pyecharts分析截图: