需求:利用scrapy框架爬取安居客成都某区的房屋信息,并保存到Mysql。
环境:pycharm,py3.6,scrapy ,bs4,mysql
第一步:创建scrapy项目,cmd执行命令 scrapy startpeoject anjuke_spider,创建了一个名为anjuke_spider的scrapy项目,然后再输入 scrapy crawl anjuke_house www.anjuke.com/ ,在项目中创建了一个名为anjuke_house 的spider。然后在项目中看到如下文件:
在settings 文件中配置框架相应的参数。注意没有被注释的语句就是相关配置。
# -*- coding: utf-8 -*-
# Scrapy settings for anjukespider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'anjukespider'
SPIDER_MODULES = ['anjukespider.spiders']
NEWSPIDER_MODULE = 'anjukespider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'anjukespider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'anjukespider.middlewares.AnjukespiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'anjukespider.middlewares.AnjukespiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'anjukespider.pipelines.AnjukespiderPipeline': 300
}
MYSQL_HOST='localhost'
MYSQL_DB='anjuke'
MYSQL_USER='root'
MYSQL_PASSWD='123456'
MYSQL_PORT=3306
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
然后在items.py文件中定义需要的字段,如下:
import scrapy
class AnjukespiderItem(scrapy.Item):
titile=scrapy.Field()
house_type=scrapy.Field()
build_time=scrapy.Field()
area=scrapy.Field()
address=scrapy.Field()
price=scrapy.Field()
unit_price=scrapy.Field()
定义好相应的字段后 ,就可以在anjuke_house.py文件写相关的业务逻辑了。上一下我写的代码:
import scrapy
from scrapy import Spider,Request
import bs4
from scr_anjukespider.anjukespider.items import AnjukespiderItem
class AnjukeHouseSpider(scrapy.Spider):
name = 'anjuke_house'
allowed_domains = ['https://chengdu.anjuke.com/sale/#filtersort']
start_urls = ['https://chengdu.anjuke.com/sale/p{}/'.format(i) for i in range(51)]
def parse(self, response):
bsoup = bs4.BeautifulSoup(response.text, 'lxml')
house_list = bsoup.find_all('li', class_="list-item")
for house in house_list:
try:
# bs4解析文件
titile = house.find('a').text.strip()
house_type = house.find('div', class_='details-item').span.text
build_time = house.find('div', class_='details-item').contents[7].text
area = house.find('div', class_='details-item').contents[3].text
address = house.find('span', class_='comm-address').text.strip()
price = house.find('span', class_='price-det').text.strip()
unit_price = house.find('span', class_='unit-price').text.strip()
item=AnjukespiderItem()
item['titile']=titile
item['house_type']=house_type
item['build_time'] =build_time
item['area'] =area
item['address'] =address
item['price'] =price
item['unit_price'] =unit_price
yield item
except Exception as e:
print(e)
因为需要将数据存入Mysql中,所以import pymsql 来连接数据库
我是现在数据库中提前创建好了表,当然你也可以在pipelines.py中用sql语句创建。
在pipelines.py文件也就是scrapy框架的管道文件中写入数据库的连接信息,并执行sql语句。
import pymysql
class AnjukespiderPipeline(object):
def __init__(self):
self.connect=pymysql.connect(
host="localhost",
db="anjuke",
user="root",
port=3306,
passwd="123456",
charset="utf8",
use_unicode=True,
cursorclass=pymysql.cursors.DictCursor
)
self.cursor=self.connect.cursor()
def process_item(self, item, spider):
self.cursor.execute('INSERT INTO anjuke_house_info(titile,house_type,build_time,area,address,price,unit_price) VALUES(%s,%s,%s,%s,%s,%s,%s)',
(item['titile'],item['house_type'],item['build_time'],
item['area'],item['address'],item['price'],item['unit_price']))
self.connect.commit()
def close_spider(self,spider):
print('关闭数据库资源................')
#关闭游标
self.cursor.close()
#关闭连接
self.connect.close()
最后在cmd中输入scrapy crawl anjuke_house,运行爬虫,数据库的部分结果如下: