使用python3+scrapy爬虫,并将结果保存到MYSQL数据库中(附代码)

使用python3+scrapy爬虫,并将结果保存到MYSQL数据库中(附代码)

    • python+scrapy的安装
    • 第一步:创建爬虫项目
    • 第二步:修改item.py
    • 第三步:写爬虫文件(spiders文件夹内创建一个空的py文件,修改成自己想要的名称),我的文件名为hospital_spider.py
    • 第四步:Pipeline.py文件
    • 第五步:setting文件

python+scrapy的安装

网上直接搜,教程有很多。下面的内容是基于已经安装好python以及scrapy情况下。MYSQL包也需要提前安装好:
进入cmd终端下执行:

pip install PyMySQL

第一步:创建爬虫项目

在cmd终端下输入:

scrapy startproject hospital

结果如下:
使用python3+scrapy爬虫,并将结果保存到MYSQL数据库中(附代码)_第1张图片

第二步:修改item.py

class HospitalItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()			#名称
    organization = scrapy.Field() 		#机构性质
    address = scrapy.Field()			#地址
    phone=scrapy.Field()
    			#简介
    guahao=scrapy.Field()
    inspect=scrapy.Field()
    check=scrapy.Field()
    transport = scrapy.Field()
    yinshi=scrapy.Field()
    zhuxiu=scrapy.Field()
    zhuyuan=scrapy.Field()#交通
    carparking=scrapy.Field()			
    charge = scrapy.Field()			#收费
    notice = scrapy.Field()			#入住要求
    drug=scrapy.Field()			

第三步:写爬虫文件(spiders文件夹内创建一个空的py文件,修改成自己想要的名称),我的文件名为hospital_spider.py

使用python3+scrapy爬虫,并将结果保存到MYSQL数据库中(附代码)_第2张图片
创建后打开进入写爬虫环节
首先,导入相关的库:我最终将爬去的数据保存在了mysql数据库中,故需要导入MySQLdb数据库,python3所用的与python2有所不同,python3为import pymysql

# -*- coding: utf-8 -*-
import scrapy
from Hospital.items import HospitalItem
from scrapy.conf import settings 						# scrapy的设置函数
from datetime import datetime							# 日期模块
import pymysql as mdb 										#数据库模块
其次,是主要代码块
class YanglaoSpider(scrapy.Spider):
    name = "hospital"  # 爬虫名
    allowed_domains = ["360jk.com"]  # 可访问域名,定义域名后爬虫只访问该域名下的网址 start_urls
    start_urls= (
		'http://www.360jk.com/jibing/gxy/yiyuan?&level_id=0',
                
	)
    def __init__(self):
	# ==========初始化函数
	now = datetime.now()										# 获取当前日期
	today = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2)		# 将日期转换为字段形式,如:20160101
	con = mdb.connect(host=settings['MYSQL_HOST'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASS'], db=settings['MYSQL_DB'],charset='utf8');
	cur = con.cursor()											# 创建数据库连接,定义连接指针
	cur.execute("""DROP TABLE IF EXISTS `hospitals_%s`;""" % today)	# 如果yl_datetime(yl_20160101)表存在,即删除该表
	cur.execute("""
		CREATE TABLE `hospitals_%s` (
		    `id` int(11) NOT NULL AUTO_INCREMENT,
		    `名称` varchar(15) DEFAULT NULL,
		    `机构类型` varchar(20) DEFAULT NULL,
		    `地址` varchar(50) DEFAULT NULL,
		    `phone` varchar(30) DEFAULT NULL,
		    `挂号` varchar(1000) DEFAULT NULL,
		    `检查` varchar(1000) DEFAULT NULL,
		    `体检` varchar(1000) DEFAULT NULL,
		    `交通` varchar(100) DEFAULT NULL,
		    `饮食` varchar(1000) DEFAULT NULL,
		    `住宿` varchar(1000) DEFAULT NULL,
		    `住院` varchar(1000) DEFAULT NULL,
		    `停车` varchar(1000) DEFAULT NULL,
		    `收费` varchar(1000) DEFAULT NULL,
		    `入住须知` varchar(2000) DEFAULT NULL,
		    `取药` varchar(1000) DEFAULT NULL,
		    PRIMARY KEY (`id`)
		) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET='utf8';
		""" % today)											# 新建yl_datetime(yl_20160101)表
	con.close()
	#`简介` varchar(6000) DEFAULT NULL,
    
    def parse(self, response):
	# ==========从访问的页面上页获取全部养老院链接,并生成Request请求,把response发给parse_page函数
	for url in response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > ul > li > div.info_area > figure > div > a').xpath('@href').extract():
	    yield scrapy.Request(response.urljoin(url), callback=self.parse_page)
			# break

		# ==========从页底获取下一页的链接,生成Request请求,把response发给原来的parse函数做循环
	next_url = response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > div.paginations > div > b.page_num > a').xpath('@href').extract()	# 下一页链接是所获取链接的倒数第二个,所以要取[-2]
        
        next_url = next_url[-2] if len(next_url)>1 else next_url[0]
        yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
    

    def parse_page(self, response):
    #def parse(self, response):
	item = HospitalItem()	# 定义item   item['intro'],
	item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']='','','','','','','','','','','','','','',''	# 给全部item赋空值,避免以后出错
        # ==========从页面上获取的字段字典
	dic = {

            'guahao':'guahao',
            'jiancha':'inspect',
            'tijian':'check',
            'jiaotong':'transport',
            'tingche':'carparking',
            'yinshi':'yinshi',
            'zhuxiu':'zhuxiu',
            'zhuyuan':'zhuyuan',
            'jiaofei':'charge',
            'zhuyuan':'notice',
            'quyao':'drug',
			}
        item['name']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.title::text').extract()])
        item['organization']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.tag::text').extract()])
        item['address']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(4)::text').extract()])
        item['phone']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(6)::text').extract()])
        for i in response.css('#container_max > div.main_max > div.article.bottom_20>div.hospital_jygl.fold_box'):
            text = ''.join([j.strip() for j in i.css('div.jygl_content.fold_body> p > span::text').extract()]).split(u'\uff1a')	# 得到li里边的全部文字,并在“:”符号处分词
	    text = text[1] if len(text)==2 else ''.join(text)# 如果分词后有2个字段,即赋第二个值,如:“名称:养老院” = “养老院”
	    
            try:
                key = dic[i.xpath('@id').extract_first(default='').encode('utf-8')]
                if key=='guahao':
                    item['guahao']=text
                elif key=='inspect':
                    item['inspect']=text

                elif key=='check':
                    item['check']=text
                    
                elif key=='transport':
                    item['transport']=text
                    
                elif key=='carparking':
                    item['carparking']=text
                    
                elif key=='charge':
                    item['charge']=text
                    
                elif key=='notice':
                    item['notice']=text
                    
                elif key=='drug':
                    item['drug']=text
                    
                elif key=='yinshi':
		    item['yinshi'] = text
		elif key=='zhuxiu':
                    item['zhuxiu']=text
                elif key=='zhuyuan':
                    item['zhuyuan']=text
                   
	    except Exception, e:
		pass
            
        return item        

第四步:Pipeline.py文件

from twisted.enterprise import adbapi
import re, time
import pymysql.cursors
from scrapy import log
from scrapy.conf import settings
from datetime import datetime
from hashlib import md5
# from scrapy import log
from scrapy.exceptions import DropItem


class HospitalPipeline(object):

    def __init__(self, dbpool):
	    # ==========初始化函数
        self.dbpool = dbpool					# 定义多线程池
	now = datetime.now()					# 获取今天的日期
	self.add_date = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2)	# 将日期转换为字段形式:20160101

	
		
    @classmethod
    def from_settings(cls, settings):
    # ==========从settings.py里获取mysql数据库信息,并定数据编码为utf-8,以免入库时出错
        dbargs = dict(
                host=settings['MYSQL_HOST'],		
                db=settings['MYSQL_DB'],
                user=settings['MYSQL_USER'],
                passwd=settings['MYSQL_PASS'],
                charset='utf8',
                use_unicode=True,
	    )

	dbpool = pymysql.connect(**dbargs)
	return dbpool

    
    def process_item(self, item, spider):
		# ==========处理item
        if item.get('name','') != '':				# 如果item里有“address”字段,即判断为yanglao爬虫的item
            self.query = "insert IGNORE into `hospitals_"+self.add_date+"` (名称,机构类型,地址,phone,挂号,检查,体检,交通,饮食,住宿,住院,停车,收费,入住须知,取药) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
            self.data = (item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']) # run db query in the thread pool
	d = self.dbpool.runInteraction(self._do_upsert, item, spider)
	#d.addErrback(self._handle_error, item, spider)
	# at the end return the item in case of success or failure
	d.addBoth(lambda _: item)
	# return the deferred instead the item. This makes the engine to
	# process next item (according to CONCURRENT_ITEMS setting) after this
	# operation (deferred) has finished.
	return d

    
    def _do_upsert(self, conn, item, spider):
	"""Perform an insert or update."""
	try:
	    conn.execute(self.query,self.data)		# 执行mysql语句
	except Exception, e:
            print 'error========================================', e    

第五步:setting文件

这里主要是添加本地MYSQL数据库信息

BOT_NAME = 'Hospital'

SPIDER_MODULES = ['Hospital.spiders']
NEWSPIDER_MODULE = 'Hospital.spiders'

COOKIES_ENABLED = True #禁止cookies,防止被ban
MYSQL_HOST = '127.0.0.1'				#Mysql连接名
MYSQL_PORT = '3306'						#连接端口号
MYSQL_USER = 'root'						#Mysql用户名
MYSQL_PASS = '******'						#Mysql用户密码
MYSQL_DB = 'gaoxueya'						#Mysql数据库名
ITEM_PIPELINES={
		'Hospital.pipelines.HospitalPipeline':100,
               
                # 开通CrawlerStorePipeline
                }

最终入库结果如下:
使用python3+scrapy爬虫,并将结果保存到MYSQL数据库中(附代码)_第3张图片

你可能感兴趣的:(python爬虫)