网上直接搜,教程有很多。下面的内容是基于已经安装好python以及scrapy情况下。MYSQL包也需要提前安装好:
进入cmd终端下执行:
pip install PyMySQL
在cmd终端下输入:
scrapy startproject hospital
class HospitalItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field() #名称
organization = scrapy.Field() #机构性质
address = scrapy.Field() #地址
phone=scrapy.Field()
#简介
guahao=scrapy.Field()
inspect=scrapy.Field()
check=scrapy.Field()
transport = scrapy.Field()
yinshi=scrapy.Field()
zhuxiu=scrapy.Field()
zhuyuan=scrapy.Field()#交通
carparking=scrapy.Field()
charge = scrapy.Field() #收费
notice = scrapy.Field() #入住要求
drug=scrapy.Field()
创建后打开进入写爬虫环节
首先,导入相关的库:我最终将爬去的数据保存在了mysql数据库中,故需要导入MySQLdb数据库,python3所用的与python2有所不同,python3为import pymysql
# -*- coding: utf-8 -*-
import scrapy
from Hospital.items import HospitalItem
from scrapy.conf import settings # scrapy的设置函数
from datetime import datetime # 日期模块
import pymysql as mdb #数据库模块
其次,是主要代码块
class YanglaoSpider(scrapy.Spider):
name = "hospital" # 爬虫名
allowed_domains = ["360jk.com"] # 可访问域名,定义域名后爬虫只访问该域名下的网址 start_urls
start_urls= (
'http://www.360jk.com/jibing/gxy/yiyuan?&level_id=0',
)
def __init__(self):
# ==========初始化函数
now = datetime.now() # 获取当前日期
today = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2) # 将日期转换为字段形式,如:20160101
con = mdb.connect(host=settings['MYSQL_HOST'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASS'], db=settings['MYSQL_DB'],charset='utf8');
cur = con.cursor() # 创建数据库连接,定义连接指针
cur.execute("""DROP TABLE IF EXISTS `hospitals_%s`;""" % today) # 如果yl_datetime(yl_20160101)表存在,即删除该表
cur.execute("""
CREATE TABLE `hospitals_%s` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`名称` varchar(15) DEFAULT NULL,
`机构类型` varchar(20) DEFAULT NULL,
`地址` varchar(50) DEFAULT NULL,
`phone` varchar(30) DEFAULT NULL,
`挂号` varchar(1000) DEFAULT NULL,
`检查` varchar(1000) DEFAULT NULL,
`体检` varchar(1000) DEFAULT NULL,
`交通` varchar(100) DEFAULT NULL,
`饮食` varchar(1000) DEFAULT NULL,
`住宿` varchar(1000) DEFAULT NULL,
`住院` varchar(1000) DEFAULT NULL,
`停车` varchar(1000) DEFAULT NULL,
`收费` varchar(1000) DEFAULT NULL,
`入住须知` varchar(2000) DEFAULT NULL,
`取药` varchar(1000) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET='utf8';
""" % today) # 新建yl_datetime(yl_20160101)表
con.close()
#`简介` varchar(6000) DEFAULT NULL,
def parse(self, response):
# ==========从访问的页面上页获取全部养老院链接,并生成Request请求,把response发给parse_page函数
for url in response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > ul > li > div.info_area > figure > div > a').xpath('@href').extract():
yield scrapy.Request(response.urljoin(url), callback=self.parse_page)
# break
# ==========从页底获取下一页的链接,生成Request请求,把response发给原来的parse函数做循环
next_url = response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > div.paginations > div > b.page_num > a').xpath('@href').extract() # 下一页链接是所获取链接的倒数第二个,所以要取[-2]
next_url = next_url[-2] if len(next_url)>1 else next_url[0]
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
def parse_page(self, response):
#def parse(self, response):
item = HospitalItem() # 定义item item['intro'],
item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']='','','','','','','','','','','','','','','' # 给全部item赋空值,避免以后出错
# ==========从页面上获取的字段字典
dic = {
'guahao':'guahao',
'jiancha':'inspect',
'tijian':'check',
'jiaotong':'transport',
'tingche':'carparking',
'yinshi':'yinshi',
'zhuxiu':'zhuxiu',
'zhuyuan':'zhuyuan',
'jiaofei':'charge',
'zhuyuan':'notice',
'quyao':'drug',
}
item['name']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.title::text').extract()])
item['organization']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.tag::text').extract()])
item['address']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(4)::text').extract()])
item['phone']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(6)::text').extract()])
for i in response.css('#container_max > div.main_max > div.article.bottom_20>div.hospital_jygl.fold_box'):
text = ''.join([j.strip() for j in i.css('div.jygl_content.fold_body> p > span::text').extract()]).split(u'\uff1a') # 得到li里边的全部文字,并在“:”符号处分词
text = text[1] if len(text)==2 else ''.join(text)# 如果分词后有2个字段,即赋第二个值,如:“名称:养老院” = “养老院”
try:
key = dic[i.xpath('@id').extract_first(default='').encode('utf-8')]
if key=='guahao':
item['guahao']=text
elif key=='inspect':
item['inspect']=text
elif key=='check':
item['check']=text
elif key=='transport':
item['transport']=text
elif key=='carparking':
item['carparking']=text
elif key=='charge':
item['charge']=text
elif key=='notice':
item['notice']=text
elif key=='drug':
item['drug']=text
elif key=='yinshi':
item['yinshi'] = text
elif key=='zhuxiu':
item['zhuxiu']=text
elif key=='zhuyuan':
item['zhuyuan']=text
except Exception, e:
pass
return item
from twisted.enterprise import adbapi
import re, time
import pymysql.cursors
from scrapy import log
from scrapy.conf import settings
from datetime import datetime
from hashlib import md5
# from scrapy import log
from scrapy.exceptions import DropItem
class HospitalPipeline(object):
def __init__(self, dbpool):
# ==========初始化函数
self.dbpool = dbpool # 定义多线程池
now = datetime.now() # 获取今天的日期
self.add_date = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2) # 将日期转换为字段形式:20160101
@classmethod
def from_settings(cls, settings):
# ==========从settings.py里获取mysql数据库信息,并定数据编码为utf-8,以免入库时出错
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DB'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASS'],
charset='utf8',
use_unicode=True,
)
dbpool = pymysql.connect(**dbargs)
return dbpool
def process_item(self, item, spider):
# ==========处理item
if item.get('name','') != '': # 如果item里有“address”字段,即判断为yanglao爬虫的item
self.query = "insert IGNORE into `hospitals_"+self.add_date+"` (名称,机构类型,地址,phone,挂号,检查,体检,交通,饮食,住宿,住院,停车,收费,入住须知,取药) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.data = (item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']) # run db query in the thread pool
d = self.dbpool.runInteraction(self._do_upsert, item, spider)
#d.addErrback(self._handle_error, item, spider)
# at the end return the item in case of success or failure
d.addBoth(lambda _: item)
# return the deferred instead the item. This makes the engine to
# process next item (according to CONCURRENT_ITEMS setting) after this
# operation (deferred) has finished.
return d
def _do_upsert(self, conn, item, spider):
"""Perform an insert or update."""
try:
conn.execute(self.query,self.data) # 执行mysql语句
except Exception, e:
print 'error========================================', e
这里主要是添加本地MYSQL数据库信息
BOT_NAME = 'Hospital'
SPIDER_MODULES = ['Hospital.spiders']
NEWSPIDER_MODULE = 'Hospital.spiders'
COOKIES_ENABLED = True #禁止cookies,防止被ban
MYSQL_HOST = '127.0.0.1' #Mysql连接名
MYSQL_PORT = '3306' #连接端口号
MYSQL_USER = 'root' #Mysql用户名
MYSQL_PASS = '******' #Mysql用户密码
MYSQL_DB = 'gaoxueya' #Mysql数据库名
ITEM_PIPELINES={
'Hospital.pipelines.HospitalPipeline':100,
# 开通CrawlerStorePipeline
}