创建项目加粗样式
scrapy startproject 项目名
创建爬虫
scrapy genspider +<爬虫名字> + <允许爬取的域名>
例如:scrapy genspider coursespider “www.xxx.cn”
spider部分
(1)start_urls为一个列表,可直接修改成爬取的地址
(2)请求头的设置,可防反爬虫
ua = UserAgent()
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"User-Agent": ua.random,
}
(3)parse函数
当请求url返回网页没有指定回调函数,默认的Request对象的回调函数,用来处理网页返回的response,和生成的Item或者Request对象。
工作机制:
1.因为使用的yield,而不是return,parse函数将会当做一个生成器使用,scrapy会注意调用parse方法中生成的结果,并且判断该结果是一个什么样的类型
2.如果是request则会加入爬取队列中,如果是item类型则会使用pipeline处理,其他类型则会返回错误信息
3.scrapy取到第一部分的request不会立马就去发送request,只是将这个request放到队列中,然后接着从生成器中获取
4.取完了第一部分的request,然后再获取第二部分的item,取到item了,就会放到对应的pipeline中处理
5.parse方法作为回调函数(callback),赋值给Request,指定parse()方法处理这些请求scrapy.Request(url,callback=self.parse)
6.Request对象经过调度,执行生成scrapy.http.response()响应对象,并送回parse()方法,直到调度器中没有Requset(递归的思路)
7.取尽之后,parse()工作结束,引擎再根据对列和pipeline中的内容去执行相应的操作
8.程序在取得各个页面的items前,会先处理完之前所有的request对列的请求,然后再提取items
def parse(self, response):
a_data = response.xpath("//div[@class='row']/ul//a")
item = dict()
for a in a_data:
item["a1_text"] = a.xpath("./text()").extract_first()
a1_href = a.xpath("./@href")
if a1_href and not a1_href.extract_first().startswith("#"):
item["a1_href"] = self.start_urls[0] + a1_href.extract_first()
yield scrapy.Request(
item["a1_href"],
callback=self.parse_detail_course,
meta={"item": deepcopy(item)}
)
yield
我们通过 yield 来发起一个请求,并通过 callback 参数为这个请求添加回调函数,在请求完成之后会将响应作为参数传递给回调函数。scrapy框架会根据 yield 返回的实例类型来执行不同的操作,如果是 scrapy.Request 对象,scrapy框架会去获得该对象指向的链接并在请求完成后调用该对象的回调函数。如果是 scrapy.Item 对象,scrapy框架会将这个对象传递给 pipelines.py做进一步处理。
callback 回调函数
def parse_detail_course(self, response):
item = response.meta["item"]
detail_course = response.xpath("//table[@bgcolor='#ffcccc']")
# print(response.url)
for course in detail_course:
course_title = course.xpath(".//td//a[1]/text()").extract_first()
# print(course_title)
# all_href = course.xpath(".//td//a[last()]/@href").extract_first()
i = 1
while 1:
course_list = course.xpath("./following-sibling::table[{}]//pre/a/text()".format(i)).extract()
if course_list:
# print(course_list)
choose_number = course_list[0]
academic_building = course_list[1] if len(course_list) > 1 else None
data = course.xpath("./following-sibling::table[{}]//pre/text()".format(i)).extract()
data1 = [x.strip() for x in data[1].split(' ') if x.strip() != '']
# print(data)
section, week, classing_time = data1[0] + ' ' + data1[1], data1[2], data1[3]
if len(data) > 2:
data2 = [x.strip() for x in data[2].split(' ') if x.strip() != '']
classroom, professor_name = data2[0], data2[1]
else:
classroom, professor_name = None, None
# item = {
# "course_title": str(course_title),
# "choose_number": choose_number,
# "section": section,
# "week": week,
# "classing_time": classing_time,
# "academic_building": academic_building,
# "classroom": classroom,
# "professor_name": professor_name,
# }
# print(item)
i += 1
item = CoursespiderItem() # 创建item对象
item['course_title'], item['choose_number'], item['section'], item['week'], item['classing_time'], \
item['academic_building'], item['classroom'], item['professor_name'], = \
course_title, choose_number, section, week, classing_time, academic_building, classroom, \
professor_name
yield item
# yield Item({
# "course_title": course_title,
# "choose_number": choose_number,
# "section": section,
# "week": week,
# "classing_time": classing_time,
# "academic_building": academic_building,
# "classroom": classroom,
# "professor_name": professor_name,
# })
else:
break
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class CoursespiderItem(scrapy.Item):
# define the fields for your item here like:
course_title = scrapy.Field()
choose_number = scrapy.Field()
section = scrapy.Field()
week = scrapy.Field()
classing_time = scrapy.Field()
academic_building = scrapy.Field()
classroom = scrapy.Field()
professor_name = scrapy.Field()
class DetailCoursespiderItem(scrapy.Item):
# define the fields for your item here like:
tag = scrapy.Field()
course_name = scrapy.Field()
course_introduction = scrapy.Field()
class MajorsspiderItem(scrapy.Item):
# define the fields for your item here like:
school_name = scrapy.Field()
professional_categories = scrapy.Field()
professional_name = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from coursespider.items import CoursespiderItem, DetailCoursespiderItem, MajorsspiderItem
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from coursespider.db import Base, Model, DetailCourseModel, MajorsModel
# class CoursespiderPipeline(object):
# def process_item(self, item, spider):
# return item
class MysqlPipeline(object):
def __init__(self):
# 建立数据库链接
self.conn = create_engine("mysql+pymysql://root:[email protected]/spider_course?charset=utf8", pool_size=100,
max_overflow=20)
Base.metadata.create_all(bind=self.conn) # 创建表,如果有,就不在创建
def _get_session(self):
# 创建session对象
Session = sessionmaker(bind=self.conn)
return Session()
def process_item(self, item, spider):
session = self._get_session() # 获取session
# print(item)
if isinstance(item, CoursespiderItem):
if item['week'] == 'to' and item['classing_time'] == 'be':
item['week'], item['classing_time'] = 'to be arranged', None
if item['professor_name'] == 'Open' or 'Closed' or item['professor_name'][:1].isdigit():
item['professor_name'] = None
if not session.query(Model).filter_by(course_title=item['course_title'],
choose_number=item['choose_number']).all():
obj = Model(**item)
session.add(obj)
session.commit()
session.close()
return item
else:
session.close()
if isinstance(item, DetailCoursespiderItem):
if not session.query(DetailCourseModel).filter_by(course_name=item['course_name']).all():
obj = DetailCourseModel(**item)
session.add(obj)
session.commit()
session.close()
return item
else:
session.close()
if isinstance(item, MajorsspiderItem):
# if not session.query(MajorsModel).filter_by(professional_categories=item['professional_categories']).all():
obj = MajorsModel(**item)
session.add(obj)
session.commit()
session.close()
return item
# else:
# session.close()
from scrapy.cmdline import execute
import sys
import os
# 设置工程的目录,可以在任何路径下运行execute
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# execute(["scrapy", "crawl", "washington"])
# execute(["scrapy", "crawl", "detailcourse"])
execute(["scrapy", "crawl", "majors"])