python爬虫:scrapy爬取传智播客教师信息

推荐一个Chrome浏览器的xpath解析工具: xPath helper
轻松获取HTML元素的xPath
打开/关闭控制台:Ctrl-Shift键-X

参考:介绍一款chrome爬虫网页解析工具-XPath Helper

scrapy基本介绍

# 创建工程 scrapy startproject myspider
# 创建爬虫 scrapy genspider itcast itcast.cn
# 查看爬虫 scrapy list
# 运行爬虫 scrapy crawl itcast

# 4种格式: json jsonl csv  xml  默认为unicode编码
# 输出为json格式:scrapy crawl itcast -o data.json

# 启用终端 scrapy shell url
# response.headers
# response.body

# 选择器提取数据返回都是列表
# response.xpath() 提取出来的是一个列表
# response.css()
# extract() 将xpath对象转为Unicode字符串对象
# re() 正则

完整代码

为了说明spider、item、pipeline三个类的基本用法,代码比较冗余

# itcast_spider.py

# -*- coding: utf-8 -*-

import scrapy
from myspider.items.itcast_item import ItcastItem

# py2解决编码问题
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class ItcastSpider(scrapy.Spider):
    # 爬虫名称,必须且唯一
    name = "itcast"
    # 限定爬取范围(可选)
    allowed_domains = ["itcast.cn"]

    # 配置处理item的pipeline
    custom_settings = {
        "ITEM_PIPELINES":{
            "myspider.pipelines.itcast_pipeline.ItcastPipeline":1,
        }
    }

    # 爬取的第一批url列表
    start_urls = [
        "http://www.itcast.cn/channel/teacher.shtml"
    ]

    def parse(self, response):
        # 解析每个链接的教师列表
        for li_txt in response.css(".li_txt"):
            name = li_txt.xpath("./h3/text()").extract()[0]
            title = li_txt.xpath("./h4/text()").extract()[0]
            info = li_txt.xpath("./p/text()").extract()[0]

            # 将数据放入item中返回给pipeline
            item = ItcastItem()
            item["name"] = name
            item["title"] = title
            item["info"] = info

            yield item
# itcast_item.py

# -*- coding: utf-8 -*-

import scrapy

class ItcastItem(scrapy.Item):
    name = scrapy.Field()  # 姓名
    title = scrapy.Field()  # 职称
    info = scrapy.Field()  # 详细信息
# itcast_pipline.py 

# -*- coding: utf-8 -*-

import json

class ItcastPipeline(object):
    # 类只实例化一次
    def __init__(self):
        print "@@@@@@爬虫初始化"
        self.f = open("itcast.json", "w")
        self.count = 0 # 计数

    def process_item(self, item, spider):
        # 必须实现的方法
        dct = json.dumps(dict(item), ensure_ascii=False)
        self.f.write(dct.encode("utf-8")+"\n")
        self.count += 1
        return item # 必须返回,让其他管道处理

    def open_spider(self, spider):
        print "@@@@@@爬虫打开"

    def close_spider(self, spider):
        self.f.close()
        print "@@@@@@爬虫关闭"
        print "爬取数据条数:%s" % self.count

你可能感兴趣的:(scrapy)