爬虫练手:使用scrapy爬取某在线学院课程信息,并保存到文件

python版本:3.5
爬取目标网址:
https://www.hellobi.com/

源代码

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class TianshanItem(scrapy.Item):
    # define the fields for your item here like:
    #课程名称
    name = scrapy.Field()
    #课程链接
    link = scrapy.Field()
    #学员数
    stu = scrapy.Field()

lessons.py

# -*- coding: utf-8 -*-
import scrapy
from tianshan.items import TianshanItem

class LessionSpider(scrapy.Spider):
    name = "lesson"
    allowed_domains = ["hellobi.com"]
    start_urls = ['https://edu.hellobi.com/course/1']

    def parse(self, response):
        item = TianshanItem()
        item["name"] = response.xpath("//div[@class='course-info']/h1/text()").extract()[0]
        item["link"] = response.xpath("//ul[@class='nav nav-tabs' and @role='tablist']/li[@class='active']/a/@href").extract()[0]
        item["stu"] = response.xpath("//span[@class='course-view']/text()").extract()[0]

        yield item
        for i in range(1,142):
            url = "https://edu.hellobi.com/course/" + str(i)
            yield scrapy.Request(url=url,callback=self.parse)

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class TianshanPipeline(object):
    def __init__(self):
        file = "E:\\test\\tianshan\\lessons.txt"
        # open(file,"a")
        self.fh = open(file, "w")



    def process_item(self, item, spider):
        print(item["name"])
        print(item["link"])
        print(item["stu"])
        print("")
        self.fh.write(item["name"] + "\n" + item["link"] + "\n" + item["stu"] + "\n" + "******************")
        return item

    def close_spider(self):
        self.fh.close()

你可能感兴趣的:(爬虫练手:使用scrapy爬取某在线学院课程信息,并保存到文件)