scrapy爬虫的几个案例

lz最近在学习scrapy爬虫框架,对于此框架,我自己用两个案例进行了实践,初步对这个框架掌握,就写一篇博客来记录下我的学习过程。

一、我的环境

mac+python2.7.6+scrapy1.4.0版本。对于scrapy在mac中的安装过程就不做介绍了。

二、爬取清华大学就业信息网的就业信息板块就业信息

清华大学就业信息网网址:

http://career.tsinghua.edu.cn/,具体爬取招聘信息板块内容,网址为:http://career.cic.tsinghua.edu.cn/xsglxt/f/jyxt/anony/xxfb

首先是分析此网页信息,需要把其中的每一条就业信息爬下来,在具体的html中可以通过一些标签来完善。具体的标签为/div[@class='content teacher']/div[2]/ul[@class='list'],在这个标签的路径下,我们可以进行提取。因为这一页中具体有10几条数据,所以需要用for循环来输出即可。但是因为这个招聘信息有好几页,所以需要进行轮询查找相关的页面并提取相关的页面信息进行提取。关于具体取哪个页面,这个参数是放在post提交数组的form_data的pgno字段。

具体的scrapy代码如下所示:

# -*- coding: utf-8 -*-

import re
import json
import datetime

from scrapy.selector import Selector
import scrapy.spiders
from scrapy.utils.response import get_base_url
from tsinghua.items import TsinghuaItem
import urlparse
from scrapy.utils.url import urljoin_rfc
import datetime,time
import codecs
import smtplib
from email.mime.application import MIMEApplication
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from scrapy.mail import MailSender
from email.utils import parseaddr,formataddr
from email.header import Header
from email.mime.base import MIMEBase
from email import encoders
import os

class TsinghuaSpider(scrapy.spiders.Spider):
    name = "tsinghua"
    allowed_domains = ["tsinghua.edu.cn"]
    items   = []
    max = 0
    today = datetime.date.today()
    #yesterday = today - datetime.timedelta(days=1)
    '''
    start_urls = [
        "http://career.cic.tsinghua.edu.cn/xsglxt/f/jyxt/anony/xxfb"
    ]
    '''
    URL = "http://career.cic.tsinghua.edu.cn/xsglxt/f/jyxt/anony/xxfb"
    offset = 1
    def start_requests(self):
        url = self.URL
        yield scrapy.FormRequest(
            url = self.URL,
            formdata={"pgno":str(self.offset)},
            callback=self.parse
        )

    def parse(self,response):

        sel = Selector(response)
        base_url = get_base_url(response)
        for sss in sel.xpath("//li[@class='clearfix']"):
            item = TsinghuaItem()
            dd = sss.xpath('span/text()').extract()[0]
            d = str(dd)
            t = str(self.today)
            #筛选了一下就业信息的时间,只挑选当日发布的就业信息
            if t != d:
                print "today %s" %self.today + "   " + "datetime %s" %dd
                continue

            item['Time'] = dd
            item['name'] = sss.xpath("a/text()").extract()[0]
            re_url = sss.xpath("a/@ahref").extract()[0]
            item['detailLink'] = urlparse.urljoin(base_url,re_url)

            #self.items.append(item)
            yield item
            #max = sel.xpath("//b[@id='totalPg']/text()").extract()[0]

        if self.offset < 2:
            self.offset += 1
            print self.offset
            yield scrapy.FormRequest(url = self.URL,formdata={"pgno":str(self.offset)},callback=self.parse)

        #return items





    def closed(self,reason):
        print os.path.getsize(r'/Users/sunwangdong/desktop/tsinghua/tsinghua_%s.json'%self.today)
        if (os.path.getsize(r'/Users/sunwangdong/desktop/tsinghua/tsinghua_%s.json'%self.today) == 0):
            os.remove(r'/Users/sunwangdong/desktop/tsinghua/tsinghua_%s.json'%self.today)
        '''
        data = []
        with open('/Users/sunwangdong/desktop/tsinghua/tsinghua.json') as f:
            for line in f:
                data.append(json.loads(line))

        file_object = codecs.open('tsinghua.txt','w',"utf-8")
        str = "\r\n"
        splitstr = "#___#"
        for item in data:
            str = "%s#___#%s#___#%s\r\n" % (item['name'],item['detailLink'],item['Time'].strip())
            file_object.write(str)
        '''

        sender = ''    
        receiver = ['']     #邮箱地址
        #subject = u'gift for u'
        smtpserver = 'smtp.163.com'
        username = ''
        password = ''    #163邮箱的附件码

        msgRoot = MIMEMultipart()
        msgRoot['Subject'] = Header('就业信息 %s' %self.today,'utf-8').encode()
        name,addr = parseaddr('朋友 <%s>' % sender)
        msgRoot['From'] = formataddr((Header(name,'utf-8').encode(),addr)).encode()
        msgRoot['To'] = ','.join(receiver)




        filename = r'/Users/sunwangdong/desktop/tsinghua/tsinghua_%s.txt'%self.today
        if os.path.exists(filename):
            msgRoot.attach(MIMEText('This is my gift!', 'plain', 'utf-8'))
            att = MIMEText(open('tsinghua.txt','rb').read(),'base64','utf-8')
            att["Content-Type"] = 'application/octet-stream'
            att["Content-Disposition"] = 'attachment; filename="tsinghua_%s.txt"'%self.today
            msgRoot.attach(att)

            '''
            with open('/Users/sunwangdong/desktop/tsinghua/tsinghua.txt','r') as f:
                mime = MIMEBase('text','txt',filename='tsinghua.txt')
                mime.set_payload(f.read())
                encoders.encode_base64(mime)
                msgRoot.attach(mime)
            
            
            att = MIMEText(open('/Users/sunwangdong/desktop/tsinghua/tsinghua.json','r').read(),'base64','utf-8')
            att["Content-Type"] = 'application/octet-stream'
            att["Content-Disposition"] = 'attachment; filename="tsinghua.json"'
            '''

            smtp = smtplib.SMTP()
            smtp.connect('smtp.163.com',25)
            smtp.login(username,password)
            smtp.sendmail(sender,receiver,msgRoot.as_string())
            smtp.quit()




            '''
            mailer = MailSender(
                smtphost="smtp.163.com",
                mailfrom="",
                smtpuser="",
                smtppass="",
                smtpport = 25
            )
            body=u'This is the gift for you!'
            subject = u'就业信息'
            file_name = open('tsinghua.json','r')
            mailer.send(to=["[email protected]"],subject=subject.encode('utf-8'),body=body.encode('utf-8'),attachs=(('tsinghua.json','text/plain',)))
            '''
        else:
            msgRoot.attach(MIMEText('Sorry!Today has no infomation about jobs!', 'plain', 'utf-8'))
            smtp = smtplib.SMTP()
            smtp.connect('smtp.163.com', 25)
            smtp.login(username, password)
            smtp.sendmail(sender, receiver, msgRoot.as_string())
            smtp.quit()





上述代码中,我在最后添加了将爬取结果发送到指定邮箱的操作,而且是生成附件的方式来发送响应的邮件。那么在pipelines.py文件中,是用来指定将爬取到的数据通过pipeline文件以管道的形式传输到相应的item文件中。

class JsonTsinghuaPipeline(object):
      def __init__(self):
            self.file = codecs.open('','a',encoding='utf-8')
      def process_item(self,item,spider):
            line = json.dumps(dict(item),ensure_ascii=False) +"\n"
            self.file.write(line)
            return item
      def spider_closed(self,spider):
            self.file.close()

class TsinghuaPipeline(object):
      def process_item(self,item,spider):
            while open(filename,'a') as f:
                 f.write(item['name'] + '\n')
                 f.write(item['detaillink'] + '\n')
                 f.write(item['Time'] + '\n')
            return item
还有就是items.py文件,用于描述数据相应的格式

class TsinghuaItem(scrapy.Item):
    name =Field()
    detaillink = Field()
    Time = Field()
将相应的爬取到的数据放置到items生成的新的表格中去。

通过以上方式就可以爬取到清华大学就业信息网的招聘信息的内容到某个文件中,并发送给相应的邮箱中。


你可能感兴趣的:(scrapy)