用scrapy爬取学校教师的个人信息

基于scrapy和beautifulsoup框架,可以方便地从网页爬取我们所需要的信息,无论中文,数字或者是英文信息,均可以按找我们希望的方式爬取下来。

这次我想爬取我们学院主页的所有老师的个人信息,包括职称,电话,办公室地址和电子邮箱。

pipelines如下:

from teacher1 import settings
import os
import urllib
from bs4 import BeautifulSoup

class TeacherPipeline(object):
    def process_item(self, item, spider):
        dir_path = '%s/%s' % (settings.PAGES_STORE, spider.name)  # 存储路径
        print 'dir_path', dir_path
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        file_path = '%s/teacherList.doc' % dir_path  # 1015修改产生联系方式
        with open(file_path, 'wb') as file_writer:
            for page_url in item['page_urls']:
                print 'http://me.sjtu.edu.cn/sz_minglu/'+page_url
                html = urllib.urlopen('http://me.sjtu.edu.cn/sz_minglu/'+page_url).read()  # 打开文章链接
                soup1 = BeautifulSoup(html.decode('GBK','ignore').encode('utf-8','ignore'))
                if (soup1.find("span", attrs={"id":"lblName"})):        
                  headitems = soup1.find("span", attrs={"id":"lblName"}).getText().encode('utf-8','ignore')
                  print headitems.decode('utf-8','ignore')
                  title = soup1.find("span", attrs={"id": "lblzhicheng"}).getText().encode('utf-8','ignore')
		  phonenum = soup1.find("span", attrs={"id": "lbldianhua"}).getText().encode('utf-8','ignore')
                  addr = soup1.find("span", attrs={"id": "lbladdress"}).getText().encode('utf-8','ignore')
                  email = soup1.find("span", attrs={"id": "lblemail"}).getText().encode('utf-8','ignore')
                  file_writer.write(headitems+'\t   '+title+'\n'+'电话:      '+phonenum+'\n'+'通信地址:  '+addr+'\n'+'电子邮箱: '+email+'\n\n')
                else:
                  continue
                file_writer.close()


 
  
 
  

 spider如下; 
  

import scrapy
from teacher1.items import TeacherItem


#from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup

class teacherSpider(scrapy.Spider):
    name = 'teacher'
    allowed_domains = []
    #start_urls = ["http://jandan.net/ooxx"]
    start_urls = ["http://me.sjtu.edu.cn/sz_minglu/Default.aspx?cid=4"]
    def parse(self, response):
        item = TeacherItem()
        item['page_urls'] =response.xpath('//a[@class="amingluCss"]//@href').extract()  # 提取图片链接
        print 'teacher_urls', item['page_urls']
        yield item
运行spider即可,获得一个名为teacherList,doc的word文档,里面列出了所有老师的个人信息。




你可能感兴趣的:(python)