基于scrapy和beautifulsoup框架,可以方便地从网页爬取我们所需要的信息,无论中文,数字或者是英文信息,均可以按找我们希望的方式爬取下来。
这次我想爬取我们学院主页的所有老师的个人信息,包括职称,电话,办公室地址和电子邮箱。
pipelines如下:
from teacher1 import settings
import os
import urllib
from bs4 import BeautifulSoup
class TeacherPipeline(object):
def process_item(self, item, spider):
dir_path = '%s/%s' % (settings.PAGES_STORE, spider.name) # 存储路径
print 'dir_path', dir_path
if not os.path.exists(dir_path):
os.makedirs(dir_path)
file_path = '%s/teacherList.doc' % dir_path # 1015修改产生联系方式
with open(file_path, 'wb') as file_writer:
for page_url in item['page_urls']:
print 'http://me.sjtu.edu.cn/sz_minglu/'+page_url
html = urllib.urlopen('http://me.sjtu.edu.cn/sz_minglu/'+page_url).read() # 打开文章链接
soup1 = BeautifulSoup(html.decode('GBK','ignore').encode('utf-8','ignore'))
if (soup1.find("span", attrs={"id":"lblName"})):
headitems = soup1.find("span", attrs={"id":"lblName"}).getText().encode('utf-8','ignore')
print headitems.decode('utf-8','ignore')
title = soup1.find("span", attrs={"id": "lblzhicheng"}).getText().encode('utf-8','ignore')
phonenum = soup1.find("span", attrs={"id": "lbldianhua"}).getText().encode('utf-8','ignore')
addr = soup1.find("span", attrs={"id": "lbladdress"}).getText().encode('utf-8','ignore')
email = soup1.find("span", attrs={"id": "lblemail"}).getText().encode('utf-8','ignore')
file_writer.write(headitems+'\t '+title+'\n'+'电话: '+phonenum+'\n'+'通信地址: '+addr+'\n'+'电子邮箱: '+email+'\n\n')
else:
continue
file_writer.close()
import scrapy
from teacher1.items import TeacherItem
#from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
class teacherSpider(scrapy.Spider):
name = 'teacher'
allowed_domains = []
#start_urls = ["http://jandan.net/ooxx"]
start_urls = ["http://me.sjtu.edu.cn/sz_minglu/Default.aspx?cid=4"]
def parse(self, response):
item = TeacherItem()
item['page_urls'] =response.xpath('//a[@class="amingluCss"]//@href').extract() # 提取图片链接
print 'teacher_urls', item['page_urls']
yield item
运行spider即可,获得一个名为teacherList,doc的word文档,里面列出了所有老师的个人信息。