用Scrapy框架爬取网页的一系列技术文章,以伯乐在线为例(网址为http://blog.jobbole.com/tag/machinelearning/)。通过查看网站源码获取每篇文章的链接和标题,将标题作为文件名,网页正文的内容作为文件内容批量保存到本地。
settings定义爬取的一些设置如下;
# -*- coding: utf-8 -*-
# Scrapy settings for jobbole project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'jobbole'
SPIDER_MODULES = ['jobbole.spiders']
NEWSPIDER_MODULE = 'jobbole.spiders'
ITEM_PIPELINES = {
'jobbole.pipelines.JobbolePipeline': 1,
}
PAGES_STORE='F:\\SpiderTest\\PageTest'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
spider定义爬取链接规则如下:
# -*- coding: utf-8 -*-
import scrapy
from jobbole.items import JobboleItem
from bs4 import BeautifulSoup
class jobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = []
start_urls = ["http://blog.jobbole.com/tag/machinelearning/"]
def parse(self, response):
item = JobboleItem()
item['page_urls'] =response.xpath('//div[@class="post floated-thumb"]/div[1]/a//@href').extract() # 提取文章链接
print 'image_urls', item['page_urls']
yield item
new_url = response.xpath('//*[@class="next page-numbers"]//@href').extract_first() # 翻页
print 'new_url', new_url
if new_url:
yield scrapy.Request(new_url, callback=self.parse)
items建立爬取模型如下
import scrapy
class JobbleItem(scrapy.Item):
# define the fields for your item here like:
page_urls = scrapy.Field() #文章链接
pass
pipeline定义如何使用爬取的链接获得所需的爬取内容,如下
-*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from jobbole import settings
import os
import urllib
from bs4 import BeautifulSoup
class JobbolePipeline(object):
def process_item(self, item, spider):
a=0
dir_path = '%s/%s' % (settings.PAGES_STORE, spider.name) # 存储路径
print 'dir_path', dir_path
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for page_url in item['page_urls']:
a=a+1
html = urllib.urlopen(page_url).read() # 打开文章链接
soup1 = BeautifulSoup(html)
headitems = soup1.find("div", attrs={"class":"entry-header"}).getText().encode("GB18030",'ignore')#将爬取内容按正确格式编码
print headitems
list_name = page_url.split('/')
print 'listname',list_name
file_name = str(headitems).strip('\n')+'.txt' #去除爬取内容末尾的换行符,以便生成文件名
print 'filename', file_name
file_path = '%s/%s' % (dir_path, file_name)
print 'filepath', file_path
if os.path.exists(file_name):
continue
with open(file_path, 'wb') as file_writer:
content = soup1.find("div", attrs={"class": "entry"}).getText().encode("GB18030",'ignore')#这行很重要,将爬取内容按正确格式编码
file_writer.write(content)
file_writer.close()
return item
运行scrapy crawl projectname,即可在指定的文件夹看到批量保存好的文章,文件名就是文章标题。