Scrapy框架学习笔记(一)
使用步骤:
1、首先建立自己的Item文件,其中定义的是抓取的内容的数据类型。
2、接下来建立自己的spider文件,
3、最后建立自己的pipeline文件,负责接收spider传送过来的Item,并在这个文件中进行处理,可以导出到文件,可以存入数据库。
碰到的问题:
1、爬取的中文输出到文件中全为unicode字符,初期以为是字符编码的问题,尝试了各种编码以后,发现非也。实为list输出错误,只需输出list[0]即可解决。
2、勿忘在settings.py注册pipeline。
下面为几个递归爬取的例子:
BsaeSpier
方法1:将item和Request对象都放入到items中,通过返回items,让框架自己去识别时item还是Request。
class SlyySpider(BaseSpider):
name = "a"
allowed_domains = [".com"]
start_urls = ["****"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
h3 = hxs.select('''*****''').extract()
h3_unicode = "".join(h3)
t1 = hxs.select('''****''').extract()
items.append(SlyyItem(head=h3_unicode, url=response.url))
for url in hxs.select('''***''').extract():
items.append(Request(url, callback=self.parse))
return items
方法2:通过yield来区别对待item和request
class SlyySpider(BaseSpider):
name = "slyy2"
allowed_domains = ["***"]
start_urls = ["***"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
h3 = hxs.select('''***''').extract()
h3_unicode = "".join(h3)
yield SlyyItem(head=h3_unicode, url=response.url)
for url in hxs.select('''***''').extract():
yield Request(url, callback=self.parse)
方法3:
例一
class SlyySpider(BaseSpider):
name = "slyy3"
allowed_domains = ["***"]
start_urls = ["***"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
firspost = hxs.select('''***''').extract()[0]
items.extend([self.make_requests_from_url(firspost).replace(callback=self.parse_post)])
url2 = hxs.select('''***''').extract()[0]
items.append(self.make_requests_from_url(url2))
return items
def parse_post(self, response):
hxs = HtmlXPathSelector(response)
h3 = hxs.select('''***''').extract()[0]
print h3
item = SlyyItem()
item['url'] = response.url
item['head'] = h3
return item
例二
1 from scrapy.selector import HtmlXPathSelector
2
3 def parse(self, response):
4 hxs = HtmlXPathSelector(response)
5 items = []
6
7 newurls = hxs.select('//a/@href').extract()
8 validurls = []
9 for url in newurls:
10 #判断URL是否合法
11 if true:
12 validurls.append(url)
13 items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])
15
16 sites = hxs.select('//ul/li')
17 items = []
18 for site in sites:
19 item = DmozItem()
20 item['title'] = site.select('a/text()').extract()
21 item['link'] = site.select('a/@href').extract()
22 item['desc'] = site.select('text()').extract()
23 items.append(item)
24
25 return items
CrawlSpier
1 from scrapy.selector import HtmlXPathSelector
2 from sitemap.items import SitemapItem
3
4 import urllib
5 import simplejson
6 import exceptions
7 import pickle
8
9 class SitemapSpider(CrawlSpider):
10 name = 'sitemap_spider'
11 allowed_domains = ['qunar.com']
12 start_urls = ['http://www.qunar.com/routes/']
13
14 rules = (
15 #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),
16 #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),
17 )
18
19 def parse(self, response):
20 item = SitemapItem()
21 x = HtmlXPathSelector(response)
22 raw_urls = x.select("//a/@href").extract()
23 urls = []
24 for url in raw_urls:
25 if 'routes' in url:
26 if 'http' not in url:
27 url = 'http://www.qunar.com' + url
28 urls.append(url)
29
30 for url in urls:
31 yield Request(url)
32
33 item['url'] = response.url.encode('UTF-8')
34 arr_keywords = x.select("//meta[@name='keywords']/@content").extract()
35 item['keywords'] = arr_keywords[0].encode('UTF-8')
36 arr_description = x.select("//meta[@name='description']/@content").extract()
37 item['description'] = arr_description[0].encode('UTF-8')
38 yield item
关于rule:定义了一系列的相关链接
allow属性为允许的链接
deny属性为不允许的链接
callback属性为回调函数
rules = (
#下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换)
Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))),
#下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换)
Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"),
)
以下为我编写的爬取沪江网站的简单爬虫:items.py:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
class MyproItem(Item):
# define the fields for your item here like:
# name = Field()
id = Field()
th = Field()
zh = Field()
url = Field()
title = Field()
pipelines.py:
class MyproPipeline(object):
def __init__(self):
self.file = open('th.txt' , 'w')
self.file2 = open('zh.txt' , 'w')
def process_item(self, item, spider):
if len(item['th']) > 0 and len(item['zh']) > 0 :
if(len(item['th']) == len(item['zh'])):
self.file.write(str(item['title'][0].encode("utf-8"))+ '\n')
for i in range(len(item['th'])):
self.file.write(str(item['th'][i].encode("utf-8")) + '\n')
self.file2.write(str(item['title'][0].encode("utf-8"))+ '\n')
for i in range(len(item['zh'])):
self.file2.write(str(item['zh'][i].encode("utf-8")) + '\n')
return item
hjspider.py:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from mypro.items import MyproItem
from scrapy.http import Request
class HjSpider(BaseSpider):
name = "hujiang"
allowed_domain = ["hujiang.com"]
start_urls = [
"http://th.hujiang.com/new/"
]
def parse(self , response):
hxs = HtmlXPathSelector(response)
items = []
urls = []
raw_title = hxs.select('//title').extract()
raw_th = hxs.select("//div[@class='langs_en']/text()").extract()
raw_zh = hxs.select("//div[@class='langs_cn']/text()").extract()
items.append(MyproItem(title = raw_title , zh = raw_zh , th = raw_th))
raw_urls = hxs.select('//a/@href').extract()
for url in raw_urls:
if('http' not in url):
if('new' in url):
if (url not in urls):
url = "http://th.hujiang.com" + url
#item = MyproItem()
#item['url'] = url
urls.append(url)
items.append(Request(url , callback = self.parse))
return items