from scrapy.spider import BaseSpider
from scrapy.selector import Selector
class HnSpider(BaseSpider):
name = 'hn'
allowed_domains = []
start_urls = ['http://news.ycombinator.com']
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[@class="title"]')
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
print title, link
from soup import BeautifulSoup as bs
from scrapy.http import Request
from scrapy.spider import BaseSpider
from hn.items import HnItem
class HnSpider(BaseSpider):
name = 'hn'
allowed_domains = []
start_urls = ['http://news.ycombinator.com']
def parse(self, response):
if 'news.ycombinator.com' in response.url:
soup = bs(response.body)
items = [(x[0].text, x[0].get('href')) for x in
filter(None, [
x.findChildren() for x in
soup.findAll('td', {'class': 'title'})
])]
for item in items:
print item
hn_item = HnItem()
hn_item['title'] = item[0]
hn_item['link'] = item[1]
try:
yield Request(item[1], callback=self.parse)
except ValueError:
yield Request('http://news.ycombinator.com/' + item[1], callback=self.parse)
yield hn_item
我们正在迭代这个items,并且给标题和链接赋上抓取来的数据。
现在就试试对Hacker News域名进行抓取,你会看到连接和标题被打印在你的控制台上。
scrapy crawl hn
2013-12-12 16:57:06+0530 [scrapy] INFO: Scrapy 0.20.2 started (bot: hn)
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Optional features available: ssl, http11, django
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'hn.spiders', 'SPIDER_MODULES': ['hn.spiders'], 'BOT_NAME': 'hn'}
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware
, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled item pipelines:
2013-12-12 16:57:06+0530 [hn] INFO: Spider opened
2013-12-12 16:57:06+0530 [hn] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2013-12-12 16:57:07+0530 [hn] DEBUG: Redirecting (301) to from
2013-12-12 16:57:08+0530 [hn] DEBUG: Crawled (200) (referer: None)
(u'Caltech Announces Open Access Policy | Caltech', u'http://www.caltech.edu/content/caltech-announces-open-access-policy')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://www.caltech.edu/content/caltech-announces-open-access-policy',
'title': u'Caltech Announces Open Access Policy | Caltech'}
(u'Coinbase Raises $25 Million From Andreessen Horowitz', u'http://blog.coinbase.com/post/69775463031/coinbase-raises-25-million-from-andreessen-horowitz')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://blog.coinbase.com/post/69775463031/coinbase-raises-25-million-from-andreessen-horowitz',
'title': u'Coinbase Raises $25 Million From Andreessen Horowitz'}
(u'Backpacker stripped of tech gear at Auckland Airport', u'http://www.nzherald.co.nz/nz/news/article.cfm?c_id=1&objectid=11171475')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://www.nzherald.co.nz/nz/news/article.cfm?c_id=1&objectid=11171475',
'title': u'Backpacker stripped of tech gear at Auckland Airport'}
(u'How I introduced a 27-year-old computer to the web', u'http://www.keacher.com/1216/how-i-introduced-a-27-year-old-computer-to-the-web/')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://www.keacher.com/1216/how-i-introduced-a-27-year-old-computer-to-the-web/',
'title': u'How I introduced a 27-year-old computer to the web'}
(u'Show HN: Bitcoin Pulse - Tracking Bitcoin Adoption', u'http://www.bitcoinpulse.com')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://www.bitcoinpulse.com',
'title': u'Show HN: Bitcoin Pulse - Tracking Bitcoin Adoption'}
(u'Why was this secret?', u'http://sivers.org/ws')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://sivers.org/ws', 'title': u'Why was this secret?'}
(u'PostgreSQL Exercises', u'http://pgexercises.com/')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://pgexercises.com/', 'title': u'PostgreSQL Exercises'}
(u'What it feels like being an ipad on a stick on wheels', u'http://labs.spotify.com/2013/12/12/what-it-feels-like-being-an-ipad-on-a-stick-on-wheels/')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://labs.spotify.com/2013/12/12/what-it-feels-like-being-an-ipad-on-a-stick-on-wheels/',
'title': u'What it feels like being an ipad on a stick on wheels'}
(u'Prototype ergonomic mechanical keyboards', u'http://blog.fsck.com/2013/12/better-and-better-keyboards.html')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200 https://news.ycombinator.com/>
{'link': u'http://blog.fsck.com/2013/12/better-and-better-keyboards.html',
'title': u'Prototype ergonomic mechanical keyboards'}
(u'H5N1', u'http://blog.samaltman.com/h5n1')
.............
.............
.............
2013-12-12 16:58:41+0530 [hn] INFO: Closing spider (finished)
2013-12-12 16:58:41+0530 [hn] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 2,
'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 2,
'downloader/request_bytes': 22401,
'downloader/request_count': 71,
'downloader/request_method_count/GET': 71,
'downloader/response_bytes': 1482842,
'downloader/response_count': 69,
'downloader/response_status_count/200': 61,
'downloader/response_status_count/301': 4,
'downloader/response_status_count/302': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 12, 12, 11, 28, 41, 289000),
'item_scraped_count': 63,
'log_count/DEBUG': 141,
'log_count/INFO': 4,
'request_depth_max': 2,
'response_received_count': 62,
'scheduler/dequeued': 71,
'scheduler/dequeued/memory': 71,
'scheduler/enqueued': 71,
'scheduler/enqueued/memory': 71,
'start_time': datetime.datetime(2013, 12, 12, 11, 27, 6, 843000)}
2013-12-12 16:58:41+0530 [hn] INFO: Spider closed (finished)
// 多态, 在JAVA中是这样用的, 其实在PHP当中可以自然消除, 因为参数是动态的, 你传什么过来都可以, 不限制类型, 直接调用类的方法
abstract class Tiger {
public abstract function climb();
}
class XTiger extends Tiger {
public function climb()
jQuery.extend({
handleError: function( s, xhr, status, e ) {
// If a local callback was specified, fire it
if ( s.error ) {
s.error.call( s.context || s, xhr, status, e );
}
always 总是
rice 水稻,米饭
before 在...之前
live 生活,居住
usual 通常的
early 早的
begin 开始
month 月份
year 年
last 最后的
east 东方的
high 高的
far 远的
window 窗户
world 世界
than 比...更
最近使用mybatis.3.1.0时无意中碰到一个问题:
The errors below were detected when validating the file "mybatis-3-mapper.dtd" via the file "account-mapper.xml". In most cases these errors can be d