from scrapy.spider import BaseSpider
from scrapy.selector import Selector
class HnSpider(BaseSpider):
name = 'hn'
allowed_domains = []
start_urls = ['']
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//td[@class="title"]')
for site in sites:
title = site.xpath('a/text()').extract()
link = site.xpath('a/@href').extract()
print title, link
from soup import BeautifulSoup as bs
from scrapy.http import Request
from scrapy.spider import BaseSpider
from hn.items import HnItem
class HnSpider(BaseSpider):
name = 'hn'
allowed_domains = []
start_urls = ['']
def parse(self, response):
if '' in response.url:
soup = bs(response.body)
items = [(x[0].text, x[0].get('href')) for x in
filter(None, [
x.findChildren() for x in
soup.findAll('td', {'class': 'title'})
for item in items:
print item
hn_item = HnItem()
hn_item['title'] = item[0]
hn_item['link'] = item[1]
yield Request(item[1], callback=self.parse)
except ValueError:
yield Request('' + item[1], callback=self.parse)
yield hn_item
现在就试试对Hacker News域名进行抓取,你会看到连接和标题被打印在你的控制台上。
scrapy crawl hn
2013-12-12 16:57:06+0530 [scrapy] INFO: Scrapy 0.20.2 started (bot: hn)
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Optional features available: ssl, http11, django
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'hn.spiders', 'SPIDER_MODULES': ['hn.spiders'], 'BOT_NAME': 'hn'}
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware
, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Enabled item pipelines:
2013-12-12 16:57:06+0530 [hn] INFO: Spider opened
2013-12-12 16:57:06+0530 [hn] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Telnet console listening on
2013-12-12 16:57:06+0530 [scrapy] DEBUG: Web service listening on
2013-12-12 16:57:07+0530 [hn] DEBUG: Redirecting (301) to from
2013-12-12 16:57:08+0530 [hn] DEBUG: Crawled (200) (referer: None)
(u'Caltech Announces Open Access Policy | Caltech', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'Caltech Announces Open Access Policy | Caltech'}
(u'Coinbase Raises $25 Million From Andreessen Horowitz', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'Coinbase Raises $25 Million From Andreessen Horowitz'}
(u'Backpacker stripped of tech gear at Auckland Airport', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'Backpacker stripped of tech gear at Auckland Airport'}
(u'How I introduced a 27-year-old computer to the web', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'How I introduced a 27-year-old computer to the web'}
(u'Show HN: Bitcoin Pulse - Tracking Bitcoin Adoption', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'Show HN: Bitcoin Pulse - Tracking Bitcoin Adoption'}
(u'Why was this secret?', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'', 'title': u'Why was this secret?'}
(u'PostgreSQL Exercises', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'', 'title': u'PostgreSQL Exercises'}
(u'What it feels like being an ipad on a stick on wheels', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'What it feels like being an ipad on a stick on wheels'}
(u'Prototype ergonomic mechanical keyboards', u'')
2013-12-12 16:57:08+0530 [hn] DEBUG: Scraped from <200>
{'link': u'',
'title': u'Prototype ergonomic mechanical keyboards'}
(u'H5N1', u'')
2013-12-12 16:58:41+0530 [hn] INFO: Closing spider (finished)
2013-12-12 16:58:41+0530 [hn] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 2,
'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 2,
'downloader/request_bytes': 22401,
'downloader/request_count': 71,
'downloader/request_method_count/GET': 71,
'downloader/response_bytes': 1482842,
'downloader/response_count': 69,
'downloader/response_status_count/200': 61,
'downloader/response_status_count/301': 4,
'downloader/response_status_count/302': 3,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 12, 12, 11, 28, 41, 289000),
'item_scraped_count': 63,
'log_count/DEBUG': 141,
'log_count/INFO': 4,
'request_depth_max': 2,
'response_received_count': 62,
'scheduler/dequeued': 71,
'scheduler/dequeued/memory': 71,
'scheduler/enqueued': 71,
'scheduler/enqueued/memory': 71,
'start_time': datetime.datetime(2013, 12, 12, 11, 27, 6, 843000)}
2013-12-12 16:58:41+0530 [hn] INFO: Spider closed (finished)
今天eclipse突然报了com/genuitec/eclipse/j2eedt/core/J2EEProjectUtil 错误,并且工程文件打不开了,在网上找了一下资料,然后按照方法操作了一遍,好了,解决方法如下:
An error has occurred.See error log for more details.
public static void main(String[] args) {
int a = 2;
Implicits work like this: if you call a method on a Scala object, and the Scala compiler does not see a definition for that method in the class definition for that object, the compiler will try to con
A. 背景声音
B. 避免注意力分散
Self Co