import os
import re
import scrapy
class QuoteSpider(scrapy.Spider):
name = 'quote'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.xpath('//div[@class="quote"]')
for quote in quotes:
yield {
'text': re.findall('“(.+)”', quote.xpath('./span[@class="text"]/text()').extract_first())[0],
'author': quote.xpath('./span/small[@class="author"]/text()').extract_first()
}
next_page = response.xpath('//li[@class="next"]/a/@href').extract_first()
if next_page:
yield response.follow(next_page, self.parse)
if __name__ == '__main__':
print(os.system('scrapy runspider quotes_spider.py -o quote.csv -t csv'))
import os
import scrapy
class QianMuSpider(scrapy.Spider):
name = 'usnews'
# 允许爬的域名范围
allowed_domains = ['www.qianmu.org']
# 爬虫的入口地址
start_urls = ['http://www.qianmu.org/ranking/1528.htm']
# 当框架请求start_urls内的链接成功后,会调用此方法
def parse(self, response):
# 提取链接,并提取链接内容,extra返回列表
links = response.xpath('//div[@class="rankItem"]//tr[position()>1]/td/a/@href').extract()
for link in links:
if not link.startswith('http://www.qianmu.org'):
continue
# 让框架继续跟随这个链接,请求成功后调用指定的callback函数
yield response.follow(link, self.parse_university)
def parse_university(self, response):
data = dict()
# 数据只有一个时,用extra_first
data['name'] = response.xpath('//div[@id="wikiContent"]/h1/text()').extract_first()
table = response.xpath('//div[@id="wikiContent"]/div[@class="infobox"]/table')
if table:
table = table[0]
col1 = table.xpath('.//td[1]')
col2 = table.xpath('.//td[2]')
keys, values = [[self.space_filter_and_join(col.xpath('.//text()').extract())
for col in cols] for cols in (col1, col2)]
if len(keys) == len(values):
data.update(zip(keys, values))
# yield的数据被框架接收,进行下一步的处理,没有处理则打印
yield data
def not_empty(self, str):
return str and str.strip()
def space_filter_and_join(self, li):
return ''.join(list(filter(self.not_empty, li)))
if __name__ == '__main__':
print(os.system('scrapy runspider use_scrapy.py'))
items.py
import scrapy
class UniversityItem(scrapy.Item):
name = scrapy.Field()
rank = scrapy.Field()
country = scrapy.Field()
state = scrapy.Field()
city = scrapy.Field()
undergraduate_num = scrapy.Field()
postgraduate_num = scrapy.Field()
website = scrapy.Field()
pipelines.py
import pymysql
import redis
from scrapy.exceptions import DropItem
class MysqlPipeline:
def open_spider(self, spider):
self.conn = pymysql.connect(host='localhost', user='root', password='123456',
port=3306, charset='utf8', database='python')
self.cur = self.conn.cursor()
def close_spider(self, spider):
self.cur.close()
self.conn.close()
def process_item(self, item, spider):
# keys = item.keys()
# values = [item[k] for k in keys]
keys, values = zip(*item.items())
sql = "insert into university ({}) values ({})".format(
','.join(keys), ','.join(['%s'] * len(keys))
)
self.cur.execute(sql, values)
self.conn.commit()
return item
class RedisPipeline:
def open_spider(self, spider):
self.r = redis.Redis()
def close_spider(self, spider):
self.r.close()
def process_item(self, item, spider):
if self.r.sadd(spider.name, item['name']):
return item
raise DropItem
设置setting.py
ITEM_PIPELINES = {
'scrapy_t.pipelines.RedisPipeline': 300,
'scrapy_t.pipelines.MysqlPipeline': 301,
}