1、创建项目
scrapy startproject ppd
2,爬取单页,主要用xpath
spider里面的源码
from scrapy.spiders import Spider
from scrapy.selector import Selector
from ppd.items import BlackItem
class PpdSpider(Spider):
name = "ppd"
allowed_domains = ["dailianmeng.com"]
start_urls = [
"http://www.dailianmeng.com/p2pblacklist/index.html"
]
def parse(self, response):
sites = response.xpath('//*[@id="yw0"]/table/tbody/tr')
items = []
for site in sites:
item = BlackItem()
item['name'] = site.xpath('td[1]/text()').extract()
item['idcard'] = site.xpath('td[2]/text()').extract()
item['mobile']=site.xpath('td[3]/text()').extract()
item['email']=site.xpath('td[4]/text()').extract()
item['total']=site.xpath('td[5]/text()').extract()
item['bepaid']=site.xpath('td[6]/text()').extract()
item['notPaid']=site.xpath('td[7]/text()').extract()
item['time']=site.xpath('td[8]/text()').extract()
item['loanAmount']=site.xpath('td[9]/text()').extract()
items.append(item)
return items
class BlackItem(Item):
name = Field()
idcard = Field()
mobile=Field()
email=Field()
total=Field()
bepaid=Field()
notPaid=Field()
time=Field()
loanAmount=Field()
结果成功跑出单页结果,但是属性排序按照属性名字的大小写字母,下一步要改成我定义的顺序。
3,按照指定属性顺序输出
因为scrapy本来按照字母顺序输出属性和属性值,现在我想改为按照我指定的顺序:
首先在目录中spider里面,创建一个文件,命名为csv_item_exporter.py
from scrapy.conf import settings
from scrapy.contrib.exporter import CsvItemExporter
class MyProjectCsvItemExporter(CsvItemExporter):
def __init__(self, *args, **kwargs):
delimiter = settings.get('CSV_DELIMITER', ',')
kwargs['delimiter'] = delimiter
fields_to_export = settings.get('FIELDS_TO_EXPORT', [])
if fields_to_export :
kwargs['fields_to_export'] = fields_to_export
super(MyProjectCsvItemExporter, self).__init__(*args, **kwargs)
FEED_EXPORTERS = {
'csv': 'ppd.spiders.csv_item_exporter.MyProjectCsvItemExporter',
} #jsuser为工程名
FIELDS_TO_EXPORT = [
'name',
'idcard',
'mobile',
'email',
'total',
'bepaid',
'notPaid',
'time',
'loanAmount'
]
name,idcard,mobile,email,total,bepaid,notPaid,time,loanAmount
余良锋,61250119890307****,13055099***,[email protected],3000.00,1063.01,999.89,2013-10-11,3个月
张栩,44152219890923****,15767638***,[email protected],3000.00,2319.84,819.56,2013-09-09,3个月
孙福东,37150219890919****,15194000***,[email protected],3000.00,2075.14,1018.55,2013-09-25,3个月
李其印,45012119870211****,13481120***,[email protected],3050.00,2127.64,167.99,2013-04-08,1年
吴必拥,45232819810201****,13977369***,[email protected],3000.00,2670.40,524.01,2013-06-07,6个月
单长江,32072319820512****,18094220***,[email protected],8900.00,6302.04,1521.78,2013-07-22,6个月
郑其睦,35042619890215****,15959783***,[email protected],5000.00,3278.60,425.51,2013-04-08,1年
吴文豪,44190019890929****,13267561***,[email protected],6000.00,579.79,463.40,2013-10-09,1年
钟华,45060319870526****,18277072***,[email protected],5700.00,3141.24,957.50,2013-08-07,6个月
汤双杰,34082119620804****,13329062***,[email protected],100000.00,105293.45,9111.54,2012-11-19,1年
黄河,43240219791103****,13786520***,[email protected],6700.00,4795.24,2307.54,2013-06-21,6个月
孙景昌,13092119850717****,15127714***,[email protected],3000.00, ,455.71,2013-10-18,1年
高义,42050319740831****,15337410***,[email protected],3000.00, ,965.51,2013-10-17,6个月
曹成均,41088119720221****,18639192***,[email protected],3300.00,1781.64,838.18,2013-06-17,8个月
张银球,33032519761109****,13806800***,[email protected],60000.00, ,19407.50,2013-10-16,6个月
主要有以下特点:(1)页面自动获取;(2)写入循环,爬取多个页面,(3)速度相对于selenium更快
from scrapy.spiders import Spider
from scrapy.selector import Selector
from ppd.items import BlackItem
class PpdSpider(Spider):
name = "ppd"
allowed_domains = ["dailianmeng.com"]
start_urls = []
#start_urls.append("http://www.dailianmeng.com/p2pblacklist/index.html")
#total_page = 164
page_re=request.get('http://www.dailianmeng.com/p2pblacklist/index.html')
page_info=page_re.find_element_by_css_selector('#yw0 > div.summary')
# 第 2446-2448 条, 共 2448 条.
pages=page_info.text.split(',')[1]
pages=int(int(pages[3:6])/15)
size_page = pages
size_page = 165
start_page = 1
for pge in range(start_page,start_page + size_page):
start_urls.append('http://www.dailianmeng.com/p2pblacklist/index.html?P2pBlacklist_page='+str(pge))
def parse(self, response):
sites = response.xpath('//*[@id="yw0"]/table/tbody/tr')
items = []
for site in sites:
item = BlackItem()
item['name'] = site.xpath('td[1]/text()').extract()
item['idcard'] = site.xpath('td[2]/text()').extract()
item['mobile']=site.xpath('td[3]/text()').extract()
item['email']=site.xpath('td[4]/text()').extract()
item['total']=site.xpath('td[5]/text()').extract()
item['bepaid']=site.xpath('td[6]/text()').extract()
item['notPaid']=site.xpath('td[7]/text()').extract()
item['time']=site.xpath('td[8]/text()').extract()
item['loanAmount']=site.xpath('td[9]/text()').extract()
items.append(item)
return items