使用python scrapy爬虫框架 爬取科学网自然科学基金数据
fundspider.py文件
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
from fundsort.items import FundItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import re
class FundSpider(CrawlSpider):
name = "fund"
id = 0
allowed_domains = ["fund.sciencenet.cn"]
start_urls =["http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page=1"
]#每换个大学换一次star_urls. url结尾一定是以“page=”结束,
def parse_item(self, response):
item = response.meta['item']
sel = Selector(response)
num=self.getid()
num=str(num)
item['id']=num
item['school'] = sel.xpath('//tbody/tr[2]/td[@colspan="2"]/text()').extract()
item['subcategory']=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()
subcode=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()[0]
#subcode=str(subcode)
item['subcode']=re.findall('([A-M]\d\d)',subcode)
item['itemname'] = sel.xpath('//div[@class="v_con"]//h1/text()').extract()
item['fundmoney'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[3]/td[1]/text()').extract()
item['time'] = sel.xpath('//table[@class="tb no_print"]//tbody/tr[3]/td[3]/text()').extract()
item['principal'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[2]/td[1]/text()').extract()
item['url']=response.url
return item
def getid(self):
self.id += 1
return self.id
def parse_link(self,response):
sel = Selector(response)
item = response.meta['item']
items_url=sel.xpath('//div[@id="resultLst"]//div[position()>0]//p//a//@href').extract()
for item_url in items_url:
item_url=str(item_url)
yield Request(url=item_url,meta={'item':item}, callback=self.parse_item)
def parse(self, response):
urlline=[]
for i in range(1,17): #此处换page的。range(1,17)表示1,2···,16 第17页不爬取
i=str(i)
a='http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page='
#同start_urls
a=a+i
urlline.append(a)
for middle_url in urlline:
item = FundItem()
middle_url=str(middle_url)
s=re.findall('page=.{1,3}',middle_url)
s=str(s)
item['page']=re.findall('\d{1,3}',s)
yield Request(middle_url,meta={'item':item},callback=self.parse_link)
from scrapy.item import Item, Field
class FundsortItem(Item):
# define the fields for your item here like:
# name = Field()
pass
class FundItem(Item):
id=Field()
itemname = Field()
school = Field()
subcode=Field()
fundmoney=Field()
subcategory=Field()
time=Field()
principal=Field()
url=Field()
page=Field()
pipelines.py
import csv
import string
class JsonWithEncodingFundPipeline(object):
def __init__(self):
self.out=csv.writer(file('94.csv', 'wb')) #此处最容易犯错
entries=['爬取序号','大学','二级学科代码','页数','学科代码','项目名称','资助金额','起止时间','负责人','详细链接']
self.out.writerow(entries)
def process_item(self, item, spider):
line=[]
keys = ['id', 'school', 'subcode','page', 'subcategory', 'itemname', 'fundmoney', 'time', 'principal', 'url']
for key in keys: #item.keys(): #取出字典中key
value = item.get(key) #.encode('utf-8') #取出key的value
value=''.join(value)
value=value.encode('utf-8')
line.append(value)
self.out.writerow(line)
return item
def spider_closed(self, spider):
self.out.close()
import random
BOT_NAME = 'fundsort'
SPIDER_MODULES = ['fundsort.spiders']
NEWSPIDER_MODULE = 'fundsort.spiders'
ITEM_PIPELINES = ['fundsort.pipelines.JsonWithEncodingFundPipeline']
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
i=random.choice((0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))
USER_AGENT=user_agent_list[i]
COOKIES_ENABLED = False