使用python scrapy爬虫框架 爬取科学网自然科学基金数据

使用python scrapy爬虫框架 爬取科学网自然科学基金数据


fundspider.py文件

# -*- coding: utf-8 -*-

from scrapy.selector import Selector
from fundsort.items import FundItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
import re

class FundSpider(CrawlSpider):
    name = "fund"
    id = 0
    allowed_domains = ["fund.sciencenet.cn"]
    start_urls =["http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page=1"
    ]#每换个大学换一次star_urls. url结尾一定是以“page=”结束,

    def parse_item(self, response):
        item = response.meta['item']
        sel = Selector(response)
        num=self.getid()
        num=str(num)
        item['id']=num
        item['school'] = sel.xpath('//tbody/tr[2]/td[@colspan="2"]/text()').extract()
        item['subcategory']=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()
        subcode=sel.xpath('//table[@class="tb no_print"]//tbody//tr[1]/td[@colspan="4"]/text()').extract()[0]
        #subcode=str(subcode)
        item['subcode']=re.findall('([A-M]\d\d)',subcode)
        item['itemname'] = sel.xpath('//div[@class="v_con"]//h1/text()').extract()
        item['fundmoney'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[3]/td[1]/text()').extract()
        item['time'] = sel.xpath('//table[@class="tb no_print"]//tbody/tr[3]/td[3]/text()').extract()
        item['principal'] = sel.xpath('//table[@class="tb no_print"]//tbody//tr[2]/td[1]/text()').extract()
        item['url']=response.url
        return item
    def getid(self):
        self.id += 1
        return self.id

    def parse_link(self,response):
        sel = Selector(response)
        item = response.meta['item']
        items_url=sel.xpath('//div[@id="resultLst"]//div[position()>0]//p//a//@href').extract()
        for item_url in items_url:
            item_url=str(item_url)
            yield Request(url=item_url,meta={'item':item}, callback=self.parse_item)

    def parse(self, response):
        urlline=[]
        for i in range(1,17):      #此处换page的。range(1,17)表示1,2···,16  第17页不爬取
          i=str(i)
          a='http://fund.sciencenet.cn/index.php/search/project?name=&person=&no=&company=%E5%8C%97%E4%BA%AC%E5%A4%96%E5%9B%BD%E8%AF%AD%E5%A4%A7%E5%AD%A6&subject=&money1=&money2=&startTime=2005&endTime=2015&subcategory=&redract_url=&submit.x=0&submit.y=0&page='
          #同start_urls
          a=a+i
          urlline.append(a)

        for middle_url in urlline:
            item = FundItem()
            middle_url=str(middle_url)
            s=re.findall('page=.{1,3}',middle_url)
            s=str(s)
            item['page']=re.findall('\d{1,3}',s)
            yield Request(middle_url,meta={'item':item},callback=self.parse_link)

items.py

from scrapy.item import Item, Field

class FundsortItem(Item):
    # define the fields for your item here like:
    # name = Field()
    pass

class FundItem(Item):
    id=Field()
    itemname = Field()
    school = Field()
    subcode=Field()
    fundmoney=Field()
    subcategory=Field()
    time=Field()
    principal=Field()
    url=Field()
    page=Field()
pipelines.py

import csv
import string

class JsonWithEncodingFundPipeline(object):
      def __init__(self):
          self.out=csv.writer(file('94.csv', 'wb')) #此处最容易犯错
          entries=['爬取序号','大学','二级学科代码','页数','学科代码','项目名称','资助金额','起止时间','负责人','详细链接']
          self.out.writerow(entries)

      def process_item(self, item, spider):
            line=[]
            keys = ['id', 'school', 'subcode','page', 'subcategory', 'itemname', 'fundmoney', 'time', 'principal', 'url']
            for key in keys:            #item.keys():   #取出字典中key
                value = item.get(key)  #.encode('utf-8')  #取出key的value
                value=''.join(value)
                value=value.encode('utf-8')
                line.append(value)
            self.out.writerow(line)
            return item

      def spider_closed(self, spider):
            self.out.close()

settings.py

import random
BOT_NAME = 'fundsort'

SPIDER_MODULES = ['fundsort.spiders']
NEWSPIDER_MODULE = 'fundsort.spiders'
ITEM_PIPELINES = ['fundsort.pipelines.JsonWithEncodingFundPipeline']

user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
       ]
i=random.choice((0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))
USER_AGENT=user_agent_list[i]
COOKIES_ENABLED = False

抓取结果图:

使用python scrapy爬虫框架 爬取科学网自然科学基金数据_第1张图片



你可能感兴趣的:(Python)