scrapy爬虫(一)——利用scrapy框架爬取酷我音乐

本代码目的是利用scrapy框架爬取feet.base.json文件中歌曲的url,进而确定该歌曲的版权信息。

 

# coding=utf-8  
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import scrapy
import json
import re
import os
import logging
import random
import time
import datetime
class kuwoSpider(scrapy.Spider):
    #唯一爬虫标识符
    name = "kuwosong"
    #爬入页面域名范围
    allowed_domains = ['kuwo.cn']
    #download_delay = 0.05 
    start_urls=[]
    def start_requests(self):
        for line in open('/home/zhangkunning/crawl/feedbot.base.json','r'):
            its = line.strip().split('\t')
           # print its[0]
           # print its[1]
            try:
                dt = json.loads(its[1])
            except:
                print line
                continue
            url_l = dt['song']['url'].split()
            for url in url_l:
                if 'kuwo' in url:
                    try:
                        yield scrapy.Request(url,meta={"url": url},callback=self.parse,dont_filter=True)
                    except:
                        logging.warn( url)
                        continue

    def parse(self,response):
       # print response.body
       dic = {}
       if str(response.url)==str(response.meta['url']):
          title_list = response.xpath('/html/head/title').extract()
          if title_list[0]!='':
           dic['url'] = response.url
           dic['copyright'] = 1
       else:
           dic['url'] = response.meta['url']
           dic['copyright'] = 0
       # commentinfo = re.findall(ur'"commenttotal" : (\d+),',response.body)
       # hot = commentinfo[0]

       f = open('kuwosong.txt','a+')
       f.write(json.dumps(dic,ensure_ascii=False) + '\n')

 

你可能感兴趣的:(scrapy爬虫(一)——利用scrapy框架爬取酷我音乐)