抓取5sing上的歌曲

#coding=utf-8

__author__ = 'Administrator'

import os
import sys
from bs4 import BeautifulSoup as BSoup
from wget import download
import requests
import mechanize
import cookielib
import json
from pprint import pprint

#http://service.5sing.kugou.com/song/getPermission?jsoncallback=jQuery1114929725&songId=14109513&songType=2
DOWNLOAD_FOLDER = r'G:\音乐'
HOST_DOWNLOAD_URL= 'http://service.5sing.kugou.com/song/getPermission'
HOST_URL = 'http://5sing.kugou.com'
HOST_URL_LIST = ['http://5sing.kugou.com/sevengod/fc/1.html',]
NODE_ATTR_HREF = 'href'
NODE_ATTR_TITLE = 'title'
SONG_ATTR_ID = 'data-songid'
SONG_ATTR_TYPE = 'data-songtype'
DEBUG = True
COOKIES = {'自己的cookie',}

def comm_utf82gbk(s):
    return s.decode(ISingSpider.SITE_ENCODING).encode('GB2312', errors='ignore')

def comm_log2file(msg, fname):
    fname = comm_utf82gbk(fname)
    with open(fname, 'w') as f:
        f.write(msg)

def utf82gbk(s):
    return s.decode('utf-8').encode('gb2312')

class ISingSpider(object):
    SITE_ENCODING = 'utf-8'

    def __init__(self, host_song_url):
        self._host_song_url = host_song_url

    def url2bs(self,url):
        res = requests.get(url)
        if res.status_code != 200:
            self.log('获取链接失败:%s' % (url,))
            return

        res.encoding = ISingSpider.SITE_ENCODING
        bs = BSoup(res.content)
        return bs

    def init(self):
        bs = self.url2bs(self._host_song_url)
        self._title = bs.head.title.text.encode(ISingSpider.SITE_ENCODING)
        self.log('歌手信息:%s' % (self._title,))
        self._song_lists = []
        return True

    def download_fc(self):
        self.get_song_lists(self._host_song_url)
        self.log('%s 总共%d 首歌曲。' % (self._title, len(self._song_lists)))
        for s in self._song_lists:
           self.download_song(s)

    def getattr(self, node, attrname):
       v = node[attrname]
       if type(v) == unicode:
            return v.encode(ISingSpider.SITE_ENCODING)

       return v

    def get_song_lists(self, url):
        if not url:
            return

        #开始从页面上爬取数据
        songs_lists = []
        bs = self.url2bs(url)
        songs = bs.find_all(name='div', attrs={'class':'song_name'})
        for s in songs:
            lnk = s.find(name='a')
            if not lnk:
                continue

            href = self.getattr(lnk, NODE_ATTR_HREF)
            title = self.getattr(lnk, NODE_ATTR_TITLE)
            asong = {NODE_ATTR_HREF:href, NODE_ATTR_TITLE:title,}
            songs_lists.append(asong)

        for s in  songs_lists:
            self.log('歌曲名称:%s 链接:%s' % (s[NODE_ATTR_TITLE], s[NODE_ATTR_HREF]))

        self._song_lists += songs_lists
        next_url = bs.find(name='a', attrs={'class':'page_next'})
        if next_url:
            next_url = self.getattr(next_url, NODE_ATTR_HREF)
            next_url = HOST_URL + next_url
            self.get_song_lists(next_url)

    def dump_song(self, song):
        self.log('消息:%s 返回码:%d' % (song[u'message'], song[u'code']))
        data = song[u'data']
        self.log('songGd=%d 作者:%s 名称:%s 地址:%s' % (data[u'songGd'], data[u'authorName'],
                                                        data[u'songName'], data[u'fileName']))

    def get_song_info(self, song):
        song_name = song[u'data'][u'songName']
        index = song_name.find(' ')
        song_name = song_name[0:index].encode('utf-8')
        return (song[u'data'][u'fileName'].encode('utf-8'),
                song[u'data'][u'authorName'].encode('utf-8'),
                song_name)

    def response2dict(self, res):
        res = res[res.find('(') + 1: res.rfind(')')].encode('utf-8')
        r = json.loads(res, encoding='utf-8')
        self.dump_song(r)
        return self.get_song_info(r)

    def download_song(self, asong):
        if type(asong) != dict:
            self.log('无效的歌曲信息')
            return False

        #继续提取信息进行下载
        bs = self.url2bs(asong[NODE_ATTR_HREF])
        if not bs:
            self.log('下载失败。')
            return False

        jcb = 'jQuery110'
        song_name = asong[NODE_ATTR_TITLE]
        self.log('正在下载:%s' % (song_name,))
        #self.log2file(bs.prettify(), song_name)
        btn_down = bs.find(name='a', attrs={'id':'func_Down'})
        if btn_down:
            song_id = self.getattr(btn_down, SONG_ATTR_ID)
            song_type = self.getattr(btn_down, SONG_ATTR_TYPE)
            self.log('song-id:%s song-type:%s' % (song_id, song_type))
            #这里必须要有cookie才可以,否则就会加载失败
            res = requests.get(HOST_DOWNLOAD_URL,
                                params={'jsoncallback':jcb, 'songId':song_id, 'songType':song_type},
                                cookies=COOKIES)
            songi = self.response2dict(res.content)
            download_dir = os.path.join(DOWNLOAD_FOLDER, songi[1])
            gbk_dir = utf82gbk(download_dir)
            if not os.path.exists(gbk_dir):
                self.log(download_dir)
                os.makedirs(gbk_dir)
            full_name = os.path.join(download_dir, songi[2] + '.mp3')
            download(songi[0], full_name.decode('utf-8'))

            #提取歌词
            lrc = bs.find(name='div', attrs={'class':'lrc_info_clip lrc-tab-content'})
            lrc_fullname = os.path.join(download_dir, songi[2] + '.lrc')
            lrc_content = lrc.text.strip()
            lrc_content = u'\r\n'.join(lrc_content.split(' '))
            self.log2file(lrc_content, lrc_fullname)
            self.log('下载文件到:%s 下载歌词到:%s' % (full_name,lrc_fullname))

            #self.log(res.url)
            #self.log(res.content)
        #sys.exit(0)
        #self.log(str(flashs))
        #self.log(bs.text)


    def log(self, msg):
        if DEBUG:
            print(msg)

    def log2file(self, msg, fname):
        comm_log2file(msg, fname)

def download_song():
    song_url = 'http://5sing.kugou.com/fc/14109513.html'
    br = mechanize.Browser()

    #初始化浏览器的Cookie
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    #打开基本上所有的浏览器功能
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(True)
    br.set_handle_refresh(True)
    #br.set_handled_schemes(True)

    #打开Browser的所有调试功能
    br.set_debug_http(True)
    br.set_debug_redirects(True)
    br.set_debug_responses(True)

    #浏览器的附加头信息,可以覆盖操作原来的
    br.addheaders = [('User-Agent',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'),]

    br.open(song_url)
    comm_log2file(br.response().read(), '内容')

def session_download_song():
    sess = requests.Session()
    res = sess.get('http://5sing.kugou.com/fc/14217069.html')
    pprint(res.cookies.items())

    #pprint(sess.cookies.items())
    #print(str(sess.cookies))

def init_sys():
    reload(sys)
    sys.setdefaultencoding('utf-8')

if __name__ == '__main__':
    init_sys()

    session_download_song()
    #download_song()
    '''
    for i in HOST_URL_LIST:
        spider = ISingSpider(i)
        if spider.init():
            spider.download_fc()
    '''

你可能感兴趣的:(python)