#coding=utf-8 __author__ = 'Administrator' import os import sys from bs4 import BeautifulSoup as BSoup from wget import download import requests import mechanize import cookielib import json from pprint import pprint #http://service.5sing.kugou.com/song/getPermission?jsoncallback=jQuery1114929725&songId=14109513&songType=2 DOWNLOAD_FOLDER = r'G:\音乐' HOST_DOWNLOAD_URL= 'http://service.5sing.kugou.com/song/getPermission' HOST_URL = 'http://5sing.kugou.com' HOST_URL_LIST = ['http://5sing.kugou.com/sevengod/fc/1.html',] NODE_ATTR_HREF = 'href' NODE_ATTR_TITLE = 'title' SONG_ATTR_ID = 'data-songid' SONG_ATTR_TYPE = 'data-songtype' DEBUG = True COOKIES = {'自己的cookie',} def comm_utf82gbk(s): return s.decode(ISingSpider.SITE_ENCODING).encode('GB2312', errors='ignore') def comm_log2file(msg, fname): fname = comm_utf82gbk(fname) with open(fname, 'w') as f: f.write(msg) def utf82gbk(s): return s.decode('utf-8').encode('gb2312') class ISingSpider(object): SITE_ENCODING = 'utf-8' def __init__(self, host_song_url): self._host_song_url = host_song_url def url2bs(self,url): res = requests.get(url) if res.status_code != 200: self.log('获取链接失败:%s' % (url,)) return res.encoding = ISingSpider.SITE_ENCODING bs = BSoup(res.content) return bs def init(self): bs = self.url2bs(self._host_song_url) self._title = bs.head.title.text.encode(ISingSpider.SITE_ENCODING) self.log('歌手信息:%s' % (self._title,)) self._song_lists = [] return True def download_fc(self): self.get_song_lists(self._host_song_url) self.log('%s 总共%d 首歌曲。' % (self._title, len(self._song_lists))) for s in self._song_lists: self.download_song(s) def getattr(self, node, attrname): v = node[attrname] if type(v) == unicode: return v.encode(ISingSpider.SITE_ENCODING) return v def get_song_lists(self, url): if not url: return #开始从页面上爬取数据 songs_lists = [] bs = self.url2bs(url) songs = bs.find_all(name='div', attrs={'class':'song_name'}) for s in songs: lnk = s.find(name='a') if not lnk: continue href = self.getattr(lnk, NODE_ATTR_HREF) title = self.getattr(lnk, NODE_ATTR_TITLE) asong = {NODE_ATTR_HREF:href, NODE_ATTR_TITLE:title,} songs_lists.append(asong) for s in songs_lists: self.log('歌曲名称:%s 链接:%s' % (s[NODE_ATTR_TITLE], s[NODE_ATTR_HREF])) self._song_lists += songs_lists next_url = bs.find(name='a', attrs={'class':'page_next'}) if next_url: next_url = self.getattr(next_url, NODE_ATTR_HREF) next_url = HOST_URL + next_url self.get_song_lists(next_url) def dump_song(self, song): self.log('消息:%s 返回码:%d' % (song[u'message'], song[u'code'])) data = song[u'data'] self.log('songGd=%d 作者:%s 名称:%s 地址:%s' % (data[u'songGd'], data[u'authorName'], data[u'songName'], data[u'fileName'])) def get_song_info(self, song): song_name = song[u'data'][u'songName'] index = song_name.find(' ') song_name = song_name[0:index].encode('utf-8') return (song[u'data'][u'fileName'].encode('utf-8'), song[u'data'][u'authorName'].encode('utf-8'), song_name) def response2dict(self, res): res = res[res.find('(') + 1: res.rfind(')')].encode('utf-8') r = json.loads(res, encoding='utf-8') self.dump_song(r) return self.get_song_info(r) def download_song(self, asong): if type(asong) != dict: self.log('无效的歌曲信息') return False #继续提取信息进行下载 bs = self.url2bs(asong[NODE_ATTR_HREF]) if not bs: self.log('下载失败。') return False jcb = 'jQuery110' song_name = asong[NODE_ATTR_TITLE] self.log('正在下载:%s' % (song_name,)) #self.log2file(bs.prettify(), song_name) btn_down = bs.find(name='a', attrs={'id':'func_Down'}) if btn_down: song_id = self.getattr(btn_down, SONG_ATTR_ID) song_type = self.getattr(btn_down, SONG_ATTR_TYPE) self.log('song-id:%s song-type:%s' % (song_id, song_type)) #这里必须要有cookie才可以,否则就会加载失败 res = requests.get(HOST_DOWNLOAD_URL, params={'jsoncallback':jcb, 'songId':song_id, 'songType':song_type}, cookies=COOKIES) songi = self.response2dict(res.content) download_dir = os.path.join(DOWNLOAD_FOLDER, songi[1]) gbk_dir = utf82gbk(download_dir) if not os.path.exists(gbk_dir): self.log(download_dir) os.makedirs(gbk_dir) full_name = os.path.join(download_dir, songi[2] + '.mp3') download(songi[0], full_name.decode('utf-8')) #提取歌词 lrc = bs.find(name='div', attrs={'class':'lrc_info_clip lrc-tab-content'}) lrc_fullname = os.path.join(download_dir, songi[2] + '.lrc') lrc_content = lrc.text.strip() lrc_content = u'\r\n'.join(lrc_content.split(' ')) self.log2file(lrc_content, lrc_fullname) self.log('下载文件到:%s 下载歌词到:%s' % (full_name,lrc_fullname)) #self.log(res.url) #self.log(res.content) #sys.exit(0) #self.log(str(flashs)) #self.log(bs.text) def log(self, msg): if DEBUG: print(msg) def log2file(self, msg, fname): comm_log2file(msg, fname) def download_song(): song_url = 'http://5sing.kugou.com/fc/14109513.html' br = mechanize.Browser() #初始化浏览器的Cookie cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) #打开基本上所有的浏览器功能 br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(True) br.set_handle_refresh(True) #br.set_handled_schemes(True) #打开Browser的所有调试功能 br.set_debug_http(True) br.set_debug_redirects(True) br.set_debug_responses(True) #浏览器的附加头信息,可以覆盖操作原来的 br.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'),] br.open(song_url) comm_log2file(br.response().read(), '内容') def session_download_song(): sess = requests.Session() res = sess.get('http://5sing.kugou.com/fc/14217069.html') pprint(res.cookies.items()) #pprint(sess.cookies.items()) #print(str(sess.cookies)) def init_sys(): reload(sys) sys.setdefaultencoding('utf-8') if __name__ == '__main__': init_sys() session_download_song() #download_song() ''' for i in HOST_URL_LIST: spider = ISingSpider(i) if spider.init(): spider.download_fc() '''