使用python抓取google音乐信息

google目前没有提供音乐搜索服务的api,不过正则表达式是强大的,下面是我闲来写的几行代码,可以抓取google的新歌排行榜,当前只限于排行榜的页面,其他页面要抓取也不是很难,如果google的页面结构保证稳定的化,也还凑合。


#codeing:utf-8
import urllib2
import re
import base64

def getMusicList(url):
    '''获取音乐列表'''
    response = urllib2.urlopen(url)
    html = response.read()
    ids = re.findall(r'http%3A%2F%2Fg.top100.cn%2F7872775%2Fhtml%2Fdownload.html%3Fid%3D(.+?)\\x26resnum',html)
    names = re.findall(r'<td class="Title BottomBorder"><a href=".+?;">(.+?)</a>',html)
    songers = re.findall(r'<a href="/music/url\?q=%2Fmusic%2Fartist%3Fid%3.+?style="white-space:nowrap;">(.+?)</a>', html)
    musicList = []
    if ids:
        try:
            for idx in range(len(ids)):
                smusic = {}
                smusic['id'] = ids[idx]
                smusic['name'] = _decodeHtmlEntity(names[idx])
                smusic['songer'] = _decodeHtmlEntity(songers[idx])
                musicList.append(smusic)
        except:
            print 'url 解析错误'
    return musicList

def getMusicInfo(musicId):
    '''获取单个音乐信息'''
    targetUrl = 'http://www.google.cn/music/top100/musicdownload?id=%s'%musicId
    response = urllib2.urlopen(targetUrl)
    html = response.read()
    htxt = html.replace('\n','')
    songNameM = re.findall(r'<tr class="meta-data-tr"><td class="td-song-name">(.+?)</td>',htxt)
    songerM = re.findall(r'<tr class="meta-data-tr">.+?<td class="td-singer">(.+?)</td>',htxt)
    songSizeM = re.findall(r'<tr class="meta-data-tr">.+?<td class="td-size">(.+?)</td>',htxt)
    songFormatM = re.findall(r'<tr class="meta-data-tr">.+?<td class="td-format">(.+?)</td>',htxt)
    songUrlM = re.findall(r'<a href="/music/top100/url\?q=(.+?)&amp;ct=rdl.+?',htxt)
    musicItem = {}
    if songNameM:
        musicItem['name'] = _decodeHtmlEntity(songNameM[0])
    if songerM:
        musicItem['songer'] = _decodeHtmlEntity(songerM[0])
    if songSizeM:
        musicItem['size'] = songSizeM[0].replace('&nbsp;','')
    if songFormatM:
        musicItem['format'] = songFormatM[0]
    if songUrlM:
        musicItem['url'] = urllib2.unquote(unicode(songUrlM[0],'utf-8'))
    return musicItem


def _decodeHtmlEntity(s):
    '''十进制unicode转换'''
    import re
    result = s
    entityRe = '(&#(\\d{5});)'
    entities = re.findall(entityRe, s)
    for entity in entities :
            result = result.replace(entity[0], unichr(int(entity[1])))
    return result

ms = getMusicList('http://www.google.cn/music/chartlisting?q=chinese_new_songs_cn&cat=song&grouping=new-release_music')
for m in ms:
    print m['id']
    print m['name']
    print m['songer']
    print '---------------------'  
    
item = getMusicInfo('Sdeae665a21782a21')
print item


打印结果:

..............................
---------------------
S7065aecee3381785
爱盛开
刘可
---------------------
S82d535f38713c6fd
Beautiful Woman
张悬
---------------------

...............................

{'songer': u'\u90ed\u71d5', 'url': u'http://file5.top100.cn/200905291742/6BC0EC05183B9E19E7732D609C45A137/Special_141264/%E6%88%92%E7%88%B1.mp3', 'format': 'MP3', 'name': u'\u6212\u7231', 'size': '6.6MB'}


你可能感兴趣的:(html,python,正则表达式,Google,音乐)