python爬取网易云音乐排行榜歌单热评(完整版)

完整版的爬取网易云音乐的排行榜单,和推荐榜单,热评
直接上代码,代码写的很清楚

为了防止被封我们先做个随机获取User_Agent

"""随机获取请求头"""
def get_ua():
    first_num = random.randint(55, 62)
    third_num = random.randint(0, 3200)
    fourth_num = random.randint(0, 140)
    os_type = [
        '(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11; Linux x86_64)',
        '(Macintosh; Intel Mac OS X 10_12_6)'
    ]
    chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num, fourth_num)
    ua = ' '.join(['Mozilla/5.0', random.choice(os_type), 'AppleWebKit/537.36','(KHTML, like Gecko)', chrome_version, 'Safari/537.36']
                  )
    return ua

我这里用了代理池
直接上完整的代码

class WangYiyun:

    def __init__(self,ua):

        self.User_Agent = ua
        self.Referer = 'http://music.163.com/'
        self.Host = 'music.163.com'
        self.Accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
        self.s = requests.session()

        self.cookies = {
        '_ntes_nuid': '132b56eb04cd7ae3dc141a67f1e00b92',
        '__gads': 'ID=edd475e49c259564:T=1522070635:S=ALNI_MaL6zNCOchTwNrS8aso4KJa96dHsw',
        'vjuids': '-202d8c038.162627acbc3.0.0c44fd974cd69',
        '_iuqxldmzr_': '32',
        '__utmz': '94650624.1537866836.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
        'WM_TID': 'S4BoTLNXpXBu1XuKXWJGvK2VNSl8A1R2',
        'mail_psc_fingerprint': '4a2ba8794d7a7cb253130b93cf150f04',
        'usertrack': 'ezq0o1vGzRQe23jDAzXKAg==',
        '_ga': 'GA1.2.1206578097.1539755287',
        'UM_distinctid': '1669aca0685ab-0d8c2646d156b1-5e4b2519-100200-1669aca0687451',
        'vjlast': '1522070637.1544581017.11',
        '__utma': '94650624.1971910903.1537866836.1543304403.1547020521.5',
        '_ntes_nnid': '132b56eb04cd7ae3dc141a67f1e00b92,1551923908660',
        'vinfo_n_f_l_n3': 'ed239199997e7529.1.17.1522070637531.1551150620657.1552439807970',
        '__f_': '1552527380802',
        'JSESSIONID-WYYY': 'VzxRm974Y8WbPeeFTyi%2BR9C769wQ1DF7hvjo2HOqEaYkajfA94%5CNme%2BBHQKfsmGq%2BTZYMus4Xgb%2B76sMsgYwudT%5CI1Hh6%2BAK2vx%2Fi4gsdSadfFT9%2F7ant5ST507rZVCaUpAjobt0UhvhngPgZ%2FJdxt%5CvCOXYGrdf9ixCsf%2BHgSeeTbxT%3A1553739054373',
        'WM_NI': 'JiBtIXn2eLKlhVwpn8xGT5hRgUqgTMx2%2FEfUXF0TUNwuzRjBZ8lysYoCPgJOgFFBrUJKAncSFlRqyx7Br1S6LwQT2gA0MeujdELLRIjJe42aHjtsBTQo3MV3DTHR260oR3M%3D',
        'WM_NIKE': '9ca17ae2e6ffcda170e2e6eea6c8499aaead95d673a1928ea7c54b829e8aafb86db28f8c92ec6983abadb5ca2af0fea7c3b92a8ab78e95cc54aeeeabadca68ac8ab991ed34bc999cd6eb5aa58bbf88d747bceee1a6e55f9c9a00b5d846b5abb6b3e94a9ae8a49af253ad9da088c945a59a8aa3f15f8a958f8fe25986b49e92fc74f59ebed2d23e90a99fb5b16d83f08386f862a89b84add143aa96abd5b16b8cb6bc94c16093b1fdaed539888c88b3b13ef4b19ed4d037e2a3',
        'MUSIC_U': 'd6bf02383c0a0234b8da0b7d717df069eb5f65aef8526abe35411b24a595d461fd2f81d778ffe31c3fa6925a46b62d12586a15998e43a379b6d53907a0dbb4064eb75c1ad5e4f24fde39c620ce8469a8',
        '__remember_me': 'true',
        '__csrf': '99b47f93e046944078bfca72a1619ef7',
    }
        self.params = (
        ('csrf_token', '99b47f93e046944078bfca72a1619ef7'),
    )
        self.data = {
        'params': '/2b9rp81m0RdIyYib3JsCnFPg6yT77pPGhY03ke5khMgbbbTrCEJho/4YG9xXvuh7kM4/+HG/rVH9PlIZ3SwCdFReymdZ3tB8o0T+sxRHREnY6tv3WJDb7HuRMEiYC+a15k61alvkIIhyGuVDBLQNVuhfBrJ2Ee0r6eqCGmbhBtn6R12QW5gLlLC+gQjWcoC4jZdQ6ERN5w4QXpgj0eNKX7cu/ooU/mQyagF23Np6Z4=',
        'encSecKey': '5ff6ba868d77aaabdd2f4e27af30c00bc085757e830c96f5624cd4dc1212af0e48d36e437d4930c08fd9f80aece6772a1b09d4c8c1ac7a903c02d03a08c078f399b90a89dbed6a45980c970956d76df60b3234c9e90d03fd2ec17037afdcfe131e48c8f444a129e203635a51dfdc91047a584ecffb373d32ee633cb74efa42d3'
    }

        self.con_url = 'https://music.163.com/'
    """构建请求头"""
    def get_header(self):
        headers = {
            'Referer' : self.Referer,
            'User-Agent' : self.User_Agent,
            'Host' : self.Host,
            'Accept' : self.Accept,
        }
        return headers

    """获取代理ip"""
    def get_proc(self):
        f = open('./wyy/ip.txt', 'r')
        content = f.readlines()
        proxie_list = []
        for tmp in content:
            proxies = {
                "https": "http://{}".format(tmp.strip()),
            }
            proxie_list.append(proxies)

        proxy = random.choice(proxie_list)
        try:
            data = requests.get(url='https://www.baidu.com', proxies=proxy, timeout=5)
            return proxy
        except:
            pass

    """验证代理ip"""
    def call_proc(self):
        cc = self.get_proc()
        while True:
            if cc:
                return cc
            else:
                cc = self.get_proc()

    """排行榜单"""
    def wybd(self):
        proxy = self.call_proc()
        headers = self.get_header()
        # play_url = 'https://music.163.com/discover/toplist'  # https://music.163.com/#/discover/playlist    直接的url上面有个  #号。要把这个去掉
        play_url = self.con_url + 'discover/toplist'  # https://music.163.com/#/discover/playlist    直接的url上面有个  #号。要把这个去掉
        # print('play_url<%s>'%play_url)
        s = self.s
        response = s.get(play_url, headers=headers, proxies=proxy).text
        html = etree.HTML(response)
        result = html.xpath('//p[@class="name"]/a')
        filename = open('./wyy/歌单1' + '.csv', 'a', newline='', encoding='utf_8_sig')
        csv_write = csv.writer(filename, dialect='excel')
        ranking_list = []
        for x in result:
            name = x.text
            id = (x.get('href')).split("=")[1]
            rank_dict = {'r_name': name, 'r_id': id}
            ranking_list.append(rank_dict)
            csv_write.writerow([name, id])

        return ranking_list

    """歌单列表"""
    def wygd(self):
        proxy = self.call_proc()
        headers = self.get_header()

        # play_url = 'https://music.163.com/discover/playlist'  # https://music.163.com/#/discover/playlist    直接的url上面有个  #号。要把这个去掉
        play_url = self.con_url + 'discover/playlist'
        s = self.s
        response = s.get(play_url, headers=headers, proxies=proxy).text
        soup = BeautifulSoup(response, 'html.parser')
        ids = soup.select('.dec a')
        lis = soup.select('#m-pl-container li')
        gd_list = []

        filename = open('./wyy/歌单1' + '.csv', 'a', newline='', encoding='utf_8_sig')
        csv_write = csv.writer(filename, dialect='excel')

        for j in range(len(lis)):
            # 获取歌单详情页地址
            url = (ids[j]['href']).split("=")[1]
            # 获取歌单标题,替换英文分割符
            title = ids[j]['title'].replace(',', ',')
            # 获取歌单播放量
            play = lis[j].select('.nb')[0].get_text()
            # 获取歌单贡献者名字
            user = lis[j].select('p')[1].select('a')[0].get_text()
            # 输出歌单索引页信息
            # print(url, title, play, user)
            gd_dict = {}

            gd_dict['r_name'] = title
            gd_dict['r_id'] = url
            gd_list.append(gd_dict)

            # 将信息写入CSV文件中
            # with open('./wyy/云音乐排行榜.csv', 'a', encoding='utf-8') as f:
            #     f.write(title + ',' + url +'\n')

            csv_write.writerow([title, url])


        return gd_list


    """爬取单个歌单中的音乐"""
    def wygid(self):

        proxy = self.call_proc()
        headers = self.get_header()
        r_lists = self.wybd()
        gd_lists = self.wygd()

        hb_list = r_lists + gd_lists
        # print(111,hb_list)
        h_list = []

        filename = open('./wyy/单曲1' + '.csv', 'a', newline='', encoding='utf_8_sig')
        csv_write = csv.writer(filename, dialect='excel')

        for r in hb_list:
            # play_url = 'http://music.163.com/playlist?id=3132994959'
            play_url = self.con_url + 'playlist?id={}'.format(r['r_id'])
            s = self.s
            gd_list = []

            try:
                response = s.get(play_url, headers=headers, proxies=proxy).content
                s = BeautifulSoup(response, 'lxml')
                main = s.find('ul', {'class': 'f-hide'})


                for music in main.find_all('a'):
                    gd_dict = {}
                    gd_dict['name'] = music.text
                    gd_dict['g_id'] = (music['href'].split("="))[1]
                    # print('{} : {}'.format(music.text, music['href']))
                    gd_list.append(gd_dict)
                    csv_write.writerow([gd_dict['name'], gd_dict['g_id']])
            except:
                logging.info('请求失败')

            h_list.append(gd_list)

        return h_list

    """热评"""
    def wypl(self):

        proxy = self.call_proc()
        headers = self.get_header()
        result = self.wygid()

        filename = open('./wyy/云音乐热评1' + '.csv', 'a', newline='', encoding='utf_8_sig')
        csv_write = csv.writer(filename, dialect='excel')
        data_pl = []
        for x in result:
            for m in x:
                print('歌曲:' ,m)
                csv_write.writerow(['歌曲名',m['name']])
                csv_write.writerow(['评论','用户名','点赞数'])
                try:
                    response = requests.post('https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}'.format(m['g_id']), headers=headers,params=self.params, cookies=self.cookies, data=self.data,proxies=proxy)
                    count = response.json()
                    for i in count['hotComments']:
                        # print(i)
                        data_pl.append(i)
                        csv_write.writerow([i['content'].strip(), i['user']['nickname'],str(i['likedCount'])])
                except:
                    proxy = self.call_proc()
                    response = requests.post('https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}'.format(m['g_id']), headers=headers,params=self.params, cookies=self.cookies, data=self.data,proxies=proxy)
                    if response.status_code == 200:
                        count = response.json()
                        for i in count['hotComments']:
                            # print(i)
                            data_pl.append(i)
                            csv_write.writerow([i['content'].strip(), i['user']['nickname'],str(i['likedCount'])])

                    else:

                        print('请求失败')
        return data_pl

    def main(self):


        header = self.get_header()
        print('开始获取headers')
        print("headers: <%s>" %header)

        proc = self.call_proc()
        print('开始获取proc')
        print("proc: <%s>" %proc)

        # wybd = self.wybd()
        # print('开始获取榜单')
        # print("wybd: <%s>" % wybd)
        #
        # wygd = self.wygd()
        # print('开始获取歌单列表')
        # print("wygd: <%s>" % wygd)

        wypl = self.wypl()
        print('开始获取评论')
        print("wypl: <%s>" % wypl)

        # wygid = self.wygid()
        # print('开始歌曲id')
        # print("wybd: <%s>" % wygid)








if __name__ == '__main__':
    ua = get_ua()
    wy = WangYiyun(ua)
    wy.main()

你可能感兴趣的:(python爬取网易云音乐排行榜歌单热评(完整版))