python利用selenium爬取网易云音乐

这里爬取得是网易云音乐的歌单,大约5w。每个歌单可以继续爬取对应的歌曲(暂未实现)


from selenium import webdriver
import time

class WY:

    def __init__(self):
        self.url = "https://music.163.com/#/discover/playlist/"
        self.driver = webdriver.Chrome()

    def all_url_1(self):
        '''
        定义各个大分类的名称和url

        '''
        self.driver.get(self.url)
        self.driver.switch_to_frame("contentFrame")
        # self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/h3/a").click()
        # time.sleep(1)
        reqs = self.driver.find_elements_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div[@class='n-sltlyr d-flag ']/div[@class='bd']/dl/dd/a")
        list_1 = []
        for req in reqs:
            dict_a = {}
            dict_a["name"] = req.get_attribute("data-cat")
            dict_a['url']  = req.get_attribute("href")
            list_1.append(dict_a)
            # print(dict_a)

        return list_1

#-------------------------------------------------------------------------------------

    def page_all(self,url):

        self.driver.get(url)
        self.driver.switch_to_frame('contentFrame')
        res_1 = self.driver.find_elements_by_xpath("./html/body/div/div/ul/li/p/a")
        list_5 = []
        for cd in res_1:
            dict_2 = {}
            # print(cd.text)
            dict_2['name'] = cd.text
            dict_2['url']  = cd.get_attribute('href')
            # list_5.append(dict_2)
            list_5.append(dict_2)
        print("lis_5的列表中有{}个大范围".format(len(list_5)))
        # time.sleep(50)
        for ss in list_5:
            self.save_1(ss)
        time.sleep(1)
        next_url = self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/div/a[@class='zbtn znxt']").get_attribute("href")

        a = 2
        # 下一页
        while True:
            try:
                self.driver.get(next_url)
                self.driver.switch_to_frame('contentFrame')
                res_1 = self.driver.find_elements_by_xpath("./html/body/div/div/ul/li/p/a")
                list_6 = []
                for cd in res_1:
                    dict_2 = {}
                    # print(cd.text)
                    dict_2['name'] = cd.get_attribute("title")
                    dict_2['url'] = cd.get_attribute('href')
                    dict_2["anchor"] = cd.text
                    # print(dict_2)
                    list_6.append(dict_2)
                    # time.sleep(2)
                for ss in list_6:
                    self.save_1(ss)


                next_session = self.driver.find_element_by_xpath("./html/body/div[@class='g-bd']/div[@class='g-wrap p-pl f-pr']/div/div/a[@class='zbtn znxt']")
                next_url = next_session.get_attribute("href")
                print("下载第{}页".format(a))
                a+=1
            except:
                break
        return list_5
#---------------------------------------------

    def save_1(self,con):
        file1 = open("E://a.csv", 'a+')
        try:
            file1.write(con['name']+","*5+str(con['url'])+","*6+con['anchor']+'\n')

            file1.close()
        except:
            pass

#------------------------------------------------------

    def run(self):
        list_1 = self.all_url_1()
        # with open('E://ss.txt', 'a+') as f:
        for li in list_1:
            list_5 = self.page_all(li["url"])
            # print(list_5)
            time.sleep(0.5)

        # print(list_1)

if __name__ == '__main__':
    a = WY()
    a.run()

python利用selenium爬取网易云音乐_第1张图片

你可能感兴趣的:(上大,python,selenium,html,爬虫,搜索引擎,java爬虫程序,爬虫搜索,关键字搜索)