python爬虫 ---爬取网易云歌手信息

import requests
from lxml import etree
def get_text(list_):
    if list_:
        return list_[0]
    return ''
def get_xpath(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    }
    response = requests.get(url,headers=headers)
    # print(response.text)
    return etree.HTML(response.text)
'''
获取所有歌手信息
'''
def parse_singer(url):
    #先请求url,获取该页面的element对象
    html = get_xpath(url)
    singer_li_list = html.xpath('//ul[@id="m-artist-box"]/li')
    for li in singer_li_list:
        name =get_text(li.xpath('.//p/a/text()|.//a[1]/text()'))
        # print(name)
        detail_url = get_text(li.xpath('.//p/a/@href|.//a[1]/@href'))
        # print(detail_url)
        item = {}
        item['name'] = name
        item['detail_url'] = detail_url
        #艺人简介

        # print(item)
def parse_area(url):
    html = get_xpath(url)
    #获取歌手字母列表
    singer_words = html.xpath('//ul[@id="initial-selector"]/li[position()>1]/a/@href')
    # print(singer_words)
    for word in singer_words:
        word_url = 'https://music.163.com'+word
        # print(word_url)
        parse_singer(word_url)
def main():
    #从首页获取区域
    base_url = 'https://music.163.com/discover/artist'
    html = get_xpath(base_url)
    area_list = html.xpath('//div[@class="blk"]/ul/li/a/@href')
    # print(area_list)
    for area in area_list:
        area_url = 'https://music.163.com'+area
        parse_area(area_url)

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫)