爬虫实战-酷狗音乐数据抓取--XPath,Pyquery,Beautifulsoup数据提取对比实战

网站:

http://www.kugou.com/yy/html/rank.html

爬取目标:

酷酷狗飙升榜的歌手,歌曲名字,歌曲链接等内容,存到Mysql数据库中

网页解析:

爬虫实战-酷狗音乐数据抓取--XPath,Pyquery,Beautifulsoup数据提取对比实战_第1张图片

此次爬取采用三种解析方式:

代码如下:

import requests
from lxml import etree
import pymongo
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup

def get_info():
    url = 'http://www.kugou.com/yy/html/rank.html'


    try:
        response = requests.get(url)
        if response.status_code == 200:
            # print(response)
            return response.text
    except requests.ConnectionError:
        return None

def get_detail_info(response):
    """xpath抓取数据"""
    html = etree.HTML(response)
    # xpath
    result = html.xpath('//div[@id="rankWrap"]/div[@class="pc_temp_songlist  pc_rank_songlist_short"]/ul/li')
    # print(result)
    list1 = []
    for msg in result:
        #查找歌手名字还有歌曲名字
        msg_star = msg.xpath('./@title')[0]
        #查找歌曲的榜名
        msg_address = msg.xpath('./span[@class="pc_temp_tips_l"]/i/@title')[0]
        #查找歌曲链接
        msg_lianjie = msg.xpath('./a/@href')[0]
        dic = {
            "msg_star":msg_star,
            'msg_address':msg_address,
            'msg_lianjie':msg_lianjie,
        }
        # print(dic)
        list1.append(dic)
    return list1

def get_detail_info_css(response):
    """pyquery抓取"""
    # print(response)
    doc = pq(response)
    # print(doc)
    items = doc('#rankWrap')

    lis = items.find('ul').find('li')
    print(type(lis))
    lis = doc(lis)
    list1 = []
    for msg in lis.items():
        # print(type(msg))
        msg_star = msg.attr.title
        msg_address = msg.children('.pc_temp_tips_l').find('i').attr.title
        msg_lianjie = msg.find('a').attr.href

        dic = {
            "msg_star": msg_star,
            'msg_address': msg_address,
            'msg_lianjie': msg_lianjie,
        }
        list1.append(dic)
    return list1


def get_detail_info_xml(response):
    """beautifulsoup抓取"""
    list1 =[]
    soup = BeautifulSoup(response,'lxml')
    info = soup.find(class_='pc_temp_songlist pc_rank_songlist_short').ul
    for msg in info.select('li'):
        # print(msg)
        msg_star = msg.attrs['title']
        msg_address = msg.find(class_='pc_temp_tips_l').i.attrs['title']
        msg_lianjie = msg.a.attrs['href']
        print(msg_lianjie)
        dic = {
            "msg_star": msg_star,
            'msg_address': msg_address,
            'msg_lianjie': msg_lianjie,
        }
        list1.append(dic)


    return list1



def db(list1):
    # print(list1)
    client = pymongo.MongoClient(host='localhost',port=27017)
    db = client.test
    collection = db.music
    for music_info in list1:
        print(music_info)
        result = collection.insert(music_info)
        print(result)


def main():
    #获取响应
    response = get_info()
    # xpath提取
    # get_detail_info(response)
    # list1 = get_detail_info(response)

    # pyquery提取
    # get_detail_info_css(response)
    # list1 = get_detail_info_css(response)

    # bs4提取
    get_detail_info_xml(response)
    list1 = get_detail_info_xml(response)
    db(list1)


if __name__ == '__main__':
    main()

 

 

 

 

你可能感兴趣的:(爬虫)