大众点评店铺评论信息获取

import sys
import os
import re
import requests
from pyquery import PyQuery as pq


headers = {
"Host": 'm.dianping.com',
'Accept-Encoding': 'gzip',
# "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'User-Agent': random.choice(utils.ua_mobile),
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Mobile Safari/537.36'
}
LIST_HEADER = {
    "Host": 'm.dianping.com',
    "Connection": 'keep-alive',
    "Cache-Control": 'max-age=0',
    "Upgrade-Insecure-Requests": '1',
    "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    "Accept-Language": 'zh-CN,zh;q=0.8',
    # 'User-Agent': random.choice(utils.ua_mobile[0]),
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Mobile Safari/537.36'
}
header_pinlun = {
'Host': 'www.dianping.com',
'Accept-Encoding': 'gzip',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}

header_css = {
'Host': 's3plus.meituan.net',
'Accept-Encoding': 'gzip',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

}


# 0-详情页
def get_msg():
    """
    url: http://www.dianping.com/shop/+ 商铺ID +/review_all
    :return:
    """
    # url = "http://www.dianping.com/shop/114461956"                                    # 获取人均和评论及评论数
    url = "http://www.dianping.com/shop/96658933/review_all"
    # url = 'http://m.dianping.com/shop/17149400/map'                                    # 获取经纬度
    html = requests.get(url, headers=header_pinlun)
    print("1 ===> STATUS", html.status_code)
    # print(html.text)
    doc = pq(html.text)
    # 解析每条评论
    pinglunLi = doc("div.reviews-items > ul > li").items()
    """
    调用评论里的css样式处理和加密字体svg处理
    :return:
    dict_svg_text: svg整个加密字库,以字典形式返回
    list_svg_y:svg背景中的标签里的[x,y]坐标轴,以[x,y]形式返回
    dict_css_x_y:css样式中,每个加密字体的 标签内容,用于匹配dict_svg_text 中的key,以字典形式返回
    """
    dict_svg_text, dict_css_x_y = css_get(doc)

    for data in pinglunLi:
        # 用户名
        userName = data("div.main-review > div.dper-info > a").text()
        # 用户ID链接
        userID = "http://www.dianping.com" + data("div.main-review > div.dper-info > a").attr("href")
        # 用户评分星级[10-50]
        startShop = str(data("div.review-rank > span").attr("class")).split(" ")[1].replace("sml-str", "")
        # 用户描述:机器:非常好 环境:非常好 服务:非常好 人均:0元
        describeShop = data("div.review-rank > span.score").text()
        # 关键部分,评论HTML,待处理,评论包含隐藏部分和直接展示部分,默认从隐藏部分获取数据,没有则取默认部分。(查看更多)
        pinglun = data("div.review-words.Hide").html()
        try:
            len(pinglun)
        except:
            pinglun = data("div.review-words").html()
        # 该用户喜欢的美食
        loveFood = data("div.main-review > div.review-recommend").text()
        # 发表评论的时间
        pinglunTime = data("div.main-review > div.misc-info.clearfix > span.time").text()
        print("userName:", userName)
        print("userID:", userID)
        print("startShop:", startShop)
        print("describeShop:", describeShop)
        print("loveFood:", loveFood)
        print("pinglunTime:", pinglunTime)
        print("pinglun:", css_decode(dict_css_x_y, dict_svg_text, pinglun))
        print("*"*100)


# 1-评论隐含部分字体css样式, 获取svg链接,获取加密汉字background
def css_get(doc):
    css_link = "http:"+doc("head > link:nth-child(11)").attr("href")
    background_link = requests.get(css_link, headers=header_css)
    r = r'background-image: url(.*?);'
    matchObj = re.compile(r, re.I)
    # svg_link = matchObj.findall(background_link.text)[2].replace(")", "").replace("(", "http:")      # 会匹配出3个url,需要选取文字最多的,出错可能是这。
    svg_links = matchObj.findall(background_link.text)
    svg_link = find_font_svg_link(svg_links)
    """
    svg_text() 方法:请求svg字库,并抓取加密字
    dict_svg_text: svg整个加密字库,以字典形式返回
    list_svg_y:svg背景中的标签里的[x,y]坐标轴,以[x,y]形式返回
    """
    dict_avg_text = svg_text(svg_link)
    """
    css_dict() 方法:生成css样式中background的样式库
    dict_css: 返回css字典样式
    """
    dict_css = css_dict(background_link.text)
    return dict_avg_text, dict_css


def find_font_svg_link(svg_links):
    link_url = None
    for link in svg_links:
        link_url = link.replace(")", "").replace("(", "http:")
        res = requests.get(link_url)
        text = res.text
        height = re.findall('height="(.*?)px"', text)
        if height:
            height = float(height[0])
            if height > 2000:
                break
    return link_url


# 2-字体库链接
def svg_text(url):
    html = requests.get(url)
    dict_svg = svg_dict(html.text)
    return dict_svg


# 3-生成svg字库字典
def svg_dict(csv_html):
    svg_text_r = r'(.*?)'
    svg_text_re = re.findall(svg_text_r, csv_html)
    dict_avg = {}
    if svg_text_re:
        # 生成svg加密字体库字典
        for data in svg_text_re:
            dict_avg[int(data[0])] = list(data[1])
        return dict_avg
    else:
        svg_text_r = r'(.*?)'
        svg_text_re = re.findall(svg_text_r, csv_html)
        dict_1 = {}
        for data in svg_text_re:
            dict_1[int(data[0])] = list(data[2])
        svg_y_r = r''
        svg_y_re = re.findall(svg_y_r, csv_html)
        for data in svg_y_re:
            if int(data[0]) in dict_1:
                dict_avg[int(data[2])] = dict_1[int(data[0])]
        return dict_avg

# 4-生成css字库字典
def css_dict(html):
    css_text_r = r'.(.*?){background:(.*?)px (.*?)px;}'
    css_text_re = re.findall(css_text_r, html)
    dict_css = {}
    for data in css_text_re:
        """
        加密字库.gqi4j {background: -98.0px -745.0px;}与svg文件对应关系,x/14,就是svg文件加密字体下标
        y,原样返回,需要在svg函数中做处理
        """
        x = int(float(data[1])/-14)
        """
        字典参数:{css参数名:(background-x,background-y,background-x/14,background-y)}
        """
        dict_css[data[0]] = (x, int(float(data[2]) * -1))
    return dict_css


# 5-最终评论汇总
def css_decode(decode_css, decode_svg, pinglun_html):
    """
    :param css_html: css 的HTML源码
    :param svg_dict: svg加密字库的字典
    :param svg_list: svg加密字库对应的坐标数组[x, y]
    :param pinglun_html: 评论的HTML源码,对应0-详情页的评论,在此处理
    :return: 最终合成的评论
    """
    css_dict_text = decode_css
    svg_dict_text = decode_svg
    pinglun_text = re.sub('|">|
||

你可能感兴趣的:(python,爬虫)