大众点评爬虫

## 大众点评评论爬虫脚本使用指南

### 爬取前的准备

- mysql数据库安装、打开服务
- 修改mysqls.py程序中数据库的用户名密码等,并创建对应的database和table,可以使用mysqls.creat_table()函数
- 登录大众点评官网,通过谷歌开发者工具等获取到当前的cookie,修改main.py中的cookie变量
- 查看爬取的店铺的店铺ID以及评论的页数,修改main.py 中对应的位置
- 如果有xuchuan.txt(保存当前点评爬取进度),请在爬取前删除(每换一个店铺要删除一次)

### 爬取过程中

- 由于大概每爬取100页左右,需要进行一次验证,当发现获取评论为0条或者异常时,请用浏览器打开点评页面,滑动滑块解锁,然后重启程序,有断点续传,不虚~
- 当更换店铺时,需要把店铺ID进行替换,还有评论的页数也要替换,还要删除xuchuan.txt

### 爬取结束后

数据存储于MYSQL数据库中,可以使用各种方法读取,啦啦啦啦~

 

1.安装Mysql,安装Navicat,创建数据库dianping。

2.封装操作数据库工具类

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 24 15:45:05 2018

@author: bin
"""

import pymysql

#连接MYSQL数据库
db = pymysql.connect("localhost","root","root","dianping" )
cursor = db.cursor()

#在数据库建表
def creat_table():
    cursor.execute("DROP TABLE IF EXISTS DZDP")
    sql = '''CREATE TABLE DZDP(
            cus_id varchar(100),
            comment_time varchar(55),
            comment_star varchar(55),
            cus_comment text(5000),
            kouwei varchar(55),
            huanjing varchar(55),
            fuwu varchar(55),
            shopID varchar(55)
            );'''
    cursor.execute(sql)
    return

#存储爬取到的数据
def save_data(data_dict):
    sql = '''INSERT INTO DZDP(cus_id,comment_time,comment_star,cus_comment,kouwei,huanjing,fuwu,shopID) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)'''
    value_tup = (data_dict['cus_id']
                 ,data_dict['comment_time']
                 ,data_dict['comment_star']
                 ,data_dict['cus_comment']
                 ,data_dict['kouwei']
                 ,data_dict['huanjing']
                 ,data_dict['fuwu']
                 ,data_dict['shopID']
                 )
    try:
        cursor.execute(sql,value_tup)
        db.commit()
    except:
        print('数据库写入失败')
    return

#关闭数据库
def close_sql():
    db.close()

3.创建表

creat_table()

4.在当前python文件同一个目录中创建proxies.txt,写入代理池,也可以自己手动创建搞个代理池,如果自己创建代理池,可参考CRAW_IP.py

 

CRAW_IP.py:

import requests

from bs4 import BeautifulSoup

import lxml

from multiprocessing import Process, Queue

import random

import json

import time

import requests


class Proxies(object):
    """docstring for Proxies"""

    def __init__(self, page=3):

        self.proxies = []

        self.verify_pro = []

        self.page = page

        self.headers = {

            'Accept': '*/*',

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',

            'Accept-Encoding': 'gzip, deflate, sdch',

            'Accept-Language': 'zh-CN,zh;q=0.8'

        }

        self.get_proxies()

        self.get_proxies_nn()

    def get_proxies(self):

        page = random.randint(1, 10)

        page_stop = page + self.page

        while page < page_stop:

            url = 'http://www.xicidaili.com/nt/%d' % page

            html = requests.get(url, headers=self.headers).content

            soup = BeautifulSoup(html, 'lxml')

            ip_list = soup.find(id='ip_list')

            for odd in ip_list.find_all(class_='odd'):
                protocol = odd.find_all('td')[5].get_text().lower() + '://'

                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))

            page += 1

    def get_proxies_nn(self):

        page = random.randint(1, 10)

        page_stop = page + self.page

        while page < page_stop:

            url = 'http://www.xicidaili.com/nn/%d' % page

            html = requests.get(url, headers=self.headers).content

            soup = BeautifulSoup(html, 'lxml')

            ip_list = soup.find(id='ip_list')

            for odd in ip_list.find_all(class_='odd'):
                protocol = odd.find_all('td')[5].get_text().lower() + '://'

                self.proxies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))

            page += 1

    def verify_proxies(self):

        # 没验证的代理

        old_queue = Queue()

        # 验证后的代理

        new_queue = Queue()

        print('verify proxy........')

        works = []

        for _ in range(15):
            works.append(Process(target=self.verify_one_proxy, args=(old_queue, new_queue)))

        for work in works:
            work.start()

        for proxy in self.proxies:
            old_queue.put(proxy)

        for work in works:
            old_queue.put(0)

        for work in works:
            work.join()

        self.proxies = []

        while 1:

            try:

                self.proxies.append(new_queue.get(timeout=1))

            except:

                break

        print('verify_proxies done!')

    def verify_one_proxy(self, old_queue, new_queue):

        while 1:

            proxy = old_queue.get()

            if proxy == 0: break

            protocol = 'https' if 'https' in proxy else 'http'

            proxies = {protocol: proxy}

            try:

                if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
                    print('success %s' % proxy)

                    new_queue.put(proxy)

            except:

                print('fail %s' % proxy)


if __name__ == '__main__':

    a = Proxies()

    a.verify_proxies()

    print(a.proxies)

    proxie = a.proxies

    with open('proxies.txt', 'a') as f:

        for proxy in proxie:
            f.write(proxy + '\n')




proxies.txt:

https://59.37.18.243:3128
https://183.129.207.74:14823
https://49.73.6.90:3128
https://115.239.255.190:3128
https://203.86.26.9:3128
https://120.92.74.189:3128
http://183.62.196.10:3128
https://183.129.244.17:10010
https://171.221.239.11:808
https://14.29.32.106:53281
https://218.60.8.83:3129
https://183.129.207.80:21776
https://203.130.46.108:9090
https://183.21.81.58:40539
https://182.18.13.149:53281
https://114.113.126.83:80
https://118.212.95.34:53281
https://114.113.126.82:80
https://183.129.207.78:18118
https://211.101.136.86:8080
https://114.249.112.16:9000
https://163.125.68.149:8888
https://111.202.37.195:8080
https://61.145.203.234:38695
https://119.254.94.92:48494
https://27.46.20.55:888
https://175.6.2.174:8088
https://59.72.126.3:8123
https://59.37.26.226:8080
https://120.27.14.125:80
https://61.140.108.57:54689
https://58.240.220.86:53281
https://183.30.201.8:9797
https://111.170.156.182:53281
https://218.15.25.157:8088
https://180.173.152.33:9000
https://117.35.51.77:53281
https://119.90.126.106:7777
https://121.228.125.27:3128
https://218.89.222.110:9999
https://61.155.112.228:61591
https://171.37.30.82:9797
https://125.123.122.59:9000
https://125.123.143.171:9000
https://60.191.57.79:3128
https://163.125.19.43:9999
https://112.65.19.122:8080
https://163.125.17.241:8888
https://163.125.17.238:8888
https://180.213.181.96:8118
https://114.86.227.164:33657
https://118.187.50.154:8080
https://118.190.217.182:80
https://118.190.217.61:80
http://183.129.244.13:10800
https://125.123.127.24:9000
https://124.237.83.14:53281
https://163.125.74.243:9797
https://61.175.172.216:8123
https://175.152.223.235:8123
https://123.165.115.55:9797
https://223.245.127.165:44765
https://59.78.1.5:1080
https://118.25.177.187:1080
https://59.39.196.122:55637
https://119.4.172.217:8118
https://116.30.123.148:9000
https://112.74.207.50:3128
https://14.149.68.120:1080
https://58.251.233.122:9797
https://182.88.187.149:9797
https://182.150.63.89:46073
https://163.125.70.70:9999
https://58.251.234.137:9797
https://101.132.122.230:3128
https://119.129.98.65:45522
https://112.81.143.172:8118
https://220.184.129.224:3128
https://112.250.109.173:53281
https://116.196.92.155:1080
https://14.20.235.117:808
https://182.88.187.83:9797
https://110.52.8.171:53281
https://159.226.170.42:3128
https://121.9.199.70:32431
https://113.118.201.133:9797
https://58.250.23.210:1080
https://119.250.26.39:9000
https://171.36.179.27:9797
https://175.25.185.57:3128
https://118.190.155.23:80
https://114.119.116.93:61066
https://171.36.210.248:9797
https://112.193.130.123:8118
https://123.183.11.166:53386
https://118.186.2.210:8080
https://112.64.38.161:51099
https://222.186.45.146:63756
https://183.14.76.165:9797
https://163.125.19.88:9999
https://218.6.16.233:8118
https://180.168.210.132:80
https://61.164.39.69:53281
https://61.130.9.249:3128
https://122.143.117.8:8080
https://180.162.34.149:9797
https://115.231.50.10:53281
https://112.95.205.63:8888
https://112.95.205.71:8888
https://115.151.4.6:53128
https://110.73.40.17:8123
https://121.207.0.115:808
https://118.180.85.201:8123
https://61.157.206.182:60460
https://124.200.104.234:47076
https://61.157.206.170:42379
https://221.234.192.10:8010
https://59.32.37.7:3128
https://1.183.163.137:53077
https://59.49.22.231:30151
https://27.22.104.28:39560
https://61.160.233.214:39522
https://59.32.37.246:8010
https://115.46.79.110:8123
https://110.73.10.53:8123
https://110.73.43.173:8123
https://183.63.17.253:54174
https://121.9.199.51:59134
https://123.163.20.37:35249
https://61.158.187.118:56524
https://61.157.206.187:37667
https://203.93.125.238:51108
https://223.203.0.14:8080
https://221.224.62.243:51941
https://114.225.169.161:53128
https://124.77.92.239:31307
https://27.153.128.207:8010
https://110.188.0.64:35137
https://115.238.105.108:808
https://61.133.245.70:35652
https://60.211.192.54:40700
https://171.37.155.232:8123
https://221.232.193.223:8010
https://27.190.26.57:8118
https://221.224.212.11:23500
https://180.118.240.51:61234
https://113.106.97.148:38257
https://119.97.23.87:8123
https://1.183.163.101:52524
https://61.157.206.172:59656
https://121.205.254.201:8010
https://61.157.206.178:34692
https://115.46.74.160:8123
https://120.5.162.224:32290
https://61.154.49.38:59675
https://61.160.233.215:48478
https://119.123.77.41:31425
https://114.225.170.217:53128
https://113.17.36.96:47399
https://114.112.70.150:57871
https://123.207.30.131:80
https://119.254.94.97:41697
https://115.46.73.129:8123
https://115.221.112.122:25903
https://115.211.231.66:8010
https://221.232.192.206:8010
https://182.88.166.78:8123
https://115.46.67.43:8123
https://121.205.254.192:808
https://175.148.73.231:1133
https://183.129.153.122:36839
https://139.196.111.17:42589
https://60.12.214.184:33507
https://117.85.86.73:53128
https://115.46.77.225:8123
https://121.31.177.217:8123
https://110.73.42.191:8123
https://222.85.22.167:8010
https://119.48.97.137:80
https://218.79.113.92:30366
https://101.236.55.145:8866
https://116.235.75.177:61922
https://220.248.125.82:8118
https://121.60.76.28:8010
https://116.17.236.52:8010
https://115.223.114.224:8010
https://122.246.51.176:8010
https://59.45.27.245:50858
https://171.37.153.33:8123
https://121.225.26.218:3128
https://180.118.243.93:61234
https://115.46.78.208:8123
https://175.148.76.72:1133
https://223.244.252.58:45744
https://115.223.117.127:8010
https://59.46.112.34:43858
https://117.114.144.195:35070
https://180.118.243.52:61234
https://180.110.7.46:3128
https://106.42.208.201:8010
https://42.236.151.226:37848
https://221.2.207.205:51030
https://114.80.216.171:54408
https://119.254.94.95:43150
https://121.31.153.170:8123
https://113.121.242.173:808
https://122.138.16.158:80
https://182.88.129.168:8123
https://113.200.27.10:53281

5.创建main.py开始爬虫程序

注意:

5.1.登录大众点评官网,通过谷歌开发者工具等获取到当前的cookie,修改main.py中的cookie变量,

5.2.查看爬取的店铺的店铺ID以及评论的页数,修改main.py 中对应的位置,

5.3.如果有xuchuan.txt(保存当前点评爬取进度),请在爬取前删除(每换一个店铺要删除一次)

5.4.由于大概每爬取100页左右,需要进行一次验证,当发现获取评论为0条或者异常时,请用浏览器打开点评页面,滑动滑块解锁,然后重启程序,有断点续传,不虚~
5.5.当更换店铺时,需要把店铺ID进行替换,还有评论的页数也要替换,还要删除xuchuan.txt

# -*- coding: utf-8 -*-
"""
Created on Mon Jul  9 16:42:52 2018

@author: bin
"""

#目标爬取店铺的评论

import requests
from bs4 import BeautifulSoup
import time, random
import mysqls
import re
from fake_useragent import UserAgent
import os

ua = UserAgent()

#设置cookies
# cookie = "_lxsdk_cuid=162760423dfc8-0801f141cb0731-3b60490d-e1000-162760423dfc8; _lxsdk=162760423dfc8-0801f141cb0731-3b60490d-e1000-162760423dfc8; _hc.v=af7219c3-2b99-8bb8-f9b2-7b1d9be7f29e.1522398406; s_ViewType=10; ua=%E4%BB%A4%E7%8B%90%E5%86%B2; ctu=029e953356caf94d20233d299a70d285a03cb64585c371690b17d3e59c4c075c; cye=guangzhou; Hm_lvt_e6f449471d3527d58c46e24efb4c343e=1531964746; cy=4; dper=8c6ae023e893759ea57ce154028f1800be56b69450806b893b9cf5c6b6c3e3ba3c986c9a603bcbf9a7fb18dcd2038cf704b3e3baba3532bc7dffec965fe5e6c3b2479ca21c6577a1f5636088acbba8936df6ac994e02a923a907907a938559f9; ll=7fd06e815b796be3df069dec7836c3df; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=1661889a264-50e-66f-22a%7C%7C276"
cookie="_lxsdk_cuid=16a4347ce37c8-0a8bfc61a70c74-5a442916-15f900-16a4347ce3750; _lxsdk=16a4347ce37c8-0a8bfc61a70c74-5a442916-15f900-16a4347ce3750; Hm_lvt_e6f449471d3527d58c46e24efb4c343e=1555906941; _hc.v=0f9cc4f8-5fa4-ea4c-262c-306a344b9a8e.1555906941; cy=2; cye=beijing; _dp.ac.v=72a808e1-1de4-45eb-8962-eefb0a179eb7; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_0295126037; ctu=255d09ddf69958c91b07ce9c01164c9c8c6144674a4190c49410595ebe1a95d7; uamo=17010209086; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=16a6820f2d8-3ed-bfa-b9e%7C%7C192"
#修改请求头
headers = {
        'User-Agent':ua.random,
        'Cookie':cookie,
        'Connection':'keep-alive',
        'Host':'www.dianping.com',
        'Referer': 'http://www.dianping.com/shop/521698/review_all/p6'
}

#从ip代理池中随机获取ip
ips = open('proxies.txt','r').read().split('\n')
#
def get_random_ip():
   ip = random.choice(ips)
   pxs = {ip.split(':')[0]:ip}
   return pxs

#获取html页面
def getHTMLText(url,code="utf-8"):
    try:
        time.sleep(random.random()*6 + 2)
        r=requests.get(url, timeout = 5, headers=headers, 
                      proxies=get_random_ip()
                       )
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        print("产生异常")
        return "产生异常"

#因为评论中带有emoji表情,是4个字符长度的,mysql数据库不支持4个字符长度,因此要进行过滤
def remove_emoji(text):
    try:
        highpoints = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        highpoints = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
    return highpoints.sub(u'',text)

#从html中提起所需字段信息
def parsePage(html,shpoID):
    infoList = [] #用于存储提取后的信息,列表的每一项都是一个字典
    soup = BeautifulSoup(html, "html.parser")
    
    for item in soup('div','main-review'):
        cus_id = item.find('a','name').text.strip()
        comment_time = item.find('span','time').text.strip()
        try:
            comment_star = item.find('span',re.compile('sml-rank-stars')).get('class')[1]
        except:
            comment_star = 'NAN'
        cus_comment = item.find('div',"review-words").text.strip()
        scores = str(item.find('span','score'))
        try:
            kouwei = re.findall(r'口味:([\u4e00-\u9fa5]*)',scores)[0]
            huanjing = re.findall(r'环境:([\u4e00-\u9fa5]*)',scores)[0]
            fuwu = re.findall(r'服务:([\u4e00-\u9fa5]*)',scores)[0]
        except:
            kouwei = huanjing = fuwu = '无'
        
        infoList.append({'cus_id':cus_id,
                         'comment_time':comment_time,
                         'comment_star':comment_star,
                         'cus_comment':remove_emoji(cus_comment),
                         'kouwei':kouwei,
                         'huanjing':huanjing,
                         'fuwu':fuwu,
                         'shopID':shpoID})
    return infoList

#构造每一页的url,并且对爬取的信息进行存储
def getCommentinfo(shop_url, shpoID, page_begin, page_end):
    for i in range(page_begin, page_end):
        try:
            url = shop_url + 'p' + str(i)
            html = getHTMLText(url)
            infoList = parsePage(html,shpoID)
            print('成功爬取第{}页数据,有评论{}条'.format(i,len(infoList)))
            for info in infoList:
                mysqls.save_data(info)
            #断点续传中的断点
            if (html != "产生异常") and (len(infoList) != 0):
                with open('xuchuan.txt','a') as file:
                    duandian = str(i)+'\n'
                    file.write(duandian)
            else:
                print('休息60s...')
                time.sleep(60)
        except:
            print('跳过本次')
            continue
    return

def xuchuan():
    if os.path.exists('xuchuan.txt'):
        file = open('xuchuan.txt','r')
        nowpage = int(file.readlines()[-1])
        file.close()
    else:
        nowpage = 0
    return nowpage

#根据店铺id,店铺页码进行爬取
def craw_comment(shopID='521698',page = 53):
    shop_url = "http://www.dianping.com/shop/" + shopID + "/review_all/"
    #读取断点续传中的续传断点
    nowpage = xuchuan()
    getCommentinfo(shop_url, shopID, page_begin=nowpage+1, page_end=page+1)
    mysqls.close_sql()
    return

if __name__ == "__main__":
    craw_comment()
        

6.数据存储于MYSQL数据库中,可以使用各种方法读取,啦啦啦啦~

7.如果后续数据想做文本分析挖掘,以及情感分析可参考我的博客中另外二篇博客 探索性分析 https://blog.csdn.net/weixin_40903057/article/details/89705923 和 大众点评评价情感分析https://blog.csdn.net/weixin_40903057/article/details/89706111

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

你可能感兴趣的:(爬虫,python,大众点评)