python爬取美团

1、抓包分析

按照需求,我随便选择了一家花溪大学城评论多的点评点评信息进行爬取
商铺链接: https://www.meituan.com/meishi/193383554/

1.1 分析网页

对于网络爬虫,我拿到的第一件事就是分析它的网页数据的加载方式,再决定我请求服务器的方式。python爬取美团_第1张图片

1.2 开始进行抓包

抓包的步骤:
1鼠标右击,打开检查功能
2选择Network
3选择All
4刷新网页
5查看加载的数据python爬取美团_第2张图片

2、 美团爬取

2.1 爬取信息

import requests
from fake_useragent import UserAgent
for page in range(0, 371, 10):#0~100
    url = "https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=2ff7056c-9d76-424c-b564-b7084f7e16e4&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F193383554%2F&riskLevel=1&optimusCode=10&id=193383554&userId=&offset={}&pageSize=10&sortType=1".format(page)  
    headers = {
    "User-Agent" : UserAgent().chrome #chrome浏览器随机代理
    }
    respone = requests.get(url = url, headers = headers) #向服务器发出请求,服务器返回结果
    for item in respone.json()['data']['comments']:#遍历,循环
        userId = item['userId']
        userName = item['userName']
        avgPrice = item['avgPrice']
        comment = item['comment']
        merchantComment = item['merchantComment']
        data = (userId, userName, avgPrice, comment, merchantComment)
        print (data)  

3、IP代理

3.1、爬取存入CSV代码汇总

# https://www.meituan.com/meishi/193383554/ 商品链接

import requests, json, re, random, time, csv
from fake_useragent import UserAgent

starttime = time.time()#记录开始时间

ips = [] #装载有效 IP 
for i in range(1, 6):
    headers = {
    "User-Agent" : UserAgent().chrome #chrome浏览器随机代理
    }
    ip_url = 'http://www.89ip.cn/index_{}.html'.format(i)
    html = requests.get(url=ip_url, headers=headers).text
    res_re = html.replace(" ", "").replace("\n", "").replace("\t", "")
    #使用正则表达式匹配出IP地址及端口
    r = re.compile('(.*?)(.*?)')
    result = re.findall(r, res_re)
    for i in range(len(result)):
        ip = "http://" + result[i][0] + ":" + result[i][1]
        # 设置为字典格式
        proxies = {"http": ip}
        #使用上面的IP代理请求百度,成功后状态码200
        baidu = requests.get("https://www.baidu.com/", proxies = proxies)
        if baidu.status_code == 200:        
            ips.append(proxies)
    print ("正在准备IP代理,请稍后。。。")

#创建CSV文件,并写入表头信息,并设置编码格式为“utf-8-sig”防止中文乱码
fp = open('./美团_大学城.csv','a', newline='',encoding='utf-8-sig') #"./"表示当前文件夹,"a"表示添加
writer = csv.writer(fp) #方式为写入
writer.writerow(('用户ID','用户名', '平均价','评论','回复')) #表头

for page in range(0, 371, 10):#0~100
    url = 'https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=9f45527e-2983-40c9-bc92-f58a8290c947&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F193383554%2F&riskLevel=1&optimusCode=10&id=193383554&userId=&offset={}&pageSize=10&sortType=1'.format(page)
    try:  
        headers = {
             "User-Agent" : UserAgent().chrome #chrome浏览器随机代理
        }
        rep = requests.get(url=url, headers=headers, proxies=ips[random.randint(0 , len(ips)-1)])
        print ("爬取条数:", page)
        for info in rep.json()['data']['comments']:
            userId = info['userId']
            userName = info['userName']
            avgPrice = info['avgPrice']
            comment = info['comment']
            merchantComment = info['merchantComment']
            data = (userId, userName, avgPrice, comment, merchantComment)
            writer.writerow((data))            
    except:
        print ("这里发生异常:", url)
        pass
fp.close() #关闭文件
endtime = time.time()#获取结束时间
sumTime = endtime - starttime #总的时间
print ("一共用的时间是%s秒"%sumTime)

python爬取美团_第3张图片
python爬取美团_第4张图片

3.2 爬取存入TXT代码汇总

# https://www.meituan.com/meishi/193383554/
import requests,json
from fake_useragent import UserAgent
import requests,re,random
from lxml import etree
from fake_useragent import UserAgent

ips = [] #建立数组,用于存放有效IP
for i in range(1,6):
    print ("正在准备IP代理,请稍等。。。")
    headers = {
    "User-Agent" : UserAgent().chrome #chrome浏览器随机代理
    }
    ip_url = 'http://www.89ip.cn/index_%s.html'%i
    # 请求IP的网站,得到源码
    res = requests.get(url=ip_url, headers=headers).text
    res_re= res.replace('\n', '').replace('\t','').replace(' ','')
    # 使用正则表达匹配出IP地址及它的端口
    re_c = re.compile('(.*?)(.*?)')
    result = re.findall(re_c, res_re)
    for i in range(len(result)):
    	#拼接出完整的IP
        ip = 'http://' + result[i][0] + ':' + result[i][1]
        # 设置为字典格式
        proxies={"http":ip}
        #使用上面爬取的IP代理请求百度
        html = requests.get('https://www.baidu.com/', proxies=proxies)
        if html.status_code == 200: #状态码为200,说明请求成功
            ips.append(proxies) #添加进数组中
            
headers = {
"User-Agent" : UserAgent().chrome #chrome浏览器随机代理
}
for page in range(0,371, 10):    
    url = 'https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=9f45527e-2983-40c9-bc92-f58a8290c947&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F193383554%2F&riskLevel=1&optimusCode=10&id=193383554&userId=&offset={}&pageSize=10&sortType=1'.format(page)
    try:
        rep = requests.get(url=url, headers=headers, proxies=ips[random.randint(0 , len(ips)-1)])
        print ("爬取条数:", page)
        for info in rep.json()['data']['comments']:
            with open("./美团文本.txt", "a", encoding='utf-8') as f:
                comment = str(info['comment'])#评论
                merchantComment = str(info['merchantComment'])#商家回复
                f.write(comment)
                f.write(merchantComment)
                f.close()
    except:
        pass

在这里插入图片描述

你可能感兴趣的:(python爬取美团)