pyhong爬虫——大众点评——用户信息

1.前一部分依旧没有变化,不过用户的链接从哪来呢,从上一期的商户评论里拿到了用户个人主页的链接,建了一个csv文件,从而用作这次爬虫的链接库。 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
import pandas as pd
import json
import random

#放chromedriver的文件路径加文件名
CHROME_DRIVER = 'C:\\Users\\Administrator\\Desktop\\chromedriver_win32\\chromedriver.exe'


# 创建chrome参数对象
#opt = webdriver.ChromeOptions()
#opt.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度

driver = webdriver.Chrome(executable_path=CHROME_DRIVER
                         # ,options=opt
                         )
#读取用户名单
users=pd.read_csv('users.csv')
href = ['https://www.dianping.com/member/'+str(i) for i in users['user_id']]
#转换为评论页链接
comment_href = [i+'/reviews' for i in href]
driver.get('http://www.dianping.com/')

#打开网站后手动登陆一下大众点评,再执行下面的语句

2.获取cookie

cookies = driver.get_cookies()

3.爬虫程序,这次的主程序是单个评论循环的,因为爬虫的体量不是特别大,这种方法下来的数据比较干净,相对于之前的运行速度,这一次的运行速度很快,但是要加入time.sleep把速度降下来,防止ip被封,如果电脑比较多,可以多台电脑并行,速度翻倍!

加个小提示吧,很多人喜欢检查页面元素之后直接copy xpath,这样很容易出问题,因为这里的xpath后面的路径是数字索引,例如div[2]这种,在不同页面,由于元素数量不同,可能就变成div[3]了,在循环的时候很可能会报错,所以还是下一个xpath插件,在插件里找属性索引的路径,这种路径一般不会报错,例如div[@class='txt'],一般的网页设计者也不会把不同内容舍相同的属性。

driver.add_cookie(cookies[0])
data_list = []
comment_list = []
#list(range(len(href)))
for i in list(range(0,998)):
    #在get之前一个异常体,如果意外ip被封,可以先把文件保存下来
    try:
        driver.get(href[i])
    except:
        print('ipipipipipipipipipipipipip两个小时以后再运行')
        user_info = pd.concat(data_list,axis = 0)
        user_info.to_csv('user_info_00000.csv',index = False,encoding = 'GBK')
        comment = pd.concat(comment_list,axis = 0)
        comment.to_csv('comment_00000.csv',index = False,encoding = 'GBK')
        break
    #100的倍数停止一小时,不然会被大众点评封掉24小时的ip
    if (i+1)%100 == 0 :
        print('wating...............')
        time.sleep(3500)
    else :
        pass
    #关注、粉丝、互动、注册时间、贡献、地区、性别、vip
    guanzhu = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[1]/a/strong').text
    fensi = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[2]/a/strong').text
    hudong = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[3]/strong').text
    register_time = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[2]/p[3]').text
    contribution = driver.find_element_by_xpath('//*[@id="J_col_exp"]').text
    #这里主要是因为不是每个用户都有这些数据,如果没有我们给他加一个字段
    try:
        region = driver.find_element_by_xpath('/html/body/div[2]/div[1]/div/div/div/div[2]/div[2]/span[2]').text
    except:
        region = 'unknown'
    try:
        gender = driver.find_element_by_xpath('/html/body/div[2]/div[1]/div/div/div/div[2]/div[2]/span[2]/i').get_attribute('class')
    except:
        gender = 'unknown'
    try :
        driver.find_element_by_xpath("//div[@class='txt']/div[@class='tit']/div[@class='vip']/a/i[@class='icon-vip']")
        vip = 1
    except :
        vip = 0
    #拼接数据框
    x = pd.DataFrame({'user_id' : users['user_id'][i],
                      'guanzhu' : guanzhu,
                      'fensi' : fensi,
                      'hudong' : hudong,
                      'register_time' : register_time,
                      'contribution' : contribution,
                      'region' : region,
                      'gender' : gender,
                      'vip' : vip},
                     index = [0])
    data_list.append(x)
    print(str(i)+'info')
    time.sleep(random.randrange(0,2))
    #抓取用户评论
    try:
        driver.get(comment_href[i])
    except:
        print('ipipipipipipipipipipipipip两个小时以后再运行')
        user_info = pd.concat(data_list,axis = 0)
        user_info.to_csv('user_info_00000.csv',index = False,encoding = 'GBK')
        comment = pd.concat(comment_list,axis = 0)
        comment.to_csv('comment_00000.csv',index = False,encoding = 'GBK')
        break
    #程序暂停随机1-3秒
    time.sleep(random.randrange(1,3))
    #页面评论循环
    for j in list(range(10)):
        #拼接评论xpath和时间xpath
        for k in list(range(1,16)):
            comment_xpath = "//div[@id='J_review']/div[@class='pic-txt']/ul/li["+ str(k) +"]/div[@class='txt J_rptlist']/div[@class='txt-c']/div[@class='mode-tc comm-entry']"
            time_xpath = "//div[@id='J_review']/div[@class='pic-txt']/ul/li[" + str(k) + "]/div[@class='txt J_rptlist']/div[@class='txt-c']/div[@class='mode-tc info']/span[@class='col-exp']"
            try:   
                u_time = driver.find_element_by_xpath(time_xpath).text.strip('发表于').strip('更新于')
                u_time = pd.to_datetime(u_time,format = '%y-%m-%d')
                if u_time>=pd.to_datetime('18-9-27',format = '%y-%m-%d'):
                    comment = driver.find_element_by_xpath(comment_xpath).text
                    x = pd.DataFrame({'href'  : href[i],
                                     'u_time' : u_time,
                                     'comment': comment},index = [0])
                    comment_list.append(x)
                    p = k
                else:
                    break
            except:
                break
        if p ==15:
            #翻页模块
            try:
                driver.find_element_by_link_text('下一页').click()
            except:
                break
        else:
            break
#列表数据转换成数据框并写入文件
#!!!!!!!注意更改文件名
user_info = pd.concat(data_list,axis = 0)
user_info.to_csv('user_info.csv',index = False,encoding = 'GBK')
comment = pd.concat(comment_list,axis = 0)
comment.to_csv('comment.csv',index = False,encoding = 'GBK')

 

 

你可能感兴趣的:(python)