1.前一部分依旧没有变化,不过用户的链接从哪来呢,从上一期的商户评论里拿到了用户个人主页的链接,建了一个csv文件,从而用作这次爬虫的链接库。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
import pandas as pd
import json
import random
#放chromedriver的文件路径加文件名
CHROME_DRIVER = 'C:\\Users\\Administrator\\Desktop\\chromedriver_win32\\chromedriver.exe'
# 创建chrome参数对象
#opt = webdriver.ChromeOptions()
#opt.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
driver = webdriver.Chrome(executable_path=CHROME_DRIVER
# ,options=opt
)
#读取用户名单
users=pd.read_csv('users.csv')
href = ['https://www.dianping.com/member/'+str(i) for i in users['user_id']]
#转换为评论页链接
comment_href = [i+'/reviews' for i in href]
driver.get('http://www.dianping.com/')
#打开网站后手动登陆一下大众点评,再执行下面的语句
2.获取cookie
cookies = driver.get_cookies()
3.爬虫程序,这次的主程序是单个评论循环的,因为爬虫的体量不是特别大,这种方法下来的数据比较干净,相对于之前的运行速度,这一次的运行速度很快,但是要加入time.sleep把速度降下来,防止ip被封,如果电脑比较多,可以多台电脑并行,速度翻倍!
加个小提示吧,很多人喜欢检查页面元素之后直接copy xpath,这样很容易出问题,因为这里的xpath后面的路径是数字索引,例如div[2]这种,在不同页面,由于元素数量不同,可能就变成div[3]了,在循环的时候很可能会报错,所以还是下一个xpath插件,在插件里找属性索引的路径,这种路径一般不会报错,例如div[@class='txt'],一般的网页设计者也不会把不同内容舍相同的属性。
driver.add_cookie(cookies[0])
data_list = []
comment_list = []
#list(range(len(href)))
for i in list(range(0,998)):
#在get之前一个异常体,如果意外ip被封,可以先把文件保存下来
try:
driver.get(href[i])
except:
print('ipipipipipipipipipipipipip两个小时以后再运行')
user_info = pd.concat(data_list,axis = 0)
user_info.to_csv('user_info_00000.csv',index = False,encoding = 'GBK')
comment = pd.concat(comment_list,axis = 0)
comment.to_csv('comment_00000.csv',index = False,encoding = 'GBK')
break
#100的倍数停止一小时,不然会被大众点评封掉24小时的ip
if (i+1)%100 == 0 :
print('wating...............')
time.sleep(3500)
else :
pass
#关注、粉丝、互动、注册时间、贡献、地区、性别、vip
guanzhu = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[1]/a/strong').text
fensi = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[2]/a/strong').text
hudong = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[1]/ul/li[3]/strong').text
register_time = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[1]/div[1]/div/div[2]/p[3]').text
contribution = driver.find_element_by_xpath('//*[@id="J_col_exp"]').text
#这里主要是因为不是每个用户都有这些数据,如果没有我们给他加一个字段
try:
region = driver.find_element_by_xpath('/html/body/div[2]/div[1]/div/div/div/div[2]/div[2]/span[2]').text
except:
region = 'unknown'
try:
gender = driver.find_element_by_xpath('/html/body/div[2]/div[1]/div/div/div/div[2]/div[2]/span[2]/i').get_attribute('class')
except:
gender = 'unknown'
try :
driver.find_element_by_xpath("//div[@class='txt']/div[@class='tit']/div[@class='vip']/a/i[@class='icon-vip']")
vip = 1
except :
vip = 0
#拼接数据框
x = pd.DataFrame({'user_id' : users['user_id'][i],
'guanzhu' : guanzhu,
'fensi' : fensi,
'hudong' : hudong,
'register_time' : register_time,
'contribution' : contribution,
'region' : region,
'gender' : gender,
'vip' : vip},
index = [0])
data_list.append(x)
print(str(i)+'info')
time.sleep(random.randrange(0,2))
#抓取用户评论
try:
driver.get(comment_href[i])
except:
print('ipipipipipipipipipipipipip两个小时以后再运行')
user_info = pd.concat(data_list,axis = 0)
user_info.to_csv('user_info_00000.csv',index = False,encoding = 'GBK')
comment = pd.concat(comment_list,axis = 0)
comment.to_csv('comment_00000.csv',index = False,encoding = 'GBK')
break
#程序暂停随机1-3秒
time.sleep(random.randrange(1,3))
#页面评论循环
for j in list(range(10)):
#拼接评论xpath和时间xpath
for k in list(range(1,16)):
comment_xpath = "//div[@id='J_review']/div[@class='pic-txt']/ul/li["+ str(k) +"]/div[@class='txt J_rptlist']/div[@class='txt-c']/div[@class='mode-tc comm-entry']"
time_xpath = "//div[@id='J_review']/div[@class='pic-txt']/ul/li[" + str(k) + "]/div[@class='txt J_rptlist']/div[@class='txt-c']/div[@class='mode-tc info']/span[@class='col-exp']"
try:
u_time = driver.find_element_by_xpath(time_xpath).text.strip('发表于').strip('更新于')
u_time = pd.to_datetime(u_time,format = '%y-%m-%d')
if u_time>=pd.to_datetime('18-9-27',format = '%y-%m-%d'):
comment = driver.find_element_by_xpath(comment_xpath).text
x = pd.DataFrame({'href' : href[i],
'u_time' : u_time,
'comment': comment},index = [0])
comment_list.append(x)
p = k
else:
break
except:
break
if p ==15:
#翻页模块
try:
driver.find_element_by_link_text('下一页').click()
except:
break
else:
break
#列表数据转换成数据框并写入文件
#!!!!!!!注意更改文件名
user_info = pd.concat(data_list,axis = 0)
user_info.to_csv('user_info.csv',index = False,encoding = 'GBK')
comment = pd.concat(comment_list,axis = 0)
comment.to_csv('comment.csv',index = False,encoding = 'GBK')