Selenium爬取京东商品的好评与差评并写入EXCEL

动态HTML页面数据的获取可以使用request库得到服务器发送来的数据,但是京东商品评论包含全部评论、好评、中评、差评等。这里是用Selenium模拟点击转到相应的页面来获取页面内容,具体分为以下几个步骤:

1.得到商品页面链接,这里我们爬取Kindle: [https://item.jd.com/100000667370.html]
2. 等所有元素加载出,点击“商品评价”按钮,点击“好评“按钮
3. 等待页面加载完成,获取Html文档内容,当前页面好评的所有评论位置是"div", id=“comment-4"标签下所有的"div”,class_="comment-item"标签
4. 将内容写入EXCEL,这里进行文件是否存在的判断,进行文件创建或追加的,同时写入每一条数据的类型(好评还是差评)和字数。
5. 差评同理

完整代码

from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from bs4 import BeautifulSoup
import xlwt
import xlrd
import os
from xlutils.copy import copy
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotVisibleException
from selenium.webdriver.common.by import By
import time
def bad(driver):
    time.sleep(2)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="detail"]/div[1]/ul/li[5]'))).click()
    #time.sleep(3)
    wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="comment"]/div[2]/div[2]/div[1]/ul/li[7]/a'))).click()
    time.sleep(4)
    col=0
    while col<149:
        html = driver.page_source
        div_bf = BeautifulSoup(html, features="lxml")
        div = div_bf.find("div", id="comment-6")
        trr=BeautifulSoup(str(div)).find_all("div",class_="comment-item")
        row = 0
        if not os.path.exists('E://demo.xls'):
            workbook = xlwt.Workbook()
            data_sheet = workbook.add_sheet('demo')
            data_sheet.write(col, row, "评论")
            data_sheet.write(col, row + 1, "字数")
            data_sheet.write(col, row + 2, "评论类型")
        else:
            excel = xlrd.open_workbook('E://demo.xls')
            workbook = copy(excel)
            data_sheet = workbook.get_sheet(0)
            #col=data_sheet.nrows
        for w in trr:
            hh=BeautifulSoup(str(w)).find("p",class_="comment-con")
            try:
                col=col+1
                data_sheet.write(col,row,hh.string)
                data_sheet.write(col,row+1,len(hh.string))
                data_sheet.write(col, row + 2, "差评")
                if col==150:
                    break;
            except:
                print('出错')
        print("*" + str(col) + "*")
        workbook.save('E://demo.xls')
        dd=driver.find_elements_by_css_selector('a.ui-pager-next')
        driver.execute_script('window.scrollTo(2200,2500);')
        print("长度为"+str(len(dd)))
        time.sleep(5)
        #print(str(dd))
        dd[1].click()
        time.sleep(2)

def good(driver):
    time.sleep(2)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="detail"]/div[1]/ul/li[5]'))).click()
    time.sleep(3)
    wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="comment"]/div[2]/div[2]/div[1]/ul/li[5]/a'))).click()
    time.sleep(4)
    col=151
    while col<301:
        html = driver.page_source
        div_bf = BeautifulSoup(html, features="lxml")
        div = div_bf.find("div", id="comment-4")
        trr=BeautifulSoup(str(div)).find_all("div",class_="comment-item")
        row = 0
        excel = xlrd.open_workbook('E://demo.xls')
        workbook = copy(excel)
        data_sheet = workbook.get_sheet(0)
        for w in trr:
            hh=BeautifulSoup(str(w)).find("p",class_="comment-con")
            try:
                col=col+1
                data_sheet.write(col,row,hh.string)
                data_sheet.write(col,row+1,len(hh.string))
                data_sheet.write(col, row + 2, "好评")
                if col==301:
                    break;
            except:
                print('出错')
        print("*" + str(col) + "*")
        workbook.save('E://demo.xls')
        dd=driver.find_elements_by_css_selector('a.ui-pager-next')
        driver.execute_script('window.scrollTo(2200,2500);')
        print("长度为"+str(len(dd)))
        time.sleep(5)
        #print(str(dd))
        dd[1].click()
        time.sleep(2)

if __name__=="__main__":
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get('https://item.jd.com/5438177.html')
    bad(driver)
    good(driver)

注意几点问题

  • EXCEL的追加只能是先copy再覆盖
  • driver全屏打开,”下一页"元素被挡住的话就点击不到,出现not clickable的错误
  • 点击”商品评价"按钮时,系统默认会加载“全部评价”,所以“dd=driver.find_elements_by_css_selector(‘a.ui-pager-next’)”是一个列表,而dd[0]是“全部评价”下的“下一页”

你可能感兴趣的:(爬虫)