分别爬取京东和淘宝手机频道,找出累积销量 (所有商家销售同一 型号手机的销量之和)最高20款手机 。
说明 :
python3.7
selenium
Google Chrome开发版(启动selenium后自动弹出)
利用网页的xpath进行爬取,边爬取边处理数据,共爬取37页,每页60部手机,写入”jingdong_rowdata.csv”。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import ElementNotInteractableException
import pandas as pd
import time
delay_time = 0.5 #休眠时间
chrome_options = Options()
driver = webdriver.Chrome(chrome_options=chrome_options) # 启动模拟浏览器
phone_type = [] # 手机型号
phone_price = [] # 手机价格
phone_comment = [] # 手机评论数
url_phone = [] # 每页所有手机的链接
webdriver用于启动模拟浏览器
options用于控制模拟浏览器
各种exceptions用于避开各种反扒机制
pandas存储数据
time用于计时
1.登录京东手机频道,按评论数排序,保存网页地址
2.点击进入具体手机型号界面,选择最低配置
3.选择商品评价—>只看当前商品评价
4.获取价格,评论数,店铺名,手机型号等各项信息,存入”jingdong_rowdata.csv”
京东商城网站无需登录即可访问。
比较坑的地方是京东一个页面的60部手机不会一次性显示出来,需要先将模拟页面拖拽到3/4处,5/6处等待加载,才能显示完整页面
for i in range(1, 37):
url1 = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&psort=4&page=" + str(int(i * 2 - 1))
driver.get(url1)
time.sleep(delay_time)
driver.execute_script("window.scrollTo(0, 3 * document.body.scrollHeight / 4);")
time.sleep(3 * delay_time)
driver.execute_script("window.scrollTo(0, 5 * document.body.scrollHeight / 6);") #下拉页面,从而显示隐藏界面
time.sleep(3 * delay_time)
Chrome开发版下检查手机名称栏,获取xpath
复制三个手机的xpath,观察其规律
可以发现,进入手机具体界面的xpath为
/html/body/div[6]/div[2]/div[2]/div[1]/div/div[2]/ul/li[*]/div/div[4]/a
一个页面,60部手机的页面地址一次获取
for j in range(1, 61): # 一个页面六十部手机,一次爬取
url_temp = driver.find_element_by_xpath("/html/body/div[6]/div[2]/div[2]/div[1]/div/div[2]/ul/li[" + str(j) + "]/div/div[4]/a").get_attribute('href')url_phone.append(url_temp)
for j in range(0, 60): # 逐一访问
time.sleep(delay_time)
url = url_phone[j] # 手机界面
driver.get(url)
time.sleep(delay_time)
type_phone, no_phone, memory_phone, storage_phone, comment_phone = click_info() # 进入主调用函数click_info()
time.sleep(delay_time)
def click_info(): #点击商品信息
comment_phone = ""
no_phone = ""
memory_phone = ""
storage_phone = ""
time.sleep(delay_time)
comment_phone = click_look_current_comment() # 只看当前商品评论数(详见5)
time.sleep(delay_time)
driver.find_element_by_xpath("/html/body/*/div[2]/div[1]/div[1]/ul/li[1]").click()
type_phone = driver.find_element_by_xpath("/html/body/*/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[1]").text # 商品名称
type_phone = type_phone.replace("商品名称:", "")
memory_phone = driver.find_element_by_xpath("/html/body/*/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[6]").text #商品内存
memory_phone = memory_phone.replace("运行内存:", "")
no_phone = " " + driver.find_element_by_xpath( "/html/body/*/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[2]").text + " " # 手机编号
no_phone = no_phone.replace("手机编号:", "")
storage_phone = driver.find_element_by_xpath("/html/body/*/div[2]/div[1]/div[2]/div[1]/div[1]/ul[3]/li[7]").text # 机身存储
storage_phone = storage_phone.replace("机身存储:", "")
phone_type.append(type_phone)
return type_phone, no_phone, memory_phone, storage_phone, comment_phone
def click_comment(): # 点击商品评论
click_lowest_settings() # 点击最低配置(详见6)
driver.find_element_by_xpath("/html/body/*/div[2]/div[1]/div[1]/ul/li[5]").click()
def click_look_current_comment(): # 点击只看当前商品,并获取商品评论数
driver.find_element_by_xpath("/html/body/*/div[2]/div[4]/div[2]/div[2]/div[1]/ul/li[9]/label").click()
comment = driver.find_element_by_xpath("/html/body/*/div[2]/div[4]/div[2]/div[2]/div[1]/ul/li[1]/a/em").text
comment = comment.replace("(", "")
comment = comment.replace("+", "")
comment = comment.replace(")", "")
return comment
def click_lowest_settings(): # 选择最低配置
driver.find_element_by_xpath("/html/body/div[6]/div/div[2]/*/div[7]/div[1]/div[2]/div[1]/a").click()
time.sleep(2 * delay_time)
driver.find_element_by_xpath("/html/body/div[6]/div/div[2]/*/div[7]/div[2]/div[2]/div[1]/a").click()
# 读取价格
price = driver.find_element_by_xpath("/html/body/div[6]/div/div[2]/*/div/div[1]/div[2]/span[1]/span[2]").text
price = float(price)
if len(memory_phone) != 0 and len(storage_phone) != 0:
item = [(type_phone, no_phone, memory_phone, storage_phone, price, comment_phone)]
print(type_phone, end=" ")
print(no_phone, end=" ")
print(memory_phone, end=" ")
print(storage_phone, end=" ")
print(price, end=" ")
print(comment_phone)
keys = ['手机型号', '手机编号', '手机内存', '机身存储', '手机价格', '手机评论']
df = pd.DataFrame.from_records(item, columns=keys)
df.to_csv('jingdong.csv', mode='a', index=False, encoding='GBK', header=False)
在完整代码中,加入了反爬虫机制,如果遇到页面不可见,采取有限次的刷新,直到页面可见的策略,并加入了数据处理部分
GitHub源代码