python爬虫的库有很多,在这里我们使用selenium的自动化库,导入其中的webdriver包来实现
使用到的库有selenium, time, re
需提前安装浏览器自动化工具Chromedriver
根据你的chrome版本找到最近的版本下载
下载:Chromediver
1.输入关键字查询
if __name__ == '__main__':
kw = input('请输入宁要查询的商品名称:')
drive = webdriver.Chrome('chromedriver.exe')
drive.get('https://www.taobao.com/')
input_div = drive.find_element_by_id('q')
input_div.send_keys(kw)
infolist = []
2.找到我们要爬取的页码数
def drive_():
drive.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
time.sleep(15)
reg = re.compile('(\d+)')
total_page = drive.find_element_by_xpath('//div[@class="inner clearfix"]/div[@class="total"]').text
pagenum = int(reg.search(total_page).group(1))
return pagenum
3.由于爬取的淘宝为动态网页,需下拉才可加载网页,不然爬取不到下面的数据
使用js实现下拉功能:
def drop_down():
for i in range(1,11,2):
time.sleep(0.5)
j = i / 10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' %j
drive.execute_script(js)
4.接下来爬取数据
def get_product():
divs = drive.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for div in divs:
title = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text # 在每个div里找子孙节点一定要加 .// !!!!
img = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src')
price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
sale = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
shoplocation = div.find_element_by_xpath('.//div[@class="row row-3 g-clearfix"]/div[@class="location"]').text
shopname = div.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text
info = {'title':title,'img':img,'price':price,'sale':sale,'shoplocation':shoplocation,'shopname':shopname}
infolist.append(info)
爬取数据这一块使用xpath,需要有一定的爬虫基础
5.爬完第一页之后需要翻页继续爬下一页
def next_page(kw):
page = drive_()
drop_down()
get_product()
for i in range(2,page+1):
url = f'https://s.taobao.com/search?q={kw}&s={(i-1)*44}'
drive.get(url)
drive.implicitly_wait(10)
drop_down()
get_product()
注意:
一. 这里翻页最好使用url规律,使用driver.find_element_by_xpath(path).click点击下一页可能会被检测
二. 每次获取页面后需要等待页面加载完成后再进行下拉,因为当网速比较慢时可能有些商品加载不出来就进行下拉导致数据丢失
6.最后从程序入口调用翻页函数,即可爬取搜索到的所有商品的部分信息
if __name__ == '__main__':
kw = input('请输入宁要查询的商品名称:')
drive = webdriver.Chrome('chromedriver.exe')
drive.get('https://www.taobao.com/')
input_div = drive.find_element_by_id('q')
input_div.send_keys(kw)
infolist = []
next_page(kw)
爬取到的商品信息以字典的形式存储在列表中,也可以将数据存入本地
完整代码
from selenium import webdriver
import time
from pyfile import data_to_excel
import re
def drive_():
drive.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
time.sleep(15)
reg = re.compile('(\d+)')
total_page = drive.find_element_by_xpath('//div[@class="inner clearfix"]/div[@class="total"]').text
pagenum = int(reg.search(total_page).group(1))
return pagenum
def get_product():
divs = drive.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for div in divs:
title = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text # 在每个div里找子孙节点一定要加 .// !!!!
img = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src')
price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
sale = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
shoplocation = div.find_element_by_xpath('.//div[@class="row row-3 g-clearfix"]/div[@class="location"]').text
shopname = div.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text
info = {'title':title,'img':img,'price':price,'sale':sale,'shoplocation':shoplocation,'shopname':shopname}
infolist.append(info)
def drop_down():
for i in range(1,11,2):
time.sleep(0.5)
j = i / 10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' %j
drive.execute_script(js)
def next_page(kw):
page = drive_()
drop_down()
get_product()
for i in range(2,page+1):
url = f'https://s.taobao.com/search?q={kw}&s={(i-1)*44}'
drive.get(url)
drive.implicitly_wait(10)
drop_down()
get_product()
if __name__ == '__main__':
kw = input('请输入宁要查询的商品名称:')
drive = webdriver.Chrome('chromedriver.exe')
drive.get('https://www.taobao.com/')
input_div = drive.find_element_by_id('q')
input_div.send_keys(kw)
infolist = []
next_page(kw)