我们的目的就是去爬取各个子分类下面的链接,
这些数据必须到当鼠标移动到 上面的横向导航栏菜单才会动态加载出对应的子菜单。
使用selenium去模拟将鼠标放到航向导航栏,同时立刻获得下面加载出来的动态数据
会涉及到 基于driver的网页元素操作
以及附上源码:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os,time
#下载动态界面并返回子分类链接
def get_dynamic_htmlNavLink(site_url):
print('开始加载',site_url,'动态页面')
chrome_options = webdriver.ChromeOptions()
#ban sandbox
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#use headless
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
#print('dynamic laod web is', site_url)
driver.set_page_load_timeout(100)
#driver.set_script_timeout(100)
try:
driver.get(site_url)
except Exception as e:
driver.execute_script('window.stop()') # 超出时间则不加载
print(e, 'dynamic web load timeout')
action = ActionChains(driver)
womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
cate_list = []
for tag in nav_tag_list:
print(tag.text)
action.move_to_element(tag).perform()
time.sleep(5)
a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
for tag in a_tag_list:
href = tag.get_attribute('href')
if href != '':
print(href)
cate_list.append(href)
try:
driver.quit()
except:
pass
return cate_list
site_url = 'https://www.zalando.de/damen-home/'
get_dynamic_htmlNavLink(site_url)
最后附上爬取该网站所有子分类下面的所有商品数据,包含了上面获得动态分类的子链接
import requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
CHROME_DRIVER_PATH = 'D:\\Code\imgageRecognition\\site_scrapy\\chromedriver.exe'
save_path = 'C:\\Users\\SHEIN\\Desktop\\zalando\\'
#获得静态的界面
def get_static_html(site_url):
print('开始加载', site_url, '页面')
headers_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
]
headers = {
'user-agent': headers_list[random.randint(0,len(headers_list))-1],
'Connection': 'keep - alive'
}
try:
resp = requests.get(site_url, headers=headers)
except Exception as inst:
print(inst)
requests.packages.urllib3.disable_warnings()
resp = requests.get(site_url, headers=headers,verify=False)
soup = BeautifulSoup(resp.text, 'html.parser')
return soup
#下载html 文件,并且进行css和js文件的替换
def download_html(content, html_path):
if not os.path.exists(html_path): # 文件夹不存在就创建按文件夹
os.makedirs(html_path)
print('download htmlfile path is:','{}.html'.format(html_path))
try:
with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
f.write(content)
f.close()
except Exception as e:
print(e)
#下载动态界面
def get_dynamic_htmlNavLink(site_url):
print('开始加载',site_url,'动态页面')
chrome_options = webdriver.ChromeOptions()
#ban sandbox
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#use headless
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
#print('dynamic laod web is', site_url)
driver.set_page_load_timeout(100)
#driver.set_script_timeout(100)
try:
driver.get(site_url)
except Exception as e:
driver.execute_script('window.stop()') # 超出时间则不加载
print(e, 'dynamic web load timeout')
action = ActionChains(driver)
womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
cate_list = []
for tag in nav_tag_list:
print(tag.text)
action.move_to_element(tag).perform()
time.sleep(5)
a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
for tag in a_tag_list:
href = tag.get_attribute('href')
if href != '':
print(href)
cate_list.append(href)
try:
driver.quit()
except:
pass
return cate_list
#下载到表格
def exportTask(heads,task_done,path,filename):
if not os.path.exists(path):
os.makedirs(path)
task_xls = xlwt.Workbook(encoding='utf-8')
task_sheet1 = task_xls.add_sheet('sheet1')
#表头
header_allign = xlwt.Alignment()
header_allign.horz = xlwt.Alignment.HORZ_CENTER
header_style = xlwt.XFStyle()
header_style.alignment = header_allign
for i in range(len(heads)):
task_sheet1.col(i).width = 12000
task_sheet1.write(0,i,heads[i],header_style)
#开始插入
for i in range(len(task_done)):
for j in range(len(heads)):
task_sheet1.write(i+1,j,task_done[i][heads[j]])
filename = "{0}.xls".format(filename.replace(':','-'))
print(os.path.join(path,filename))
task_xls.save(os.path.join(path,filename))
return filename
#获得大分类下的总页数
def getTotalPageNums(url):
soup = get_static_html(url)
#1.是否有分类
if len(soup.select('.cat_main-1dxBH')) == 0:
return 0,''
cate = soup.select('.cat_main-1dxBH')[0].text
#2.是否有分页
exist_pagebean = soup.select('.cat_label-2W3Y8')
if len(exist_pagebean) == 0:#不存在分页
return 1,cate
page_tag= exist_pagebean[0]
page_msg = page_tag.text
page_num = int(page_msg.split(' ')[3])
if '?' in url:
url_page = '{0}&p={1}'.format(url, page_num)
else:
url_page = '{0}?p={1}'.format(url, page_num)
soup2 = get_static_html(url_page)
page_tag2 = soup2.select('.cat_label-2W3Y8')[0]
page_msg2 = page_tag2.text
page_num2 = int(page_msg2.split(' ')[1])
print(page_num2,cate)
return page_num2,cate
#获得某个分类的商品的全部信息,包括分页
def getInfoFromSoup(cate_url,url):
soup = get_static_html(url)
#print(soup.prettify())
if len(soup.select('.cat_articleContain-1Z60A')) == 0:#一件商品都没有
print('没有商品')
return []
else:
cate = soup.select('.cat_main-1dxBH')[0].text
info_list = []
for tag in soup.select('.cat_articleContain-1Z60A'):
info = { 'cate_url': cate_url,'cate': cate}
link_tag = tag.select('.cat_infoDetail--ePcG')[0]
info['product link'] = 'https://www.zalando.de' + link_tag.attrs['href']
desc_brand_tag = tag.select('.cat_brandName-2XZRz')[0]
desc_article_tag = tag.select('.cat_articleName--arFp')[0]
info['desc'] = desc_brand_tag.text + '- ' + desc_article_tag.text
price_tage = tag.select('.cat_originalPrice-2Oy4G')[0]
info['price'] = price_tage.text
patttern = re.compile(r'([0-9 ,]+)')
price_num_str = patttern.findall(info['price'])
if len(price_num_str) == 0:
info['price_num'] = 0
else:
info['price_num'] = float(price_num_str[0].replace(',','.'))
info['price'] = price_num_str[0] + ' €'
#patttern = re.compile(r'[produkt]\/([0-9 a-z \-]+)\/#image')
#patttern = re.compile(r'[produkt]\/[a-z 0-9 \-]+\-([0-9]+)\/')
#info_id = patttern.findall(info['product link'])
#print(info['product link'])
info['product_id'] = link_tag.attrs['href']
info_list.append(info)
#print(info)
return info_list
#获得主页下商品的分类链接
def getCateUrl(path):
url = 'https://www.zalando.de/damen-home/'
cate_list = get_dynamic_htmlNavLink(url)
if not os.path.exists(path):
os.mkdir(path)
try:
with open(path + 'all_cate_link.txt', 'a+', encoding="utf-8") as f:
for cate_url in cate_list:
f.write(cate_url + '\n')
f.close()
except Exception as e:
print(e)
#将某个分类下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
pagenum,cate = getTotalPageNums(cate_url)
info_list = []
if pagenum > 0:
for i in range(1, pagenum + 1):
if '?' in cate_url:
url_page = '{0}&p={1}'.format(cate_url, i)
else:
url_page = '{0}?p={1}'.format(cate_url, i)
info_list += getInfoFromSoup(cate_url,url_page)
time.sleep(5)
heads = ['cate_url','cate','desc','price','price_num','product_id','product link']
filename = '{0}-{1}'.format(num,cate)
exportTask(heads, info_list, path, filename)
try:
with open(path+'record.txt', 'a+', encoding="utf-8") as f:
f.write(cate_url+'\n')
f.close()
except Exception as e:
print(e)
#获得已经爬取的分类链接
def getDoneUrl(path,file_name):
done_url = []
with open(os.path.join(path,file_name), 'r', encoding="utf-8") as f:
url_list = f.readlines()
for url in url_list:
done_url.append(url.rstrip('\n'))
print(done_url)
return done_url
#合并为一个表格
def connectToOne(dir,to_dir):
excel_list = []
for file in os.listdir(dir):
if file.endswith('.xls'):
print("file:",file)
excel_list.append(pd.read_excel(os.path.join(dir,file)))
print('开始合并')
total_excel = pd.concat(excel_list)
print('生成文件')
total_excel.to_excel(os.path.join(to_dir,'asos.xlsx'),index=False)
if __name__ == '__main__':
cate_url_list = getDoneUrl(save_path,'all_cate_link.txt')
done_url = getDoneUrl(save_path,'record.txt')
for i in range(len(cate_url_list)):
if cate_url_list[i] not in done_url:
dowloadExcelByCate(cate_url_list[i],save_path,i+1)
#getCateUrl(save_path)
#cate_url = 'https://www.zalando.de/beauty-damen/bobbi-brown/?product_group=beauty&order=activation_date'
#getTotalPageNums(cate_url)
#getInfoFromSoup(cate_url,cate_url)
#dowloadExcelByCate(cate_url, save_path, 1)
#connectToOne(save_path,'C:\\Users\\SHEIN\\Desktop')