爬虫-获取鼠标点击或则移动到指定位置才能获得的动态加载数据

测试网站 https://www.zalando.de/damen-home/  一家电子商城网站

爬虫-获取鼠标点击或则移动到指定位置才能获得的动态加载数据_第1张图片

我们的目的就是去爬取各个子分类下面的链接,

这些数据必须到当鼠标移动到 上面的横向导航栏菜单才会动态加载出对应的子菜单。

主体思路

使用selenium去模拟将鼠标放到航向导航栏,同时立刻获得下面加载出来的动态数据

会涉及到 基于driver的网页元素操作

以及附上源码:

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import os,time



#下载动态界面并返回子分类链接
def get_dynamic_htmlNavLink(site_url):
    print('开始加载',site_url,'动态页面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出时间则不加载
        print(e, 'dynamic web load timeout')
    action = ActionChains(driver)
    womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
    nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
    cate_list = []
    for tag in nav_tag_list:
        print(tag.text)
        action.move_to_element(tag).perform()
        time.sleep(5)
        a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
        for tag in a_tag_list:
            href = tag.get_attribute('href')
            if href != '':
                print(href)
                cate_list.append(href)
    try:
        driver.quit()
    except:
        pass
    return cate_list


site_url = 'https://www.zalando.de/damen-home/'
 get_dynamic_htmlNavLink(site_url)

最后附上爬取该网站所有子分类下面的所有商品数据,包含了上面获得动态分类的子链接

import  requests,random,os,xlwt,math,time,re,pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains


CHROME_DRIVER_PATH = 'D:\\Code\imgageRecognition\\site_scrapy\\chromedriver.exe'
save_path = 'C:\\Users\\SHEIN\\Desktop\\zalando\\'



#获得静态的界面
def get_static_html(site_url):
    print('开始加载', site_url, '页面')
    headers_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0 ',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ]
    headers = {
        'user-agent': headers_list[random.randint(0,len(headers_list))-1],
        'Connection': 'keep - alive'
    }
    try:
        resp = requests.get(site_url, headers=headers)
    except Exception as inst:
        print(inst)
        requests.packages.urllib3.disable_warnings()
        resp = requests.get(site_url, headers=headers,verify=False)
    soup = BeautifulSoup(resp.text, 'html.parser')
    return soup



#下载html 文件,并且进行css和js文件的替换
def download_html(content, html_path):
    if not os.path.exists(html_path):  # 文件夹不存在就创建按文件夹
        os.makedirs(html_path)
    print('download htmlfile path is:','{}.html'.format(html_path))
    try:
        with open('{}.html'.format(html_path), 'w+', encoding="utf-8") as f:
            f.write(content)
            f.close()
    except Exception as e:
        print(e)


#下载动态界面
def get_dynamic_htmlNavLink(site_url):
    print('开始加载',site_url,'动态页面')
    chrome_options = webdriver.ChromeOptions()
    #ban sandbox
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    #use headless
    #chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-ssl-errors')
    driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH,chrome_options=chrome_options)
    #print('dynamic laod web is', site_url)
    driver.set_page_load_timeout(100)
    #driver.set_script_timeout(100)
    try:
        driver.get(site_url)
    except Exception as e:
        driver.execute_script('window.stop()')  # 超出时间则不加载
        print(e, 'dynamic web load timeout')
    action = ActionChains(driver)
    womwn_nav_tag = driver.find_element_by_css_selector('.z-navicat-header_categoryList')
    nav_tag_list = womwn_nav_tag.find_elements_by_css_selector('li')
    cate_list = []
    for tag in nav_tag_list:
        print(tag.text)
        action.move_to_element(tag).perform()
        time.sleep(5)
        a_tag_list = driver.find_elements_by_css_selector('a.z-navicat-header_subCategoryLink')
        for tag in a_tag_list:
            href = tag.get_attribute('href')
            if href != '':
                print(href)
                cate_list.append(href)
    try:
        driver.quit()
    except:
        pass
    return cate_list



#下载到表格
def exportTask(heads,task_done,path,filename):
    if not os.path.exists(path):
        os.makedirs(path)
    task_xls = xlwt.Workbook(encoding='utf-8')
    task_sheet1 = task_xls.add_sheet('sheet1')
    #表头
    header_allign = xlwt.Alignment()
    header_allign.horz = xlwt.Alignment.HORZ_CENTER
    header_style = xlwt.XFStyle()
    header_style.alignment = header_allign
    for i in  range(len(heads)):
        task_sheet1.col(i).width = 12000
        task_sheet1.write(0,i,heads[i],header_style)
    #开始插入
    for i in range(len(task_done)):
        for j in range(len(heads)):
            task_sheet1.write(i+1,j,task_done[i][heads[j]])
    filename = "{0}.xls".format(filename.replace(':','-'))
    print(os.path.join(path,filename))
    task_xls.save(os.path.join(path,filename))
    return filename



#获得大分类下的总页数
def getTotalPageNums(url):
    soup = get_static_html(url)

    #1.是否有分类
    if len(soup.select('.cat_main-1dxBH')) == 0:
        return 0,''
    cate = soup.select('.cat_main-1dxBH')[0].text
    #2.是否有分页
    exist_pagebean = soup.select('.cat_label-2W3Y8')
    if len(exist_pagebean) == 0:#不存在分页
        return 1,cate
    page_tag= exist_pagebean[0]
    page_msg = page_tag.text

    page_num = int(page_msg.split(' ')[3])
    if '?' in url:
        url_page = '{0}&p={1}'.format(url, page_num)
    else:
        url_page = '{0}?p={1}'.format(url, page_num)
    soup2 = get_static_html(url_page)
    page_tag2 = soup2.select('.cat_label-2W3Y8')[0]
    page_msg2 = page_tag2.text
    page_num2 = int(page_msg2.split(' ')[1])
    print(page_num2,cate)
    return page_num2,cate



#获得某个分类的商品的全部信息,包括分页
def getInfoFromSoup(cate_url,url):
    soup = get_static_html(url)
    #print(soup.prettify())
    if len(soup.select('.cat_articleContain-1Z60A')) == 0:#一件商品都没有
        print('没有商品')
        return []
    else:
        cate = soup.select('.cat_main-1dxBH')[0].text
        info_list = []
        for tag in soup.select('.cat_articleContain-1Z60A'):
            info = { 'cate_url': cate_url,'cate': cate}

            link_tag = tag.select('.cat_infoDetail--ePcG')[0]
            info['product link'] = 'https://www.zalando.de' + link_tag.attrs['href']

            desc_brand_tag = tag.select('.cat_brandName-2XZRz')[0]
            desc_article_tag = tag.select('.cat_articleName--arFp')[0]
            info['desc'] = desc_brand_tag.text + '- ' + desc_article_tag.text


            price_tage = tag.select('.cat_originalPrice-2Oy4G')[0]
            info['price'] = price_tage.text
            patttern = re.compile(r'([0-9 ,]+)')
            price_num_str = patttern.findall(info['price'])
            if len(price_num_str) == 0:
                info['price_num'] = 0
            else:
                info['price_num'] = float(price_num_str[0].replace(',','.'))
                info['price'] = price_num_str[0] + ' €'


            #patttern = re.compile(r'[produkt]\/([0-9 a-z \-]+)\/#image')
            #patttern = re.compile(r'[produkt]\/[a-z 0-9 \-]+\-([0-9]+)\/')
            #info_id = patttern.findall(info['product link'])
            #print(info['product link'])
            info['product_id'] = link_tag.attrs['href']


            info_list.append(info)
            #print(info)
        return info_list



#获得主页下商品的分类链接
def getCateUrl(path):
    url = 'https://www.zalando.de/damen-home/'
    cate_list = get_dynamic_htmlNavLink(url)
    if not os.path.exists(path):
        os.mkdir(path)
    try:
        with open(path + 'all_cate_link.txt', 'a+', encoding="utf-8") as f:
            for cate_url in cate_list:
                f.write(cate_url + '\n')
            f.close()
    except Exception as e:
        print(e)



#将某个分类下的商品存到excel表格
def dowloadExcelByCate(cate_url,path,num):
    pagenum,cate = getTotalPageNums(cate_url)
    info_list = []
    if pagenum > 0:
        for i in range(1, pagenum + 1):
            if '?' in cate_url:
                url_page = '{0}&p={1}'.format(cate_url, i)
            else:
                url_page = '{0}?p={1}'.format(cate_url, i)
            info_list += getInfoFromSoup(cate_url,url_page)
            time.sleep(5)
        heads = ['cate_url','cate','desc','price','price_num','product_id','product link']
        filename = '{0}-{1}'.format(num,cate)
        exportTask(heads, info_list, path, filename)
        try:
            with open(path+'record.txt', 'a+', encoding="utf-8") as f:
                f.write(cate_url+'\n')
                f.close()
        except Exception as e:
            print(e)



#获得已经爬取的分类链接
def getDoneUrl(path,file_name):
    done_url = []
    with open(os.path.join(path,file_name), 'r', encoding="utf-8") as f:
        url_list = f.readlines()
        for url in url_list:
            done_url.append(url.rstrip('\n'))
        print(done_url)
    return done_url



#合并为一个表格
def connectToOne(dir,to_dir):
    excel_list = []
    for file in os.listdir(dir):
        if file.endswith('.xls'):
            print("file:",file)
            excel_list.append(pd.read_excel(os.path.join(dir,file)))
    print('开始合并')
    total_excel = pd.concat(excel_list)
    print('生成文件')
    total_excel.to_excel(os.path.join(to_dir,'asos.xlsx'),index=False)



if __name__ == '__main__':
    cate_url_list = getDoneUrl(save_path,'all_cate_link.txt')
    done_url = getDoneUrl(save_path,'record.txt')
    for i in range(len(cate_url_list)):
       if cate_url_list[i] not in done_url:
            dowloadExcelByCate(cate_url_list[i],save_path,i+1)


    #getCateUrl(save_path)
    #cate_url = 'https://www.zalando.de/beauty-damen/bobbi-brown/?product_group=beauty&order=activation_date'
    #getTotalPageNums(cate_url)
    #getInfoFromSoup(cate_url,cate_url)
    #dowloadExcelByCate(cate_url, save_path, 1)

    #connectToOne(save_path,'C:\\Users\\SHEIN\\Desktop')

 

你可能感兴趣的:(scrapy)