Selenium:测试抓取京东数据练习[汽车用品-机油]

絮叨两句:
博主是一名软件工程系的在校生,利用博客记录自己所学的知识,也希望能帮助到正在学习的同学们
人的一生中会遇到各种各样的困难和折磨,逃避是解决不了问题的,唯有以乐观的精神去迎接生活的挑战
少年易老学难成,一寸光阴不可轻。
最喜欢的一句话:今日事,今日毕

系列文章目录

Selenium系列:


Selenium:测试抓取京东数据练习[汽车用品-机油]

文章目录

  • 系列文章目录
  • 前言
  • 分析
    • 步骤:
  • 代码:
  • 总结


前言

对Selenium进行练习操作


提示:以下是本篇文章正文内容,下面案例可供参考

分析

机油地址:https://list.jd.com/list.html?cat=6728,6742,11849
Selenium:测试抓取京东数据练习[汽车用品-机油]_第1张图片

步骤:

  1. 按F12找到左上角有一个鼠标的标志点击你想要获取的地方如下图:
    Selenium:测试抓取京东数据练习[汽车用品-机油]_第2张图片
  2. 有上图可以看到,有多个品牌,每个品牌中都连接和名称
    我们可以通过连接得到每一个商品
  3. Selenium:测试抓取京东数据练习[汽车用品-机油]_第3张图片
  4. 每一品牌有很多商品,一次展示不完就是要翻页,这也是我们要考虑的地方
    5.

代码:


from  bs4 import BeautifulSoup
import requests
import time as ti
import uuid
from selenium import webdriver
def getHTML(url):
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44'
    }
    request = requests.get(url, headers=headers)
    # request=requests.get(url,headers=headers)
    return request.text
def getJXHTML(html):
    JY_soup = BeautifulSoup(JY_HTML, 'html.parser')
    J_valueList_li_All = JY_soup.find('div', attrs={'class': 'sl-v-logos'}).find('ul', attrs={'class': 'J_valueList v-fixed'}).findAll('li')
    bowser = webdriver.Chrome()
    for li in J_valueList_li_All:
        brand_href=f"https://list.jd.com{li.find('a')['href']}"
        brand_name = f"{li.find('a')['title']}"
        print("品牌分类:----->", brand_name, brand_href)
        bowser.get(brand_href)
        windows(bowser)
        brand_html=bowser.page_source
        ca_Html = BeautifulSoup(brand_html, 'html.parser')
        b_title = ca_Html.find('span', attrs={'class': 'p-skip'})
        if b_title==None:
            '''
            直接解析获取商品
            '''
            getProduct(ca_Html, brand_name=brand_name)
        else:
            b_fy_Number=int(b_title.find('b').text)-1
            print("--------------------------------------第一页---------------------------------")
            '''获取当前页的数据'''
            getProduct(ca_Html, brand_name=brand_name)
            for i in range(1,b_fy_Number):
                print("------------------------------下一页-------------------------------")
                bowser.find_element_by_class_name('pn-next').click()
                windows(bowser)
                html_fy = bowser.page_source
                fy_httml = BeautifulSoup(html_fy, 'html.parser')
                getProduct(fy_httml,brand_name=brand_name)



def getProduct(barn_soup,brand_name):
    li_All=barn_soup.find('div',attrs={'id':'J_goodsList'}).findAll('li')
    bowser_there=webdriver.Chrome()
    for li in li_All:
        db = "INSERT INTO `xxuan_car_jd_mobil_product` VALUES (NULL,"
        sql={'name':'',
             'brand':'',
             'type':'',
             'originplace':'',
             'netweight':'',
             'price':'',
             'commodity_Name':'',
             'image':'',
             'viscosity':'',
             'volume':''
             }
        sql['brand']=brand_name
        span_hk=li.find('div',attrs={'class':'p-name p-name-type-3'}).find('a').find('em').find('span',attrs={'class':'p-tag'})
        if span_hk==None:
            li_href=li.find('a')
            if li_href!=None:
                li_href=li_href['href']
            else:
                continue
            li_price=li.find('div',attrs={'class':'p-price'})
            if li_price!=None:
                li_price=li_price.find('i').text
            # print(li_price)
    #         ti.sleep(1)
            if  not str(li_href).__contains__("https://"):
                https_li_href=f"https:{li_href}"
                print("商品链接:",https_li_href)
                product_This(db=db,sql=sql,https_li_href=https_li_href,li_price=li_price,bowser=bowser_there)

            else:
                https_li_href=li_href
                print("商品链接:",https_li_href)
                product_This(db=db,sql=sql,https_li_href=https_li_href,li_price=li_price,bowser=bowser_there)
        else:
           continue



def product_This(db,sql,https_li_href,li_price,bowser):
    bowser.get(https_li_href)
    windows(bowser)
    product_HTML=bowser.page_source
    produc_soup = BeautifulSoup(product_HTML, 'html.parser')
    # 商品标题名称
    sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'})
    if sku_name_wrap!=None:
        sku_name=sku_name_wrap.find('div', attrs={'class': 'sku-name'})
        if sku_name != None:
            sku_name = sku_name.text
            sku_name = str(sku_name).strip()
            sql['commodity_Name'] = sku_name

    # print("商品标题名称:",sku_name)
    # print('商品价格:',li_price)
    sql['price'] = li_price
    # 商品图片
    spec_img = produc_soup.find('img', attrs={'id': 'spec-img'})
    if spec_img == None:
        spec_img = 'NULL'
    else:
        spec_img = spec_img['data-origin']
    # print("https:",spec_img)
    imageURL = f"https:{spec_img}"
    if imageURL.__contains__('NULL'):
        sql['image'] = f"NULL"
    else:
        sql['image'] = imageURL
    # 商品规格信息
    parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'})
    if parameter_list != None:
        li_all_parameter = parameter_list.findAll('li')
        for li in li_all_parameter:
            if str(li.text).__contains__('商品名称:'):
                if li.text == None:
                    sql['name'] = 'NULL',
                else:
                    sql['name'] = str(li.text).replace('商品名称:', '')
            elif str(li.text).__contains__('商品编号:'):
                if li.text == None:
                    #     li.text = 'NULL',
                    pass
                else:
                    pass
                    # li.text = li.text
            elif str(li.text).__contains__('商品毛重:'):
                if li.text == None:
                    sql['netweight'] = 'NULL',
                else:
                    sql['netweight'] = str(li.text).replace('商品毛重:', '')
            elif str(li.text).__contains__('商品产地:'):
                if li.text == None:
                    sql['originplace'] = 'NULL',
                else:
                    sql['originplace'] = str(li.text).replace('商品产地:', '')
            elif str(li.text).__contains__('粘度:'):
                if li.text == None:
                    sql['viscosity'] = 'NULL',
                else:
                    sql['viscosity'] = str(li.text).replace('粘度:', '')
            elif str(li.text).__contains__('机油种类:'):
                if li.text == None:
                    sql['type'] = 'NULL',
                else:
                    sql['type'] = str(li.text).replace('机油种类:', '')
            elif str(li.text).__contains__('容量:'):
                if li.text == None:
                    sql['volume'] = 'NULL',
                else:
                    sql['volume'] = str(li.text).replace('容量:', '')
    # print(sql)
    for i in sql:
        if len(str(sql[i])) == 0:
            sql[i] = 'NULL'
        if i != "volume":

            db += f"'{sql[i]}',"
        else:
            db += f"'{sql[i]}');"
    with open('D:\\数据\\京东\\sql\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w:
        w.write(db + '\r')
        # print(db)






def windows(browser):
    for i in range(0, 10000, 10):
        windowBout(browser, i)
    for i in range(10000, 0, -10):
        windowTop(browser, i)
    for i in range(0, 10000, 10):
        windowBout(browser, i)
def windowBout(browser,i):
    js = f"window.scrollTo(0,{i})"
    browser.execute_script(js)

def windowTop(browser, i):
        js = f"window.scrollTo(0,{i})"
        browser.execute_script(js)



if __name__ == '__main__':
    url="https://list.jd.com/list.html?cat=6728,6742,11849"
    JY_HTML=getHTML(url)
    getJXHTML(JY_HTML)

总结

使用Selenium进行获取所有品牌的机油数据 记得点赞,收藏,谢谢大家支持

你可能感兴趣的:(Selenium)