用Python爬取淘宝搜索的数据,用微博账号模拟淘宝登录,后用Selenium爬取数据(主要依托chromedriver驱动)

使用教程

  1. 下载chrome浏览器
  2. 查看chrome浏览器的版本号,对应版本号的chromedriver驱动
  3. pip安装下列包
  4.  pip install selenium
  5. 登录微博,并通过微博绑定淘宝账号密码
  6. 在chromedriver_path中填写chromedriver的绝对路径
  7. 事先填写好自己的微博账号密码

chromedriver驱动下载方式参考:https://blog.csdn.net/muriyue6/article/details/101440353

通过微博绑定淘宝账号代码参考文档链接:https://blog.csdn.net/qq_45327272/article/details/99053395

视频教学(爬取数据部分)参考链接:https://www.bilibili.com/video/BV1qJ411y71J?p=9

import csv
import json

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import re

chromedriver_path =  "C:/Users/mqm/Desktop/chromedriver.exe"  # 改成你的chromedriver



class taobao_infos:
    # 对象初始化
    def __init__(self):
        url = 'https://login.taobao.com/member/login.jhtml'
        self.url = url

        options = webdriver.ChromeOptions()

        options.add_experimental_option('excludeSwitches',
                                        ['enable-automation'])  # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium

        self.browser = webdriver.Chrome(executable_path=chromedriver_path, options=options)
        self.wait = WebDriverWait(self.browser, 10)  # 超时时长为10s

        # 创建打开文件并写入的对象f  淘宝数据保存在csv文件中
        self.f = open('%s_data.csv'%iphone, 'w',newline='',encoding='utf8')   # newline='' 代表没生成一行时不换行
        # 创建csv对象
        self.csv_write = csv.writer(self.f )
        # csv 先生成第一行标题栏
        self.csv_write.writerow(['手机广告词', '手机价格', '成交量', '手机图片地址', '商家', '发货地'])

    # 登录淘宝
    def login(self):
        # 打开网页
        self.browser.get(self.url)

        # 等待 密码登录选项 出现
        password_login = self.wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#login > div.login-content.nc-outer-box > div > div.login-blocks.login-switch-tab > a.password-login-tab-item')))
        password_login.click()

        # 等待 微博登录选项 出现
        weibo_login = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
        weibo_login.click()

        # 等待 微博账号 出现
        weibo_user = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input')))
        weibo_user.send_keys(weibo_username)

        # 等待 微博密码 出现
        weibo_pwd = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input')))
        weibo_pwd.send_keys(weibo_password)

        # 等待 登录按钮 出现
        submit = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span')))
        submit.click()

        # 直到获取到淘宝会员昵称才能确定是登录成功
        taobao_name = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                                                      'a.site-nav-login-info-nick ')))
        # 输出淘宝昵称
        print(taobao_name.text)

    #搜索
    def search(self):
        try:
            self.browser.get('https://www.taobao.com')
            input = self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
            )
            submit = self.wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button'))
            )
            input.send_keys(iphone)
            submit.click()
            total = self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))
            )
            return total.text
        except TimeoutError:
            return self.search()

    #换页操作
    def next_page(self,page_number):
        try:
            input = self.wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
            )
            submit2 = self.wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))
            )
            input.clear()
            input.send_keys(page_number)
            submit2.click()
            # 用高量的标签判断是否翻页了
            self.wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))

            #调用每个页面详情数据的爬取
            self.get_products(page_number)
        except TimeoutError:
            self.next_page(page_number)

    # 获取详情
    def get_products(self,page_number):
        self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
        html = self.browser.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()

        for i in items:
            product = {
                'image':i.find('.pic .img').attr('src'),
                'price':i.find('.price').text(),
                'deal':i.find('.deal-cnt').text()[:-3],
                'title':i.find('.title').text(),
                'shop':i.find('.shop').text(),
                'location':i.find('.location').text()
            }

            # 写入数据
            self.csv_write.writerow([product['title'],product['price'],product['deal'],product['image'],product['shop'],product['location']])
            print('分页%s的数据已完成'%page_number)

    def close_csv(self):
        self.f.close()



if __name__ == '__main__':
    weibo_username = "*****"  # 改成你的微博账号
    weibo_password = "*****"  # 改成你的微博密码
    iphone = input('请输入想要搜索的商品:')
    a = taobao_infos()
    a.login()  # 登录
    total = a.search()
    total = int(re.compile('(\d+)').search(total).group(1))
    print(total)  # 分页 页码
    for i in range(2,total+1):
        a.next_page(i)

    #关闭文件
    a.close_csv()



直接跑起代码需要:事先下载好chromedriver驱动,1.在代码中配置驱动的路径,我的驱动路径就放在了桌面。2.在代码里main中,要改成自己微博账号和密码。(一定要用自己账号事先用微博授权登录过淘宝)

爬取的数据我用csv保存

你可能感兴趣的:(用Python爬取淘宝搜索的数据,用微博账号模拟淘宝登录,后用Selenium爬取数据(主要依托chromedriver驱动))