python使用Selenium爬取淘宝商品信息

       由于淘宝对自动化工具进行了识别,直接进入登录页面滑动二维码一直会报错,所以采取了曲线救国的方式,通过用微博账号来登录淘宝。刚自学《Python3网络爬虫开发实战》,和里面的代码有一点点区别。废话不多说,直接上代码。

#coding=utf-8
"""
__author__ = zenghaisheng

"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib import quote
from bs4 import BeautifulSoup

browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
KEYTWORD = "white something you want to search"
WEIBO_NAME = "white your weibo name"
WEIBO_PASSWOORD = 'white your weibo password'

def index_page(page):

    print('正在爬取第{}页'.format(page))
    try:
        url = "https://s.taobao.com/search?q=" +quote(KEYTWORD)
        browser.get(url)
        #点击切换密码登陆
        a_element = browser.find_element_by_class_name('login-switch')
        a_element.click()
        #跳转到微博登陆页面
        weibo_login = browser.find_element_by_class_name('weibo-login')
        weibo_login.click()
        name_input = browser.find_element(By.NAME,'username')
        name_input.send_keys(WEIBO_NAME)
        password_input = browser.find_element(By.NAME,'password')
        password_input.send_keys(WEIBO_PASSWOORD)
        submit = browser.find_element_by_class_name('W_btn_g')
        submit.send_keys(Keys.ENTER)
        #登陆成功,跳转回淘宝
        wait = WebDriverWait(browser,10)

        if page > 1:
            input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-page .form > input')))
            sumbit_go_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.m-page .form .J_Submit')))
            input_page.clear()
            input_page.send_keys(page)
            sumbit_go_page.send_keys(Keys.ENTER)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
        goods_msgs =  get_goods_msg()
        return goods_msgs

    except Exception as e:
        print(e)

def get_goods_msg():

    html = browser.page_source
    soup = BeautifulSoup(html,'lxml')
    goods_list = soup.find_all(class_='J_MouserOnverReq')
    for i in goods_list:
        i_soup = BeautifulSoup(str(i),'lxml')
        #商品显示图链接
        data_imgurl = 'https:'+i_soup.find(class_='J_ItemPic img')["data-src"]
        #商品链接
        data_href = 'https:'+i_soup.find(class_='pic-link')["data-href"]
        #商品标题
        data_title = i_soup.find(class_='title').get_text().strip()
        #商品价格
        data_price = i_soup.select('.ctx-box .price strong')[0].get_text()
        #多少人付款
        data_pay_peoples = i_soup.find(class_='deal-cnt').get_text().replace("人付款",'')
        yield dict(
            data_imgurl = data_imgurl,
            data_href = data_href,
            data_title = data_title,
            data_price = data_price,
            data_pay_peoples = data_pay_peoples,
        )

if __name__ == "__main__":
    #填写你想搜索的第几页数
    page_num = 1
    goods_msgs = index_page(page_num)
    for good_msg in goods_msgs:
        print(good_msg)

 

你可能感兴趣的:(python)