由于淘宝对自动化工具进行了识别,直接进入登录页面滑动二维码一直会报错,所以采取了曲线救国的方式,通过用微博账号来登录淘宝。刚自学《Python3网络爬虫开发实战》,和里面的代码有一点点区别。废话不多说,直接上代码。
#coding=utf-8
"""
__author__ = zenghaisheng
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from urllib import quote
from bs4 import BeautifulSoup
browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
KEYTWORD = "white something you want to search"
WEIBO_NAME = "white your weibo name"
WEIBO_PASSWOORD = 'white your weibo password'
def index_page(page):
print('正在爬取第{}页'.format(page))
try:
url = "https://s.taobao.com/search?q=" +quote(KEYTWORD)
browser.get(url)
#点击切换密码登陆
a_element = browser.find_element_by_class_name('login-switch')
a_element.click()
#跳转到微博登陆页面
weibo_login = browser.find_element_by_class_name('weibo-login')
weibo_login.click()
name_input = browser.find_element(By.NAME,'username')
name_input.send_keys(WEIBO_NAME)
password_input = browser.find_element(By.NAME,'password')
password_input.send_keys(WEIBO_PASSWOORD)
submit = browser.find_element_by_class_name('W_btn_g')
submit.send_keys(Keys.ENTER)
#登陆成功,跳转回淘宝
wait = WebDriverWait(browser,10)
if page > 1:
input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-page .form > input')))
sumbit_go_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.m-page .form .J_Submit')))
input_page.clear()
input_page.send_keys(page)
sumbit_go_page.send_keys(Keys.ENTER)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item')))
goods_msgs = get_goods_msg()
return goods_msgs
except Exception as e:
print(e)
def get_goods_msg():
html = browser.page_source
soup = BeautifulSoup(html,'lxml')
goods_list = soup.find_all(class_='J_MouserOnverReq')
for i in goods_list:
i_soup = BeautifulSoup(str(i),'lxml')
#商品显示图链接
data_imgurl = 'https:'+i_soup.find(class_='J_ItemPic img')["data-src"]
#商品链接
data_href = 'https:'+i_soup.find(class_='pic-link')["data-href"]
#商品标题
data_title = i_soup.find(class_='title').get_text().strip()
#商品价格
data_price = i_soup.select('.ctx-box .price strong')[0].get_text()
#多少人付款
data_pay_peoples = i_soup.find(class_='deal-cnt').get_text().replace("人付款",'')
yield dict(
data_imgurl = data_imgurl,
data_href = data_href,
data_title = data_title,
data_price = data_price,
data_pay_peoples = data_pay_peoples,
)
if __name__ == "__main__":
#填写你想搜索的第几页数
page_num = 1
goods_msgs = index_page(page_num)
for good_msg in goods_msgs:
print(good_msg)