python爬取某宝商品信息

某宝是著名的电商平台,刚好也学到selenium,就试试用selenium爬一下某宝的商品信息。
因为使用自动化工具爬取的,也就没有网页分析的步骤。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import conn
import time
import csv

def login(name):
	'''解决登录和滑块验证问题'''
    driver.get('https://www.taobao.com/')
    driver.maximize_window()
    driver.implicitly_wait(10)
    driver.find_element_by_xpath('//*[@id="q"]').send_keys(name)
    driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
    driver.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys(conn.TaoUser)
    time.sleep(1)
    driver.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys(conn.TaoPwd)
    time.sleep(1)
    action = ActionChains(driver)
    yzm = driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
    time.sleep(1)
    action.click_and_hold(yzm).move_by_offset(xoffset=258, yoffset=0)#点击移动验证码
    action.pause(0.8).perform()#使动作链执行
    action.release()
    driver.find_element_by_xpath('//*[@id="login-form"]/div[4]/button').click()
    driver.implicitly_wait(10)


def get_info():
	'''获取一页商品页信息并保存 '''
    #注意使用elements方法提取可迭代的值
    divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq  "]')
    for div in divs:
        shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
        store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
        ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
        price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
        pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
        shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
        #print(shop_name, store, ship_area, price, pay_num, sep="|")
        with open(r'seleniumDemo\shopinfo.csv', mode='a', newline="") as f:
            csvwrite = csv.writer(f,delimiter=',')
            csvwrite.writerow([shop_name, store, ship_area, price, pay_num, shop_url])
    print('文件保存完成')

def turn_page():
	'''翻页爬取'''
    all_page = driver.find_element_by_xpath('//div[@class="total"]').text.split(' ')[1]
    page = 1 
    try:
        while page <= int(all_page):
            print(f'=========正在爬取第{page}页信息=========')
            driver.implicitly_wait(10)
            get_info()
            page += 1
            #点击下一页
            driver.find_element_by_xpath('//a[@class="J_Ajax num icon-tag"]').click()
            time.sleep(5)
            
    except Exception as e:
        print(e)

'''
def get_info():
	'带有title的csv表格,不过用的时候要把打开文件和写入title放
	主入口那里'
    #注意使用elements方法提取可迭代的值
    divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq  "]')
    with open(r'seleniumDemo\shopinfo.csv', mode='w') as f:
        f.write(f"{'商品名称'},{'价格'},{'付款人数'},{'店铺名称'},{'发货地址'},{'链接'}\n")
        for div in divs:
            shop_name = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').text
            store = div.find_element_by_xpath('.//div[@class="shop"]/a').text
            ship_area = div.find_element_by_xpath('.//div[@class="location"]').text
            price = div.find_element_by_xpath('.//div[@class="price g_price g_price-highlight"]/strong').text
            pay_num = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
            shop_url = div.find_element_by_xpath('.//div[@class="row row-2 title"]/a').get_attribute('href')
            #print(shop_name, store, ship_area, price, pay_num, sep="|")
            f.write(f"{shop_name},{price},{pay_num},{store},{ship_area},{shop_url}\n")
    print('文件保存完成')
'''

if __name__ == '__main__':
	'''程序主入口,感觉用selenium爬太慢了 '''
    choice = input('输入你想爬取的商品:')
    driver = webdriver.Chrome(r'C:\Users\IT\Desktop\chromedriver.exe')
    login(choice)
    turn_page()
    print('所有商品爬取完成。。。。')

你可能感兴趣的:(python爬取某宝商品信息)