python +Selenium 爬取淘宝商品评论

第一步

现在淘宝防爬取做的比较好,如果直接爬的话总是出现登陆界面。从而获取不到信息。
解决办法,新建 淘宝.py

import json
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

options = Options()
dirver = webdriver.Firefox(firefox_options=options)
dirver.get(
    'https://login.taobao.com/member/login.jhtml?redirectURL=http%3A%2F%2Fbuyertrade.taobao.com%2Ftrade%2Fitemlist%2Flist_bought_items.htm%3Fspm%3D875.7931836%252FB.a2226mz.4.66144265Vdg7d5%26t%3D20110530')
# 这里是为了等待手机扫码登录, 登录后回车即可
input("请回车登录")
dictCookies = dirver.get_cookies()
jsonCookies = json.dumps(dictCookies)
# 登录完成后,将cookies保存到本地文件
with open("cookies_tao.json", "w") as fp:
    fp.write(jsonCookies)

用来保存你登陆后的cookie。

第二步

用你保存下来的cookie实现登录,爬取商品的评论信息
新建爬取.py

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import wait
from selenium.webdriver.support.wait import WebDriverWait
from twisted.conch.telnet import EC

options = Options()
#options.add_argument("--headless")
dirver = webdriver.Firefox(firefox_options=options)
# 初次建立连接, 随后方可修改cookie
dirver.get('http://www.taobao.com')
# 删除第一次登录是储存到本地的cookie
dirver.delete_all_cookies()
# 读取登录时储存到本地的cookie
with open("cookies_tao.json", "r", encoding="utf8") as fp:
    ListCookies = json.loads(fp.read())

for cookie in ListCookies:
    dirver.add_cookie({
        'domain': '.taobao.com',  # 此处xxx.com前,需要带点
        'name': cookie['name'],
        'value': cookie['value'],
        'path': '/',
        'expires': None
    })

# 再次访问页面,便可实现免登陆访问
dirver.get("http://www.taobao.com")
time.sleep(3)
# 将页面保存为图片用于查看是否登录成功
search = dirver.find_element_by_id('q')
search.send_keys("男装")
search.send_keys(Keys.ENTER)
time.sleep(3)
url1 = dirver.current_url
print(dirver.current_url)
dirver.get(url1)
time.sleep(2)
sort = dirver.find_element_by_link_text('销量')
sort.click()
shangpin = dirver.find_element_by_id('J_Itemlist_Pic_565221245456')
shangpin.click()
time.sleep(2)
sreach_windows = dirver.current_window_handle
all_handles = dirver.window_handles
for handle in all_handles:
    if handle != sreach_windows:
        dirver.switch_to.window(handle)
        time.sleep(2)
        ding = dirver.find_element_by_id('J_TabBarBox')
        dirver.execute_script("arguments[0].scrollIntoView()", ding)
        time.sleep(3)
        num = 0
        while num < 10:
            ck = dirver.find_element_by_xpath('//*[@id="J_TabBar"]/li[3]/a')
            ck.click()
            time.sleep(3)
            texts = dirver.find_elements_by_class_name('tm-col-master')
            print(texts)
            for each in texts:
                text = each.find_element_by_class_name('tm-rate-fulltxt')
                print(text.text)
            bbb = dirver.find_element_by_link_text('下一页>>')
            dirver.execute_script("arguments[0].scrollIntoView()", bbb)
            time.sleep(1)
            bbb.send_keys(Keys.ENTER)
            num = num + 1
dirver.quit()




结果如下:
python +Selenium 爬取淘宝商品评论_第1张图片

你可能感兴趣的:(python)