Spider07selenium进阶登录反爬

1. 选项卡切换

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup

1.1 创建浏览器对象

b = Chrome()

1.2 打开网页

b.get('https://www.cnki.net/')

1.3 搜索论文

search = b.find_element_by_id('txt_SearchText')
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)

1.4 获取每个搜索结果对应的标签

time.sleep(1)
a_list = b.find_elements_by_css_selector('table.result-table-list td.name>a')
a_list[0].click()       # 点击第一个搜索结果
time.sleep(1)

1.5 将选项卡切换到第二个页面,获取详情页数据

浏览器对象.window_handles - 获取当前所有的选项卡,返回一个列表。

# 浏览器对象.window_handles  -  获取当前所有的选项卡,返回一个列表
b.switch_to.window(b.window_handles[1])
# print(b.page_source)

# 获取摘要数据
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChDivSummary').text)
b.close()

1.6 回到第一页

b.switch_to.window(b.window_handles[0])
a_list[1].click()
time.sleep(1)

# 获取摘要数据
b.switch_to.window(b.window_handles[-1])
soup = BeautifulSoup(b.page_source, 'lxml')
print(soup.select_one('#ChDivSummary').text)

1.7 知网爬虫练习

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
import os
import re


# 点击第一个搜索结果
def get_one_page():
    # 获取每个搜索结果对应的标签
    a_list = b.find_elements_by_css_selector('table.result-table-list td.name>a')
    # print(a_list, len(a_list))

    all_data = []
    for x in a_list:
        x.click()
        time.sleep(1)

        # 5. 将选项卡切换到第二个页面,获取详情页数据
        # 浏览器对象.window_handles  -  获取当前所有的选项卡,返回一个列表
        b.switch_to.window(b.window_handles[-1])
        # print(b.page_source)
        soup = BeautifulSoup(b.page_source, 'lxml')

        # 题目
        title = soup.select_one('div.container div.wx-tit>h1').text

        # 获取摘要数据
        if soup.select_one('#ChDivSummary'):
            abstract = soup.select_one('#ChDivSummary').text
        else:
            abstract =None

        # 关键词
        if soup.select_one('div.container div.row>p.keywords>a'):
            keyword = soup.select_one('div.container div.row>p.keywords>a').text
            keyword = re.sub('\s', '', keyword)
        else:
            keyword = None

        all_data.append([title, abstract, keyword])
        b.close()

        # 6. 回到第一页
        b.switch_to.window(b.window_handles[0])

    writer.writerows(all_data)


if __name__ == '__main__':
    writer = csv.writer(open('files/知网数据分析.csv', 'a', encoding='utf-8', newline=''))
    is_exists = os.path.exists('files/知网数据分析.csv')
    if not is_exists:
        writer.writerow(['论文题目', '摘要', '关键词'])

    # 1. 创建浏览器对象
    b = Chrome()

    # 2. 打开网页
    b.get('https://www.cnki.net/')

    # 3. 搜索论文
    search = b.find_element_by_id('txt_SearchText')
    search.send_keys('数据分析')
    search.send_keys(Keys.ENTER)
    time.sleep(1)

    for _ in range(5):
        # 获取一页的所有数据
        get_one_page()

        # 点击下一页
        next_btn = b.find_element_by_id('PageNext')
        next_btn.click()
        time.sleep(3)

2. requests登录反爬

添加cookie完成自动登录。

import requests

# 添加cookie完成自动登录
headers = {
    'cookie': r'_zap=5a0255c0-2631-4533-b00b-11fa5c5bce29; d_c0="AKBdbvd55xSPTialrdtjTCxWDdfhwIFmbfk=|1651911504"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=5LN74%2F0gFI9BRFRQUFPFBJLixsJlWxHn; _xsrf=sif67kGuztvrCG4lqumCAqog1ej8lC5C; captcha_session_v2=2|1:0|10:1652757335|18:captcha_session_v2|88:amRCVGh4eTN2SFFiVjBNRlNSRGFsTm85bmcrUGJSZGhSNWFma3UxaW1QNlhsNnBpa25MbG95R0xDcWlmeFNHTw==|6e620fa69489b60d1f5b83ba64de2a88020b0e6c36f14d63fc3de65b8ba7c043; __snaker__id=OZgoo8Q643crcGaQ; gdxidpyhxdE=gpAsrdc76JoW%5CAH7%5C6syS%2BfZKXeCBC%2BH7Nqg%5CyaPrKcSwI2S0eOIk5lt16d289pu7tCKzCmc8At4CS7weyBn4x2M7CwQK5iJWgLGd3YiaT2Q4uYixWJNPxu%5CyUSs8sQf%2BEndH5XB11GNqnBk5oc%2FMi6vE%2B3DL5%2BjD%2BacGGz3LdvvTm3%2B%3A1652758239269; YD00517437729195%3AWM_NI=JBeJoQFDBsMmmlDvVefYIBrGMcoPXstpFzybXyJDonY0qrqEobTBHr8Wkw0h9%2Fmt2hpMzvGhDHcxe%2FUTVTLxLZKpghvXo87QB6kp%2FQNVot6vXe1Pf3oReu0DieEX7%2FBMTEU%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeace23ea38a9d8ac560ae9e8ba6c54b929a8eb0d84b90938eb3c17f88f5bab3cd2af0fea7c3b92a8a9eababee5b869c8e94d067afb48c97d56fa68787a3f14d8fbda1b1db79b8b08a9bfc6a9ca786a5ee669bf0bf93f44fadbe9ca3cb7afc89a598b364fbf1a395b164b493b697c6648898ad99e825978c9ab5cf7490868591e664b688a1d9b65b82eaa591ae5b8e8aa195b57e9cbbffb1e539ad8eab96b350abbb8bb9b62582939ca6d837e2a3; z_c0="2|1:0|10:1652757841|4:z_c0|92:Mi4xOU04TUJBQUFBQUFBb0YxdTkzbm5GQ2NBQUFDRUFsVk5VYUtxWWdEcVNGX3BnTUlRVFFfcE5yMTNfeFZvYTFzSVZR|ce07c55884feb88c17e80bf9fb87f3bb3e087fba8fbb29a31d8e776712a5ddef"; q_c1=f2c0c46262fb43bca1ec55e7f5dc8a0d|1652757842000|1652757842000; NOT_UNREGISTER_WAITING=1; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1651911507,1652757336,1652767106,1652767236; SESSIONID=1CZ5prmJnH5MToLzVcvlNnGJxyvK9NCOKGjeLtVpNFf; JOID=Wl8VB0itC59FIacKIKBPBzrZ0uk8k0_Bd1raYG7ITK0McsNJXcMyHyslrwkjITdag_dBW4OaY8bgAwp76dr3Uac=; osd=UFAWBE6nBJxGJ60FI6NJDTXa0e82nEzCcVDVY23ORqIPccVDUsAxGSEqrAolKzhZgPFLVICZZczvAAl949X0UqE=; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1652767343; KLBRSID=ed2ad9934af8a1f80db52dcb08d13344|1652767469|1652767234',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
        }

res = requests.get('https://www.zhihu.com/', headers=headers)
print(res.text)

3. selenium登录反爬获取cookie

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys

3.1 创建浏览器打开淘宝

b = Chrome()
b.get('https://www.taobao.com')

3.2 给足够的时间让人工完成登录

input('完成登录:')

3.3 完成登录获取cookie信息保存本地

cookies = b.get_cookies()
with open('files/淘宝.txt', 'w', encoding='utf-8', newline='') as f:
    f.write(str(cookies))

3.4 输入商品名搜索

search = b.find_element_by_id('q')
search.send_keys('运动鞋')
search.send_keys(Keys.ENTER)

4. selenium登录反爬使用cookie

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import csv
import os
import re

4.1 创建浏览器打开网站

b = Chrome()
b.get('https://www.taobao.com')

4.2 添加cookie,完成自动登录

cookie_list = eval(open('files/淘宝.txt', encoding='utf-8').read())
for cookie in cookie_list:
    if cookie.get('secure'):
        b.add_cookie(cookie)

4.3 刷新网页

time.sleep(1)
b.get('https://www.taobao.com')

4.4 获取数据写入文件

# 后续操作
time.sleep(1)
search = b.find_element_by_id('q')
search.send_keys('运动鞋')
search.send_keys(Keys.ENTER)

time.sleep(1)
soup = BeautifulSoup(b.page_source, 'lxml')
#
goods_list = soup.select('div.grid.g-clearfix div.items>div')

# print(goods_list, len(goods_list))
writer = csv.writer(open('files/淘宝运动鞋.csv', 'w', encoding='utf-8', newline=''))
# if not os.path.exists('files/淘宝运动鞋.csv'):
writer.writerow(['商品名称', '价格', '店铺名称', '商品链接', '店铺链接', '付款人数'])

all_data = []
for goods in goods_list:
    # 商品名
    name_goods = goods.select_one('div.row.row-2.title>a').text
    name_goods = re.sub('\s', '', name_goods)
    # print(name_goods)
    # 价格
    price = goods.select_one('div.price.g_price.g_price-highlight>span').text + goods.select_one('div.price.g_price.g_price-highlight>strong').text
    # 店铺名称
    name_shop = goods.select_one('div.shop > a > span:nth-child(2)').text
    # 商品链接
    href_goods = goods.select_one('div.ctx-box.J_MouseEneterLeave.J_IconMoreNew div.row.row-2.title>a').attrs['href']
    if 'https:' not in href_goods:
        href_goods = 'https:' + href_goods

    # 店铺链接
    href_shop = goods.select_one('div.shop>a').attrs['href']
    if 'https:' not in href_shop:
        href_shop = 'https:' + href_shop

    # 付款人数
    payment = goods.select_one('div.ctx-box.J_MouseEneterLeave.J_IconMoreNew div.deal-cnt').text

    print(name_goods, price, name_shop, href_goods, href_shop, payment)
    all_data.append([name_goods, price, name_shop, href_goods, href_shop, payment])

writer.writerows(all_data)

5. 等待

5.1 隐式等待

1)设置隐式等待时间: 浏览器对象.implicitly_wait(超时时间)。

2)设置等待时间,每次通过浏览器对象获取网页内容的时候,如果取的时候没有获取到标签程序不会马上报错,而是在指定时间范围内不断尝试重新获取这个标签,直到获取到或者超时为止。

2)注意:一个浏览器只需要设置一次隐式等待时间,它会作用于每次获取标签。

5.2 显示等待

等待某个条件成立或者不成立才继续执行程序。

5.2.1 使用步骤

1)创建等待对象:WebDriverWait(浏览器对象, 超时时间)

2)添加等待条件:等待对象.until(条件)、等待对象.until_not(条件)

5.2.2 常用条件

1)text_to_be_present_in_element_value(标签, 内容):当指定标签的value属性值中包含了指定内容

2)presence_of_element_located(标签):当前指定标签出现

3)element_to_be_clickable(标签):当指定标签可以点击

4)text_to_be_present_in_element(标签, 值):当指定标签的标签内容中包含指定值

注意:标签提供方法:(By.XXX, 值),例如(By.ID, ‘key’)、(By.CLASS_NAME, ‘user_login’) 。

# 创建浏览器打开网页
b = Chrome()
b.get('https://www.jd.com')

# 1)设置隐式等待时间: 浏览器对象.implicitly_wait(超时时间)
b.implicitly_wait(5)

search = b.find_element_by_id('key')

# 2)显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


# a. 创建等待对象
wait = WebDriverWait(b, 20)
print('==========1==========')

# b. 添加等待条件
# wait.until(EC.text_to_be_present_in_element_value((By.ID, 'key'), '鞋子'))
wait.until_not(EC.presence_of_element_located((By.CLASS_NAME, 'user_login')))

print('==========2==========')
b.refresh()    # 刷新

6. 找数据接口

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
        }

res = requests.get('https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js', headers=headers)
result = res.json()
# print(result)
for x in result['hero']:
    print(x['name'], x['title'])

7. 作业英雄联盟皮肤爬虫

import requests
import os
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
import time
from selenium.webdriver import Chrome


# 获取每个英雄的皮肤图片并写入英雄对应的文件中,皮肤图片用皮肤命名
# 先找到每个英雄的详情数据接口,规律
# https://game.gtimg.cn/images/lol/act/img/js/hero/1.js
# https://game.gtimg.cn/images/lol/act/img/js/hero/2.js


def one_hero(id):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }
    res = requests.get(f'https://game.gtimg.cn/images/lol/act/img/js/hero/{id}.js', headers=headers)
    result = res.json()
    # 获取英雄名字
    hero_name = result['hero']['name']
    print(hero_name)
    # 创建以英雄名字命名的文件夹
    os.makedirs(f'files/英雄联盟皮肤/{hero_name}')

    for x in result['skins']:
        # 获取皮肤名字
        skins_name = x['name']
        skins_name = skins_name.replace('/', '')
        # 获取皮肤图片地址
        img_url = x.get('loadingImg')
        print(skins_name, img_url)
        # 解析图片保存在文件中
        if img_url:
            res_img = requests.get(img_url)
            # print(res_img.content)
            # res_img.encoding = 'utf-8'
            with open(f'files/英雄联盟皮肤/{hero_name}/{skins_name}.jpg', 'wb') as f:
                f.write(res_img.content)


if __name__ == '__main__':
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    # }
    # # 获取英雄个数,遍历每个英雄
    # res = requests.get('https://lol.qq.com/data/info-heros.shtml', headers=headers)
    # time.sleep(1)
    # res.encoding = 'gbk'
    # soup = BeautifulSoup(res.text, 'lxml')
    # print(soup)

    b = Chrome()
    b.get('https://lol.qq.com/data/info-heros.shtml')
    time.sleep(1)
    soup = BeautifulSoup(b.page_source, 'lxml')

    # 拿到每个英雄的id
    hero_li = soup.select('#jSearchHeroDiv>li')
    # print(hero_li)
    for hero in hero_li:
        if hero.select_one('li>a'):
            id = hero.select_one('li>a').attrs['href'].split('info-defail.shtml?id=')[-1]
            # print(id)
            one_hero(id)

你可能感兴趣的:(爬虫,python语言基础,python学习,爬虫,python,数据分析)