Selenium使用方法

一、x-path练习:图片保存

import requests
from lxml import etree
import csv


def get_net_data():
    url = 'https://www.umei.net/'
    response = requests.get(url)
    response.encoding = 'utf-8'
    print(response.text)
    return response.text


def download_image(url: str):
    response = requests.get(url)
    if response.status_code == 200:
        image_name = url.split('/')[-1]
        with open(f'./images/{image_name}', 'wb') as f:
            f.write(response.content)
        print(image_name, '下载成功!')
        return f'images/{image_name}'


def analysis(data: str):
    html = etree.HTML(data)
    big_box = html.xpath('./body/div[@class="wrap"]')[0]
    image_li = big_box.xpath('.//ul/li')

    all_data = []
    for li in image_li:
        # 标题
        title = li.xpath('./a/@title')[0]
        # 连接
        url = 'https://www.umei.net/'+ li.xpath('./a/@href')[0]
        # 图片地址
        image_url = li.xpath('./a/img/@src')[0]

        # 下载图片
        local_url = download_image(image_url)

        all_data.append([title, url, image_url, local_url])

    return all_data


def save_data(data: list):
    with open('files/美女图片.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['标题', '链接', '图片网络地址', '图片本地地址'])
        writer.writerows(data)


if __name__ == '__main__':
    data = analysis(get_net_data())
    save_data(data)

二、selenium的基础方法

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# 1. 创建浏览器对象
# 注意:浏览器对象如果是局部变量,那么函数结束后浏览器会自动关闭。如果是全局变量浏览器需要手动关闭
b = webdriver.Chrome()

# 2. 打开网页
b.get('https://www.jd.com')

# 3. 获取网页内容
# 注意:在获取浏览器page_source值的时候,只能获取到当前浏览器已经加载出来的数据
print(b.page_source)

# 4. 获取和操作标签
# 1)输入框操作:获取到输入框 -> 输入内容 -> 按回车
# 根据id值获取输入框
input = b.find_element_by_css_selector('#key')

# 在输入框中输入电脑
input.send_keys('电脑')

# 在输入框按回车键
input.send_keys(Keys.ENTER)

time.sleep(1)
input2 = b.find_element_by_css_selector('#key')
# 清空输入框内容
input2.clear()
input2.send_keys('鼠标')

# 获取按钮标签
search_btn = b.find_element_by_css_selector('.button.cw-icon')
# 点击按钮
search_btn.click()

# 4.回退
time.sleep(1)
b.back()
time.sleep(1)
b.back()
time.sleep(1)
b.forward()
time.sleep(1)
b.forward()

# 关闭浏览器
# b.close()

三、selenium的选项卡

import time

from selenium import webdriver

b = webdriver.Chrome()
b.get('https://www.jd.com')

# 获取秒杀对应的a标签
miaosha = b.find_element_by_css_selector('#navitems-group1>li>a')
miaosha.click()

# 获取所有选项卡
print(b.window_handles)

time.sleep(2)

# 切换选项卡
b.switch_to.window(b.window_handles[0])

四、selenium获取网页cookie

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

url = 'https://www.taobao.com'
b = webdriver.Chrome()
b.get(url)


input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)

# 进入到登录页面,等待人工操作登陆页面
time.sleep(30)

print('人工操作完成!')
# 人工登陆成功后获取cookie值并且保存到本地文件中
cookies = b.get_cookies()
with open('./files/taobao_cookies.txt', 'w', encoding='utf-8') as f:
    f.write(str(cookies))

五、selenium使用cookie

import time

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

b = webdriver.Chrome()
b.get('https://www.taobao.com')

# 设置cookie
with open('files/taobao_cookies.txt', encoding='utf-8') as f:
    cookies = eval(f.read())
    for cookie in cookies:
        if cookie['secure']:
            cookie['secure'] = True
            b.add_cookie(cookie)
time.sleep(1)

b.get('https://www.taobao.com')
# b.refresh()
input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)

六、页面滚动方法

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# from bs4 import BeautifulSoup
from lxml import etree
import csv
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By


def scrolling_web():
    max_height = 10000
    # 每次滚动的位置
    y = 0
    while True:
        y += 500
        b.execute_script(f'window.scrollTo(0, {y})')
        if y > max_height:
            break
        time.sleep(1)


def get_net_data(url):
    global b
    b = webdriver.Chrome()
    b.get(url)
    input1 = b.find_element_by_css_selector('#key')
    input1.send_keys('电脑')
    input1.send_keys(Keys.ENTER)
    time.sleep(1)
    scrolling_web()
    return b.page_source


def analysis_data(data:str):
    html = etree.HTML(data)
    all_lis = html.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/div/div[2]/ul/li')
    all_data = []
    for x in all_lis:
        name = '电脑'.join(x.xpath('./div/div[3]/a/em/text()')).replace('\n', '').replace('\t', '')
        url = 'https:' + x.xpath('./div/div[3]/a/@href')[0]
        image = 'https:' + x.xpath('./div/div[1]/a/img/@src')[0]
        price = x.xpath('./div/div[2]/strong/em/text()')[0] + x.xpath('./div/div[2]/strong/i/text()')[0]
        comments_num = x.xpath('./div/div[4]/strong/a/text()')[0]
        shop_name = x.xpath('./div/div[5]/span/a/text()')[0]
        shop_url = 'https:' + x.xpath('./div/div[5]/span/a/@href')[0]
        tag_1 = x.xpath('./div/div[6]//i/text()')
        if tag_1:
            tag = '/'.join(tag_1)
        else:
            tag = '无标签'

        all_data.append([name, url, image, price, comments_num, shop_name, shop_url, tag])
    time.sleep(1)
    return all_data


def page_turning():
    input2 = b.find_element_by_css_selector('.pn-next')
    input2.click()
    scrolling_web()
    return b.page_source


def save_data(data:list):
    with open('./files/京东电脑.csv', 'w', newline='', encoding='utf-8_sig') as f:
        writer = csv.writer(f)
        writer.writerow(['标题', '链接', '图片', '价格', '评论数', '店铺名字', '店铺链接', '标签'])
        writer.writerows(data)


if __name__ == '__main__':
    url = 'https://www.jd.com'
    data = get_net_data(url)
    all_data = analysis_data(data)
    for x in range(9):
       all_data += analysis_data(page_turning())
    save_data(all_data)

你可能感兴趣的:(爬虫,python)