一、x-path练习:图片保存
import requests
from lxml import etree
import csv
def get_net_data():
url = 'https://www.umei.net/'
response = requests.get(url)
response.encoding = 'utf-8'
print(response.text)
return response.text
def download_image(url: str):
response = requests.get(url)
if response.status_code == 200:
image_name = url.split('/')[-1]
with open(f'./images/{image_name}', 'wb') as f:
f.write(response.content)
print(image_name, '下载成功!')
return f'images/{image_name}'
def analysis(data: str):
html = etree.HTML(data)
big_box = html.xpath('./body/div[@class="wrap"]')[0]
image_li = big_box.xpath('.//ul/li')
all_data = []
for li in image_li:
title = li.xpath('./a/@title')[0]
url = 'https://www.umei.net/'+ li.xpath('./a/@href')[0]
image_url = li.xpath('./a/img/@src')[0]
local_url = download_image(image_url)
all_data.append([title, url, image_url, local_url])
return all_data
def save_data(data: list):
with open('files/美女图片.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接', '图片网络地址', '图片本地地址'])
writer.writerows(data)
if __name__ == '__main__':
data = analysis(get_net_data())
save_data(data)
二、selenium的基础方法
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
b = webdriver.Chrome()
b.get('https://www.jd.com')
print(b.page_source)
input = b.find_element_by_css_selector('#key')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
time.sleep(1)
input2 = b.find_element_by_css_selector('#key')
input2.clear()
input2.send_keys('鼠标')
search_btn = b.find_element_by_css_selector('.button.cw-icon')
search_btn.click()
time.sleep(1)
b.back()
time.sleep(1)
b.back()
time.sleep(1)
b.forward()
time.sleep(1)
b.forward()
三、selenium的选项卡
import time
from selenium import webdriver
b = webdriver.Chrome()
b.get('https://www.jd.com')
miaosha = b.find_element_by_css_selector('#navitems-group1>li>a')
miaosha.click()
print(b.window_handles)
time.sleep(2)
b.switch_to.window(b.window_handles[0])
四、selenium获取网页cookie
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url = 'https://www.taobao.com'
b = webdriver.Chrome()
b.get(url)
input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
time.sleep(30)
print('人工操作完成!')
cookies = b.get_cookies()
with open('./files/taobao_cookies.txt', 'w', encoding='utf-8') as f:
f.write(str(cookies))
五、selenium使用cookie
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
b = webdriver.Chrome()
b.get('https://www.taobao.com')
with open('files/taobao_cookies.txt', encoding='utf-8') as f:
cookies = eval(f.read())
for cookie in cookies:
if cookie['secure']:
cookie['secure'] = True
b.add_cookie(cookie)
time.sleep(1)
b.get('https://www.taobao.com')
input = b.find_element_by_css_selector('#q')
input.send_keys('电脑')
input.send_keys(Keys.ENTER)
六、页面滚动方法
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree
import csv
def scrolling_web():
max_height = 10000
y = 0
while True:
y += 500
b.execute_script(f'window.scrollTo(0, {y})')
if y > max_height:
break
time.sleep(1)
def get_net_data(url):
global b
b = webdriver.Chrome()
b.get(url)
input1 = b.find_element_by_css_selector('#key')
input1.send_keys('电脑')
input1.send_keys(Keys.ENTER)
time.sleep(1)
scrolling_web()
return b.page_source
def analysis_data(data:str):
html = etree.HTML(data)
all_lis = html.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/div/div[2]/ul/li')
all_data = []
for x in all_lis:
name = '电脑'.join(x.xpath('./div/div[3]/a/em/text()')).replace('\n', '').replace('\t', '')
url = 'https:' + x.xpath('./div/div[3]/a/@href')[0]
image = 'https:' + x.xpath('./div/div[1]/a/img/@src')[0]
price = x.xpath('./div/div[2]/strong/em/text()')[0] + x.xpath('./div/div[2]/strong/i/text()')[0]
comments_num = x.xpath('./div/div[4]/strong/a/text()')[0]
shop_name = x.xpath('./div/div[5]/span/a/text()')[0]
shop_url = 'https:' + x.xpath('./div/div[5]/span/a/@href')[0]
tag_1 = x.xpath('./div/div[6]//i/text()')
if tag_1:
tag = '/'.join(tag_1)
else:
tag = '无标签'
all_data.append([name, url, image, price, comments_num, shop_name, shop_url, tag])
time.sleep(1)
return all_data
def page_turning():
input2 = b.find_element_by_css_selector('.pn-next')
input2.click()
scrolling_web()
return b.page_source
def save_data(data:list):
with open('./files/京东电脑.csv', 'w', newline='', encoding='utf-8_sig') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接', '图片', '价格', '评论数', '店铺名字', '店铺链接', '标签'])
writer.writerows(data)
if __name__ == '__main__':
url = 'https://www.jd.com'
data = get_net_data(url)
all_data = analysis_data(data)
for x in range(9):
all_data += analysis_data(page_turning())
save_data(all_data)