python爬虫,shigen
2023年5月27日整理的笔记,现在分享出来。
pip install requests
参考文章:python 爬虫之 BeautifulSoup - 掘金
案例:爬取豆瓣top250榜电影名称
查看user-agent的小工具: 获取浏览器UA(userAgent)信息
import requests
from bs4 import BeautifulSoup
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=header)
content = response.text
soup = BeautifulSoup(content, 'html.parser')
all_titles = soup.findAll('span', attrs={'class': 'title'})
for title in all_titles:
print(title.get_text())
# html/xml解析
pip install lxml
from lxml import etree
tree = etree.HTML(content)
titles = tree.xpath('//*[@id="content"]/div/div[1]/ol//li/div/div[2]/div[1]/a/span[1]/text()')
print(titles)
session = requests.session()
# 会携带上下文的cookie
溯源,当前请求的上一级
案例中使用防盗链下载梨视频
# -*- encoding: utf-8 -*-
__date__ = '2023/05/28 10:33:03'
# this python shell is used to download video from pearvideo
import requests
url = 'https://www.pearvideo.com/video_1413858'
contant_id = url.split('/')[-1].split('_')[-1]
session = requests.Session()
session.get(url)
request_json_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=1413858&mrd=0.051782001313967596'
headers = {'Referer': url}
res_json = session.get(url=request_json_url, headers=headers).json()
print(res_json)
systemTime = res_json['systemTime']
srcUrl = res_json['videoInfo']['videos']['srcUrl']
videoImg = res_json['videoInfo']['video_image']
videoUrl = srcUrl.replace(systemTime, 'cont-'+contant_id)
print(videoUrl, videoImg)
basePath = './files/' + contant_id
import os
if not os.path.exists(basePath):
os.makedirs(basePath)
# download video image
img_save_path = os.path.join(basePath, videoImg.split('/')[-1])
with open(img_save_path, 'wb') as file:
file.write(session.get(videoImg).content)
# download video
video_save_path = os.path.join(basePath, videoUrl.split('/')[-1])
with open(video_save_path, 'wb') as file:
file.write(session.get(videoUrl).content)
通过代理ip访问真实的网站
国内免费的HTTP代理:2023年5月28日7时 国内最新免费HTTP代理IP - 站大爷 - 企业级高品质Http代理IP_Socks5代理服务器_免费代理IP
url = 'http://www.baidu.com'
import requests
proxy_ip = 'https:114.251.193.153:3128'
proxies = {
"https": proxy_ip
}
response = requests.get(url=url, proxies=proxies)
# 直接爬取会出现乱码的问题
text = response.content.decode('utf-8')
print(text)
线程:
进程:
from threading import Thread
def func(name):
for i in range(100):
print('func() called', i)
class MyThread(Thread):
def run(self):
for i in range(100):
print('mythread func() called', i)
if __name__ == '__main__':
# t = Thread(target=func, args=('shigen',))
# t.start()
t = MyThread()
t.start()
for i in range(100):
print(i)
和多线程的API相似
from multiprocess import Process
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def fn(name):
for i in range(100):
print(name, i)
if __name__ == "__main__":
with ThreadPoolExecutor(50) as t:
for i in range(100):
t.submit(fn, name=f'线程执行{i}')
print('all done')
获得线程运行结果信息:
# 创建一个包含5条线程的线程池
executor = ThreadPoolExecutor(max_workers=5)
lists = [1, 2, 3, 4, 5, 7, 8, 9, 10]
start_time = time.time()
result = [data for data in executor.map(action, lists)]
print(result)
executor.shutdown()
print(time.time() - start_time)
当程序遇见了IO操作的时候,可以选择性的切换到其他任务上
import asyncio
async def download(url):
print(f'start downloading {url}')
await asyncio.sleep(2)
print('downloading finished')
async def main():
urls = [
'http://www.google.com',
'http://www.baidu.com',
'http://www.xiaomi.com'
]
tasks = []
for url in urls:
tasks.append(asyncio.create_task(download(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
import asyncio
import aiohttp
basepath = './files/'
headers = {'Referer': 'https://xxxx.cn/'}
async def download(url):
print(f'start downloading {url}')
filename = url.split('/')[-2]
async with aiohttp.ClientSession() as session:
async with session.get(url=url, headers=headers) as resp:
with open(basepath + filename, mode='wb') as f:
f.write(await resp.content.read())
print(f'downloading {url} finished')
async def main():
urls = []
tasks = []
for url in urls:
tasks.append(asyncio.create_task(download(url)))
await asyncio.wait(tasks)
if __name__ == '__main__':
asyncio.run(main())
自动化测试,打开浏览器,像人一样操作浏览器
pip install selenium
解压到python解释器的位置,并重命名
tar -zxvf chromedriver_mac_arm64.zip
ll
which python3
mv chromedriver ~/opt/anaconda3/bin
from selenium.webdriver import Chrome
# 创建浏览器对象
web = Chrome()
web.get('http://www.baidu.com')
print(web.get_window_size())
print(web.title)
可能遇到的问题: 解决mac无法打开chromedriver报错,由于无法验证开发人员,因此无法打开“ chromedriver”_webdriver mac 无法打开_奈暮希的博客-CSDN博客
爬取京东上关于python的书籍信息,并保存成csv文件。
todo:部分页面的页面元素不一样,导致元素的查找存在问题。
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
driver = webdriver.Chrome()
driver.get('https://www.jd.com/')
# 获取搜索框
search = driver.find_element(By.XPATH, '//*[@id="key"]')
# 获取查询按钮
button = driver.find_element(By.XPATH, '//*[@id="search"]/div/div[2]/button')
# 在搜索框中输入 Python
search.send_keys('Python')
# 点击查询按钮
button.click()
time.sleep(2)
# 获得商品
goods_list = driver.find_elements(By.XPATH, '//*[@id="J_goodsList"]/ul/li')
datas = []
for good in goods_list[:-1]:
# 此处用css选择器比较保险
img = good.find_element(By.CSS_SELECTOR, 'a > img').get_attribute('src')
name = good.find_element(By.CSS_SELECTOR, 'div > div.p-name.p-name-type-2 > a > em').text
prize = good.find_element(By.TAG_NAME, 'i').text
url_element = good.find_element(By.CSS_SELECTOR, 'div > a')
url = url_element.get_attribute('href')
url_element.click()
# 进入到新窗口
driver.switch_to.window(driver.window_handles[-1])
book_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-3"]/div[2]/div').text
author_detail = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-4"]/div[2]/div').text
book_menu = driver.find_element(By.XPATH, '//*[@id="detail-tag-id-6"]/div[2]/div').text
# 打开详情页获得内容简介、作者简介和目录
print(img, name, prize, url)
datas.append({
'img': img,
'name': name,
'prize': prize,
'url': url,
'book_detail': book_detail,
'author_detail': author_detail,
'book_menu': book_menu,
})
# 关闭并切回到原来的窗口
driver.close()
driver.switch_to.window(driver.window_handles[0])
df = pd.DataFrame(datas)
df.to_csv('book_detail.csv', index=True, header=True)
暂时无法在飞书文档外展示此内容
没有任何的界面,操作浏览器
# 创建配置对象
options = webdriver.ChromeOptions()
# 配置对象添加开启无界面模式的命令
options.add_argument('--headless')
# 配置对象添加禁用gpu的命令
options.add_argument('--disable-gpu')
# 实例化带有配置对象的driver对象
driver =Chrome(chrome_options=options)
# 配置代理ip
options.add_argument('--proxy-server=http://150.138.253.70:808')
# 更换User-Agent
options.add_argument('--user-agent=Opera/9.23 (X11; Linux x86_64; U; en)')
使用ocr或者超级鹰平台来识别验证码
超级鹰: 超级鹰验证码识别-专业的验证码云端识别服务,让验证码识别更快速、更准确、更强大
教程中讲的是使用selenium自动登录12306, 中国铁路12306网站