import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Cookie": ""
} # 请求头
def request(url):
req = requests.get(
url,
headers=headers)
try:
types = req.apparent_encoding
req.encoding = types
except:
None
html = req.text
meta = BeautifulSoup(html, 'html.parser')
return meta
url = "www.baidu.com"
meta = request(url)
# 标签获取数据
text = meta.select('div[id="content"]')[0].text
# 经常使用的两个正则
re.sub("\xa0\xa0\xa0\xa0", "", text)
re.findall('(.*)', text)[0]
with open(paths, encoding="utf-8") as f: # 读取文件数据
text = f.read()
f.close()
from xlutils.copy import copy
import xlrd # pip install xlrd==1.2.0
def write_excel_xlsx_append(value, path):
'''
:param value: [a,a,a,a,a]
:return:
'''
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(index):
new_worksheet.write(rows_old, i, value[i]) # 追加写入数据,注意是从rows_old行开始写入
new_workbook.save(path) # 保存工作簿
print("写入数据成功!")
def write_excel_xlsx_append_MAX(value, path):
'''
:param value: [[a,a,a,a,a], [b,b,b,b]]
:return:
'''
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i + rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
print("写入数据成功!")
def open_workbook(paths, hang):
'''
读取一列数据
'''
data = xlrd.open_workbook(paths) # 读取文件
sheets = data.sheet_by_index(0).col_values(hang)
return sheets
import pandas as pd
def dq_xls(path):
''' 表格读取都可以用
:param path:
:return:[[a,a,a],[a,a,a],[a,a,a]]
'''
dr_data = pd.read_excel(path)
dr_data = dr_data.fillna("")
dr_list = dr_data.values.tolist()
return dr_list
def dq_xls(path, sheet_name):
''' 表格读取都可以用
读取指定sheet数据
:param path:
:return:[[a,a,a],[a,a,a],[a,a,a]]
'''
dr_data = pd.read_excel(path, sheet_name=sheet_name)
dr_data = dr_data.fillna("")
dr_list = dr_data.values.tolist()
return dr_list
def new_xlsx(path):
work_book = xlwt.Workbook(encoding='utf-8')
sheet = work_book.add_sheet('sheet')
work_book.save(path)
dir = os.path.abspath('media') # 获取文件夹名路径
dir = os.path.dirname(__file__) # 获取当前路径
'''
判断目录是否存在,如果不存在创建目录
'''
pd = os.path.exists(path)
if not pd:
os.makedirs(path)
data_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
data_time = time.strftime("%Y-%m-%d", time.localtime(time.time() - 86400))
import time
from datetime import datetime
st = '2022-09-16 11:03:35'
time_str = time.mktime(datetime.strptime(st, '%Y-%m-%d %H:%M:%S').timetuple())
def untar_win(fname, dirs):
"""
解压tar.gz文件
:param fname: 压缩文件名s
:param dirs: 解压后的存放路径
:return: bool
"""
try:
t = tarfile.open(fname)
t.extractall(path=dirs)
return True
except Exception as e:
print(e)
return False
# 输入
import win32api
import win32con
import ctypes
def input_str(data_str):
for i in str(data_str):
MapVirtualKey = ctypes.windll.user32.MapVirtualKeyA
an = ord(i)
win32api.keybd_event(an, MapVirtualKey(an, 0), 0, 0)
win32api.keybd_event(an, MapVirtualKey(an, 0), win32con.KEYEVENTF_KEYUP, 0)
# 点击
from pynput.mouse import *
def dianji(x, y):
m = Controller()
m.position = (x, y)
print('设置鼠标的坐标:{0}'.format(m.position))
time.sleep(1)
# m.move(m.position[0], m.position[1])
m.press(Button.left)
m.release(Button.left)
time.sleep(1)
# 开进程
from multiprocessing import Process
p = Process(target=you_def, args=(file_data_one,))
p.start()
# 验证码
import ddddocr
ocr = ddddocr.DdddOcr()
with open('1.jpg', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
import ddddocr
def get_ddddocr(path):
ocr = ddddocr.DdddOcr()
with open(path, 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
return res
# 图片识别
import pytesseract
from PIL import Image
image = Image.open('2.jpg')
t = pytesseract.image_to_string(image, lang='chi_sim')
# 图片截取
from PIL import Image
def img_jq(data):
'''
图片截取
date = {
'x': '474',
'y': '407',
'w': '964',
'h': '38',
'path': '1.jpg',
'out_name_path': 'test2.jpg',
}
:param data:
:return:
'''
path = data['path']
x = float(data['x'])
y = float(data['y'])
w = float(data['w'])
h = float(data['h'])
out_name_path = data['out_name_path']
im = Image.open(path)
region = im.crop((x, y, x + w, y + h))
region = region.convert('RGB')
region.save(out_name_path)
# 字符串转json
json3=ast.literal_eval(str)
# chromedriver使用
# http://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver
url = 'www.baidu.com'
chrome_options = webdriver.ChromeOptions()
chromedriver_path = r'chromedriver.exe'
# chrome_options.add_argument("--auto-open-devtools-for-tabs")
# chrome_options.add_argument('--start-maximized')
# chrome_options.add_argument('--headless')
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"')
brow = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)
brow.set_page_load_timeout(8)
brow.get(url)
brow.execute_script('window.stop()')
brow.find_element_by_id("txtUserName2").click()
brow.find_element_by_id("txtUserName2").send_keys(user)
# 切换内置网页
iframe = brow.find_element_by_css_selector('iframe')
print(iframe)
brow.switch_to_frame(iframe)