老的荣耀手机属于华为云系统,家里人换了新荣耀手机属于荣耀云系统无法通过云空间将备忘录转移到新手机,不想让他们一个一个搞,于是整了一晚上想办法爬取下来。从网页抓取下来,然后存到docx文档中(包括文字和图片,别的形式的内容请举一反三)
本方法Cons:不能复制到荣耀云里,因为捣了半天这个根本就没有除了手机之外可以访问的方法
手机内部自动化保存为文档后处理
华为手机备忘录批量导出txt╳ 全自动版 ╳By 免ROOT自动化助手
cons:这个up是导出一条删除一条,我不想删除,它有for循环但是要付费,付费倒不是啥别太贵就行毕竟人家做的工具,但是一看还要注册登录,,,
fiddler抓包方式抓所有笔记页面
华为手机备忘录批量导出文字和图片
cons:看了一下需要手动一个一个点笔记,不想动了
import json
import os
import shutil
import time
import requests as requests
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
def login_and_save_to_cookies(driver, url):
# driver = webdriver.Chrome()
driver.get(url)
val = input("Manually login now. Enter anything if finished")
# time.sleep(20) # Let the user actually see something!
cookies = driver.get_cookies()
# 转换成字符串保存
json_cookie = json.dumps(cookies)
# 保存到txt文件
with open('cookies.json', 'w') as f:
f.write(json_cookie)
print('cookies保存成功!')
def login_with_cookies(driver):
with open('cookies.json','r',encoding='utf8') as f:
cookies = json.loads(f.read())
# 给浏览器添加cookies
for cookie in cookies:
driver.add_cookie(cookie)
# 刷新网页,cookies才会成功,后面我们直接跳转就不用刷新了
# driver.refresh()
if __name__ == '__main__':
driver = webdriver.Chrome()
# 笔记的第一页url,后面需要用到
url = 'https://cloud.huawei.com/home#/notepad/note/allNote/note/xxxxxxxxxxxxxxxxxxxxxxx'
# 加载已有cookies之前至少得先访问一遍
driver.get(url)
time.sleep(5) # 需要等待加载完
# 切换成中文
language_button = driver.find_element(By.XPATH,'//*[@id="Cloudlogin"]/div[1]/div[1]/div[2]/a[1]/span[1]')
ActionChains(driver).click(language_button).perform() # single click
chinese_button = driver.find_element(By.XPATH,'//*[@id="scroller"]/ul/li[65]/div')
ActionChains(driver).click(chinese_button).perform() # single click
time.sleep(5)
val = input("If using existing cookies, type 'n', manually login type anything else")
if val != 'n':
login_and_save_to_cookies(driver, url)
login_with_cookies(driver)
time.sleep(5)
driver.get(url)
time.sleep(5)
# driver.find_element(By.)
note_center_items = driver.find_elements(By.CLASS_NAME, "note_item")
note_titles = []
note_times = []
note_contents = []
chrome_download_directory = "C:\\Users\\xxxxxxxxxxxxxx\\Downloads\\" # 浏览器默认下载路径
img_store_directory = "./img/" # 都需要以/结尾以便后面拼接
for note_center_item in note_center_items:
# print(note_center_item.get_attribute("autokey"))
ac = note_center_item
# ActionChains(driver).scroll_to_element(ac).perform()
ActionChains(driver).click(ac).perform() # single click
driver.execute_script("arguments[0].scrollIntoView(true);", ac) # 必须滑动,不然点击视窗外的元素无法加载右侧笔记内容
time.sleep(5)
title_element = driver.find_element(By.XPATH,
'//*[@id="note_title_editor"]/div/div[6]/div[1]/div/div/div/div[5]/pre/span')
time_element = driver.find_element(By.XPATH,
'//*[@id="app"]/div[6]/div[3]/div/div[1]/div[4]/div[2]/div/span[1]')
content_multiline_container_element = driver.find_element(By.XPATH,
'//*[@id="note_detail_editor"]/div/div[6]/div[1]/div/div/div/div[5]')
content_singleline_containers = content_multiline_container_element.find_elements(By.XPATH, './div')
note_content = []
for i in range(len(content_singleline_containers)):
content_singleline_container = content_singleline_containers[i]
try:
# is an image
img_element = content_singleline_container.find_element(By.XPATH, './div/div/img')
img_src = img_element.get_attribute("src")
# 用浏览器下载,这里request好像不行
driver.get(img_src)
# cur_img = requests.get(img_src)
# note_content.append(cur_img)
# 等待下载完
time.sleep(3)
# 转移,加jpg后缀因为这里都是jpg格式
downloaded_file_name = img_src.split("/")[-1]
shutil.move(chrome_download_directory + downloaded_file_name, img_store_directory)
os.rename(img_store_directory + downloaded_file_name, img_store_directory + downloaded_file_name + '.jpg')
note_content.append(downloaded_file_name + '.jpg') # 标记位置
except:
# is text
note_content.append(content_singleline_container.text) # 可以直接获取最里面的值, 也可以通过./pre/span/span获取最里面的元素再获取text
# 如果全是文字可以直接合并成段落
# if i < len(content_singleline_containers) - 1:
# note_content += "\n"
note_title = title_element.text
note_time = time_element.text
note_titles.append(note_title)
note_times.append(note_time)
note_contents.append(note_content)
print(note_title)
print(note_time)
print(note_content)
print(len(note_titles), len(note_times), len(note_contents))
result = []
for i in range(len(note_titles)):
result.append([note_titles[i], note_times[i], note_contents[i]])
with open('result.json', 'w') as f:
json.dump(result, f)
# f = open("result.txt", "w")
# for i in range(len(note_times)):
# f.write(note_times[i])
# f.write(note_contents[i])
# f.write("\n")
# f.close()
# time.sleep(20)
import json
from docx import Document
from docx.shared import Inches
document = Document()
with open('result.json', 'r') as f:
data_json = json.load(f)
img_store_directory = "./img/"
for note_title, note_time, note_content in data_json:
if len(note_title) > 0:
# 这里很多都是空标题所以如果空就不写入
document.add_paragraph(note_title)
document.add_paragraph(note_time)
for note_content_line in note_content:
if note_content_line.split('.')[-1] == 'jpg':
# is an image
document.add_picture(img_store_directory + note_content_line, width=Inches(5))
else:
document.add_paragraph(note_content_line)
document.add_paragraph("\n")
document.save('备忘录.docx')