使用python3+selenium+chromeDriver爬sobooks的电子,记录书的信息(书名,作者,描述,出版时间等),转存到自己的百度云盘
环境:安装xmapp,下载chrome对应的chromeDriver.exe放到python安装目录下及安装python连接mysql的插件:pymysql
以下为配置文件:config.json
1.百度云的账号及密码
"baidu": {
"username": "",
"password": ""
}
为百度云的账号和密码,其中账号密码已加密,使用下面的例子可以算出加密后的账号密码的字符串
编码示例(Python):
```python
>>> import base64
>>> base64.b85encode(b'username').decode()
'b#rBMZeeX@'
```
2.sobooks的提取码
"sobook_code": "2018919"
提取码需要到微信公众号获取,会不定时更新
3.mysql数据
"db": {
"host":"127.0.0.1",
"port":3306,
"user": "root",
"passwd": "",
"db_name":"test"
}
可以使用xmapp安装开启,启动Appache及MySql,填入数据库连接相关信息即可。
4.保存到百度云的文件夹
只做一级目录名字,定制的自己修改
"save_baidu_dir": "book"
{
"debug": false,
"baidu": {
"username": "",
"password": ""
},
"sobook_code": "2018919",
"db": {
"host":"127.0.0.1",
"port":3306,
"user": "root",
"passwd": "",
"db_name":"test"
},
"save_baidu_dir": ""
}
解析json配置文件:config.py
import argparse
import json
import logging
import sys
from base64 import b85decode
from pathlib import Path
log_format = '%(asctime)s %(name)s[%(module)s] %(levelname)s: %(message)s'
logging.basicConfig(format=log_format, level=logging.INFO)
class Config:
def __init__(self):
self.debug = False
self.baidu = {
'username': '',
'password': ''
}
@classmethod
def load(cls, d):
the_config = Config()
the_config.debug = d.get('debug', False)
try:
the_config.baidu = {
'username': b85decode(d['baidu']['username']).decode(),
'password': b85decode(d['baidu']['password']).decode()
}
the_config.sobookCode = d.get('sobook_code', "")
the_config.db = {
'host': d['db']['host'],
'port': d['db']['port'],
'user': d['db']['user'],
'passwd': d['db']['passwd'],
'dbName':d['db']['db_name'],
}
the_config.saveBaiduDir = d.get('save_baidu_dir', "")
except Exception as e:
logging.error('获取配置文件出错: ' + repr(e))
if not (the_config.baidu['username'] and the_config.baidu['password']):
# 有些页面操作还是有用的, 比如移动焦点到输入框... 滚动页面到登录表单位置等
# 所以不禁止 browser 的 auto_login 动作了, 但两项都有才自动提交, 否则只进行自动填充动作
logging.info('用户名/密码未找到, 自动登录功能将不可用.')
return the_config
def load_config():
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', help='config file name')
args = parser.parse_args()
config_name = args.config or 'config.json'
logging.info('使用配置文件 "{}".'.format(config_name))
config_file = Path(config_name)
if not config_file.exists():
config_name = 'config.default.json'
logging.warning('配置文件不存在, 使用默认配置文件 "{}".'.format(config_name))
config_file = config_file.parent.joinpath(config_name)
try:
# 略坑, Path.resolve() 在 3.5 和 3.6 上表现不一致... 若文件不存在 3.5 直接抛异常, 而 3.6
# 只有 Path.resolve(strict=True) 才抛, 但 strict 默认为 False.
# 感觉 3.6 的更合理些...
config_file = config_file.resolve()
config_dict = json.loads(config_file.read_text())
except Exception as e:
sys.exit('# 错误: 配置文件载入失败: {}'.format(e))
the_config = Config.load(config_dict)
return the_config
config = load_config()
数据库文件:mysql.py
记录获取的书的相关信息:书名、作者、书的描述,书icon的url等信息
import pymysql
# id name author booktype preview tag time isbn dbgrade bookself authorself
from config import config
class Mysql(object):
def __init__(self):
try:
self.conn = pymysql.connect(
host= config.db["host"],
port= config.db["port"],
user= config.db["user"],
passwd= config.db["passwd"],
db= config.db["dbName"],
charset='utf8'
)
except Exception as e:
print(e)
print('连接成功失败')
else:
print('连接成功')
self.cur = self.conn.cursor()
def create_table(self):
try:
sql = 'create table authorbg(author_des_id int not null auto_increment PRIMARY KEY,author varchar(255), author_describe text) DEFAULT CHARSET=utf8 '
res = self.cur.execute(sql)
print(res)
sql = 'create table sobook(book_id int not null auto_increment PRIMARY KEY, name varchar(255),author varchar(255),book_format varchar(255),preview_count varchar(255),tag varchar(255),publish_time varchar(255),dbpf varchar(255),isbn varchar(255),file_name varchar(255), file_size varchar(255),icon_url varchar(255), book_describe text, book_type varchar(255),author_des_id int,FOREIGN KEY (author_des_id) REFERENCES authorbg(author_des_id)) DEFAULT CHARSET=utf8 '
res = self.cur.execute(sql)
print(res)
except Exception as e:
print("Exception:",e)
else:
pass
finally:
pass
def close(self):
self.cur.close()
self.conn.close()
# def add(self,name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id): # 增
# describeAuthorId = self.addAuthorBg(author,author_describe)
# sql = "INSERT INTO sobook (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id) VALUES (%s, %s,%s, %s, %s,%s,%s, %s, %s, %s,%s,%s,%s,%s)"
# val = (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,str(author_des_id))
# res = self.cur.execute(sql,val)
# if res:
# self.conn.commit()
# print("sucess:")
# else:
# print("error:")
# self.conn.rollback()
# print(res)
def addObject(self,bookInfo): # 增
describeAuthorId = self.addAuthorBg(bookInfo["author"],bookInfo["author_describe"])
sql = "INSERT INTO sobook (name,author,book_format,preview_count,tag,publish_time,dbpf,isbn,file_name,file_size,icon_url, book_describe,book_type,author_des_id) VALUES (%s, %s,%s, %s, %s,%s,%s, %s, %s, %s,%s,%s,%s,%s)"
val = (bookInfo["name"],bookInfo["author"],bookInfo["book_format"],bookInfo["preview_count"],bookInfo["tag"],bookInfo["publish_time"],bookInfo["dbpf"],bookInfo["isbn"],bookInfo["file_name"],bookInfo["file_size"],bookInfo["icon_url"], bookInfo["book_describe"],bookInfo["book_type"],str(describeAuthorId))
res = self.cur.execute(sql,val)
if res:
self.conn.commit()
print("sucess:")
else:
print("error:")
self.conn.rollback()
print(res)
def addAuthorBg(self,author,author_describe):
preInsertStr = self.dealWithString(author_describe)
preInsertStr = self.dealWithString(author_describe)
sql = "select * FROM authorbg WHERE author = %s"
na = (author)
res = self.cur.execute(sql,na)
res = self.cur.fetchall()
for item in res:
print(item)
existStr = self.dealWithString(item[2])
if existStr == preInsertStr:
return item[0]
return self.insertAuthorBg(author,author_describe)
def dealWithString(self,author_describe):
insertKey = author_describe.replace(" ","")
insertKey = insertKey.replace("\n","")
insertKey = insertKey.replace("\r","")
return insertKey
def insertAuthorBg(self,author,author_describe):
sql = "INSERT INTO authorbg (author, author_describe) VALUES (%s, %s)"
val = (author, author_describe)
res = self.cur.execute(sql,val)
if res:
describeId = int(self.conn.insert_id()) #最后插入行的主键ID
self.conn.commit()
else:
self.conn.rollback()
print(res)
return describeId
def rem(self,name): # 删
#sql = 'delete from sobook where id=1'
sql = "DELETE FROM sobook WHERE name = %s"
na = (name)
res = self.cur.execute(sql,na)
if res:
self.conn.commit()
else:
self.conn.rollback()
print(res)
def mod(self): # 改
sql = 'update sobook set name="Tom Ding" where id=2'
res = self.cur.execute(sql)
if res:
self.conn.commit()
else:
self.conn.rollback()
print(res)
def show(self): # 查
sql = 'select * from sobook'
self.cur.execute(sql)
res = self.cur.fetchall()
for i in res:
print(i)
if __name__ == "__main__":
mysql = Mysql()
#mysql.create_table()
# mysql.add("book6","author4","http://www.baidu.com","6","author6 xx xx xxx ","haha")
# mysql.show()
# mysql.close()
爬sobook网站,并记录到数据库:sobooks.py
# 导入selenium的浏览器驱动接口
from selenium import webdriver
import time
import json
import time
import datetime
from mysql import Mysql
from selenium.webdriver.common.action_chains import ActionChains #引入ActionChains鼠标操作类
from selenium.webdriver.common.keys import Keys #引入keys类操作
#import sys
# 要想调用键盘按键操作需要引入keys包
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
# 导入chrome选项
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from config import config
mysql = Mysql()
curPage = "lishizhuanji"
bookInfoStruct = {}
def main():
mysql.create_table()
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--headless')
chrome_options.add_argument('log-level=3')
driver = webdriver.Chrome( options=chrome_options)
#driver = webdriver.Chrome("chromedriver",0,chrome_options)
listPage = ["lishizhuanji","xiaoshuowenxue","renwensheke","lizhichenggong","jingjiguanli","xuexijiaoyu","shenghuoshishang","yingwenyuanban"]
listPageTx = {"lishizhuanji":"历史传记","xiaoshuowenxue":"小说人文","lizhichenggong":"励志成功","xuexijiaoyu":"学习教育","shenghuoshishang":"生活时尚","yingwenyuanban":"英文原版"}
try:
for pageUrl in listPage:
bookInfoStruct["book_type"] = listPageTx[pageUrl]
requestCurrentPage(driver,"https://sobooks.cc/"+pageUrl)
except Exception as e:
print('Got an error ', e)
else:
print("sucess...ok..end.....")
driver.quit()
mysql.close()
##请求一个大类的list
def requestCurrentPage(driver,url):
for x in range(1,100):
pageUrl = url
if x != 1:
pageUrl = url + "/page/"+ str(x)
print(" --------------------------------------------------------" )
print("current url:" + pageUrl)
print(" --------------------------------------------------------" )
driver.get(pageUrl)
time.sleep(10)
if driver.current_url == "https://sobooks.cc/":
print(" " + pageUrl + " end page:" + str(x))
print(" ---------------------page--end-----------------------------------" )
return
Webelement = driver.find_element_by_id("cardslist") #find_element_by_class_name("cardslist")
if not Webelement:
print("no cardslist")
return
else:
print("have cardslist")
normal_window = driver.current_window_handle
itemList = driver.find_elements_by_css_selector("[class='card col span_1_of_4']")
index = 0
for item in itemList:
index = index + 1
print("------------ index:" + str(index) + "------------" )
bookdetailPage(driver,item.find_element_by_xpath(".//h3/a[@href]").get_attribute('href'))
driver.switch_to.window(normal_window)
time.sleep(2)
pass
pass
##请求某一本书的具体信息
def bookdetailPage(driver,url):
driver.execute_script('window.open("%s")'%url) #打开一个标签页。
time.sleep(3)
driver.switch_to.window(driver.window_handles[1])
cur_window = driver.current_window_handle
time.sleep(2)
print(driver.current_url)
detailElement = driver.find_element_by_xpath("//div[@class='book-left']")
if not detailElement:
print("no detailElement")
#sys.exit(0)
else:
print("have detailElement")
bookpic = driver.find_element_by_xpath("//div[@class='bookpic']")
bookInfoStruct["icon_url"] = bookpic.find_element_by_tag_name("img").get_attribute("src")
#print("book icon url:" + bookpic.find_element_by_tag_name("img").get_attribute("src"))
bookinfo = driver.find_element_by_xpath("//div[@class='bookinfo']")
bookInfoList = bookinfo.find_elements_by_tag_name("li")
#print("bookinfo:\n" + bookinfo.get_attribute("outerHTML"))
#bookInfo item
for item in bookInfoList:
print("------------")
if "评分:" == item.text:
grade = item.find_element_by_tag_name("b").get_attribute("class")
print("" + grade)
bookInfoStruct["dbpf"] = grade.replace("dbpf","").strip()
else:
bookIArray = item.text.split(':',1)
if bookIArray[0] == "书名":
bookInfoStruct["name"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "作者":
bookInfoStruct["author"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "格式":
bookInfoStruct["book_format"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "浏览":
bookInfoStruct["preview_count"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "标签":
bookInfoStruct["tag"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "时间":
bookInfoStruct["publish_time"] = bookIArray[1].strip()
pass
elif bookIArray[0] == "ISBN":
bookInfoStruct["isbn"] = bookIArray[1].strip()
pass
print("" + item.text)
#book content
print("------content------")
contentList = driver.find_elements_by_xpath("//article[@class='article-content']/*")
findContentTag = False
findAuthorTag = False
content = ""
for el in contentList:
#print("el:\n" + el.get_attribute("outerHTML"))
if (el.text == "内容简介"):
findContentTag = True
elif (el.text == "作者简介"):
#先输出内容简介,后清空填充作者
bookInfoStruct["book_describe"] = content
print("\n内容简介:\n"+content)
content = ""
findContentTag = False
findAuthorTag = True
elif (findAuthorTag and el.text == ""):
print("\n作者简介:\n"+content)
bookInfoStruct["author_describe"]= content
content = ""
findAuthorTag = False
print("end...")
break
elif (findAuthorTag or findContentTag):
content = content + "\n" + el.text
#文件名字及大小
tabList = driver.find_elements_by_xpath("//table[@class='dltable']/*/*/*")
if len(tabList) > 2:
extraIArray = tabList[1].text.split(':',1)
if len(extraIArray) > 1:
bookInfoStruct["file_name"] = extraIArray[1]
pass
else:
bookInfoStruct["file_name"] = tabList[1]
extraIArray = tabList[2].text.split(':',1)
if len(extraIArray) > 1:
bookInfoStruct["file_size"] = extraIArray[1]
pass
else:
bookInfoStruct["file_size"] = tabList[2]
print("" + tabList[1].text)
print("" + tabList[2].text)
key = getKey(driver)
jumToBaiDuYun(driver,key)
driver.switch_to.window(cur_window)
time.sleep(2)
driver.close()
def getKey(driver):
driver.find_element_by_xpath("//input[@class='euc-y-i']").send_keys(config.sobookCode)
driver.find_element_by_xpath("//input[@class='euc-y-s']").click()
time.sleep(3)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='e-secret']")) );
getKeyWord = driver.find_element_by_xpath("//div[@class='e-secret']").text
keyword = getKeyWord.replace("提取密码:","")
print("提取码:" + keyword)
if not keyword.isalnum():
raise NameError('sobooks 的提取密码已改,到sobooks微信公众号获取并更新配置文件config.json的sobook_code字段')
return keyword
def jumToBaiDuYun(driver,key):
url = driver.find_element_by_xpath("//a[contains(text(), '百度网盘')]").get_attribute("href")# click()
url = url.split('=')[1]
driver.execute_script('window.open("%s")'%url)
time.sleep(2)
driver.switch_to.window(driver.window_handles[2])
# print("driver:"+driver.page_source)
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@id='jgddmad']"))) ;
driver.find_element_by_xpath("//input[@id='jgddmad']").send_keys(key)
driver.find_element_by_xpath("//a[@title='提取文件']").click()
except Exception as e:
#不需要提取
print("不需要提取码")
else:
pass
finally:
pass
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='module-share-header']"))) ;
#print("driver:"+driver.page_source)
try:
driver.find_element_by_xpath("//div[@class='KPDwCE']")
onClickDir(driver)
except Exception as e:
#只有一个
print("only have one book !!!!!")
else:
pass
finally:
pass
global bookInfoStruct
save2Account(driver)
mysql.addObject(bookInfoStruct)
book_type = bookInfoStruct["book_type"]
bookInfoStruct = {}
bookInfoStruct["book_type"] = book_type
driver.close()
pass
def onClickDir(driver):
driver.find_element_by_xpath("//a[@class='filename']").click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='FuIxtL']"))) ;
time.sleep(3)
ele = driver.find_element_by_xpath("//dd[@class='g-clearfix AuPKyz']").click()
time.sleep(2)
pass
def save2Account(driver):
driver.find_element_by_xpath("//a[@title='保存到网盘']").click()
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@id='fileTreeDialog']")))
except Exception as e:
loginBaidu(driver)
else:
pass
finally:
pass
time.sleep(5)
dianlog = driver.find_element_by_xpath("//div[@id='fileTreeDialog']")
listEl = dianlog.find_elements_by_xpath(".//ul[@class='treeview treeview-root-content treeview-content ']/*")
for el in listEl:
#print("element:"+el.get_attribute("outerHTML"))
fileName = el.find_element_by_xpath(".//span[@class='treeview-txt']").text
#print("dir name:"+fileName)
if fileName == config.saveBaiduDir:
print("find save dir name:"+fileName)
el.click()
break
time.sleep(3)
dianlog.find_element_by_xpath("//a[@title='确定']").click()
time.sleep(3)
def loginBaidu(driver):
driver.find_element_by_xpath("//p[@id='TANGRAM__PSP_10__footerULoginBtn']").click()
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//input[@id='TANGRAM__PSP_10__userName']")))
driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__userName']").send_keys(config.baidu.username)
time.sleep(1)
driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__password']").send_keys(config.baidu.password)
time.sleep(1)
driver.find_element_by_xpath("//input[@id='TANGRAM__PSP_10__submit']").click()
time.sleep(5)
def contains():
pass
def containVarInString(containVar,stringVar):
try:
if isinstance(stringVar, str):
if stringVar.find(containVar)>=0:
return True
else:
return False
else:
return False
except :
return False
return False
if __name__ == '__main__':
main()
启动xmapp后,运行 python sobooks.py 即可