霍比特人小说爬取,使用 selenium 模块调用谷歌浏览器,无界面模式爬取小说网站信息,将数据按照每次2000字符在mysql中保存。
# https://www.shukuai9.com/b/324694/
# 导入需要的库
from selenium import webdriver
# 导入Keys模块,用于模拟键盘按键操作
from selenium.webdriver.common.keys import Keys
#数据库
import pymysql
#获取文本,拆分为4000字每段
def text_sql(title,dimension,text):
text = text.replace(chr(9),'').replace(chr(10),'').replace(chr(13),'')
print(title,dimension,len(text),len(text)//2000)
for i in range((len(text)//2000)+1):
str_num = i*2000
end_num = (i+1)*2000
#print(str_num,end_num)
#print(i,text[str_num:end_num])
try:
start = mysql_in(title,dimension,i,text[str_num:end_num])
except Exception as e:
print(e)
start='异常'
print(title,dimension,i,start)
#数据库登录
def mysql_execute(in_sql, leixing):
# 登录数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='szc_sql', password='szcNSP850219', database='szc_sql',charset='utf8')
# # 登录数据库
# dsn = "134.80.200.216/xxx"
# try:
# conn = cx_Oracle.connect(user="zbweb", password="zibo_xxx", dsn=dsn, encoding="UTF-8")
# except:
# time.sleep(10)
# conn = cx_Oracle.connect(user="zbweb", password="zibo_xxx", dsn=dsn, encoding="UTF-8")
# 得到一个可以执行SQL语句的光标对象
cursor = conn.cursor()
# 数据库执行导入的语句
if leixing == '数量':
# 反馈数量
count = cursor.execute(in_sql)
elif leixing == '单条':
# 反馈单条
cursor.execute(in_sql)
count = cursor.fetchone()[0]
elif leixing == '多条':
# 反馈多条
cursor.execute(in_sql)
count = cursor.fetchall()
elif leixing == '编辑':
count = cursor.execute(in_sql)
conn.commit()
# 关闭光标对象
cursor.close()
# 关闭数据库连接
conn.close()
# 反馈
return count
def mysql_in(title,dimension,num,text):
print(num,type(num))
sql="delete from novel_text where title='%s' and dimension='%s' and num='%s' " % (title,dimension,str(num))
mysql_execute(sql, '编辑')
sql = "insert into novel_text (title,dimension,num,text) values('%s','%s','%s','%s') " % (title,dimension,str(num),text)
print('sql',sql)
mysql_execute(sql, '编辑')
return '完成'
# 创建Chrome浏览器对象
chrome_opt = webdriver.ChromeOptions()
# 开启无界面模式
chrome_opt.add_argument('--headless')
# 禁用gpu
chrome_opt.add_argument('--disable-gpu')
# 创建Chrome浏览器实例
driver = webdriver.Chrome(executable_path='./chromedriver.exe',options=chrome_opt)
# 打开网页
driver.get("https://www.shukuai9.com/b/324694/")
# 等待页面加载完成
driver.implicitly_wait(10)
# 获取搜索结果列表元素
search_results = driver.find_elements_by_css_selector("dd")
search_title = driver.find_elements_by_css_selector("h1")
print('search_title',search_title)
for text in search_title:
print('text',text.text)
search_title_text = text.text
hbt_key = {}
# 输出搜索结果标题和链接
for result in search_results:
title = result.find_element_by_css_selector("a").text
link = result.find_element_by_css_selector("a").get_attribute("href")
num = str(title).index('节')
title_num = title[1:num]
#print(title_num,biaoti, link)
hbt_key[title_num]=[title,link]
#print('hbt_key',hbt_key)
#获取键值对个数
for i in range(len(hbt_key)):
print(hbt_key[str(i+1)])
title = hbt_key[str(i + 1)][0]
link = hbt_key[str(i + 1)][1]
# 打开网页
driver.get(link)
# 等待页面加载完成
driver.implicitly_wait(10)
# 获取搜索结果列表元素
search_results = driver.find_element_by_xpath('//*[@id="content"]').text
#print(search_results)
# 输出搜索结果标题和链接
search_text = ''
for t in search_results:
search_text += t
print(len(search_text),type(search_text))
#文字留存
text_sql(search_title_text,title,search_text)
#break