具体代码如下:
# coding:utf8
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import pymysql
from bs4 import BeautifulSoup
import _thread
import time
import threading
# 获取书简介,修改书信息
def getIntroduce(novel_href,id):
header = {
'Host':'www.quanshuwang.com',
'Upgrade-Insecure-Requests':'1',
'Connection':'keep-alive',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
time.sleep(0.2)
novellist = requests.get(novel_href,headers=header,timeout=20)
novellist.encoding = 'gbk'
soup = BeautifulSoup(novellist.text, 'lxml')
res = soup.select("#waa")
if(len(res)>0):
# 书简介
introduce = soup.select("#waa")[0].get_text()
chapterHref = soup.select(".reader")[0].get("href")
print(introduce)
sql = "UPDATE novel_info SET novel_introduce='%s' WHERE novel_href='%s'" % (introduce,novel_href)
te = threading.Thread(target=getChapterList, args=(chapterHref, id,sql))
te.start()
# getChapterList(chapterHref,id,sql)
# 获取章节信息
def getChapterList(h,id,sql):
db = pymysql.connect("localhost", "root", "123456", "wx_app")
db.ping(True)
time.sleep(0.2)
novellist = requests.get(h, timeout=20)
novellist.encoding = 'gbk'
soup = BeautifulSoup(novellist.text, 'lxml')
list = soup.select(".dirconone > li");
i = 1
print("开始输入-> 书ID:%d " % id)
insertNovelInfo(sql,db)
for chapter in list:
contHref = chapter.select("a")[0].get("href")
# 章节标题
contTitle = chapter.select("a")[0].get_text()
# content = getContents(contHref)
res1 = requests.get(contHref, timeout=20)
res1.encoding = 'gbk'
soup = BeautifulSoup(res1.text, 'lxml')
tx = soup.select(".mainContenr")
if (len(tx) > 0):
content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')
else:
content = h
print("章节:%s" % (contTitle))
sql1 = "INSERT INTO `novel_chapter`(novel_id,chapter_id,chapter_name) VALUES(%d,%d,'%s')" % (id,i,contTitle)
i = i+1
sql2 = "INSERT INTO `novel_chapter_info`(chapter_id,chapter_name,chapter_content,novel_id) VALUES(%d,'%s','%s',%d)" % (i,contTitle,content,id)
insertNovelInfo(sql1,db)
insertNovelInfo(sql2,db)
print("文件%s输入完成" % id)
db.commit()
db.close()
def getContents(h):
res = requests.get(h, timeout=20)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'lxml')
tx = soup.select(".mainContenr")
if(len(tx)>0):
content = soup.select(".mainContenr")[0].get_text().lstrip('style5();').rstrip('style6();')
else:
content = h
return content
def insertNovelInfo(sql,db):
cursor = db.cursor()
try:
cursor.execute(sql)
except:
#回滚
db.rollback()
print("mysql错误:",sql)
exec(0)
# getIntroduce('http://www.quanshuwang.com/book_135083.html')
def test(i):
print(i)
def init(count,num):
# count = 0
while (count < num):
i = 0
str = "select a.novel_href,a.novel_id from novel_info a inner join (select novel_id from novel_info GROUP BY novel_id limit %d,1) b on a.novel_id=b.novel_id " % count
# 打开数据库连接
db = pymysql.connect("localhost", "root", "123456", "wx_app")
db.ping(True)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
try:
# 执行SQL语句
cursor.execute(str)
# 获取所有记录列表
results = cursor.fetchall()
db.close()
except:
print("Error: unable to fecth data")
# 关闭数据库连接
for row in results:
getIntroduce(row[0],row[1])
print(row[0],row[1])
count = count + 1
def test(res):
i = 0;
while(i<10):
print(res)
i = i+1
try:
threads = []
# 循环开启线程
for i in range(0, 100):
# 每个线程执行多少文章
j = i+1
t = threading.Thread(target=init, args=(i,j))
i = j
threads.append(t)
t.start()
for t in threads:
t.join()
print("end")
except:
print("Error: 无法启动线程")