#!/user/bin/python
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import lxml
import MySQLdb
from bs4 import BeautifulSoup
import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'''
hdr = { 'User-Agent' : user_agent }
db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
cursor = db.cursor()
str_sql2 = '''INSERT INTO `xiaoshuo`.`chapter1` (`bookId`, `chapterNum`,
`chapterName`, `chapterUrl`) VALUES '''
str_sql3 = '''INSERT INTO `xiaoshuo`.`chapter` (`bookId`, `chapterNum`, `chapterName`, `chapterUrl`)
VALUES (%s,%s,%s,%s)'''
def getUrlFromDbAndGetChapterInfo():
global str_sql2
str_sql1 = 'select bookId, bookName, url from book1'
cursor1 = db.cursor()
cursor1.execute(str_sql1)
url_list = cursor1.fetchall()
cursor1.close()
print "get book url list:", url_list
for item in url_list:
param=[]
bookId = item[0]
bookName = item[1].encode('utf-8')
url = item[2].encode('utf-8')
print "bookId:", bookId, "bookName:", bookName, "url:", url
getChapterInfoAndSaveInDb(bookId, url, param)
try:
cursor.executemany(str_sql3,param)
db.commit()
except MySQLdb.Error, e:
sqlError = "Error:%s" % str(e)
print "sqlError:", sqlError
def getChapterInfoAndSaveInDb(bookId, url, param):
request = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(request)
html_data = response.read().decode('gbk')
#f = open('2.html')
soup = BeautifulSoup(html_data,'lxml')
mylist = soup.find_all('div', id ='list')
for item in mylist:
section_list = item.find_all('dd')
for item in section_list:
#print item
chapterUrl = "http://www.biquzi.com" + item.find('a').get('href')
#print "章节url:", chapterUrl
tmp = item.find('a').get_text().split(' ')
chapterNum = ""
chapterName = ""
if len(tmp)>1:
chapterNum = tmp[0].encode("utf-8")
chapterName = tmp[1].encode("utf-8")
else:
str1 = item.find('a').get_text().split(u'章',1)
if len(str1) == 1:
continue
chapterNum = str1[0].encode("utf-8") + "章"
chapterName = str1[1].encode("utf-8")
#temp_str = '("' + str(bookId) + '", "'+ chapterNum + '", "'+ chapterName + '", "'+ chapterUrl + '"),'
tup1 = (bookId, chapterNum, chapterName, chapterUrl)
param.append(tup1)
#print "第几章:", chapterNum, len(chapterNum)
#print "章节名:", chapterName, len(chapterName)
#print "-----------------------------------------------------------------------------------------"
#str_sql2 = str_sql2 + ''.join(str_list)
#f.close()
if __name__ == "__main__":
print ("<<<-----Start Get Book Chapter And Save In Db------>>")
getUrlFromDbAndGetChapterInfo()
cursor.close()
db.close()