爬虫结果入库图:
代码如下:
#!/user/bin/python
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import lxml
import MySQLdb
from bs4 import BeautifulSoup
import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'''
hdr = { 'User-Agent' : user_agent }
db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
cursor = db.cursor()
str_sql = '''INSERT INTO `xiaoshuo`.`book1` (`bookName`, `author`, `url`, `classifyName`, `brief`, `updateTime`,
`status`) VALUES(%s, %s, %s, %s, %s, %s, %s)'''
def getBookInfoBaseOnUrl(url, param):
request = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(request)
html_data = response.read().decode('gbk')
soup = BeautifulSoup(html_data,'lxml')
mylist = soup.select('head')
for item in mylist:
bookName = item.find(property="og:novel:book_name").get("content").encode('utf-8')
#print "书名:", bookName
author = item.find(property="og:novel:author").get("content").encode('utf-8')
#print "作者:", author
url = item.find(property="og:novel:read_url").get("content").encode('utf-8')
#print "链接:", url
classifyName = item.find(property="og:novel:category").get("content").encode('utf-8')
#print "类型:", classifyName
description = item.find(property="og:description").get("content").encode('utf-8')
#print "brief:", description
updateTime = item.find(property="og:novel:update_time").get("content").encode('utf-8')
#print "更新时间:", updateTime
status = item.find(property="og:novel:status").get("content").encode('utf-8')
#print "status:", status
#str_sql += '("' + bookName + '", "' + author + '", "' + url + '", "' + classifyName + '", "' + description + '", "' + updateTime + '", "' + status + '"),'
tup1 = (str(bookName), str(author), str(url), str(classifyName), str(description), str(updateTime), str(status))
param.append(tup1)
#print "-----------------------------------------------------------------------------------------"
def getBookList(url):
request = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(request)
html_data = response.read().decode('gbk')
soup = BeautifulSoup(html_data,'lxml')
mylist = soup.find_all('div', class_ ='r')
for item in mylist:
param=[]
xiaoshuo_list = item.find_all('li')
for item in xiaoshuo_list:
tmp = item.get_text('|').split('|', 1)
bookName = tmp[0].encode('utf-8')
author = tmp[1].encode('utf-8')
url = item.find('a').get('href').encode('utf-8')
#print "bookName:", bookName, "author:", author, "url:", url
str_sql1 = "select * from xiaoshuo.book1 where bookName = '" + str(bookName) + "'"
print str_sql1
cursor.execute(str_sql1)
bookInfo = cursor.fetchall()
if len(bookInfo) == 1:
#print "the have in db:", bookInfo
print "*****************************************************************************************"
continue
else:
getBookInfoBaseOnUrl(url, param)
print "*****************************************************************************************"
print "param:", str(param).decode('string_escape')
try:
cursor.executemany(str_sql, param)
db.commit()
except MySQLdb.Error, e:
sqlError = "Error:%s" % str(e)
print "sqlError:", sqlError
def startGetChatper():
soup = BeautifulSoup(open('biquge.html'),'lxml')
mylist = soup.find_all('div', class_ ='nav')
for item in mylist:
#print item
xiaoshuo_list = item.find_all('li')
for item in xiaoshuo_list:
print item
url = item.find('a').get('href')
classifyName = item.find('a').get_text().encode('utf-8')
print "url:", url , type(url)
if url != '/':
getBookList("http://www.biquzi.com/" + url)
print "#########################################################################################"
if __name__ == "__main__":
print ("<<<-----Start Get Book INFO And Save Db------>>")
startGetChatper()
cursor.close()
db.close()