# -*- coding:utf-8 -*-
import urllib,urllib2
from pyquery import PyQuery as pq
import requests
import numpy as np
import time
from openpyxl import Workbook
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
has=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
def book_spider(book_tag):
page_num=0;
book_list=[]
try_times=0
num=0
#fp = open('1.txt','w')
while(1):
#url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test
url='http://www.douban.com/tag/'+urllib.quote(book_tag)+'/book?start='+str(page_num*15).strip()
time.sleep(np.random.rand()*5)
#if(page_num>1):
#return book_list
try:
req = urllib2.Request(url,headers=has[page_num%len(has)])
html = urllib2.urlopen(req).read().decode('utf-8')
con = str(html)
except (urllib2.HTTPError, urllib2.URLError), e:
print e
continue
try_times+=1;
content = pq(con)
a_list = content(".desc")
rating_list = content(".rating_nums")
href_list = content("dd").find('a')
if len(href_list)<=1 or href_list==None:
break
for i in range(0,len(href_list)):
titleObj= href_list[i]
title = str(content(titleObj).text().strip())
nameObj = href_list[i]
book_url = content(nameObj).attr('href')
textObj = a_list[i]
try:
ratingObj =rating_list[i]
except:
pass
author = str(content(textObj).text().decode('utf-8'))
author_list = author.split('/')
try:
author_info = '作者/译者: ' + '/'.join(author_list[0:-3])
except:
author_info ='作者/译者: 暂无'
try:
pub_info = '出版信息: ' + '/'.join(author_list[-3:])
except:
pub_info = '出版信息: 暂无'
try:
rating = content(ratingObj).text().strip()
#print rating
except:
rating='0.0'
try:
people_num = get_people_num(book_url)
#print page_num
except:
people_num = '0'
#print int(people_num)
book_list.append([title,rating,people_num,author_info,pub_info])
try_times=0 #set 0 when got valid information
page_num+=1
print 'Downloading Information From Page %d' % page_num
return book_list
def get_people_num(url):
#url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test
try:
req = urllib2.Request(url, headers=has[np.random.randint(0,len(has))])
source_code = urllib2.urlopen(req).read()
plain_text=str(source_code)
except (urllib2.HTTPError, urllib2.URLError), e:
print e
dd = pq(plain_text)
num_Obj = dd(".rating_sum").find("a").find("span")
people_num = dd(num_Obj).text()
if(people_num==''):
people_num = '0'
#print people_num
return people_num
def do_spider(book_tag_lists):
book_lists=[]
for book_tag in book_tag_lists:
book_list=book_spider(book_tag)
book_list=sorted(book_list,key=lambda x:x[1],reverse=True)
book_lists.append(book_list)
return book_lists
def print_book_lists_excel(book_lists,book_tag_lists):
wb=Workbook()
ws=[]
for i in range(len(book_tag_lists)):
ws.append(wb.create_sheet(title=book_tag_lists[i].decode())) #utf8->unicode
for i in range(len(book_tag_lists)):
ws[i].append(['序号','书名','评分','评价人数','作者','出版社'])
count=1
for bl in book_lists[i]:
try:
ws[i].append([count,bl[0],float(bl[1]),int(bl[2]),bl[3],bl[4]])
except:
print count,bl[0],float(bl[1]),int(bl[2]),bl[3],bl[4]
count+=1
save_path='book_list'
for i in range(len(book_tag_lists)):
save_path+=('-'+book_tag_lists[i].decode())
save_path+='.xlsx'
wb.save(save_path)
if __name__=='__main__':
#book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史']
#book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教']
#book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性']
#book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网']
#book_tag_lists = ['数学']
#book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生']
#book_tag_lists = ['商业','理财','管理']
#book_tag_lists = ['名著']
#book_tag_lists = ['科普','经典','生活','心灵','文学']
#book_tag_lists = ['科幻','思维','金融']
print 1,'d',"dfasdfs"
book_tag_lists = ['数学']
book_lists=do_spider(book_tag_lists)
print_book_lists_excel(book_lists,book_tag_lists)