import requests
import time
from bs4 import BeautifulSoup
import re
book=[]
def search(url):
try:
#url='https://book.douban.com/tag/%E6%97%A5%E6%9C%AC%E6%96%87%E5%AD%A6'
r=requests.get(url)
html=r.text
return html
except:
search(url)
def parse_page(url,k):
html=search(url)
pattern1=re.compile('(.*?)',re.S)
book_info=re.findall(pattern1,html)
pattern2=re.compile('title="(.*?)"',re.S)
book_name=re.findall(pattern2,html)
pattern3=re.compile('class="rating_nums">(.*?)')
book_score=re.findall(pattern3,html)
if len(book_name)==0:
print('爬取结束')
print('共'+str(len(book))+'本')
exit()
for i in range(len(book_name)-1):
book_tmp = []
book_tmp.append(book_name[i])
book_tmp.append(book_info[i].strip())
#book_tmp.append(book_score[i])
book.append(book_tmp)
#print(len(book_name))
for i in range(k*20,len(book)-1):
print('第'+str(i+1)+'本')
print('书名:《'+book[i][0]+'》'+' 图书信息:'+book[i][1])
print('-----------------------------------------------------')
def main(k,sname,start):
url='https://book.douban.com/tag/'+sname+'?start='+str(start)+'&type=T'
parse_page(url,k)
if __name__ == '__main__':
sname=input("请输入图书信息:")
print('-------------------------开始爬取-------------------------')
time.sleep(1)
for i in range(10):
main(i,sname,start=i*20)