from urllib import request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import csv
import io
import sys
import time
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
# resp = request.urlopen('https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?type=S')
# html_data = resp.read().decode('utf-8')
# # print(html_data)
# soup = BeautifulSoup(html_data,'html.parser')
def get_url_book(url):
print(url)
resp = urllib.request.urlopen(url)
web_data = resp.read().decode('utf-8')
# print(web_data)
soup = BeautifulSoup(web_data,'html.parser')
time.sleep(2)
return read(soup)
def read(soup):
nowplaying_book = soup.find_all('ul',class_='subject-list')
# print(nowplaying_book)
nowplaying_book_list = nowplaying_book[0].find_all('li',class_='subject-item')
# print(nowplaying_book_list[0])
book_list = []
for item in nowplaying_book_list:
nowplaying_dict = {}
for tag_info_item in item.find_all('div',class_='info'):
nowplaying_dict['name'] = tag_info_item.find_all('a')[0]['title']
try:
nowplaying_dict['dec'] = tag_info_item.find_all('p')[0].string
except:
nowplaying_dict['dec'] = ''
nowplaying_dict['pub'] = tag_info_item.find_all('div',class_='pub')[0].string.replace('\n','')
try:
nowplaying_dict['rating_nums'] = tag_info_item.find_all('span',class_='rating_nums')[0].string
except:
nowplaying_dict['rating_nums'] = '(少于10人评价)'
nowplaying_dict['people'] = tag_info_item.find_all('span',class_='pl')[0].string.replace('\n','')
# print(nowplaying_dict)
book_list.append(nowplaying_dict)
# print(book_list)
return book_list
# print(book_list)
list = []
for a in range(3):
url = 'https://book.douban.com/tag/%E4%BA%92%E8%81%94%E7%BD%91?start={}&type=T'.format(a*20)
data = get_url_book(url)
list+=data
print(url)
print(list)
with open("book.csv","w",encoding='gb18030',newline='') as datacsv:
csvwriter = csv.writer(datacsv,dialect=("excel"))
csvwriter.writerow(["名称","描述","作者/出版社/价格","评分","评分人数"])
for item in list:
csvwriter.writerow([item['name'],item['dec'],item['pub'],item['rating_nums'],item['people']])
print("ok")