import requests
import lxml
from bs4 import BeautifulSoup
# 请求数据
url = 'https://book.douban.com/latest'
# headers 里面大小写均可
headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# UA设置伪装
data = requests.get(url, headers = headers)
#print(data.text)
# 解析数据
soup = BeautifulSoup(data.text, 'lxml')
#print(soup)
#观察到网页上的书籍按左右两边分布,按照标签分别提取
books_left = soup.find('ul', { 'class' : 'cover-col-4 clearfix'})
books_left = books_left.find_all('li')
books_right = soup.find( 'ul', { 'class' : 'cover-col-4 pl20 clearfix'})
books_right = books_right.find_all('li')
books = list(books_left) + list(books_right)
# 对每一个图片区块进行相同的操作,获取图片信息
img_urls = []
titles = []
ratings = []
authors = []
details = []
for book in books :
#获取封面图片url地址
img_url = book.find_all('a')[0].find('img').get('src')
img_urls.append(img_url)
#图片标题
title = book.find_all('a')[1].get_text()
titles.append(title)
# print(title)
#评价星级
rating = book.find( 'p', { 'class' : 'rating'}).get_text()
rating = rating.replace( '\n', "").replace( ' ', "" )
ratings.append(rating)
#作者及出版信息
author = book.find( 'p', { 'class' : 'color-gray'}).get_text()
author = author.replace('\n', "").replace(' ', "")
authors.append(author)
#图书简介
detail = book.find_all('p')[2].get_text()
detail = detail.replace( '\n', '').replace( ' ', ' ' )
details.append(detail)
count = 0;
while(count__len__()) :
print("img_urls: ", img_urls[count])
print( "titles: <<", titles[count],">>")
print("rating: ", ratings[count])
print("authors: ", authors[count])
print("details: ", details[count])
count += 1