爬取豆瓣书评

import requests
import lxml
from bs4 import BeautifulSoup

# 请求数据
url = 'https://book.douban.com/latest'
# headers 里面大小写均可
headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# UA设置伪装
data = requests.get(url, headers = headers)
#print(data.text)

# 解析数据
soup = BeautifulSoup(data.text, 'lxml')
#print(soup)

#观察到网页上的书籍按左右两边分布,按照标签分别提取
books_left = soup.find('ul', { 'class' : 'cover-col-4 clearfix'})
books_left = books_left.find_all('li')

books_right = soup.find( 'ul', { 'class' : 'cover-col-4 pl20 clearfix'})
books_right = books_right.find_all('li')

books = list(books_left) + list(books_right)

# 对每一个图片区块进行相同的操作,获取图片信息
img_urls = []
titles = []
ratings = []
authors = []
details = []
for book in books :
    #获取封面图片url地址
    img_url = book.find_all('a')[0].find('img').get('src')
    img_urls.append(img_url)
    #图片标题
    title = book.find_all('a')[1].get_text()
    titles.append(title)
    # print(title)

    #评价星级
    rating = book.find( 'p', { 'class' : 'rating'}).get_text()
    rating = rating.replace( '\n', "").replace( '  ', "" )
    ratings.append(rating)

    #作者及出版信息
    author = book.find( 'p', { 'class' : 'color-gray'}).get_text()
    author = author.replace('\n', "").replace('  ', "")
    authors.append(author)

    #图书简介
    detail = book.find_all('p')[2].get_text()
    detail = detail.replace( '\n', '').replace( '  ', ' ' )
    details.append(detail)
count = 0;
while(count__len__()) :
    print("img_urls: ", img_urls[count])
    print( "titles: <<", titles[count],">>")
    print("rating: ", ratings[count])
    print("authors: ", authors[count])
    print("details: ", details[count])
    count += 1

你可能感兴趣的:(python)