Python3 爬取豆瓣图书Top250并存入Excel中

#coding=utf-8
import re
import xlwt
import requests
from bs4 import BeautifulSoup

 

def getHtml(url):  
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}  
    page = requests.get(url,headers = headers)
    html =page.text
    return html

if __name__=='__main__':
    Workbook = xlwt.Workbook()
    sheet = Workbook.add_sheet('豆瓣图书Top250')
    sheet.write(2,2,'书名')
    sheet.write(2,3,'作者')
    sheet.write(2,4,'译者')
    sheet.write(2,5,'出版单位')
    sheet.write(2,6,'出版时间')
    sheet.write(2,7,'定价')
    sheet.write(2,8,'豆瓣评分')
    sheet.write(2,9,'评价人数')
    sheet.write(2,10,'一句话')


    i = 3
    j = 3
    k = 3
    m = 3
    for page in range(0,250,25):
        url = 'https://book.douban.com/top250?start={0}'.format(page)
        html = getHtml(url)
        Soup = BeautifulSoup(html,'html.parser')
        names = Soup.find_all('div',class_ = 'pl2')
        
        for name in names:
            book = name.find('a')
            book = book.text.strip()
            book = book.replace(' ','')
            sheet.write(i,2,book)
            i += 1
        
        Infos = Soup.find_all('p',class_ = 'pl')
        for Info in Infos:
            r = 1
            authorinfo =  Info.text
            authors = authorinfo.split('/')
            if len(authors) < 4:
                sheet.write(j,5,authors[0])
                sheet.write(j,6,authors[1])
                sheet.write(j,7,authors[2])
                j += 1
                continue
            sheet.write(j,3,authors[0])
            if authorinfo.count('/') == 4:
                sheet.write(j,4,authors[r])
                r += 1
            sheet.write(j,5,authors[r])
            sheet.write(j,6,authors[r+1])
            sheet.write(j,7,authors[r+2])
            j += 1

        rating_nums = Soup.find_all('div',class_ = 'star clearfix')
        for rating in rating_nums:
            star = rating.find_all('span')
            sheet.write(k,8,star[1].text)
            reg = r'\d+'
            vote = re.findall(reg,star[2].text)
            sheet.write(k,9,vote)
            k += 1
        quotes = Soup.find_all('p',class_ = 'quote')
        for quote in quotes:
            sheet.write(m,10,quote.text)
            m += 1
        
    
    Workbook.save('豆瓣图书Top250.xls')
   
            
            
            
            
            
        
        
        

            

效果截图如下:

Python3 爬取豆瓣图书Top250并存入Excel中_第1张图片

 

 

#coding=utf-8
import re
import time
import xlwt
import requests
from lxml import etree
from bs4 import BeautifulSoup
from collections import namedtuple

def getHtml(url):  
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}  
    page = requests.get(url,headers = headers)
    html =page.text
    return html

def get_book_info_xpath():
    Book_list = []
    reg = re.compile(r'\d+')
    urls = [f'https://book.douban.com/top250?start={page}' for page in range(0,250,25)]
    for url in urls:
        html = getHtml(url)
        html = etree.HTML(html)
        book_names = html.cssselect('td[valign="top"]~td[valign="top"]')
        for book_name in book_names:
            Book = namedtuple('Book','书名,作者,译者,出版单位,出版时间,定价,豆瓣评分,评价人数,一句话')
            Book.书名 =  book_name.find('div//a').get('title')
            author_info = book_name.findtext('p')
            author_split = re.split('\s/+\s',author_info)
            author_len = len(author_split)

            Book.出版单位 = author_split[-3]
            Book.出版时间 = author_split[-2]
            Book.定价     = author_split[-1]

            Book.作者 = author_split[0] if author_len > 3 else ''
            Book.译者 = author_split[1] if author_len > 4 else ''

            Book.豆瓣评分 = book_name.findtext('div//span[@class="rating_nums"]')
            person = book_name.findtext('div//span[@class="pl"]')
            Book.评价人数 = reg.search(person).group()
            Book.一句话 = book_name.findtext('p//span[@class="inq"]')

            Book_list.append(Book)
    return Book_list


def get_book_info_bs4():
    Book_list = []
    reg = re.compile(r'\d+')
    urls = [f'https://book.douban.com/top250?start={page}' for page in range(0,250,25)]
    for url in urls:
        html = getHtml(url)
        soup = BeautifulSoup(html,'lxml')
        book_names = soup.select('td[valign="top"] ~ td[valign="top"]')
        for book_name in book_names:
            Book = namedtuple('Book','书名,作者,译者,出版单位,出版时间,定价,豆瓣评分,评价人数,一句话')
            Book.书名 = book_name.select_one('div.pl2 > a').get('title')
            author_info = book_name.select_one('p.pl').text
            author_split = re.split('\s/+\s',author_info)
            author_len = len(author_split)

            Book.出版单位 = author_split[-3]
            Book.出版时间 = author_split[-2]
            Book.定价     = author_split[-1]

            Book.作者 = author_split[0] if author_len > 3 else ''
            Book.译者 = author_split[1] if author_len > 4 else ''
                
            Book.豆瓣评分 = book_name.select_one('span.rating_nums').text
            person = book_name.select_one('span.pl').text
            Book.评价人数 = reg.search(person).group()

            one_sentence = book_name.select_one('span.inq')
            Book.一句话 = one_sentence.text if one_sentence else ''
            
            Book_list.append(Book)
    return Book_list

def write_info_xls(Book_list):
    Workbook = xlwt.Workbook()
    sheet = Workbook.add_sheet('豆瓣图书Top250')
    infos = '书名,作者,译者,出版单位,出版时间,定价,豆瓣评分,评价人数,一句话'.split(',')
    for i in range(2,len(infos)+2):
        sheet.write(2,i,infos[i-2])

    for i in range(3,len(Book_list)+3):
        sheet.write(i,2,Book_list[i-3].书名)
        sheet.write(i,3,Book_list[i-3].作者)
        sheet.write(i,4,Book_list[i-3].译者)
        sheet.write(i,5,Book_list[i-3].出版单位)
        sheet.write(i,6,Book_list[i-3].出版时间)
        sheet.write(i,7,Book_list[i-3].定价)
        sheet.write(i,8,Book_list[i-3].豆瓣评分)
        sheet.write(i,9,Book_list[i-3].评价人数)
        sheet.write(i,10,Book_list[i-3].一句话)
    Workbook.save('豆瓣图书Top250.xls')
        
if __name__=='__main__':
    Book_list = get_book_info_bs4()
    write_info_xls(Book_list)

欢迎关注本人微信公众号,会分享更多的干货:

你可能感兴趣的:(Python,爬虫)