Python requests爬虫爬取小说数据

抓取起点网站的所有小说(未分类抓取),并将提取的数据保存到本地csv文件中。
采用requests抓取网页,用BeautifulSoup解析网页,用select方法提取元素(也可以使用find()方法)
import requests
import csv
from bs4 import BeautifulSoup

import time#设置抓取间隔时间
page_urls=[]#页数链接
book_urls=[]#每页中的小说链接
url='http://a.qidian.com/'

def get_url(url):#获取页数链接
    headers={'User-Agent':"Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0"}#头部
    html=requests.get(url,headers=headers)
    html=BeautifulSoup(html.text,'lxml')
    num=html.select('#page-container')[0].get('data-pagemax')#获取最大页数
    for i in range(1,int(num)+1):
        url='http://a.qidian.com/?size=-1&page={}'.format(str(i))
        page_urls.append(url)

def get_bookurl(url):#获取每页中的小说链接
    headers={'User-Agent':"Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0"}
    html=requests.get(url,headers=headers)
    html=BeautifulSoup(html.text,'lxml')
    urls=html.select('.book-img-box a')
    for url in urls:
        book_url='http:'+url.get('href')
        book_urls.append(book_url)

def get_book(url):
    headers={'User-Agent':"Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0"}
    html=requests.get(url,headers=headers)
    time.sleep(1)#设置抓取间隔时间
    html=BeautifulSoup(html.text,'lxml')

    bookname=html.select('h1 em')[0].text#小说名
    bookfenlei=[i.text for i in html.select('p.tag a')]#小说分类
    bookwords =html.select('div.book-info p em')[0].text+html.select('div.book-info p cite')[0].text#小说字数
    bookdes=html.select('.book-intro > p')[0].text.strip()#小说简介
    kookscore=None#此数据在js中,占未抓取
    bookwriter=html.select('h1 a.writer')[0].text#作者
    bookclicks=html.select('div.book-info p em')[1].text#点击次数
    bookurl=url#小说链接
    with open('C:\\Documents and Settings\\liuchi\\桌面\\新建文件夹\\qdxiaoshuo01.csv', 'a+',newline='') as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow([bookname,bookfenlei[0],bookfenlei[1],bookwords,bookdes,bookwriter,bookclicks,bookurl])#bookfenlei[0],bookfenlei[1]:将分类拆分为两列。
    #print({'bn':bookname,'bf':bookfenlei,'bw':bookwords,'bd':bookdes,'be':bookwriter,'bu':bookurl})#

#测试抓取一页的数据
get_bookurl('http://a.qidian.com/?page=1')
list(map(get_book,book_urls))




你可能感兴趣的:(requests爬虫学习笔记)