python爬虫实战笔记——爬取图书信息(利用selenium库+chromedriver.exe插件)

准备:

1、插件chromedriver.exe

2、已经安装好谷歌浏览器Chrome

编写代码

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select,WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import random
import numpy as np
import pandas as pd

#=============第一步:下载网页数据到本地=====================
#各种USER_AGENTS,反 反爬虫机制
USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
        "Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
        "Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3",
    ]


#chromedriver插件:作用:会自动控制浏览器,进行点击等操作
driver_path = r'D:\output2020\chromedriver.exe'
#初始化一个对象,ChromeOptions代表浏览器的操作
options = webdriver.ChromeOptions()  #这是一个空对象

#随机产生user_agent
def change_useragent(options,USER_AGENTS):
    user_agent = random.choice(USER_AGENTS)  #从USER_AGENTS随机选择一个
    options.add_argument('User-Agent=%s'%user_agent)  #拼接好一个报文

user_agent = random.choice(USER_AGENTS)
options.add_argument('User-Agent=%s'%user_agent)
#插件使用
driver = webdriver.Chrome(chrome_options = options,executable_path = driver_path)

#访问路径
url = "https://read.douban.com/category?sort=new&page=1&kind=1"
driver.get(url)  #浏览器访问该地址,driver获取了该网页所有内容

#随便加载一个元素(class),之后开始获取页面
#设置爬取时的网页等待时间,超时则放弃爬取,防止网络中断等故障的影响
WebDriverWait(driver=driver,timeout=10).until(
    #表示如果加载完这个works-item类就好了,如果没有加载完就一直加载直到超过10s
    EC.presence_of_element_located((By.CLASS_NAME,'works-item')) #works-item这个类是在网页检查中找的类
)

#=====================第二步:分析html====================
#=====================第三步:保存文本====================
df = pd.DataFrame(columns=['img_url','title','author','detail','bookClass','bookPrice','codeCount','pubtime'])

#分析每一页的html元素,找到需要的元素,并保存
def parse_page(resource):
    #解析数据
    soup = BeautifulSoup(resource,'lxml')
    #观察网页,提取元素
    books = soup.find_all('li',class_ = 'works-item' )

    #遍历每一个li(一本书)
    for book in books:
        #获取书本价格(有特价的取特价,没特价的取原价)
        try:
            book.find_all('span',class_ ='discount-price')[0].get_text()
        except:
            price = book.find_all('span',class_ ='price-tag')[0].get_text()
        else:
            price = book.find_all('span',class_ ='discount-price')[0].get_text()

        #获取发布日期
        pubtime = book.find_all('span',class_ ='flexible-info-item')[0].get_text()
        #封面图片链接
        img_url = book.find_all('img')[0].get('src')
        #书名
        title = book.find_all('a',class_ ='title-container')[0].get('title')
        #作者等信息
        author = book.find_all('a', class_='author-link')[0].get_text()
        #简介
        detail = book.find_all('div',class_ ='intro')[0].get_text()
        #字数
        flag = book.find_all('div',class_ ='sticky-info')[0]
        codecount = flag.find_all('span')[0].get_text()
        #分类
        bookcl = book.find_all('a',class_ ='kind-link')[0].get_text()

        # 保存到文本 pd.DataFrame(columns=['img_url','title','author','detail','bookClass','bookPrice','codeCount','pubtime'])
        #意思是,把img_url当作主键列(其值具有唯一性),主键列的值相同的会放到一行中
        df.loc[img_url] = np.nan  # none
        df.loc[img_url]['img_url'] = img_url
        df.loc[img_url]['title'] = title
        df.loc[img_url]['author'] = author
        df.loc[img_url]['detail'] = detail
        df.loc[img_url]['bookClass'] = bookcl
        df.loc[img_url]['bookPrice'] = price
        df.loc[img_url]['codeCount'] = codecount
        df.loc[img_url]['pubtime'] = pubtime

        # 刷新到文本 csv
        df.to_csv('./douban_book.csv')   #写入当前文件夹下的douban_book.csv中



#主程序
while True:
    #获取下一页(在网页检查页面中找到该页面的点击进入下一页的按钮的hmtl代码,由此来定位)
    next_btn = driver.find_element_by_xpath("//div[@class='paginator-full']/ul/li[last()]/a")
    #获取本页的html的元素
    resource = driver.page_source
    #每一页的页面分析(从每一个页面的元素中挑出需要的信息)
    parse_page(resource)

    print('本业爬取完成,开始下一页')
    #拉动滚动条,显示下一页按钮
    driver.execute_script("arguments[0].scrollIntoView(true);",next_btn)
    #判断是否有效
    if "disabled" in next_btn.get_attribute('class'):
        break

    #反反爬虫
    change_useragent(options,USER_AGENTS)
    time.sleep(5)

 

你可能感兴趣的:(python爬虫实战笔记——爬取图书信息(利用selenium库+chromedriver.exe插件))