如何使用python爬虫自动的从网站上爬取想要的数据

以下代码为实现从电商网站上自动的获取书籍信息

from selenium import webdriver
import time
from bs4 import BeautifulSoup
base_url = "https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_e98375277f714ddabd4e98712adec3a9"
with open('data1.txt', 'w', encoding='utf-8') as f:
    n = 1
    driver = webdriver.Chrome(executable_path='d:\\chromedriver.exe')
    driver.get(base_url)

    driver.find_element_by_id('key').send_keys('大数据')
    time.sleep(3)
    driver.find_element_by_class_name('button').click()
    while True:
        scroll = 1000
        for i in range(15):
            # '$(window).scrollTop(str(scroll))'
            driver.execute_script('var q = document.documentElement.scrollTop={}'.format(scroll))
            scroll += 1000
            # time.sleep(2)
            driver.implicitly_wait(10)

        doc = BeautifulSoup(driver.page_source, 'html.parser')
        # print(doc)
        book_l = doc.select('.gl-warp > li')
        for book in book_l:
            s = ''
            print(n)
            # print(book)
            book_imge = book.select('.p-img > a > img')[0].get('src')
            if book_imge is None:
                book_imge = book.select('.p-img > a > img')[0].get('data-lazy-img')
                # print(book_imge)
            book_price = book.select('.p-price > strong')[0].getText()
            # print(book_price)
            book_name = book.select('.p-name > a >em')[0].getText()
            # print(book_name)
            book_com_num = book.select(".p-commit > strong")[0].getText()
            # print(book_com_num)
            try:
                book_press = book.select('.p-shopnum >a')[0].getText()
            except:
                book_press = '不详'
            s += book_name +'\t'+book_press+'\t' + book_imge + '\t' + book_price + '\t' + book_com_num + '\n'
            n += 1
            f.write(s)
        if doc.select('.disabled') != doc.select('pn-next'):
            driver.find_element_by_class_name('pn-next').click()
            time.sleep(2)
        else:
            break

 

你可能感兴趣的:(python)