python 爬取 js渲染_Python爬虫学习笔记7:动态渲染页面爬取

淘宝,它的整个页面数据确实也是通过 Ajax获取的,但是这些 Ajax接口 参数比较复杂,可能会包含加密密钥等, 所以如果想自己构造 Ajax 参数,还是比较困难的 。 对于这 种页面 , 最方便快捷 的抓取方法就是通过 Selenium

1513541-20190706114942154-1025073577.png

商品列表信息

1513541-20190706115332776-935896034.png

# 爬取淘宝页面商品信息,包括商品名称、商品价格、购买人数、店铺名称、店铺所在地

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.wait import WebDriverWait

from urllib.parse import quote

from pyquery import PyQuery as pq

import json

browser = webdriver.Chrome()

wait = WebDriverWait(browser,10)

KEYWORD = 'iPad'

def index_page(page):

print('正在爬取第',page,'页')

try:

url = 'https://www.taobao.com/search?q=' + quote(KEYWORD)

browser.get(url)

if page > 1: # 如果大于1,就进行跳转操作,否则等待页面加载完成

input = wait.until(

EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager div.from>input'))

)

submit = wait.until(

EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager div.from>span.btn.J_Submit'))

)

input.clear()

input.send_keys(page)

submit.click()

wait.until(

EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager li.item.active>span'),str(page))

)

wait.until(

EC.presence_of_element_located((By.CSS_SELECTOR,'.m-itemlist .items .item'))

)

get_products()

except TimeoutException:

index_page(page)

def get_products():

taobao_data = []

html = browser.page_source

doc = pq(html)

items = doc('#mainsrp-itemlist .items .item').items()

for item in items:

product = {

'image': item.find('.pic .img').attr('data-src'),

'price': item.find('.price').text(),

'deal': item.find('.title').text(),

'shop': item.find('.shop').text(),

'location': item.find('.location').text()

}

print(product)

taobao_data.append(product)

with open('taobao.json','a',encoding='utf-8') as f:

f.write(json.dumps(taobao_data, indent=2, ensure_ascii=False))

# MONGO_URL = 'localhost'

# MONGO_DB = 'taobao'

# MONGO_COLLECTION = 'products'

# client = pymongo.MongoClient(MONGO_URL)

# db = client[MONGO_DB]

#

# def save_to_mongo(result):

# try:

# if db[MONGO_COLLECTION].insert(result):

# print('存储到MongoBD 成功')

# except Exception:

# print('存储到MongoBD 失败')

MAX_PAGR = 100

def main():

for i in range(1,MAX_PAGR+1):

index_page(i)

main()

测试代码跑起来了,不过提示页面找不到了,没爬取成功,后面在补充学习

1513541-20190706122532295-212986670.png

你可能感兴趣的:(python,爬取,js渲染)