Python爬取淘宝网商品信息

直接上代码

#!/usr/bin/env Python
#coding=UTF-8
import time
import pymongo
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random#设置反爬等待时间

client = pymongo.MongoClient(host='localhost',port=27017)#链接mongodb
db=client.taobao.table_name1#创建表table_name
browser=webdriver.Chrome()#定义browser
wait=WebDriverWait(browser,10)#设置等待显式时间


def search(keyword):#负责登录淘宝网、输入搜索关键词
	try:#尝试登录搜索,失败则重来
		browser.get('https://login.taobao.com/member/login.jhtml')#打开淘宝网
		print('请扫描二维码登录淘宝!')
		input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))#选中输入框
		submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))#先选中提交按钮
		input.send_keys(keyword)#输入关键词
		submit.click()#点击提交按钮
		a=random.randint(1,3)#设置随机等待时间
		for k in range(a):
			print('反爬等待',a-k,'s')
			time.sleep(1)
		total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
		print('login_success!')
	except TimeoutException as error:
		print(error)
		search(keyword)
def next_page(pagenumber):#翻页用
	try:
		input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')))
		submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
		input.clear()
		a=random.randint(1,3)
		for k in range(a):
			print('反爬等待',a-k,'s')
			time.sleep(1)
		input.send_keys(pagenumber)
		submit.click()
		wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(pagenumber)))
		print('翻页成功')
		product()
	except:
		print('触发了反爬机制,错误啦')

def save_to_mongo(result):
	try:
		if db[MONGO_TABLE].insert(result):
			print('存储到Mongo成功')
	except Exception:
		print('存储到Mongo失败',result)
def product():
	wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
	html=browser.page_source
	doc=pq(html)
	a=random.randint(1,3)
	for k in range(a):
		print('反爬等待',a-k,'s')
		time.sleep(1)
	items=doc('#mainsrp-itemlist .items .item').items()
	for item in items:
		product={
		'image:':item.find('.pic img').attr('data-src'),
		'deal:':item.find('.deal-cnt').text()[:-3],
		'location':item.find('.location').text(),
		'price:':item.find('strong').text(),
		'shop:':item.find('.shop').text(),
		'title:':item.find('.title').text()}
		print('获取数据成功,准备写入MongoDB')
		save_to_mongo(product)
def main():
	keyword='零食'
	search(keyword)
	for i in range(2,101):
		product()
		a=random.randint(1,3)
		for k in range(a):
			print('反爬等待',a-k,'s')
			time.sleep(1)
		print('准备翻页到第',i,'页')
		next_page(i)
	print('ALL DONE!')
	browser.close()
if __name__=="__main__":
	main()

你可能感兴趣的:(python)