Python 抓取淘宝商品信息并保存到数据库或者EXCEL

参考此文：https://www.jianshu.com/p/80c602afc623
#! -*- coding:utf-8 -*-
# 抓取商品信息并保存到数据库或者EXCEL
from multiprocessing.pool import Pool
import pymongo
from selenium import webdriver
import xlwt

driver_path=r"C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chromedriver.exe"
driver=webdriver.Chrome(executable_path=driver_path)
# EXCEL
f=xlwt.Workbook(encoding="utf8")
sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet01.write(0,0,'标题') # excl里面：左边0:是横，右边：纵
sheet01.write(0,1,'标价')
sheet01.write(0,2,'购买人数')
# MONGODB
MONGO_URL = 'localhost'
MONGO_DB = 'taobao'
MONGO_TABLE = 'nvzhuang'
client = pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

# 需要请求的页面
url = "https://uland.taobao.com/sem/tbsearch?refpid=mm_26632258_3504122_32538762&keyword=%E5%A5%B3%E8%A3%85&clk1=44c369a534bf95506aa0a87518971645&upsid=44c369a534bf95506aa0a87518971645&page="
# 默认页数
number=1

# 获取最大页数
def get_maxpage(url):
    try:
        driver.get(url)
        totalPage = driver.find_element_by_css_selector('.totalPage').text
        number = int(str(totalPage).strip('共').strip('页'))
        driver.close()
        return number
    except Exception:
        get_maxpage(url)

# 解析页面元素
def parse_page(pagenum):
    print("在正抓取第",pagenum,'页...')
    contents = []
    driver.get(url + str(pagenum))
    divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")
    for div in divs:
        title = div.find_element_by_xpath(".//span[@class='title']").text
        price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text
        byNumber = div.find_element_by_xpath(".//span[@class='payNum']").text
        shopName = div.find_element_by_xpath(".//span[@class='shopNick']").text
        score = div.find_element_by_xpath(".//span[@class='dsr-info-num']").text
        href = div.find_element_by_xpath(".//a").get_attribute("href")
        image = div.find_element_by_xpath(".//img").get_attribute("src")
        info = {
            'title': title,
            'shopName':shopName,
            'price': price,
            'byNumber': byNumber,
            'score':score,
            'href':href,
            'image':image
        }
        contents.append(info)
    # save_infos(contents)
    save_to_mongo(contents)

# 保存到MONGO
def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('保存到MONGODB成功')
    except Exception:
        print('存储到MONGODB失败')
# 保存到EXCEL
def save_infos(contents):
    w = 0
    for content in contents:
        sheet01.write(w + 1, 0, content['title'])  # 前纵后横
        sheet01.write(w + 1, 1, content['price'])
        sheet01.write(w + 1, 2, content['number'])
        w = w + 1
    f.save(r"C:\Users\Administrator\Desktop\taobao_nvzhuang.xls")

def main():
    number = get_maxpage(url)
    # 多线程抓取
    pool = Pool()
    pool.map(parse_page, [i + 1 for i in range(number)])

if __name__ == '__main__':
    main()
效果如下：
QQ截图20181226115537.png
Python 抓取淘宝商品信息并保存到数据库或者EXCEL

你可能感兴趣的:(Python 抓取淘宝商品信息并保存到数据库或者EXCEL)