Ali1688爬虫实践(1)

我的想法是爬取1688上面的企业信息,最开始的思路是直接搜索商品通过selenium爬取,结果整出来的爬虫效率太低舍弃了,不过也把代码贴出来,以供参考,采用的是pyquery+beautifulsoup+selenium+MongoDB

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import time
from mongo_config import *
import pymongo

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 15)


def crawle(key):
    url = 'https://www.1688.com/'
    browser.get(url=url)
    try:
        button = browser.find_element_by_class_name('identity-cancel')
        button.click()
    except:
        pass

    input = browser.find_element_by_id('alisearch-keywords')
    input.send_keys(key)

    sea_button = browser.find_element_by_id('alisearch-submit')
    sea_button.click()
    try:
        button_1 = browser.find_element_by_class_name('s-overlay-close-l')
        button_1.click()
    except:
        pass
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    try:

        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
    except:
        print('*' * 30, '超时加载', '*' * 30, '\n\n\n')
    getDetailUrl()


# 翻页
def goToNextPage(page):
    page_input = browser.find_element_by_class_name('fui-paging-input')
    page_input.clear()
    page_input.send_keys(page)
    button = browser.find_element_by_class_name('fui-paging-btn')
    button.click()
    time.sleep(3)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    try:
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
    except:
        print('*' * 30, '加载超时', '*' * 30, '\n\n\n')

    getDetailUrl()


# 搜索结果页获取店铺详情页url
def getDetailUrl():
    html = browser.page_source
    doc = pq(html)
    items = doc('.sm-offer .fd-clr .sm-offer-item').items()
    totalPage = doc('.fui-paging-num').text()
    totalPage = int(totalPage)
    currentPage = doc('.fui-current').text()
    currentPage = int(currentPage)
    global totalShopsOnePage
    index = 0
    for item in items:
        index += 1
        # 产品网址
        text = item.find('.sm-offer-photo')
        soup = BeautifulSoup(str(text), 'lxml')
        try:
            a = soup.select('.sm-offer-photo a')[0]
            detailurl = a['href']
            goToDetailPage(detailurl, totalPage, currentPage)
            # print('detailUrl>>>', detailurl)

        except:
            pass
    print('* ' * 40)
    print('一共%d条数据' % index)
    totalShopsOnePage = index


# 进入店铺详情页,有时候会弹出登录框,需要叉掉
def goToDetailPage(detailurl, totalPage, currentPage):
    browser.get(url=detailurl)
    try:
        button = browser.find_element_by_class_name('sufei-dialog-close')
        button.click()
    except:
        pass
    getShopInfoUrl(totalPage, currentPage)


# 店铺详情页获取旺铺或者公司信息url
def getShopInfoUrl(totalPage, currentPage):
    html = browser.page_source
    doc = pq(html)
    items = doc('.top-nav-bar-box').items()
    for item in items:
        # 产品网址
        text = item.find('.creditdetail-page')
        soup = BeautifulSoup(str(text), 'lxml')
        try:
            a = soup.select('.creditdetail-page a')[0]
            shopInfoUrl = a['href']
            print('shopInfoUrl>>>', shopInfoUrl)
            # getShopInfo(shopInfoUrl, totalPage, currentPage) TODO
        except:
            pass


# 获取店铺或者企业认证数据
def getShopInfo(url, totalpage, currentPage):
    browser.get(url=url)
    html = browser.page_source
    doc = pq(html)
    # 公司名称
    companyName = doc('.company-name span').text()
    # 信用等级
    honestLevel = doc('.company-name a.icon.icon-credit').text()
    # 联系人
    contactPerson = doc('.text.company-contact span.contact-info').text()
    # 模拟点击联系方式
    telephoneButton = browser.find_element_by_id('J_COMMON_CompanyInfoTelBtn')
    telephoneButton.click()
    mobileButton = browser.find_element_by_id('J_COMMON_CompanyInfoPhoneBtn')
    mobileButton.click()
    # 联系方式
    telephone = doc('.phone-tip span.tip-info.phone-num').text()
    # 公司注册信息
    companyInfoItems = doc('.info-box.info-right table tbody tr')
    infohtml = str(companyInfoItems)
    soup = BeautifulSoup(infohtml, "html.parser")
    tagTd1 = soup.select('td')[0].get_text().strip()
    tagTd2 = soup.select('td')[1].get_text().strip()
    tagTd3 = soup.select('td')[2].get_text().strip()
    tagTd4 = soup.select('td')[3].get_text().strip()
    tagTd5 = soup.select('td')[4].get_text().strip()
    tagTd6 = soup.select('td')[5].get_text().strip()
    tagTd7 = soup.select('td')[6].get_text().strip()
    tagTd8 = soup.select('td')[7].get_text().strip().replace("\n", "").replace(" ", "").replace("查看地图", "")
    companyInfo = {
        'fundDate': tagTd1 + tagTd2,
        'registerMoney': tagTd3 + tagTd4,
        'operateArea': tagTd5 + tagTd6,
        'addr': tagTd7 + tagTd8
    }

    # 店铺交易信息
    translateTag = doc('.section-main ul')
    translateNumPQ = pq(translateTag)
    translateList = pq(translateNumPQ('li')).text().split()
    translateInfo = {
        'transactionsNum': translateList[0] + ":" + translateList[1] + ";" + translateList[2] + ":" + translateList[3],
        'buyersNum':translateList[4] + ":" + translateList[5] + ";" + translateList[6] + ":" + translateList[7],
        'repeaaRate':translateList[8] + ":" + translateList[9] + ";" + translateList[10] + ":" + translateList[11],
    }
    result = {
        'companyName': companyName,
        'loyaltyLevel': honestLevel,
        'contactPerson': contactPerson,
        'telephone': telephone,
        'companyInfoItems': companyInfo,
        'translateInfo': translateInfo
    }
    saveToMongo(result, totalpage, currentPage)


# 数据保存到MongoDB
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


def saveToMongo(result, totalPage, currentPage):
    count = 0
    try:
        if db[MONGO_TABLE].insert(result):
            count += 1
            print('存储成功', result)
    except:
        count += 1
        print('存储失败', result)
    # 如果保存数据库的数据条数等于每个页面店铺总数则翻页 TODO
    print('存储数据总数》》》', count)
    if (count == totalShopsOnePage):
        if totalPage > currentPage:
            goToNextPage(currentPage + 1)


def main():
    key_words = input('输入想查询的类别(如女装,五金。。。):').split(' ')
    for key in key_words:
        time.sleep(3)
        crawle(key)


main()

 

 

你可能感兴趣的:(爬虫)