我的想法是爬取1688上面的企业信息,最开始的思路是直接搜索商品通过selenium爬取,结果整出来的爬虫效率太低舍弃了,不过也把代码贴出来,以供参考,采用的是pyquery+beautifulsoup+selenium+MongoDB
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import time
from mongo_config import *
import pymongo
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 15)
def crawle(key):
url = 'https://www.1688.com/'
browser.get(url=url)
try:
button = browser.find_element_by_class_name('identity-cancel')
button.click()
except:
pass
input = browser.find_element_by_id('alisearch-keywords')
input.send_keys(key)
sea_button = browser.find_element_by_id('alisearch-submit')
sea_button.click()
try:
button_1 = browser.find_element_by_class_name('s-overlay-close-l')
button_1.click()
except:
pass
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
except:
print('*' * 30, '超时加载', '*' * 30, '\n\n\n')
getDetailUrl()
# 翻页
def goToNextPage(page):
page_input = browser.find_element_by_class_name('fui-paging-input')
page_input.clear()
page_input.send_keys(page)
button = browser.find_element_by_class_name('fui-paging-btn')
button.click()
time.sleep(3)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#offer60')))
except:
print('*' * 30, '加载超时', '*' * 30, '\n\n\n')
getDetailUrl()
# 搜索结果页获取店铺详情页url
def getDetailUrl():
html = browser.page_source
doc = pq(html)
items = doc('.sm-offer .fd-clr .sm-offer-item').items()
totalPage = doc('.fui-paging-num').text()
totalPage = int(totalPage)
currentPage = doc('.fui-current').text()
currentPage = int(currentPage)
global totalShopsOnePage
index = 0
for item in items:
index += 1
# 产品网址
text = item.find('.sm-offer-photo')
soup = BeautifulSoup(str(text), 'lxml')
try:
a = soup.select('.sm-offer-photo a')[0]
detailurl = a['href']
goToDetailPage(detailurl, totalPage, currentPage)
# print('detailUrl>>>', detailurl)
except:
pass
print('* ' * 40)
print('一共%d条数据' % index)
totalShopsOnePage = index
# 进入店铺详情页,有时候会弹出登录框,需要叉掉
def goToDetailPage(detailurl, totalPage, currentPage):
browser.get(url=detailurl)
try:
button = browser.find_element_by_class_name('sufei-dialog-close')
button.click()
except:
pass
getShopInfoUrl(totalPage, currentPage)
# 店铺详情页获取旺铺或者公司信息url
def getShopInfoUrl(totalPage, currentPage):
html = browser.page_source
doc = pq(html)
items = doc('.top-nav-bar-box').items()
for item in items:
# 产品网址
text = item.find('.creditdetail-page')
soup = BeautifulSoup(str(text), 'lxml')
try:
a = soup.select('.creditdetail-page a')[0]
shopInfoUrl = a['href']
print('shopInfoUrl>>>', shopInfoUrl)
# getShopInfo(shopInfoUrl, totalPage, currentPage) TODO
except:
pass
# 获取店铺或者企业认证数据
def getShopInfo(url, totalpage, currentPage):
browser.get(url=url)
html = browser.page_source
doc = pq(html)
# 公司名称
companyName = doc('.company-name span').text()
# 信用等级
honestLevel = doc('.company-name a.icon.icon-credit').text()
# 联系人
contactPerson = doc('.text.company-contact span.contact-info').text()
# 模拟点击联系方式
telephoneButton = browser.find_element_by_id('J_COMMON_CompanyInfoTelBtn')
telephoneButton.click()
mobileButton = browser.find_element_by_id('J_COMMON_CompanyInfoPhoneBtn')
mobileButton.click()
# 联系方式
telephone = doc('.phone-tip span.tip-info.phone-num').text()
# 公司注册信息
companyInfoItems = doc('.info-box.info-right table tbody tr')
infohtml = str(companyInfoItems)
soup = BeautifulSoup(infohtml, "html.parser")
tagTd1 = soup.select('td')[0].get_text().strip()
tagTd2 = soup.select('td')[1].get_text().strip()
tagTd3 = soup.select('td')[2].get_text().strip()
tagTd4 = soup.select('td')[3].get_text().strip()
tagTd5 = soup.select('td')[4].get_text().strip()
tagTd6 = soup.select('td')[5].get_text().strip()
tagTd7 = soup.select('td')[6].get_text().strip()
tagTd8 = soup.select('td')[7].get_text().strip().replace("\n", "").replace(" ", "").replace("查看地图", "")
companyInfo = {
'fundDate': tagTd1 + tagTd2,
'registerMoney': tagTd3 + tagTd4,
'operateArea': tagTd5 + tagTd6,
'addr': tagTd7 + tagTd8
}
# 店铺交易信息
translateTag = doc('.section-main ul')
translateNumPQ = pq(translateTag)
translateList = pq(translateNumPQ('li')).text().split()
translateInfo = {
'transactionsNum': translateList[0] + ":" + translateList[1] + ";" + translateList[2] + ":" + translateList[3],
'buyersNum':translateList[4] + ":" + translateList[5] + ";" + translateList[6] + ":" + translateList[7],
'repeaaRate':translateList[8] + ":" + translateList[9] + ";" + translateList[10] + ":" + translateList[11],
}
result = {
'companyName': companyName,
'loyaltyLevel': honestLevel,
'contactPerson': contactPerson,
'telephone': telephone,
'companyInfoItems': companyInfo,
'translateInfo': translateInfo
}
saveToMongo(result, totalpage, currentPage)
# 数据保存到MongoDB
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
def saveToMongo(result, totalPage, currentPage):
count = 0
try:
if db[MONGO_TABLE].insert(result):
count += 1
print('存储成功', result)
except:
count += 1
print('存储失败', result)
# 如果保存数据库的数据条数等于每个页面店铺总数则翻页 TODO
print('存储数据总数》》》', count)
if (count == totalShopsOnePage):
if totalPage > currentPage:
goToNextPage(currentPage + 1)
def main():
key_words = input('输入想查询的类别(如女装,五金。。。):').split(' ')
for key in key_words:
time.sleep(3)
crawle(key)
main()