最近,有对阿里商家端进行一些数据爬取,这次爬取的是直通车人群溢价数据,发现对selenium的检测相当厉害,然而我的回答是,你强任你强,清风拂山岗。咱人工登录怕过谁,什么cokies,user-agent,selenium检测,token,统统与我无瓜,我们的宗旨就是,能用就行
核心就是,打开真实浏览器,用selenium接管,登录之后,再关闭提示框,立刻开始爬虫
os.system('cd "C:\\Program Files (x86)\\Google\\Chrome\\Application"&start chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile" https://subway.simba.taobao.com')
顺便用tkinter简单搞了个账号密码提示框,省的老去找
# -*- coding: utf-8 -*-
import os
import re
import time
from datetime import date, timedelta
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from user_pass import time_count
def main():
"""
人群溢价爬取
"""
#打开本地chrome,同时打开直通车登录页面,需要提前配置环境变量path
os.system('cd "C:\\Program Files (x86)\\Google\\Chrome\\Application"&start chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile" https://subway.simba.taobao.com')
chrome_debug_port = 9999
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_experimental_option("debuggerAddress", f"127.0.0.1:{chrome_debug_port}")
#selenium接管当前网页
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 15)
print(browser.title)
# 当前句柄,以防有iframe,事实是我想多了
current_handle = browser.current_window_handle
print("准备好,爬虫开始:")
# 账号密码提示:
time_count()
# 进入推广页面
url = 'https://subway.simba.taobao.com/#!/manage/campaign/index'
browser.get(url)
time.sleep(3)
# 获取cookies
# cookies = browser.get_cookies()
# jsonCookies = json.dumps(cookies)
# # 把cookies保存在本地
# with open('ffCookies.json', 'w') as f:
# f.write(jsonCookies)
# time.sleep(2)
# 获取人群数据
get_CrowdDetail(browser,wait)
browser.close()
# 获取人群溢价数据
def get_CrowdDetail(browser,wait):
# 计数器
count = 0
#设定开始与结束时间
startDate = str((date.today() + timedelta(days=-7)).strftime("%Y-%m-%d"))
endDate = str((date.today() + timedelta(days=-1)).strftime("%Y-%m-%d"))
print('爬取日期:'+startDate+'至'+endDate)
#保存好要爬取的人群溢价网址,如果能得到campaignId,adGroupId两个关键字段大可不必这样麻烦,可直接F12从xhr中获取
CrowDetail = {
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1745413653&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1745413655&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1745413656&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1745413657&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1850331434&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281901&adGroupId=1877620378&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66173371&adGroupId=1733518315&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66173370&adGroupId=1734092022&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66173370&adGroupId=1734092023&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1732352485&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1732352486&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1732352487&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1732352489&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1855117081&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66157385&adGroupId=1881028817&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66281882&adGroupId=1732330386&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66145329&adGroupId=1739717496&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1',
'https://subway.simba.taobao.com/#!/manage/adgroup/detail?productId=101001005&tab=crowd&campaignId=66145329&adGroupId=1739717497&start='+startDate+'&end='+endDate+'&rptType=realTime&adpage=1'
}
for i in CrowDetail:
count=count+1
header = []
# 访问网址,准备获取数据
browser.get(i)
wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="manage-common-view-list"]//div[@class="freeze-table bp-table"]//div[@class="table-content osx custom-scrollbar"]//table[@class="bp-table"]/tbody/tr/td[3]')))
print('当前访问网址:'+i)
time.sleep(1.5)
#解析page_source
source = etree.HTML(browser.page_source)
title = source.xpath('//*[@id="magix_vf_main"]/article/div[@class="grid-p clearfix"]//ul/li[5]/div/div/span[1]')[0].text
product= source.xpath('//*[@id="magix_vf_main"]/article//div[@class="adgroup-detail"]/p//a')[0].text
people_list = source.xpath('//div[@class="manage-common-view-list"]//div[@class="freeze-table bp-table"]//div[@class="freeze-col"]//table[@class="bp-table"]/tbody/tr')
info_list = source.xpath('//div[@class="manage-common-view-list"]//div[@class="freeze-table bp-table"]//div[@class="table-content osx custom-scrollbar"]//table[@class="bp-table"]/tbody/tr')
time.sleep(1.5)
#获取标题等信息
for people in people_list:
t = {}
if people.xpath('./td[2]/strong'):
t['标题'] = title
t['产品'] = product
t['序号'] = people.xpath('./@index')[0]
t['状态'] = people.xpath('./td[2]/strong')[0].text
t['推广人群'] = people.xpath('./td[3]/div[1]/span')[0].text
#获取详细信息
for detail in info_list:
if detail.xpath('./@index')[0]==t['序号'] and detail.xpath('./td[1]/div'):
t['对应序号'] = detail.xpath('./@index')[0]
t['人群分类'] = detail.xpath('./td[1]/div')[0].text
t['溢价'] = detail.xpath('./td[2]/div/span/span')[0].text
t['展现'] = detail.xpath('./td[3]')[0].text
t['点击'] = detail.xpath('./td[4]')[0].text
t['CTR'] = detail.xpath('./td[5]')[0].text
t['花费'] = detail.xpath('./td[6]')[0].text
t['CPC'] = detail.xpath('./td[7]')[0].text
t['加购数'] = detail.xpath('./td[8]')[0].text
t['转化率'] = detail.xpath('./td[9]')[0].text
t['成交金额'] = detail.xpath('./td[10]')[0].text
t['ROI'] = detail.xpath('./td[11]')[0].text
t['成交订单'] = detail.xpath('./td[12]')[0].text
header.append(t)
print('已获取:' + t['标题'] + '-' + t['产品'] + '-' + t['推广人群'])
#re.findall("[0-9\,\.]*",)
#写入本地CSV文件
df = pd.DataFrame(header)
if count==1:
df.to_csv('./dd.csv', index=None, encoding='utf-8-sig', mode='a')
else:
df.to_csv('./dd.csv', index=None, encoding='utf-8-sig', mode='a', header=None)
time.sleep(1)
print('已保存该数据到本地dd.csv文件夹')
if __name__ == '__main__':
main()
print("结束---------------------------------")
import tkinter
def close():
root.destroy()
def time_count():
global root
#创建应用程序窗口,设置标题和大小
root = tkinter.Tk()
root.wm_attributes('-topmost',1)
root.title('我只是一个提示框')
root['width'] = 200
root['height'] = 110
#不允许改变窗口大小
root.resizable(False, False)
#创建Text组件,放置一些文字
richText = tkinter.Text(root, width=50)
richText.place(x=10, y=10, width=180, height=30)
richText.insert('0.0', '你的账号')
eeText = tkinter.Text(root, width=50)
eeText.place(x=10, y=40, width=180, height=30)
eeText.insert('0.0', '你的密码')
#显示关闭按钮
buff = tkinter.Button(root,text="关闭",command=close)
buff.place(x=70, y=75, width=50, height=30)
root.mainloop()