(我用的谷歌浏览器,在淘宝登录页面,按F12键进入检查模式,点击Network,勾选Preserve log,点击登陆,这样就记录下了登陆需要提交的所有参数)
import scrapy
import re
import json
import time
from tao_bao_data.items import TaoBaoDataItem
class TaobaodataSpider(scrapy.Spider):
# 爬虫名
name = ‘TaoBaoData’
# 爬虫允许的域名
allowed_domains = [“taobao.com”, “passport.alibaba.com”]
# 爬虫进行模拟登录的url
login_url = ‘https://login.taobao.com/member/login.jhtml’
# 爬虫要抓取数据的url
crawl_urls = ‘https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm’
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
'Content-Type': 'application/x-www-form-urlencoded',
'Connection': 'Keep-Alive'
}
# 模拟登录需要提交的用户名
username = '(此处是自己的淘宝用户名或者店铺名)'
# 构建模拟登录需要提交的表单数据
post_data = {
'TPL_username': username,
'TPL_password': '',
'ncoSig': '',
'ncoSessionid': '',
'ncoToken': 'd801d3c69349',
'slideCodeShow': 'false',
'useMobile': 'false',
'lang': 'zh_CN',
'loginsite': '0',
'newlogin': '0',
'TPL_redirect_url': 'http://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm?spm=a1z02.1.a2109.d1000368.5d3a782dr6KjrH&nekot=1470211439694',
'from': 'tb',
'fc': 'default',
'style': 'default',
'css_style': '',
'keyLogin': 'false',
'qrLogin': 'true',
'newMini': 'false',
'newMini2': 'false',
'tid': '',
'loginType': '3',
'minititle': '',
'minipara': '',
'pstrong': '',
'sign': '',
'need_sign': '',
'isIgnore': '',
'full_redirect': '',
'sub_jump': '',
'popid': '',
'callback': '',
'guf': '',
'not_duplite_str': '',
'need_user_id': '',
'poy': '',
'gvfdcname': '10',
'gvfdcre': '68747470733A2F2F6C6F67696E2E74616F62616F2E636F6D2F6D656D6265722F6C6F676F75742E6A68746D6C3F73706D3D61317A30392E322E37353438',
'from_encoding': '',
'sub': '',
'TPL_password_2': '9b8f47092a216b0df4f68ee751c65ba430627e81b09029f29be8d6d1e24b62b8338222b95e759f9877f0051e096ae285181621f1',
'loginASR': '1',
'loginASRSuc': '1',
'allp': '',
'oslanguage': 'zh-CN',
'sr': '1920*1080',
'osVer': 'windows|6.1',
'naviVer': 'chrome|67.0339687',
'osACN': 'Mozilla',
'osAV': '5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'osPF': 'Win32',
'miserHardInfo': '',
'appkey': '00000000',
'nickLoginLink': '',
'mobileLoginLink': 'https://login.taobao.com/member/login.jhtml?redirectURL=http://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm?spm=a1z02.1.a2109.d1000368.5d3a782dr6KjrH&nekot=1470211439694&useMobile=true',
'showAssistantLink': '',
'um_token': 'HV02PAAZ0bb0767c3af',
'ua': 'rQcXNgSTHZhdvYEp94Q9LWm3tf/rWXTklo5KCcpiO9WwblFoikTWZTfZQ7wfTsnnTb8z6gm8TsTTJ7ZyUxxBEdKEiqnZosTTb8r26zmTnsZwpPVsSHbFSbBM/qwfzTTBrT5S6K+aTjsnj6UfP/T2Tj+teh9f8plTIb826zmgsjTQTVj1vhovBOLEukvAHyptk38gOP4Tth2VR3CpC+jJ+IPZXx71zeO8I8'
}
# 替换所有的HTML标签
def re_html(self, data):
# 替换抓取数据中的html标签
try:
message = str(data)
re_h = re.compile('?\w+[^>]*>') # html标签
ret1 = re_h.sub('', message)
ret2 = re.sub(r'\n', '', ret1)
ret3 = re.sub(r'\u3000', '', ret2)
return ret3
except:
pass
# 抓取登录链接
def start_requests(self):
yield scrapy.Request(
url=self.login_url,
meta={'cookiejar': 1},
headers=self.headers,
callback=self.post_login
)
# 提交登陆请求
def post_login(self, response):
yield scrapy.FormRequest.from_response(
response=response,
method='POST',
meta={'cookiejar': response.meta['cookiejar']},
formdata=self.post_data,
callback=self.search_token,
dont_filter=True
)
# 登录成功后, 获取token值拼接url进行请求,跳过验证
def search_token(self, response):
data0 = str(response.body)
# token0 = re.search(r'&token=.*?&', data0)
# data1 = token0.group()
# token1 = re.sub(r'&', '', data1)
# token2 = re.sub(r'token=', '', token1)
nekot0 = re.search(r'&nekot=.*?}', data0)
data1 = nekot0.group()
nekot1 = re.sub(r'&', '', data1)
nekot2 = re.sub(r'nekot=', '', nekot1)
nekot3 = re.sub(r'\'}', '', nekot2)
nekot4 = re.sub(r'\\', '', nekot3)
token_URL = 'https://passport.alibaba.com/mini_apply_st.js?site=0&token=%s&callback=stCallback6' % nekot4
yield scrapy.Request(
url=token_URL,
meta={'cookiejar': response.meta['cookiejar']},
headers=self.headers,
callback=self.parse_crawl_url
)
# 请求需要抓取的url,获取数据
def parse_crawl_url(self, response):
yield scrapy.Request(
meta={'cookiejar': response.meta['cookiejar']},
url=self.crawl_urls,
headers=self.headers,
callback=self.parse_data
)
def parse_data(self, response):
# num = response.xpath('//div[@id="tp-bought-root"]/div[19]/div[2]/ul/div/div/input')
# 将json数据转换成字典格式
# print('0000000000000000')
node_data = str(response.body.decode('unicode-escape'))
# print(node_data)
re_data0 = re.search(r'{"error":"","extra".*?"type":"t3"}]}', node_data)
# print(re_data0.group())
order_data = json.loads(re_data0.group())['mainOrders']
# print(order_data)
# 创建item实例
money = 0
item = TaoBaoDataItem()
for data in order_data:
item['下单时间'] = data['orderInfo']['createDay']
item['订单号'] = data['id']
item['店铺名'] = data['seller']['shopName']
item['店铺链接'] = data['seller']['shopUrl']
item['付款金额'] = data['payInfo']['actualFee']
print(item)
pass