scrapy爬虫-淘宝模拟登陆抓取数据

(我用的谷歌浏览器,在淘宝登录页面,按F12键进入检查模式,点击Network,勾选Preserve log,点击登陆,这样就记录下了登陆需要提交的所有参数)

-- coding: utf-8 --

import scrapy
import re
import json
import time
from tao_bao_data.items import TaoBaoDataItem

class TaobaodataSpider(scrapy.Spider):
# 爬虫名
name = ‘TaoBaoData’
# 爬虫允许的域名
allowed_domains = [“taobao.com”, “passport.alibaba.com”]
# 爬虫进行模拟登录的url
login_url = ‘https://login.taobao.com/member/login.jhtml’
# 爬虫要抓取数据的url
crawl_urls = ‘https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm’

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Connection': 'Keep-Alive'
}

# 模拟登录需要提交的用户名
username = '(此处是自己的淘宝用户名或者店铺名)'
# 构建模拟登录需要提交的表单数据
post_data = {
    'TPL_username': username,
    'TPL_password': '',
    'ncoSig': '',
    'ncoSessionid': '',
    'ncoToken': 'd801d3c69349',
    'slideCodeShow': 'false',
    'useMobile': 'false',
    'lang': 'zh_CN',
    'loginsite': '0',
    'newlogin': '0',
    'TPL_redirect_url': 'http://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm?spm=a1z02.1.a2109.d1000368.5d3a782dr6KjrH&nekot=1470211439694',
    'from': 'tb',
    'fc': 'default',
    'style': 'default',
    'css_style': '',
    'keyLogin': 'false',
    'qrLogin': 'true',
    'newMini': 'false',
    'newMini2': 'false',
    'tid': '',
    'loginType': '3',
    'minititle': '',
    'minipara': '',
    'pstrong': '',
    'sign': '',
    'need_sign': '',
    'isIgnore': '',
    'full_redirect': '',
    'sub_jump': '',
    'popid': '',
    'callback': '',
    'guf': '',
    'not_duplite_str': '',
    'need_user_id': '',
    'poy': '',
    'gvfdcname': '10',
    'gvfdcre': '68747470733A2F2F6C6F67696E2E74616F62616F2E636F6D2F6D656D6265722F6C6F676F75742E6A68746D6C3F73706D3D61317A30392E322E37353438',
    'from_encoding': '',
    'sub': '',
    'TPL_password_2': '9b8f47092a216b0df4f68ee751c65ba430627e81b09029f29be8d6d1e24b62b8338222b95e759f9877f0051e096ae285181621f1',
    'loginASR': '1',
    'loginASRSuc': '1',
    'allp': '',
    'oslanguage': 'zh-CN',
    'sr': '1920*1080',
    'osVer': 'windows|6.1',
    'naviVer': 'chrome|67.0339687',
    'osACN': 'Mozilla',
    'osAV': '5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
    'osPF': 'Win32',
    'miserHardInfo': '',
    'appkey': '00000000',
    'nickLoginLink': '',
    'mobileLoginLink': 'https://login.taobao.com/member/login.jhtml?redirectURL=http://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm?spm=a1z02.1.a2109.d1000368.5d3a782dr6KjrH&nekot=1470211439694&useMobile=true',
    'showAssistantLink': '',
    'um_token': 'HV02PAAZ0bb0767c3af',
    'ua': 'rQcXNgSTHZhdvYEp94Q9LWm3tf/rWXTklo5KCcpiO9WwblFoikTWZTfZQ7wfTsnnTb8z6gm8TsTTJ7ZyUxxBEdKEiqnZosTTb8r26zmTnsZwpPVsSHbFSbBM/qwfzTTBrT5S6K+aTjsnj6UfP/T2Tj+teh9f8plTIb826zmgsjTQTVj1vhovBOLEukvAHyptk38gOP4Tth2VR3CpC+jJ+IPZXx71zeO8I8'

}

# 替换所有的HTML标签
def re_html(self, data):
    # 替换抓取数据中的html标签
    try:
        message = str(data)
        re_h = re.compile(']*>')  # html标签
        ret1 = re_h.sub('', message)
        ret2 = re.sub(r'\n', '', ret1)
        ret3 = re.sub(r'\u3000', '', ret2)
        return ret3
    except:
        pass

# 抓取登录链接
def start_requests(self):
    yield scrapy.Request(
        url=self.login_url,
        meta={'cookiejar': 1},
        headers=self.headers,
        callback=self.post_login
        )

# 提交登陆请求
def post_login(self, response):
    yield scrapy.FormRequest.from_response(
        response=response,
        method='POST',
        meta={'cookiejar': response.meta['cookiejar']},
        formdata=self.post_data,
        callback=self.search_token,
        dont_filter=True
        )

# 登录成功后, 获取token值拼接url进行请求,跳过验证
def search_token(self, response):

    data0 = str(response.body)
    # token0 = re.search(r'&token=.*?&', data0)
    # data1 = token0.group()
    # token1 = re.sub(r'&', '', data1)
    # token2 = re.sub(r'token=', '', token1)

    nekot0 = re.search(r'&nekot=.*?}', data0)
    data1 = nekot0.group()
    nekot1 = re.sub(r'&', '', data1)
    nekot2 = re.sub(r'nekot=', '', nekot1)
    nekot3 = re.sub(r'\'}', '', nekot2)
    nekot4 = re.sub(r'\\', '', nekot3)
    token_URL = 'https://passport.alibaba.com/mini_apply_st.js?site=0&token=%s&callback=stCallback6' % nekot4

    yield scrapy.Request(
        url=token_URL,
        meta={'cookiejar': response.meta['cookiejar']},
        headers=self.headers,
        callback=self.parse_crawl_url
        )

# 请求需要抓取的url,获取数据
def parse_crawl_url(self, response):
    yield scrapy.Request(
        meta={'cookiejar': response.meta['cookiejar']},
        url=self.crawl_urls,
        headers=self.headers,
        callback=self.parse_data
        )

def parse_data(self, response):
    # num = response.xpath('//div[@id="tp-bought-root"]/div[19]/div[2]/ul/div/div/input')
    # 将json数据转换成字典格式
    # print('0000000000000000')
    node_data = str(response.body.decode('unicode-escape'))
    # print(node_data)
    re_data0 = re.search(r'{"error":"","extra".*?"type":"t3"}]}', node_data)
    # print(re_data0.group())
    order_data = json.loads(re_data0.group())['mainOrders']
    # print(order_data)

    # 创建item实例
    money = 0
    item = TaoBaoDataItem()
    for data in order_data:
        item['下单时间'] = data['orderInfo']['createDay']
        item['订单号'] = data['id']
        item['店铺名'] = data['seller']['shopName']
        item['店铺链接'] = data['seller']['shopUrl']
        item['付款金额'] = data['payInfo']['actualFee']
        
        print(item)
    
    pass

你可能感兴趣的:(python爬虫开发)