裁判文书网APP数据的抓取

       第一次爬取APP数据记录一下,裁判文书网APP数据的抓取,可以获得数据,但是是加密过之后的数据。下边是源码:


import uuid
import datetime
import requests
import json
import hashlib
import random
from random import choice
from court.ip_lists import ip_lists
import time
import pymysql
def devid():
    devid=str(uuid.uuid1()).replace('-','')
    return devid
def timespan():
    timespan=datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    return timespan
def nonce():
    nonce = []
    s = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
         'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for i in range(0, 4):
        a = random.choice(s)
        nonce.append(a)
    non = str(nonce).replace("[", "").replace("]", "").replace("'", "").replace(",", "").replace(" ", "")
    return non
def signature(timespan,nonce,devid):
    s = timespan+ nonce + devid
    mid = hashlib.md5(s.encode(encoding='utf-8')).hexdigest()
    return mid
def token():
    times = timespan()
    dev = devid()
    nonc = nonce()
    url='http://wenshuapp.court.gov.cn/MobileServices/GetToken'
    headers={
    'Content-Type': 'application/json',
    'timespan': timespan(),
    'nonce': nonce(),
    'devid': devid(),
    'signature':signature(times,nonc,dev) ,
    'User-Agent': 'Dalvik/1.6.0 (Linux; U; Android 4.4.2; vivo X20A Build/NMF26X)',
    'Host': 'wenshuapp.court.gov.cn',
    'Connection': 'Keep-Alive',
    'Accept-Encoding': 'gzip',
    # 'Content-Length': '71'
    }
    data='{"app":"cpws","devid":"%s","apptype":"1"}'%(devid)
    html=requests.post(url=url,headers=headers,data=data,proxies={'http':choice(ip_lists)})
    token=json.loads(html.text)['token']
    print(token)
    return token

def parse():
    while 1:
        try:

            url='http://wenshuapp.court.gov.cn/MobileServices/GetAddCountAndTotalAndPVCount'
            times=timespan()
            dev=devid()
            nonc= nonce()
            toke=token()
            headers={
                'Content-Type': 'application/json',
                'timespan': times,
                'nonce':nonc,
                'devid': dev,
                'signature':signature(times,nonc,dev),
                'User-Agent': 'Dalvik/1.6.0 (Linux; U; Android 4.4.2; vivo X20A Build/NMF26X)',
                'Host': 'wenshuapp.court.gov.cn',
                'Connection': 'Keep-Alive',
                'Accept-Encoding': 'gzip',
                'Content-Length': '60'

            }
            data='{"app":"cpws","reqtoken":"%s"}' % toke
            data = data.encode('utf-8')
            html=requests.post(url=url,headers=headers,data=data,timeout=6)
            if html.text=='':
                print('html.text=null')
                time.sleep(2)
                continue
            print(html.text)
            return
        except Exception as e:
            print(e)
            continue



def stare_request():
    i=0
    while i<118876:
        try:
            times = timespan()
            dev=devid()
            nonc = nonce()
            headers={
                'Content-Type': 'application/json',
                'timespan': times,
                'nonce': nonc,
                'devid':dev,
                'signature':signature(times,nonc,dev),
                'User-Agent': 'Dalvik/1.6.0 (Linux; U; Android 4.4.2; vivo X20A Build/NMF26X)',
                'Host': 'wenshuapp.court.gov.cn',
                'Connection': 'Keep-Alive',
                'Accept-Encoding': 'gzip',
                # 'Content-Length': '242'
            }
            toke=token()
            data = '{"dicval":"asc","reqtoken":"%s",' \
                   '"condition":"/CaseInfo/案/@上传日期=[2018-06-21 TO 2018-06-22]",' \
                   '"skip":"%s","app":"cpws","limit":"20","dickey":"/CaseInfo/案/@法院层级"}' % (toke, str(i))
            data=data.encode('utf-8')
            url='http://wenshuapp.court.gov.cn/MobileServices/GetLawListData'
            html=requests.post(url=url,headers=headers,data=data ,timeout=6,proxies={'http':choice(ip_lists)})
            if html.text == ''  :
                print('html.text=null')
                time.sleep(0.5)
                continue
            elif 'Request Error' in html.text:
                print('Request Error')
                time.sleep(0.5)
                continue
            elif html.status_code!=200:
                continue
            print('爬取到第几条数据:',i)
             
            i+=20
        except Exception as e:
            print(e)
            continue
if __name__ == '__main__':
    parse()
    stare_request()

       在请求的时候一次不一定能请求到,这个需要多请求几次就可以获得数据了。

数据的解密关键字段是toke,times,uuid和返回的数据了。解密的方法就不多说了。

但是在爬取的时候感觉还是爬取web版的要快一点,可能还是因为对APP这个不是太熟悉的吧。

你可能感兴趣的:(python爬虫,裁判文书,APP抓取)