爬虫采集全国工商系统的数据(外接打码平台)

javascript的代码用的是java的javascript引擎,用python的jpype去调用java的对象。因为试了好多python的js库,PYv8装起来太麻烦。而且对js的eval函数支持不是很好,后面就用了java 的js引擎。

java的话打包成jar或者class文件,java用的是1.8版本

#coding:UTF-8
import json
import re
import threading
import time

import jpype
import redis
import requests
from bs4 import BeautifulSoup
from jpype import *

jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=/code/java/forpython/target/classes/")

class SearchItem(threading.Thread):
    session=requests.session()
    keyword=""
    proxy=""
    semaphore=None
    def getGTChallenge(self):
        print "getGTChallenge start"
        loginurl="http://www.gsxt.gov.cn/SearchItemCaptcha"
        result=self.session.get(loginurl)
        if "y.replace(" not in result.text:
            raise Exception("被屏蔽了")
        mycookies= result.cookies
        jpype.attachThreadToJVM()
        jpype.isThreadAttachedToJVM()
        A = jpype.JClass("com.GovTest")
        self.Aobj=A()
        fu=self.Aobj.challenge(result.text)
        print "fu="+fu
        jslarr= fu.split("=")
        jsl_clearance=jslarr[1]
        self.session.cookies['__jsl_clearance']=jsl_clearance
        result=self.session.get(loginurl)
        challengeJson=json.loads(result.text)
        return  challengeJson

    def getImageGif(self):
        print "getImageGif start"
        url="http://www.gsxt.gov.cn/corp-query-custom-geetest-image.gif?v="
        localTime=time.localtime(time.time())
        url=url+str(localTime.tm_min+localTime.tm_sec)
        resp=self.session.get(url)
        aaa=self.Aobj.getImageGif(resp.text)
        matchObj = re.search( 'location_info = (\d+);', aaa)
        if matchObj:
            return matchObj.group(1)
        else:
            Exception("没有找到location_info")

    def getValidateInput(self,location_info):
        print "getValidateInput start"
        url="http://www.gsxt.gov.cn/corp-query-geetest-validate-input.html?token="+location_info
        resp=self.session.get(url)
        aaa=self.Aobj.getImageGif(resp.text)
        matchObj = re.search( 'value: (\d+)}', aaa)
        if matchObj:
            location_info= matchObj.group(1)
            token=int(location_info) ^ 536870911;
            print "token=",token
            return str(token)
        else:
            Exception("没有找到location_info")

    def searchTest(self,keyword):
        print "searchTest start"
        url="http://www.gsxt.gov.cn/corp-query-search-test.html?searchword="+keyword
        resp=self.session.get(url);
        print "searchTest ",resp.text

    def jianYan(self,challengeJson):
        print "jianYan start"
        url="http://jiyanapi.c2567.com/shibie?user=你的账号&pass=你的密码>="+challengeJson["gt"]+"&challenge="+challengeJson["challenge"]+"&referer=http://www.gsxt.gov.cn&return=json&format=utf8"
        sess=requests.session()
        resp=sess.get(url);
        jiyanJson=  json.loads(resp.text)
        print resp.text
        return jiyanJson

    def querySearch(self,jiYanJson,token,keyword):
        print "querySearch start"
        url="http://www.gsxt.gov.cn/corp-query-search-1.html"
        postData={
            'tab':'ent_tab',
            'province':'',
            'geetest_challenge':jiYanJson['challenge'],
            'geetest_validate':jiYanJson['validate'],
            'geetest_seccode':jiYanJson['validate']+'|jordan',
            'token':token,
            'searchword':keyword
        }
        resp=self.session.post(url,postData)
        return resp.text ,postData

    def dealPageUrl(self,html):
        print "dealPageUrl start"
        soup = BeautifulSoup(html,"html.parser")
        urlsItem=soup.find_all("a",class_="search_list_item db")
        pageNums=0
        for urlItem in urlsItem:
            print "urlItem['href']=",urlItem['href']
        if len(urlsItem)>1:
            pageForm=soup.find_all(id="pageForm")
            tabAs=pageForm[0].find_all("a",text=re.compile("\d+"))
            pageNums=len(tabAs)
        return pageNums

    def dealPageUrlNum(self,pageNums,postData):
        print "dealPageUrlNum start"
        url="http://www.gsxt.gov.cn/corp-query-search-advancetest.html"
        for i in range(pageNums):
            postData['page']=i+1
            resp=self.session.get(url,params=postData)
            soup = BeautifulSoup(resp.text)
            urlsItem=soup.find_all("a",class_="search_list_item db")
            for urlItem in urlsItem:
                print "urlItem['href']=",urlItem['href']

    def getCorpUrl(self):
        self.session.timeout=1
        self.session.max_redirects=1
        if self.proxy:
            self.session.proxies={ "http": "http://"+self.proxy, "https": "http://"+self.proxy, }
        headers={'Host': 'www.gsxt.gov.cn',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                 'Accept-Encoding': 'gzip, deflate',
                 'Referer': 'http://www.gsxt.gov.cn/SearchItemCaptcha',
                 'Connection': 'keep-alive',
                 'Upgrade-Insecure-Requests': '1',
                 'Cache-Control': 'max-age=0, no-cache'}
        self.session.headers=headers
        challengeJson=self.getGTChallenge()
        localtion_info= self.getImageGif()
        token=self.getValidateInput(localtion_info)
        self.searchTest(self.keyword)
        jiyanJson=self.jianYan(challengeJson)
        html,postData=self.querySearch(jiyanJson,token,self.keyword)
        pageNums=self.dealPageUrl(html)
        print 'pageNums=',pageNums
        self.dealPageUrlNum(pageNums,postData)
        return 1

    def run(self):
        try:
            self.getCorpUrl()
        except Exception,e:
            print "run exception ",e.message
        self.session.close()
        self.semaphore.release()
        print "search Item run finish"

    def __init__(self, keyword,proxy,semaphore):
        threading.Thread.__init__(self)
        self.keyword = keyword
        self.proxy = proxy
        self.semaphore = semaphore

semaphore=threading.Semaphore(1)
while 1:
        try:
            semaphore.acquire()
            t1=SearchItem("百度",None,semaphore)
            t1.start()
        except Exception, e:
            print 'main e.message:\t', e.message
        time.sleep(1)

java代码执行js引擎的代码:

package com;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

public class GovTest {
    private ScriptEngine scriptEngine;

    public GovTest() {
        ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
        this.scriptEngine = scriptEngineManager.getEngineByName("JavaScript");
    }

    public String challenge(String resp){
        resp = resp.substring(8);
        String tmp[] = resp.split("x';","");aaa=aaa.replace("h=h.firstChild.href;","h='http://www.gsxt.gov.cn/';");aaa=aaa.replace("while(window._phantom||window.__phantomas){};","");bbb=aaa.split("setTimeout");\n" +
                "    aaa=bbb[0]+"return dc;}}";\n" +
                "    aaa=aaa.replace("var l=","{fa:");\n" +
                "  var ffa=eval("("+aaa+")");\n" +
                "    var fffa=ffa.fa();";


        System.out.println(resp);
        String script = resp;
        try {
            scriptEngine.eval(script);
        } catch (ScriptException e) {
            return e.getMessage();
        }
        String bbb = (String) scriptEngine.get("fffa");
        System.out.println(bbb);
        return bbb;
    }
    public String getImageGif(String resp){
        String script="function dd(){var json="+resp+";return json.map( function(item){ return String.fromCharCode(item);}).join('');}" +
                "var ggg=dd();";

        try {
            scriptEngine.eval(script);
        } catch (ScriptException e) {
            return e.getMessage();
        }
        String bbb = (String) scriptEngine.get("ggg");
        return bbb;
    }
    public static void main(String[] s){
        new GovTest().challenge("");
    }
}

注:本文仅供参考学习,请勿做其它非法用途!

你可能感兴趣的:(Python编程,Java编程,网络爬虫)