Java爬虫

因公司新业务行政执法建设需要,需对多个业务部门提供的目标网站相关行政复议文书进行爬取。

对多个目标网站的研究发现。在对不同目标网站进行爬取时,需要处理的方式不一样,有pdf、doc格式等,有的网站可以随意下载,有的是接口字段加密传参、需要通过接口解密处理,有的需要通过解析网页元素处理。

导包


   org.jsoup
   jsoup
   1.12.1


   cn.hutool
   hutool-all
   5.7.9


    com.alibaba
    fastjson
    1.2.78


    org.apache.httpcomponents
    httpclient
    4.5.12


     commons-httpclient
     commons-httpclient
     3.1


     commons-io
     commons-io
     2.11.0

Service

/***
 * @Description: 
 * @Auther: lyonardo
 * @Date: 2021/10/13 09:49
 * @version : V1.0
 */
@Slf4j
@Service
public class SftjHttpSpiderServiceImpl implements SftjHttpSpiderService {
    @Value("${admin_review_filePath}")
    private String adminReviewFilePath;
    @Resource
    private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
     @Override
     public void spiderSftj(){
          try {
              Document  pageDoc = JsoupUtil.buildDocument(SftjConstants.PageJsoupUrl);
              Elements pageElements = pageDoc.select(".f12>script");
              pageElements.forEach(x->{
                  String[] strings = x.data().split(";");
                  for (String s : strings){
                      if(s.contains("var countPage")){
                          int pageSize = RegexUtils.getNumberFromString(s);
                          for(int i=1;i list = new ArrayList<>();
                                  Elements elements = doc.select(".news .mf26 .overflow");
                                  elements.forEach(
                                          y-> {
                                              Document doc1;
                                              try {
                                                  doc1 = JsoupUtil.buildDocument(y.attr("abs:href"));
                                                  Elements elements1 = doc1.select(".attachments>ul>li>a");
                                                  elements1.forEach(t->{
                                                      log.debug("t.href::"+t.attr("abs:href"));
                                                      if(StringUtils.isNotEmpty(t.attr("abs:href"))){
                                                          FileUtil.createDirs(adminReviewFilePath);
                                                       String content = PDFBoxUtil.readAndSavePdfUrl(t.attr("abs:href"),adminReviewFilePath+y.ownText()+".pdf");
                                                          list.add(AdministrativeDocumentsOriginals.build("天津市司法局",y.ownText(),content,
                                                                  adminReviewFilePath+y.ownText()+".pdf",y.select(".news-date").text(),null,2));
                                                      }
                                                  });
                                              } catch (IOException e) {
                                                  log.error("JsoupUtil buildDocument");
                                              }
                                          }
                                  );
                                  try {
                                      administrativeDocumentsOriginalsMapper.batchUpsert(list);
                                  }catch (Exception e){
                                      e.printStackTrace();
                                      log.error("batchUpsert失败!!!");
                                  }
                              } catch (IOException e) {
                                  log.error("处理index页面失败==》本次爬虫失败!!!");
                              }
                          }
                      }
                   }
                 }
              );
          } catch (IOException e) {
              log.error("处理Document失败==》本次爬虫失败!!!");
          }
      }
}

另外一个网站。分批处理

/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/09/24 11:08
 * @version : V1.0
 */
@Slf4j
@Component
public class CfwsHttpSpiderJob {
    @Autowired
    private DoCfwsHttpSpiderTask spiderTask;
    @Value("${partition_size}")
    private Integer partitionSize;
    @Scheduled(cron = "${punish_jobs_cron}")
    public void spiderCfws() {
        List> subQueryConditionList = Lists.partition(QueryConditionConstant.queryConditionList, partitionSize);
        for (List smallerList : subQueryConditionList) {
            try {
                spiderTask.doCfwsHttpSpider(smallerList);
            } catch (Exception e) {
                e.printStackTrace();
                log.info("爬虫失败");
            }
        }
    }
}
/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/09/29 16:09
 * @version : V1.0
 */
@Slf4j
@Component
public class DoCfwsHttpSpiderTask implements Runnable{
    @Resource
    private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;

    @Value("${admin_punish_filePath}")
    public String adminPunishFilePath;

    @Value("${ciphertext}")
    public String ciphertext ;

    @Value("${sleep_longtime}")
    public Integer sleepLongtime;

    private List smallerList;

    @Override
    public void run() {
        try {
            System. out.println("run thread...");
            doCfwsHttpSpider(smallerList);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void doCfwsHttpSpider(List smallerList) throws Exception {
        synchronized (this) {
            doSpiderCfws(smallerList);
            log.info("休眠"+sleepLongtime+"分钟==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
            Thread. sleep(sleepLongtime*60*1000);
            log.info(sleepLongtime+"分钟后wake up==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
        }
    }

    public void doSpiderCfws(List smallerList) {
        List resultStringList = resultList(smallerList);
        if(CollectionUtils.isNotEmpty(resultStringList)){
            List list = new ArrayList<>();
            resultStringList.forEach(x->{
                log.info("result==>{}",x);
                if(null != x){
                    JSONObject jsonQueryDocResult = JSONObject.parseObject(x);
                    if(null!=jsonQueryDocResult.getJSONObject("result")){
                        JSONObject jsonQueryDocResultObject = jsonQueryDocResult.getJSONObject("result");
                        if(null!=jsonQueryDocResultObject.getJSONObject("queryResult")){
                            JSONObject queryResult = jsonQueryDocResultObject.getJSONObject("queryResult");
                            if(null!=queryResult.getJSONArray("resultList")){
                                Integer resultCount = queryResult.getInteger("resultCount");
                                log.info("resultCount==>{}",resultCount);
                                JSONArray resultList = queryResult.getJSONArray("resultList");
                                for (int i=0 ; i{}",i0);
                                                if(StringUtils.isNotEmpty(i0)){
                                                    String i7 = jsonObject.getString("i7");
                                                    if(null!=i7&&!"".equals(i7)){
                                                        try {
                                                            i7 = i7.replace(" ", "+");
                                                            FileUtil.createDirs(adminPunishFilePath);
                                                            String allFilePath = adminPunishFilePath + i0 + ".pdf";
                                                            if (FileUtil.isCreateNewFile(allFilePath)) {
                                                                String punishmentAuthority = "";
                                                                String dateOfPunishment = "";
                                                                String typesOfPunishment = "";
                                                                if (null != jsonObject.getString("i3")) {
                                                                    punishmentAuthority = jsonObject.getString("i3");
                                                                }
                                                                if (null != jsonObject.getString("i1")) {
                                                                    dateOfPunishment = jsonObject.getString("i1");
                                                                }
                                                                if (null != jsonObject.getString("i4")) {
                                                                    typesOfPunishment = jsonObject.getString("i4");
                                                                }
                                                                list.add(AdministrativeDocumentsOriginals.build(punishmentAuthority, i0, PDFBoxUtil.readAndSavePdfStr(i7, allFilePath), allFilePath,
                                                                        dateOfPunishment, typesOfPunishment, 1));
                                                            }
                                                        }catch (IOException e) {
                                                            log.info(i0+",PDF处理失败,此次爬虫失败!!!");
                                                            e.printStackTrace();
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            });
            try {
                log.info("list==>{}",list);
                log.info("batchUpsert:==>{}",administrativeDocumentsOriginalsMapper.batchUpsert(list));
            }catch (Exception e){
                e.printStackTrace();
                log.info("入库异常==》爬虫失败");
            }
        }
        log.info("爬虫失败");
    }
    ...
}

还有另外几个网站。

JDK等源码也有很多深嵌套,是if嵌套还是if反向判断throw new Exception,由团队代码风格和任务紧急度决定。

爬取数据最棘手的,一是面向监狱编程,二是IP池中IP的收集,三是加密破解,四是限流。

问题:前端防爬机制如何处理?

function cipher() {
	var date = new Date();
	var timestamp = date.getTime().toString();
	var salt =random(24);
	var year = date.getFullYear().toString();
	var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
			.getMonth()+1).toString();
	var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
			.toString();
	var iv = year + month + day;
	var enc = DES3.encrypt(timestamp, salt, iv).toString();
	var str = salt + iv + enc;
	var ciphertext = strTobinary(str);
	return ciphertext;
}
function strTobinary(str) {
	var result = [];
	var list = str.split("");
	for (var i = 0; i < list.length; i++) {
		if (i != 0) {
			result.push(" ");
		}
		var item = list[i];
		var binaryStr = item.charCodeAt().toString(2);
		result.push(binaryStr);
	};
	return result.join("");
}
function random (size){
	var str = "",
	arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
	for(var i=0; i

输入内容

验证码验证码不能为空

", btn:'确定', shade:0.8, btnAlign:'c', area:['400px','200px'], yes:function(){ var ciphertext = cipher(); var yzm = $("#yzm").val(); if(yzm==null || $.trim(yzm)=="" || yzm=="undefined"){ $('.yzmtip').show(); return false; } var url = window.localStorage.getItem("url"); var parameterMap = window.localStorage.getItem("parameterMap"); parameterMap=$.parseJSON(parameterMap); parameterMap.yzm=yzm; parameterMap.ciphertext=ciphertext; $.ajax({ type: "post", async: true, url: url, datatype:"json", data: parameterMap, success: function(data) { try{ data = $.parseJSON(data) }catch(e){ } if(data.code != -11 || !data.code){ //window.localStorage.setItem("result",JSON.stringify(data)); layer.closeAll(); window.location.reload(); return ; } if(data.code == -11 && data.description!=null && data.description!=""){ $(".yzmtip").html(""+data.description+""); $('.yzmtip').show(); } $("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random()); }, error:function(data){ } }) } }) } if(obj.code =="-12"){ layer.msg("访问受限"); } } /** * 点击获取验证码 */ $(function(){ $(document).on('click','#imgcode',function(){ $("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random()); }); })

现有后端破解代码

/***
 * @Description:
 * @Auther: lyonardo
 * @Date: 2021/10/22 16:07
 * @version : V1.0
 */
public class CryptoUtil {
    private static String CODE_TYPE = "UTF-8";
    public static final String transformation = "DES/ECB/NOPadding";

    public String encode(String KEY, String datasource){
        try{
            SecureRandom random = new SecureRandom();
            DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            SecretKey securekey = keyFactory.generateSecret(desKey);
            Cipher cipher = Cipher.getInstance("DES");
            cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
            byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
            return IOUtils.toString(temp,"UTF-8");
        }catch(Throwable e){
            e.printStackTrace();
            return null;
        }
    }

    public static String encrypt(String key,String text,String vector) throws  Exception {
        try {
                byte[] src = text.getBytes("utf-8");
                DESedeKeySpec spec = new DESedeKeySpec(key.getBytes("utf-8"));
                SecretKeyFactory factory = SecretKeyFactory.getInstance("DESede");
                SecretKey secretKey = factory.generateSecret(spec);
                Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
                String iv  = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").format(LocalDateTime.now());
                cipher.init(Cipher.ENCRYPT_MODE, secretKey/**new IvParameterSpec(vector||iv)*/);
                byte[] res = cipher.doFinal(src);
                return new String(Base64.encodeBase64(res), "utf-8");
            } catch (Exception e) {
                System.out.println("error");
            }
        return null;
   }

    private static Cipher GetCipher(int opmode, String key) {
        try {
            DESKeySpec dks = new DESKeySpec(key.getBytes());
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            Key secretKey = keyFactory.generateSecret(dks);
            Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
            cipher.init(opmode, secretKey);
            return cipher;
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

    public static String encode1(String data , String key) {
        if (data == null || data.isEmpty()) {
            return null;
        }
        try {
            Cipher cipher = GetCipher(Cipher.ENCRYPT_MODE,key);
            if (cipher == null) {
                return null;
            } else {
                byte[] byteHex = cipher.doFinal(data.getBytes("UTF-8"));
                return byteToHexString(byteHex);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return data;
        }
    }

    public static String decode1(String data , String key) throws Exception {
        if (data == null || data.isEmpty())
            return null;
        try {
            byte[] b = HexUtil.decodeHex(data.toCharArray());
            Cipher cipher = GetCipher(Cipher.DECRYPT_MODE ,key);
            if (cipher != null)
                return new String(cipher.doFinal(b), "UTF-8");
            else
                return null;
        } catch (Exception e) {
            e.printStackTrace();
            return data;
        }
    }

    public static String byteToHexString(byte[] bytes) {
        StringBuffer sb = new StringBuffer(bytes.length);
        String sTemp;
        for (int i = 0; i < bytes.length; i++) {
            sTemp = Integer.toHexString(0xFF & bytes[i]);
            if (sTemp.length() < 2)
                sb.append(0);
            sb.append(sTemp.toUpperCase());
        }
        return sb.toString();
    }
    private static String KEY = "password111111";

    public static String encode(String datasource){
        try{
            SecureRandom random = new SecureRandom();
            DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
            SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
            SecretKey securekey = keyFactory.generateSecret(desKey);
            Cipher cipher = Cipher.getInstance("DES");
            cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
            byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
            return IOUtils.toString(temp,"UTF-8");
        }catch(Throwable e){
            e.printStackTrace();
            return null;
        }
}

当时用js调试和改造,前端和后端用同样的时间和串,得到加密结果却不一样。

todo解密。

你可能感兴趣的:(Java与大数据,开发运维bug之谜,java,爬虫,开发语言)