因公司新业务行政执法建设需要,需对多个业务部门提供的目标网站相关行政复议文书进行爬取。
对多个目标网站的研究发现。在对不同目标网站进行爬取时,需要处理的方式不一样,有pdf、doc格式等,有的网站可以随意下载,有的是接口字段加密传参、需要通过接口解密处理,有的需要通过解析网页元素处理。
org.jsoup
jsoup
1.12.1
cn.hutool
hutool-all
5.7.9
com.alibaba
fastjson
1.2.78
org.apache.httpcomponents
httpclient
4.5.12
commons-httpclient
commons-httpclient
3.1
commons-io
commons-io
2.11.0
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/10/13 09:49
* @version : V1.0
*/
@Slf4j
@Service
public class SftjHttpSpiderServiceImpl implements SftjHttpSpiderService {
@Value("${admin_review_filePath}")
private String adminReviewFilePath;
@Resource
private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
@Override
public void spiderSftj(){
try {
Document pageDoc = JsoupUtil.buildDocument(SftjConstants.PageJsoupUrl);
Elements pageElements = pageDoc.select(".f12>script");
pageElements.forEach(x->{
String[] strings = x.data().split(";");
for (String s : strings){
if(s.contains("var countPage")){
int pageSize = RegexUtils.getNumberFromString(s);
for(int i=1;i list = new ArrayList<>();
Elements elements = doc.select(".news .mf26 .overflow");
elements.forEach(
y-> {
Document doc1;
try {
doc1 = JsoupUtil.buildDocument(y.attr("abs:href"));
Elements elements1 = doc1.select(".attachments>ul>li>a");
elements1.forEach(t->{
log.debug("t.href::"+t.attr("abs:href"));
if(StringUtils.isNotEmpty(t.attr("abs:href"))){
FileUtil.createDirs(adminReviewFilePath);
String content = PDFBoxUtil.readAndSavePdfUrl(t.attr("abs:href"),adminReviewFilePath+y.ownText()+".pdf");
list.add(AdministrativeDocumentsOriginals.build("天津市司法局",y.ownText(),content,
adminReviewFilePath+y.ownText()+".pdf",y.select(".news-date").text(),null,2));
}
});
} catch (IOException e) {
log.error("JsoupUtil buildDocument");
}
}
);
try {
administrativeDocumentsOriginalsMapper.batchUpsert(list);
}catch (Exception e){
e.printStackTrace();
log.error("batchUpsert失败!!!");
}
} catch (IOException e) {
log.error("处理index页面失败==》本次爬虫失败!!!");
}
}
}
}
}
);
} catch (IOException e) {
log.error("处理Document失败==》本次爬虫失败!!!");
}
}
}
另外一个网站。分批处理
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/09/24 11:08
* @version : V1.0
*/
@Slf4j
@Component
public class CfwsHttpSpiderJob {
@Autowired
private DoCfwsHttpSpiderTask spiderTask;
@Value("${partition_size}")
private Integer partitionSize;
@Scheduled(cron = "${punish_jobs_cron}")
public void spiderCfws() {
List> subQueryConditionList = Lists.partition(QueryConditionConstant.queryConditionList, partitionSize);
for (List smallerList : subQueryConditionList) {
try {
spiderTask.doCfwsHttpSpider(smallerList);
} catch (Exception e) {
e.printStackTrace();
log.info("爬虫失败");
}
}
}
}
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/09/29 16:09
* @version : V1.0
*/
@Slf4j
@Component
public class DoCfwsHttpSpiderTask implements Runnable{
@Resource
private AdministrativeDocumentsOriginalsMapper administrativeDocumentsOriginalsMapper;
@Value("${admin_punish_filePath}")
public String adminPunishFilePath;
@Value("${ciphertext}")
public String ciphertext ;
@Value("${sleep_longtime}")
public Integer sleepLongtime;
private List smallerList;
@Override
public void run() {
try {
System. out.println("run thread...");
doCfwsHttpSpider(smallerList);
} catch (Exception e) {
e.printStackTrace();
}
}
public void doCfwsHttpSpider(List smallerList) throws Exception {
synchronized (this) {
doSpiderCfws(smallerList);
log.info("休眠"+sleepLongtime+"分钟==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
Thread. sleep(sleepLongtime*60*1000);
log.info(sleepLongtime+"分钟后wake up==》"+LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")));
}
}
public void doSpiderCfws(List smallerList) {
List resultStringList = resultList(smallerList);
if(CollectionUtils.isNotEmpty(resultStringList)){
List list = new ArrayList<>();
resultStringList.forEach(x->{
log.info("result==>{}",x);
if(null != x){
JSONObject jsonQueryDocResult = JSONObject.parseObject(x);
if(null!=jsonQueryDocResult.getJSONObject("result")){
JSONObject jsonQueryDocResultObject = jsonQueryDocResult.getJSONObject("result");
if(null!=jsonQueryDocResultObject.getJSONObject("queryResult")){
JSONObject queryResult = jsonQueryDocResultObject.getJSONObject("queryResult");
if(null!=queryResult.getJSONArray("resultList")){
Integer resultCount = queryResult.getInteger("resultCount");
log.info("resultCount==>{}",resultCount);
JSONArray resultList = queryResult.getJSONArray("resultList");
for (int i=0 ; i{}",i0);
if(StringUtils.isNotEmpty(i0)){
String i7 = jsonObject.getString("i7");
if(null!=i7&&!"".equals(i7)){
try {
i7 = i7.replace(" ", "+");
FileUtil.createDirs(adminPunishFilePath);
String allFilePath = adminPunishFilePath + i0 + ".pdf";
if (FileUtil.isCreateNewFile(allFilePath)) {
String punishmentAuthority = "";
String dateOfPunishment = "";
String typesOfPunishment = "";
if (null != jsonObject.getString("i3")) {
punishmentAuthority = jsonObject.getString("i3");
}
if (null != jsonObject.getString("i1")) {
dateOfPunishment = jsonObject.getString("i1");
}
if (null != jsonObject.getString("i4")) {
typesOfPunishment = jsonObject.getString("i4");
}
list.add(AdministrativeDocumentsOriginals.build(punishmentAuthority, i0, PDFBoxUtil.readAndSavePdfStr(i7, allFilePath), allFilePath,
dateOfPunishment, typesOfPunishment, 1));
}
}catch (IOException e) {
log.info(i0+",PDF处理失败,此次爬虫失败!!!");
e.printStackTrace();
}
}
}
}
}
}
}
}
}
}
}
});
try {
log.info("list==>{}",list);
log.info("batchUpsert:==>{}",administrativeDocumentsOriginalsMapper.batchUpsert(list));
}catch (Exception e){
e.printStackTrace();
log.info("入库异常==》爬虫失败");
}
}
log.info("爬虫失败");
}
...
}
还有另外几个网站。
JDK等源码也有很多深嵌套,是if嵌套还是if反向判断throw new Exception,由团队代码风格和任务紧急度决定。
爬取数据最棘手的,一是面向监狱编程,二是IP池中IP的收集,三是加密破解,四是限流。
function cipher() {
var date = new Date();
var timestamp = date.getTime().toString();
var salt =random(24);
var year = date.getFullYear().toString();
var month = (date.getMonth() + 1 < 10 ? "0" + (date.getMonth() + 1) : date
.getMonth()+1).toString();
var day = (date.getDate() < 10 ? "0" + date.getDate() : date.getDate())
.toString();
var iv = year + month + day;
var enc = DES3.encrypt(timestamp, salt, iv).toString();
var str = salt + iv + enc;
var ciphertext = strTobinary(str);
return ciphertext;
}
function strTobinary(str) {
var result = [];
var list = str.split("");
for (var i = 0; i < list.length; i++) {
if (i != 0) {
result.push(" ");
}
var item = list[i];
var binaryStr = item.charCodeAt().toString(2);
result.push(binaryStr);
};
return result.join("");
}
function random (size){
var str = "",
arr = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'];
for(var i=0; i输入内容
验证码
验证码不能为空
",
btn:'确定',
shade:0.8,
btnAlign:'c',
area:['400px','200px'],
yes:function(){
var ciphertext = cipher();
var yzm = $("#yzm").val();
if(yzm==null || $.trim(yzm)=="" || yzm=="undefined"){
$('.yzmtip').show();
return false;
}
var url = window.localStorage.getItem("url");
var parameterMap = window.localStorage.getItem("parameterMap");
parameterMap=$.parseJSON(parameterMap);
parameterMap.yzm=yzm;
parameterMap.ciphertext=ciphertext;
$.ajax({
type: "post",
async: true,
url: url,
datatype:"json",
data: parameterMap,
success: function(data) {
try{
data = $.parseJSON(data)
}catch(e){
}
if(data.code != -11 || !data.code){
//window.localStorage.setItem("result",JSON.stringify(data));
layer.closeAll();
window.location.reload();
return ;
}
if(data.code == -11 && data.description!=null && data.description!=""){
$(".yzmtip").html(""+data.description+"");
$('.yzmtip').show();
}
$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
},
error:function(data){
}
})
}
})
}
if(obj.code =="-12"){
layer.msg("访问受限");
}
}
/**
* 点击获取验证码
*/
$(function(){
$(document).on('click','#imgcode',function(){
$("#imgcode").attr("src", "/captcha/captchaImage?key=antitheftImageCode&random="+Math.random());
});
})
现有后端破解代码
/***
* @Description:
* @Auther: lyonardo
* @Date: 2021/10/22 16:07
* @version : V1.0
*/
public class CryptoUtil {
private static String CODE_TYPE = "UTF-8";
public static final String transformation = "DES/ECB/NOPadding";
public String encode(String KEY, String datasource){
try{
SecureRandom random = new SecureRandom();
DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
SecretKey securekey = keyFactory.generateSecret(desKey);
Cipher cipher = Cipher.getInstance("DES");
cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
return IOUtils.toString(temp,"UTF-8");
}catch(Throwable e){
e.printStackTrace();
return null;
}
}
public static String encrypt(String key,String text,String vector) throws Exception {
try {
byte[] src = text.getBytes("utf-8");
DESedeKeySpec spec = new DESedeKeySpec(key.getBytes("utf-8"));
SecretKeyFactory factory = SecretKeyFactory.getInstance("DESede");
SecretKey secretKey = factory.generateSecret(spec);
Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
String iv = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").format(LocalDateTime.now());
cipher.init(Cipher.ENCRYPT_MODE, secretKey/**new IvParameterSpec(vector||iv)*/);
byte[] res = cipher.doFinal(src);
return new String(Base64.encodeBase64(res), "utf-8");
} catch (Exception e) {
System.out.println("error");
}
return null;
}
private static Cipher GetCipher(int opmode, String key) {
try {
DESKeySpec dks = new DESKeySpec(key.getBytes());
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
Key secretKey = keyFactory.generateSecret(dks);
Cipher cipher = Cipher.getInstance("DESede/ECB/PKCS7Padding");
cipher.init(opmode, secretKey);
return cipher;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String encode1(String data , String key) {
if (data == null || data.isEmpty()) {
return null;
}
try {
Cipher cipher = GetCipher(Cipher.ENCRYPT_MODE,key);
if (cipher == null) {
return null;
} else {
byte[] byteHex = cipher.doFinal(data.getBytes("UTF-8"));
return byteToHexString(byteHex);
}
} catch (Exception e) {
e.printStackTrace();
return data;
}
}
public static String decode1(String data , String key) throws Exception {
if (data == null || data.isEmpty())
return null;
try {
byte[] b = HexUtil.decodeHex(data.toCharArray());
Cipher cipher = GetCipher(Cipher.DECRYPT_MODE ,key);
if (cipher != null)
return new String(cipher.doFinal(b), "UTF-8");
else
return null;
} catch (Exception e) {
e.printStackTrace();
return data;
}
}
public static String byteToHexString(byte[] bytes) {
StringBuffer sb = new StringBuffer(bytes.length);
String sTemp;
for (int i = 0; i < bytes.length; i++) {
sTemp = Integer.toHexString(0xFF & bytes[i]);
if (sTemp.length() < 2)
sb.append(0);
sb.append(sTemp.toUpperCase());
}
return sb.toString();
}
private static String KEY = "password111111";
public static String encode(String datasource){
try{
SecureRandom random = new SecureRandom();
DESKeySpec desKey = new DESKeySpec(KEY.getBytes(CODE_TYPE));
SecretKeyFactory keyFactory = SecretKeyFactory.getInstance("DES");
SecretKey securekey = keyFactory.generateSecret(desKey);
Cipher cipher = Cipher.getInstance("DES");
cipher.init(Cipher.ENCRYPT_MODE, securekey, random);
byte[] temp = Base64.encodeBase64(cipher.doFinal(datasource.getBytes()));
return IOUtils.toString(temp,"UTF-8");
}catch(Throwable e){
e.printStackTrace();
return null;
}
}
当时用js调试和改造,前端和后端用同样的时间和串,得到加密结果却不一样。
todo解密。