基于HtmlUnit获取页面表格信息

抽取总方法:

 @Override
    public List<CrawlerMessage> crawlerData(String path, String startTime, String endTime) {
        List<CrawlerMessage> list = new ArrayList<>();
        list.add(order(path, startTime, endTime));
        list.add(inventory(path, startTime, endTime));
        list.add(invoice(path, startTime, endTime));
        list.add(payment(path, startTime, endTime));
        list.add(delivery(path, startTime, endTime));
//        list.add(returnOrder(path, startTime, endTime));
//        list.add(sale(path, startTime, endTime));
        return null;
    }

抽取订单方法:

 @Override
    protected CrawlerMessage order(String path, String startTime, String endTime) {
        logger.info("开始抽取订单数据!");
        CrawlerMessage message = new CrawlerMessage(E20001, Order.name(), "订单", "抽取订单成功!");
        try {
            showOrder(path, endTime);
        } catch (IOException e) {
            message.setCode(E20003);
            message.setMessage("抽取订单异常!");
        }
        logger.info("订单抽取结束!");
        webClient.getOptions().setJavaScriptEnabled(true);
        return message;
    }

具体抽取方法:

 //-----------------------------------抽取方法----------------------------------------------
    public void showOrder(String path, String date) throws IOException {
        String orderUrl = "https://supplier.rt-mart.com.cn/php/scm_orders_form_1.php?status=1";
        String orderDetailUrl = "https://supplier.rt-mart.com.cn/php/";
        //页面第2526张表
        Map param = new LinkedHashMap<>();
        //key值为a-b-c a:表序号 b:行号 c:列
        param.put("26-0-1-1","storeName");//门店编码 订单编号前半部分  如 :订单编号:     5.161300066
        param.put("26-0-1-2","orderCode");//订单编号 订单编号后半部分  如: 订单编号:     5.161300066
        param.put("26-1-1","orderStatus");//订货状态
        param.put("26-2-1","orderDate");//订货日期
        param.put("26-3-1","expectArriveDate");//预交货日
        param.put("26-4-1","receiveDate");//收货日期
//        param.put("26-5-1","");//取货日期
        param.put("26-6-1","saleAuction");//促销档期
        param.put("26-7-1","orderAmount");//订购金额
//        param.put("26-8-1","");//实际金额
        param.put("26-9-1","discount");//票折
        param.put("26-10-1","discountAmount_NoTax");//折扣金额
        param.put("26-11-1","invoiceAmount");//开票金额
        param.put("26-12-1","recNumber");//实收总量
        param.put("26-13-1","note");//备注
        HtmlPage orderMain = webClient.getPage(orderUrl);
        /*String orderMainStr = new String(orderMain.asText().getBytes("ISO-8859-1"),"gb2312");
        //取订单编号
        String temp = orderMainStr.substring(orderMainStr.indexOf("免费样机退货)"));
        String orderIdList = temp.substring(7,temp.indexOf("腾讯大润发")).trim();
        orderIdList = orderIdList.replaceAll("\\t","").replaceAll("\\r\\n","");
        //取订单号数组
        String [] orderList = orderIdList.substring(1).split("\\*");*/
        HtmlForm orderForm = orderMain.getFormByName("order");
        HtmlElement orderTab = orderForm.getElementsByTagName("table").get(0);
        List list = orderTab.getHtmlElementsByTagName("table");
        String saveFile = FileUtil.getSaveFile(path, date, RTMART, userName, Order.name());

        int i = 0;//append 标志
        for(HtmlElement ele:list){
            //获得门店名称
            String storeName = transCoding(ele.getElementsByTagName("li").get(0).asText()).split("\\-")[0];
            //取得标签内的地址
            for(HtmlElement href:ele.getElementsByTagName("a")){
                HtmlAnchor anchor = (HtmlAnchor)href;
                logger.info("明细地址为:" + orderDetailUrl + anchor.getHrefAttribute());
                webClient.getOptions().setJavaScriptEnabled(false);
                HtmlPage page = webClient.getPage(orderDetailUrl + anchor.getHrefAttribute());
//                HtmlPage page = href.click();
                //直接点击标签
//                HtmlPage page = ((HtmlAnchor) orderHrefList.get(i)).click();
                writeXml2File(page, param, saveFile, (i++ == 0), storeName);
                logger.info("开始抽取订单明细");
                Map detailParam = new LinkedHashMap<>();
                detailParam.put("24-2-0","barcode");//条 码
                detailParam.put("24-2-1","kaproductcode");//货 号
                detailParam.put("24-2-2","deliveryCode");//收货号码
                detailParam.put("24-2-3","price_tax");//单品开票价
                detailParam.put("24-2-4","uom");//单 位
                detailParam.put("24-2-5","orderNum");//订购数量
                detailParam.put("24-2-6","receiveqty");//实收数量
                String detailFile = FileUtil.getSaveFile(path, date, RTMART, userName, OrderDetail.name()+"$"+href.asText().substring(1));
                writeXml2File(page, detailParam, detailFile, true, null);

            }
        }
    }

写数据方法:

 //----------------------------------写数据方法---------------------------------------------
    public void writeXml2File(HtmlPage page, Map param, String saveFile, boolean append, String storeName) throws IOException {
        BufferedWriter bufferedWriter = null;
        bufferedWriter = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(saveFile,!append), "UTF-8"));
        if(append){
            //头部写入文件中
            String headerInfo = "";
            for(String s :param.keySet()){
                headerInfo += param.get(s)+"|$|";
            }
            headerInfo = headerInfo.substring(0,headerInfo.length()-3)+"\r\n";
            bufferedWriter.write(headerInfo);//写入头部信息
        }

        DomNodeList list = page.getElementsByTagName("table");
        //取对应表格对应数据方法:
        //((HtmlTable) list.get(26)).getRow(3).getCell(1).asText();
        String[] arr;
        String strLine = "";
        String temp = "";
        for(String str:param.keySet()){
            arr = str.split("-");
            temp = transCoding(((HtmlTable) list.get(Integer.parseInt(arr[0]))).getRow(Integer.parseInt(arr[1])).getCell(Integer.parseInt(arr[2])).asText())+"|$|";**
            if(str.contains("26-0-1-")){
                if(param.get(str).equals("storeName")){
                    temp = "["+temp.split("\\.")[0]+"]"+storeName +"|$|";
                }else {
                    temp = temp.split("\\.")[1];
                }
            }
            strLine += temp;
        }
        strLine = strLine.substring(0,strLine.length()-3)+"\r\n";
        bufferedWriter.write(strLine);

        bufferedWriter.flush();
        bufferedWriter.close();
    }

你可能感兴趣的:(网络爬虫学习笔记)