Jsoup和htmlunit结合使用。

maven依赖:

		
		
			org.jsoup
			jsoup
			1.11.3
		

		
			net.sourceforge.htmlunit
			htmlunit
			2.40.0
		

代码:

package com.ybjdw.tool.utils;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.util.logging.Level;

/**
 * author: zhanggw
 * 创建时间:  2020/7/23
 */
public class JsoupUtil {
    private static Logger logger = LoggerFactory.getLogger(JsoupUtil.class);

    public static void main(String[] args) {
        try{
            String shopName = "KaKa studios";
            String mainUrl = "https://www.vvic.com";
            String localDir = "D:/tmp";
            String shopLink;

            logger.debug("开始搜索店铺名称:{}", shopName);

            // 搜索档口
            Connection connect = Jsoup.connect("https://www.vvic.com/gz/shops/search.html?q="+shopName);
            connect.timeout(10000);
            Document document = connect.get();
            Element element = document.selectFirst("div[id=stallContent] dl dd span[class=cell ctrl-cell] a");
            shopLink = mainUrl + element.attr("href");
            logger.debug("店铺链接: {}", shopLink);

            // 获取档口详情
            connect = Jsoup.connect(shopLink);
            connect.data("sort","up_time-desc");
            connect.data("currentPage", "1");
            document = connect.get();
            element = document.selectFirst("div[class=goods-list shop-list clearfix] ul");
            Elements itemEleList = element.getElementsByTag("li");

            // 构造一个webClient 模拟Chrome 浏览器
            WebClient webClient = new WebClient(BrowserVersion.CHROME);
            for(int i=1; i 40){
                    break;
                }

                Element itemEle = itemEleList.get(i);
                String href = itemEle.selectFirst("div[class=item] div[class=pic j-vct] a").attr("href");
                String itemLink =  mainUrl+href;
                logger.debug("商品链接: {}", itemLink);

                // 获取商品详情
                //屏蔽日志信息
                LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
                        "org.apache.commons.logging.impl.NoOpLog");
                java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
                //支持JavaScript
                webClient.getOptions().setJavaScriptEnabled(true);
                webClient.getOptions().setCssEnabled(true);
                webClient.getOptions().setActiveXNative(false);
                webClient.getOptions().setThrowExceptionOnScriptError(false);
                webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
                webClient.getOptions().setUseInsecureSSL(true);
                webClient.getOptions().setTimeout(10000);
                HtmlPage rootPage = webClient.getPage(itemLink);
                //设置一个运行JavaScript的时间
                webClient.waitForBackgroundJavaScript(5000);
                String html = rootPage.asXml();
                document = Jsoup.parse(html);

                // 货号
                Element productCodeEle = document.selectFirst("div[class=product-detail] dl[class=summary clearfix] div[class=value ff-arial]");
                String productCode = productCodeEle.text().trim();

                element = document.selectFirst("div[id=info] div[class=d-content]");
                Elements imgEleList = element.getElementsByTag("img");
                logger.debug("商品详情图如下:");
                imgEleList.forEach(img->{
                    String url = img.attr("data-original");
                    String suffix = url.substring(url.lastIndexOf("."));
                    String localPath = localDir + "/" + shopName + "/" + productCode;
                    logger.debug("url:{}, suffix:{}, localPatch:{}", url, suffix, localPath);

                    File file = new File(localPath);
                    if(!file.exists()){
                        file.mkdirs();
                    }

                    logger.debug("开始下载:{},本地地址:{}", url, localPath+"/"+System.currentTimeMillis()+suffix);
                    FileUtil.downloadFileConcurrent(url, localPath+"/"+System.currentTimeMillis()+suffix);
                });
            }
        }catch (Exception e){
            logger.debug("爬取异常",e);
        }
        logger.debug("搜款网图片下载完毕!");
    }

}

 

你可能感兴趣的:(Jsoup和htmlunit结合使用。)