根据url读取html文件

根据url读取html有两种方式

1.HttpURLConnection
2.Jsoup
两种方式的maven依赖:

	
		
			org.apache.httpcomponents
			httpcore
			4.4.5
		
		
			org.apache.httpcomponents
			httpclient
			4.5.6
		
		
			org.apache.httpcomponents
			httpmime
			4.5.2
		
		
		
			com.google.guava
			guava
			27.0.1-jre
		
		
		    org.jsoup
		    jsoup
		    1.11.3
		

两种方式的代码实现

两种读取方式及部分其他方法:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;


public class ReadHTMLByUrl {
	
    
    /**
     * url读取html文件
     * @param u
     * @param encoding
     * @return
     * @throws Exception
     */
    public static String readFile(String u, String encoding) throws Exception {
    	StringBuffer html = new StringBuffer();
        URL url = new URL(u);// 根据链接(字符串格式),生成一个URL对象

        HttpURLConnection urlConnection = (HttpURLConnection) url
                .openConnection();// 打开URL
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                urlConnection.getInputStream(), encoding));// 得到输入流,即获得了网页的内容
        String line; // 读取输入流的数据,并显示
        while ((line = reader.readLine()) != null) {
        	html.append(line);
        }
		return html.toString();
    }
    
    /**
     * 按照url从网络上直接读取html下body的内容
     * @param url
     * @return
     * @throws IOException
     */
    public static String JsoupBodyHtml(String url){
    	
    	Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			for(int i=0;i<3;i++) {
				try {
					Thread.sleep(5*1000);
					doc = Jsoup.connect(url).get();
					//成功建立连接跳出循环
					break;
				} catch (InterruptedException e1) {
				} catch (IOException e1) {
				}
			}
		}
    	return doc == null ? "":doc.body() == null ? "":doc.body().html();
    	
    }
    
    /**
     * 按照url从网络上直接读取html下的内容
     * @param url
     * @return
     * @throws IOException
     */
    public static String JsoupHtml(String url) {
    	Document doc = null;
		try {
			doc = Jsoup.connect(url).get();
		} catch (IOException e) {
			try {
				Thread.sleep(20*1000);
				doc = Jsoup.connect(url).get();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			} catch (InterruptedException e1) {
				e1.printStackTrace();
			}
			e.printStackTrace();
		}
    	return doc==null ? "" : doc.toString();
    	
    }
    /**
     *  根据网页的url清空带有字体的style属性,替换img的src
     * @param url
     * @return
     */
    public static String reWriteHtml(String url) {
        try {
         //截取url
          String domainUrl = url.substring(0, url.lastIndexOf("/") + 1);
          Document doc = Jsoup.connect(url).get();
          Element body = doc.body();
          //查找img
          List imgs = body.select("img");
          if(null == imgs) {
            imgs = new ArrayList();
          }
          String src;
          for(Element img : imgs){
              src = img.attr("src");
              //图片相对路径改为绝对路径
              src = src.startsWith("http") ? src : domainUrl + src;
              img.attr("src", src);
          }
          List eList = body.getAllElements();
          if(null == eList) {
            eList = new ArrayList();
          }
          String style;
          //移除带有font属性的样式
          for(Element e : eList){
              style = e.attr("style");
              if(style.indexOf("font") > -1){
                  e.removeAttr("style");
              }
          }
          return body.html();
        }catch(Exception e) {
          e.printStackTrace();
          return "";
        }
      }
    /** 
     * 将img标签中的src进行二次包装 
     * @param content 内容 
     * @param replaceHttp 需要在src中加入的域名 
     * @return 
     */  
    public static String repairContent(String url){
    	String content =JsoupBodyHtml(url);
    	if("".equals(content)) {
    		return "";
    	}
    	//加在img src中的前缀
		String replaceHttp = url.substring(0, url.lastIndexOf("/")+1);
        String patternStr="]*)\\s*src=\\\"(.*?)\\\"\\s*([^>]*)>";  
        
        content = replSrc(content, replaceHttp, patternStr);   
        return content;  
    }
    /**
     * 替换src后的div内容
     * @param content
     * @param replaceHttp
     * @param patternStr
     * @return
     */
	private static String replSrc(String content, String replaceHttp, String patternStr) {
		Pattern pattern = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);  
        Matcher matcher = pattern.matcher(content);
        Map repMap = new HashMap();
		//将所有的匹配数据放到repMap中去重,防止重复替换
        while(matcher.find()) { 
            String src = matcher.group(2);  
            String replaceSrc = ""; 
            //只替换非 http:// 或 https:// 的src
            if(!src.startsWith("http://")&&!src.startsWith("https://")){  
                replaceSrc = replaceHttp + src;
               //按key去重
                if (!repMap.containsKey(src)) {
                	repMap.put(src, replaceSrc);
				}
            }  
        }
        //去重替换文件
        for (String key : repMap.keySet()) {
        	content = content.replace(key, repMap.get(key));
		}
		return content;
	}
    /**
     * 获得要替换的map
     * @param content
     * @param replaceHttp
     * @param pattern
     * @param matcher
     * @return
     */
	private static String matchSrc(String content, String replaceHttp, Pattern pattern, Matcher matcher) {
		Map repMap = new HashMap();
		//将所有的匹配数据放到repMap中去重,防止重复替换
        while(matcher.find()) { 
            String src = matcher.group(2);  
            String replaceSrc = "";  
            if(!src.startsWith("http://")&&!src.startsWith("https://")){  
                replaceSrc = replaceHttp + src;
                if (!repMap.containsKey(src)) {
                	repMap.put(src, replaceSrc);
				}
            }  
        }
        //按key去重
        for (String key : repMap.keySet()) {
        	content = content.replace(key, repMap.get(key));
		}
		return content;
	}  
	private static List getMatchers(String regex, String source){
         Pattern pattern = Pattern.compile(regex);
         Matcher matcher = pattern.matcher(source);
         List list = new ArrayList();
         while (matcher.find()) {
             list.add(matcher.group(2));
         }
         return list;
     } 
}

你可能感兴趣的:(根据url读取html,java)