基于密度的网页列表抽取

   思路是抽取页面所有链接,根据网站host以及一些逻辑分析,剔除掉不必要的网址。计算每个xpath对应的链接数,取其中最大值。代码依赖于jsoup、httpclient

   一、抽取网页所有链接并进行一些过滤

 1 public static ArrayList getList(String url, String html) {
 2 
 3         ArrayList list = new ArrayList();
 4         String host = url.substring(url.indexOf("://") + 3, url.indexOf("/", url.indexOf("://") + 3));
 5 
 6         if (html.toLowerCase().contains("")) {
 7             StringBuffer bf = new StringBuffer("
    "); 8 Pattern pattern = Pattern.compile(")[\\s\\S]*?", Pattern.CASE_INSENSITIVE); 9 Matcher match = pattern.matcher(html); 10 while(match.find()) { 11 bf.append("
  • " + match.group(1) + "
  • \n"); 12 } 13 bf.append("
"); 14 Document document = Jsoup.parse(bf.toString()); 15 document.setBaseUri(url); 16 Elements a = document.getElementsByTag("a"); 17 for(Node e : a) { 18 list.add(")[\\\\s\\\\S]*? - href: " + e.attr("abs:href")); 19 } 20 } else { 21 Document document = Jsoup.parse(Jsoup.parse(html).body().html()); 22 document.setBaseUri(url); 23 Elements a = document.getElementsByTag("a"); 24 for (Node e : a) { 25 boolean flag = true; 26 if (e.attr("abs:href") != "" && !e.attr("abs:href").endsWith("/") 27 && (e.attr("abs:href").contains(host) || e.attr("abs:href").contains(":80"))) { 28 String xpath = "/a[@href] - href: " + e.attr("abs:href"); 29 while (true) { 30 if (e.parentNode().nodeName() == "body") { 31 xpath = "//body" + xpath; 32 break; 33 } else { 34 e = e.parentNode(); 35 if (e.attr("class").contains("hide") || e.attr("style").contains("display:none") 36 || e.attr("class").contains("head")) { 37 flag = false; 38 } else { 39 if (e.nodeName().toLowerCase() == "div") { 40 xpath = "/" + e.nodeName() 41 + (e.attr("class") == "" ? "" : "[@class='" + e.attr("class") + "']") 42 + xpath; 43 } else { 44 xpath = "/" + e.nodeName() + xpath; 45 } 46 47 } 48 } 49 } 50 if (flag) { 51 list.add(xpath); 52 } 53 } 54 } 55 } 56 return list; 57 }
View Code

    二、对抽取出的列表进一步优化

 1 for (int i = 0; i < list.size(); i++) {
 2             String[] arr = list.get(i).split(" - href: ");
 3             if (i == 0) {
 4                 urls = new ArrayList();
 5                 urls.add(arr[1]);
 6                 map.put(arr[0], urls);
 7             } else {
 8                 if (arr[0].equals(list.get(i - 1).substring(0, list.get(i - 1).indexOf(" - href: ")))) {
 9                     urls.add(arr[1]);
10                     // map.put(arr[0], map.get(arr[0]) + 1);
11                 } else {
12                     urls = new ArrayList();
13                     urls.add(arr[1]);
14                     map.put(arr[0], urls);
15                 }
16             }
17         }
18 
19         // 优化map
20         for (String key : map.keySet()) {
21             ArrayList sortLenth = new ArrayList();
22             for (String link : map.get(key)) {
23                 sortLenth.add(link.length());
24             }
25             Collections.sort(sortLenth);
26             int flag = sortLenth.get(0);
27             for (String link : map.get(key)) {
28                 if (link.length() > flag + 10) {
29                     removeList.add(key);
30                     break;
31                 }
32             }
33         }
34         // 移除不需要的key
35         for (String key : removeList) {
36             map.remove(key);
37         }

 

    三、分析xpath对应的链接数

 1 // 按值排序map
 2         for (String key : map.keySet()) {
 3             if (sortMap.containsKey(map.get(key).size())) {
 4                 sortMap.get(map.get(key).size()).add(key);
 5             } else {
 6                 ArrayList valueList = new ArrayList();
 7                 valueList.add(key);
 8                 sortMap.put(map.get(key).size(), valueList);
 9             }
10         }
11 
12         for (Integer i : sortMap.keySet()) {
13             key_list.add(i);
14         }
15         Collections.sort(key_list);
16 
17         // 取最大值的xpath
18         if (sortMap.get(key_list.get(key_list.size() - 1)).size() > 0) {
19             for (String str : sortMap.get(key_list.get(key_list.size() - 1))) {
20                 xpath += str + "|";
21             }
22             xpath = xpath.substring(0, xpath.length() - 1);
23         } else {
24             xpath = "";
25         }

      样本数据150条,经测试成功率在85%以上。

你可能感兴趣的:(基于密度的网页列表抽取)