HTMLParser抓取网页

看到后很多人问,过滤网页信息什么的。其实用HTMLPARSER非常方便,现在没事要做个用代理访问网页的软件,首先是提取各个免费代理网站的代理:用到HTMLPARSER
1 package com.pmjava.search;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileReader;
6 import java.io.FileWriter;
7 import org.htmlparser.Parser;
8 import org.htmlparser.filters.NodeClassFilter;
9 import org.htmlparser.tags.TableTag;
10 import org.htmlparser.util.NodeList;
11
12 public class Search ... {
13
14    /** *//**
15     * @param args
16     * @author Qing
17     * @throws Exception
18     */

19    public static void main(String[] args) throws Exception ...{
20        String[] url= new String[4] ;
21        url[0] = "[url]http://www.cnproxy.com/proxy1.html[/url]";
22        String currentUrl = url[0] ;       
23        String[] encoding = new String[4] ;   
24        encoding[1] = "gb2312";
25        String currentEncoding = encoding[1] ;
26        Parser parser = new Parser() ;       
27            parser.setURL(currentUrl) ;
28            parser.setEncoding(currentEncoding) ;
29            NodeClassFilter f=new NodeClassFilter(TableTag.class);
30            NodeList nodelist =  parser.extractAllNodesThatMatch(f);
31            String list=null;
32            String []Temp  ;
33            String []Temp1;
34            if (nodelist.size()>0)...{
35                for (int i = 0; i < nodelist.size(); i++)    ...{
36                    TableTag linkTag = (TableTag)nodelist.elementAt(i);
37                    list=linkTag.getChildrenHTML().replace("<tr>", "").replace("<td>", "").replace("<td>", "").replace("</td>", "").replace("</tr>", "").replace("<SCRIPT type=text/javascript>document.write(", "").replace(")</SCRIPT>", "").replace("imageURl=","").replace("<td width=\"140\">IP:Port<td width=\"40\">Type<td width=\"90\">Speed<td width=\"160\"> Country/Area","").replace("", "");
38                }

39                File file=new File("f://2.txt");
40                FileWriter writer=new FileWriter(file,true);
41                writer.write(list);
42                writer.close();
43                String readFile,writerFile = null,t,t1;
44                FileReader br=new FileReader(file);
45                BufferedReader bufread  =   new  BufferedReader(br);
46                String port;               
47                String []port2=...{z,m,k,l,d,x,i,w,q,b};               
48                while ((readFile = bufread.readLine()) != null) ...{       
49                    if(readFile.length()>1)
50                    ...{
51                     Temp=readFile.split("HTTP");
52                     int a=Temp[0].trim().indexOf(":");
53                    port=Temp[0].trim().substring(a,Temp[0].trim().length()).replace("\"", "").replace("+", "").replace(":","");
54                    char []port1=port.toCharArray();
55                    String temp1 = null,temp2 = "";
56                    for(int j=0;j<port1.length;j++)
57                    ...{                       
58                        System.out.println(port1[j]);
59                        for(int e=0;e<port2.length;e++)
60                        ...{                           
61                            if(String.valueOf(port1[j]).equals(port2[e]))
62                            ...{
63                                temp2=temp2+temp1;
64                            }

65                        }

66                    }

67                    }

68                 }

69                bufread.close();
70                br.close();   
71            }

72                   
73       
74    }

75
76
77
78}

79
source: [url]http://www.pmjava.com/blogview.asp?id=351[/url]

你可能感兴趣的:(职场,HtmlParser,休闲)