因为正则感觉比较复杂,而且很容易出错,所以不用正则写了一个程序,大概能抓80%的网页(除了一些特别复杂的网页)
步骤
1、下载网页内容
2、去标签
public void DownLoad(String strpath) //该方法用来实现网页的下载 strpath为保存的文件名
{
byte[] b = new byte[1024];
int offset;
path = strpath;
tempstr = "";
String tt="";
try
{
is = url.openStream(); //打开到此 URL 的连接并返回一个用于从该连接读入的 InputStream。
FileOutputStream fout = new FileOutputStream(strpath);
while((offset = is.read(b)) != -1) //从输入流中读取一定数量的字节并将其存储在缓冲区数组 b 中。直到读到末尾
{
fout.write(b,0,offset);
tt = tt + new String(b);
}
fout.close();
}
catch(Exception e)
{
e.printStackTrace();
}
}
2、去标签
}
//删除指定的从A到B的字符串
public StringBuffer DeleteIndexof(String start,String end,StringBuffer str)
{
int s = 0;
int e = 0;
while(true)
{
s = str.indexOf(start,0);
e = str.indexOf(end,s+1)+end.length();
if(s == -1)
break;
str.replace(s, e, "").toString();
//System.out.println(str);
}
return str;
}
//删除指定的字符串
public String DeleteIndexof(String start,String str)
{
String sss;
sss = str.replaceAll(start, "");
return sss;
}
public void DeleteTag(String strpathout) throws IOException
{
File file=new File(path);
StringBuffer sbstr = new StringBuffer();
FileInputStream fin = new FileInputStream(file);
InputStreamReader isr=new InputStreamReader(fin,geshi);
BufferedReader br=new BufferedReader(isr);
String t;
while(( t = br.readLine()) != null)
{
sbstr.append(t);
sbstr.append("\n");
}
//去掉没用的标签
//注意这里的顺序
sbstr = DeleteIndexof("<script","</script>",sbstr);
sbstr = DeleteIndexof("<style","</style>",sbstr);
sbstr = DeleteIndexof("\"","\"",sbstr);
sbstr = DeleteIndexof("<",">",sbstr);
tempstr = sbstr.toString();
System.out.println("转换完成");
File file1 = new File(strpathout);
FileOutputStream fout = new FileOutputStream(file1);
fout.write(tempstr.getBytes());
fin.close();
fout.close();
}