java 获得网页源码 获得标题信息

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public class URLSource {
  //获取网页标题
  public static String getTitleName(String url) {
 String source="";//存储网页源文件
 source = getSource(url);
 //抽取每个网页的正文内容
 int c  = source.indexOf(""); <br> <span></span> int d  = source.indexOf("");
 String titleName=source.substring(c+7,d);
 return titleName;
  }
//获取微博名称
  public static String getWeiboName(String url) {
 String titleName=getTitleName(url);
 //抽取每个网页的正文内容
 int c  = titleName.indexOf("的微博");
 String weiBoName=titleName.substring(0,c);
 return weiBoName;
  }
  //抽取网页的源文件
 private static String getSource(String link) {
  String charset = "GBK";//网页默认编码设置为GBK
  URLConnection connection = null;
try {
 URL url = new URL(link);
 //打开连接
 connection = url.openConnection();
 //如果网页无法打开
 if(null == connection)
 return null;
 //下载源文件
 byte[] buf = new byte[4096];
 InputStream is = null;
 ByteArrayOutputStream os = new ByteArrayOutputStream();
 int count = 0;
 try {
 is = connection.getInputStream();
 while ((count = is.read(buf)) >= 0)
 {
 os.write(buf, 0, count);
 }
 }catch (Exception e) {
 e.printStackTrace();
 if (os.size() == 0)
 {
 return null;
 }
 }
 finally
 {
 try{is.close(); } 
 catch(Exception e){}
 }
 //获取网页的编码格式
 String content = os.toString();
 int fromIndex = content.indexOf("charset=");
 charset = content.substring(fromIndex+8, content.indexOf("\"", fromIndex));
 return new String(os.toByteArray(), charset);
}catch (Exception e) {
 e.printStackTrace();
}
return null;
  }
 public static void main(String[] arg){
System.out.println(getWeiboName("http://t.qq.com/hewenna"));
 } 
}

你可能感兴趣的:(java技术)