import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.util.*; public class HttpConnTest { public static void main (String[] args) throws Exception{ // URL url = new URL("http://javaeye.com"); URL url = new URL("http://blog.sina.com.cn/buptaa"); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.connect(); //打印请求相应的头部文件 Map<String,List<String>> header = conn.getHeaderFields(); for(String key : header.keySet()){ System.out.println(key + ":" + header.get(key)); } //打印相应内容 BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8")); String str = null; while((str = br.readLine()) != null){ System.out.println(str); } conn.disconnect(); } }
基于广度优先算法 在上面代码基础上实现简易爬虫 如下
import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.regex.*; public class HttpConnTest { private List<URL> urlList = new ArrayList<URL>(); private int count = 0; private void doHttpConn() throws Exception{ count ++; URL url = new URL("http://blog.sina.com.cn/buptaa"); if(! urlList.isEmpty()){ url = urlList.get(0); } String urlRegx = "(http|www|ftp)(://)?(//w+(-//w+)*)" + "(//.(//w+(-//w+)*))*((://d+)?)(/(//w+(-//w+)*))" + "*(//.?(//w)*)(//?)?(((//w*%)*(//w*//?)*(//w*:)" + "*(//w*//+)*(//w*//.)*(//w*&)*(//w*-)*(//w*=)*" + "(//w*%)*(//w*//?)*(//w*:)*(//w*//+)*(//w*//.)*" + "(//w*&)*(//w*-)*(//w*=)*)*(//w*)*)"; Pattern p = Pattern.compile(urlRegx, Pattern.CASE_INSENSITIVE); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.connect(); // 打印请求相应的头部文件 Map<String, List<String>> header = conn.getHeaderFields(); for (String key : header.keySet()) { System.out.println(key + ":" + header.get(key)); } // 打印相应内容 BufferedReader br = new BufferedReader(new InputStreamReader(conn .getInputStream(), "UTF-8")); String str = null; while ((str = br.readLine()) != null) { System.out.println(str); Matcher m = p.matcher(str); while (m.find()) { urlList.add(new URL(m.group(0))); } } conn.disconnect(); System.out.println("-----------------------"); System.out.println(urlList.size()); for (URL aurl : urlList) { System.out.println(aurl.toString()); } } public static void main(String[] args) throws Exception { HttpConnTest hct = new HttpConnTest(); while(hct.count <= 3){ hct.doHttpConn(); } System.out.println("---DONE---"+hct.count); } }