大致意思就是先给定一个URL,然后用HttpParser开源工具提取其网页上的链接,并丢到队列里,然后取队列的首URL重复此操作。
需要用到HttpClient和HttpParser包。
这里先给出所有.Java文件的源码。
其中还有些问题,今天不想看了,所有问题明天再说。
今天做了些修改,还是不能保存到文件,但是应该没有其他的问题了,主要修改了运用Queue来操作,还有修改了提取的Url可能不正确,这里面加了http的判断
-------------------------------------------------------------------------------------MyCrawler.java---------------------------------------------------------------------------------
package MyCrawler; import java.util.Set; /** * * @author xkey */ public class MyCrawler { /** * 使用种子初始化URL队列 * @return * @param seeds 种子URL */ private static void initCrawlerWithSeeds(String[] seeds) { for(int i = 0 ; i < seeds.length ; i ++) { LinkQueue.addUnVisitedUrl(seeds[i]); } } /** * 抓取过程 * @return * @param seeds */ public static void crawling (String[] seeds) { initCrawlerWithSeeds(seeds); while(!LinkQueue.unVisitedUrlIsEmpty() && LinkQueue.getVisitedUrlNum() < 1000) { //队头URL String visitUrl = (String)LinkQueue.unVisitedUrlDequeue(); System.out.println(visitUrl); if(visitUrl == null) continue; DownLoaderFile downLoader = new DownLoaderFile(); downLoader.downLoaderFile(visitUrl); LinkQueue.addVisitedUrl(visitUrl); //提取出下载网页中的URL LinkFilter filter = new LinkFilter(); Set<String>links = HtmlParserTool.extracLinks(visitUrl); //新的未访问的URL入对 for(String link:links) { System.out.println("xkey: "+link); LinkQueue.addUnVisitedUrl(link); } } } public static void main(String[] args) { MyCrawler crawler = new MyCrawler(); crawler.crawling(new String[]{"http://www.baidu.com"}); } }
public class LinkQueue { //已访问的URL集合 private static Set visitedUrl = new HashSet(); //待访问的URL集合 private static Queue <String> unVisitedUrl = new ConcurrentLinkedQueue<String>(); //获得URL队列 public static Queue getUnVisitedUrl() { return unVisitedUrl; } //添加到访问过的URL队列中 public static void addVisitedUrl(String Url) { visitedUrl.add(Url); } //移除访问过的URL public static void removeVisitedUrl(String Url) { visitedUrl.remove(Url); } //未访问过的URL出队列 public static Object unVisitedUrlDequeue() { return unVisitedUrl.poll(); } //保证每个URL只被访问一次 public static void addUnVisitedUrl(String Url) { if(Url != null && !Url.trim().equals("") && !visitedUrl.contains(Url) && !unVisitedUrl.contains(Url)) unVisitedUrl.add(Url); } //获得已经访问的URL数目 public static int getVisitedUrlNum() { return visitedUrl.size(); } //判断未访问的URL队列是否为空 public static boolean unVisitedUrlIsEmpty() { return unVisitedUrl.isEmpty(); } }
-------------------------------------------------------------------------------------DownLoaderFile.java---------------------------------------------------------------------------------
public class DownLoaderFile { public String getFileNameByUrl(String url,String contentType) { //移除http url = url.substring(7); //text.html类型 if(contentType.indexOf("html") != -1) { url = url.replaceAll("[\\?/:*|<>\"]","_")+".html"; return url; } //pdf类型 else { return url.replaceAll("[\\?/:*|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1); } } /** * 保存网页字节数组到本地文件,filepath为要保存的文件相对地址 */ private void saveToLocal(byte[] data,String filePath) { try{ DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath))); for(int i = 0 ; i < data.length ; i ++) { out.write(data[i]); } out.flush(); out.close(); }catch(IOException e) { e.printStackTrace(); } } //下载URL指向的网页 public String downLoaderFile(String url) { String filePath = null; //1.生成HttpClient对象并设置参数 HttpClient httpClient = new HttpClient(); httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); //2.生成GetMethod对象并设置参数 GetMethod getMethod = new GetMethod(url); //设置get 请求超时5s getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000); getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler()); //3.执行http get qi请求 try{ int statusCode = httpClient.executeMethod(getMethod); if(statusCode != HttpStatus.SC_OK) { System.err.println("Method failed: " + getMethod.getStatusLine()); filePath = null; } //4.处理HTTP响应内容 byte[] responseBody = getMethod.getResponseBody(); //根据网页URL生成保存时的文件名 // String ans = new String (responseBody); // System.out.println(ans); //filePath = "D;/xkey"; // filePath = "D:\\xkey\\" + getFileNameByUrl(url,getMethod.getResponseHeader("Content-Type").getValue()); // saveToLocal(responseBody,filePath); }catch(HttpException e){ System.out.println("Please check your provided http address!"); e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); }finally { getMethod.releaseConnection(); } return filePath; } }
-------------------------------------------------------------------------------------HttpParserTool.java--------------------------------------------------------------------------------------------------------------
public class HtmlParserTool { //获取一个网站上的URL,filter用来过滤链接 public static Set<String>extracLinks(String url) { Set<String>links = new HashSet<String>(); try{ Parser parser = new Parser(url); parser.setEncoding("utf-8"); NodeFilter frameFilter = new NodeFilter(){ public boolean accept(Node node) { if(node.getText().startsWith("frame src=")){ return true; } else return false; } }; OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter); NodeList list = parser.extractAllNodesThatMatch(linkFilter); for(int i = 0 ; i < list.size() ; i ++) { Node tag = list.elementAt(i); if(tag instanceof LinkTag) //<a>标签 { LinkTag link = (LinkTag) tag; String linkUrl = link.getLink();//URL if(linkUrl.contains("http")) links.add(linkUrl); }else {//<frame>标签 //提取frame 里src属性的链接,如<frame src = "test.html"> String frame = tag.getText(); int start = frame.indexOf("src="); frame = frame.substring(start); int end = frame.indexOf(" "); if(end == -1) end = frame.indexOf(">"); String frameUrl = frame.substring(5,end - 1); if(frameUrl.contains("http")) links.add(frameUrl); } } }catch(ParserException e) { e.printStackTrace(); } return links; } }