内存数据库用来保存待访问url,布隆过滤器用来记录已访问的url。先前我们待访问url是存放在内存中,已访问的url是利用HashSet实现的。
布隆过滤器
package hashfilter; import java.util.BitSet; import bdb.CrawlUrl; public class SimpleBloomFilter { private static final int DEFAULT_SIZE=2<<24; private static final int seeds[]={7,11,13,31,37,61}; private BitSet bits=new BitSet(DEFAULT_SIZE); private SimpleHash func[]=new SimpleHash[seeds.length]; public SimpleBloomFilter() { int n=func.length; for(int i=0;i<n;i++) { func[i]=new SimpleHash(DEFAULT_SIZE,seeds[i]); } } public void add(CrawlUrl crawlUrl) { add(crawlUrl.getOriUrl()); } private void add(String value) { if(value!=null) { for(SimpleHash f:func) { bits.set(f.hash(value), true); } } } public boolean contains(CrawlUrl crawlUrl) { return contains(crawlUrl.getOriUrl()); } private boolean contains(String value) { if(value==null) return false; else { boolean ret=true; for(SimpleHash f:func) { ret=ret&&bits.get(f.hash(value)); } return ret; } } }
package hashfilter; public class SimpleHash { private int cap; private int seed; public SimpleHash(int cap,int seed) { this.cap=cap; this.seed=seed; } public int hash(String value) { int result=0; int n=value.length(); for(int i=0;i<n;i++) { result=result*seed+value.charAt(i); } return (cap-1)&result; } }
package bdb; import java.io.Serializable; import java.util.Date; import com.sleepycat.je.utilint.Timestamp; public class CrawlUrl implements Serializable{ private static final long serialVersionUID=7931672194843948629L; public CrawlUrl(){ } private String oriUrl; // 原始 URL 的值,主机部分是域名 private String url; // URL 的值,主机部分是 IP,为了防止重复主机的出现 private int urlNo; // URL NUM private int statusCode; // 获取 URL 返回的结果码 private int hitNum; // 此 URL 被其他文章引用的次数 private String charSet; // 此 URL 对应文章的汉字编码 private String abstractText; // 文章摘要 private String author; // 作者 private int weight; // 文章的权重(包含导向词的信息) private String description; // 文章的描述 private int fileSize; // 文章大小 private Timestamp lastUpdateTime; // 最后修改时间 private Date timeToLive; // 过期时间 private String title; // 文章名称 private String type; // 文章类型 private String[] urlRefrences; // 引用的链接 private int layer; // 爬取的层次, 从种子开始, 依次为第 0 层, 第 1 层... public int getLayer() { return layer; } public void setLayer(int layer) { this.layer=layer; } public String getUrl() { return url; } public void setUrl(String url) { this.url=url; } public int getUrlNo() { return urlNo; } public void setUrlNo(int urlNo) { this.urlNo = urlNo; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public int getHitNum() { return hitNum; } public void setHitNum(int hitNum) { this.hitNum = hitNum; } public String getCharSet() { return charSet; } public void setCharSet(String charSet) { this.charSet = charSet; } public String getAbstractText() { return abstractText; } public void setAbstractText(String abstractText) { this.abstractText = abstractText; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public int getWeight() { return weight; } public void setWeight(int weight) { this.weight = weight; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public int getFileSize() { return fileSize; } public void setFileSize(int fileSize) { this.fileSize = fileSize; } public Timestamp getLastUpdateTime() { return lastUpdateTime; } public void setLastUpdateTime(Timestamp lastUpdateTime) { this.lastUpdateTime = lastUpdateTime; } public Date getTimeToLive() { return timeToLive; } public void setTimeToLive(Date timeToLive) { this.timeToLive = timeToLive; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String[] getUrlRefrences() { return urlRefrences; } public void setUrlRefrences(String[] urlRefrences) { this.urlRefrences = urlRefrences; } public final String getOriUrl() { return oriUrl; } public void setOriUrl(String oriUrl) { this.oriUrl = oriUrl; } }
package bdb; public interface Frontier { public CrawlUrl getNext() throws Exception; public boolean putUrl(CrawlUrl url) throws Exception; }
package bdb; import java.io.File; import com.sleepycat.bind.serial.StoredClassCatalog; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; public abstract class AbstractFrontier { private Environment env; private static final String CLASS_CATALOG="java_class_catalog"; protected StoredClassCatalog javaCatalog; protected Database catalogdatabase; protected Database database; public AbstractFrontier(String homeDirectory) { System.out.println("Opening environment in: "+homeDirectory); EnvironmentConfig envConfig=new EnvironmentConfig(); envConfig.setTransactional(true); envConfig.setAllowCreate(true); env=new Environment(new File(homeDirectory),envConfig); DatabaseConfig dbConfig=new DatabaseConfig(); dbConfig.setAllowCreate(true); dbConfig.setTransactional(true); catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig); // A single StoredClassCatalog object is normally used along with a set of databases that stored serialized objects. // 存放需要序列化的对象 javaCatalog=new StoredClassCatalog(catalogdatabase); DatabaseConfig dbConfig0=new DatabaseConfig(); dbConfig0.setAllowCreate(true); dbConfig0.setTransactional(true); // 存放的是key database=env.openDatabase(null,"URL", dbConfig0); } public void close() { database.close(); javaCatalog.close(); env.close(); } protected abstract void put(Object key,Object value); protected abstract Object get(Object key); protected abstract Object delete(Object key); }
package bdb; import java.util.Map.Entry; import java.util.Set; import com.sleepycat.bind.EntryBinding; import com.sleepycat.bind.serial.SerialBinding; import com.sleepycat.collections.StoredMap; public class BDBFrontier extends AbstractFrontier implements Frontier{ private StoredMap pendingUrisDB=null; public BDBFrontier(String homeDirectory) { super(homeDirectory); // TODO Auto-generated constructor stub // 获得DatabaseEntry有两种方式,一是通过其构造函数,参数是对象的字节; // 二是通过EntryBinding.objectToEntry()函数来获得 EntryBinding keyBinding=new SerialBinding(javaCatalog, String.class); EntryBinding valueBinding=new SerialBinding(javaCatalog,CrawlUrl.class); // Creates a map entity view of a Database pendingUrisDB=new StoredMap(database,keyBinding,valueBinding,true); } @Override public CrawlUrl getNext() throws Exception { // TODO Auto-generated method stub CrawlUrl result=null; if(!pendingUrisDB.isEmpty()) { // Set entrys=pendingUrisDB.entrySet(); // System.out.println(entrys); Entry<String,CrawlUrl> entry=(Entry<String,CrawlUrl>)pendingUrisDB.entrySet().iterator().next(); result=entry.getValue(); delete(entry.getKey()); } return result; } @Override public boolean putUrl(CrawlUrl url) throws Exception { // TODO Auto-generated method stub put(url.getOriUrl(),url); return true; } @Override protected void put(Object key, Object value) { // TODO Auto-generated method stub pendingUrisDB.put(key, value); } @Override protected Object get(Object key) { // TODO Auto-generated method stub return pendingUrisDB.get(key); } @Override protected Object delete(Object key) { // TODO Auto-generated method stub return pendingUrisDB.remove(key); } // 根据url可计算键值,可使用包括MD5在内的各种压缩算法 private String calulateUrl(String url) { return url; } public boolean contains(CrawlUrl url) { return pendingUrisDB.containsKey(url.getOriUrl()); } public boolean isEmpty() { return pendingUrisDB.isEmpty(); } // 测试程序 // public static void main(String[] args) // { // BDBFrontier bDBFrontier=new BDBFrontier("D:\\bdb"); // CrawlUrl url=new CrawlUrl(); // url.setOriUrl("http://www.baidu.com"); // try { // bDBFrontier.putUrl(url); // System.out.println(bDBFrontier.getNext().getOriUrl()); // bDBFrontier.close(); // } catch (Exception e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // } }
import bdb.BDBFrontier; import bdb.CrawlUrl; import hashfilter.SimpleBloomFilter; public class NewLinkQueue { private static SimpleBloomFilter visitedUrl=new SimpleBloomFilter(); private static BDBFrontier unvistedUrl=new BDBFrontier("D:\\bdb"); public static Object unvisitedUrlDeQueue() throws Exception { return unvistedUrl.getNext().getOriUrl(); } public static void addUnvisitedUrl(String url) { CrawlUrl crawlUrl=new CrawlUrl(); crawlUrl.setOriUrl(url); if(url!=null&&!url.trim().equals("") &&!unvistedUrl.contains(crawlUrl)&&!visitedUrl.contains(crawlUrl)) { try { unvistedUrl.putUrl(crawlUrl); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static boolean unvisitedUrlIsEmpty() { return unvistedUrl.isEmpty(); } public static void addVisitedUrl(String url) { CrawlUrl crawlUrl=new CrawlUrl(); crawlUrl.setOriUrl(url); visitedUrl.add(crawlUrl); } }
//下载网页
import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.Header; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.conn.HttpClientConnectionManager; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.BasicHttpClientConnectionManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; public class DownLoadFile { private String filePath; private CloseableHttpClient httpclient; DownLoadFile() { filePath=null; // httpclient=HttpClients.createDefault(); // HttpParams httpParams=new BasicHttpParams(); // HttpConnectionParams.setConnectionTimeout(httpParams, 50000); // HttpConnectionParams.setSoTimeout(httpParams, 5000); } //根据URL和网页类型生成需要保存的网页的文件名,去除URL中的非文件名字符 public String getFileNameByUrl(String url,String contentType) { url=url.substring(7); //text/html类型 if(contentType.indexOf("html")!=-1) { url=url.replaceAll("[\\?/:|<>\"]","_")+".html"; return url; } else { return url.replaceAll("[\\?/:|<>\"]","_")+"." +contentType.substring(contentType.lastIndexOf("/")+1); } } //保存网页字节数组到本地文件,filePath为要保存的文件的相对路径 //下载URL指向的网页 public String downloadFile(String url) { System.out.println("link:"+url); // HttpClientConnectionManager connManager=new BasicHttpClientConnectionManager(); // connManager.closeIdleConnections(5, TimeUnit.SECONDS); // httpclient=HttpClients.createMinimal(connManager); // RequestConfig.Builder requestBuilder=RequestConfig.custom(); // requestBuilder = requestBuilder.setConnectionRequestTimeout(5*1000); // requestBuilder = requestBuilder.setConnectTimeout(5*1000); // HttpClientBuilder builder=HttpClientBuilder.create(); // builder.setDefaultRequestConfig(requestBuilder.build()); // CloseableHttpClient httpclient=builder.build(); HttpParams params = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(params, 10000); HttpConnectionParams.setSoTimeout(params, 10000); HttpClient httpClient = new DefaultHttpClient(params); try { HttpGet httpGet=new HttpGet(url); HttpResponse response=httpClient.execute(httpGet); System.out.println("得到http响应"); if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK) { /**************************************************************************************/ //提取网页编码方式 /* Header[] headers=response.getAllHeaders(); String charset=null; int temp=-1; for(int i=0;i<headers.length;i++) { if((temp=headers[i].getValue().indexOf("charset="))!=-1) { // int end=headers[i].getValue().indexOf("\""); // if(end==-1) // end=headers[i].getValue().indexOf(">"); // charset=headers[i].getValue().substring(temp+8,end-1); charset=headers[i].getValue().substring(temp+8); break; } } */ /* InputStream in=response.getEntity().getContent(); String charset=null; byte b[]=null; int contentLength=in.available(); if(contentLength>1000) { contentLength=1000; } b=new byte[1000]; in.read(b,0,contentLength); String strTmp=new String(b); Pattern p; Matcher m; String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8"; p=Pattern.compile(regex); m=p.matcher(strTmp); if(m.find()) { charset=m.group(); } else { charset="utf-8"; } System.out.println("得到网页字符集"+charset); // BufferedReader br=new BufferedReader(new InputStreamReader(in)); // if(charset==null) // { // String line=""; // StringBuffer buffer=new StringBuffer(); // while((line=br.readLine())!=null) // { // buffer.append(line); // } // line=buffer.toString(); // int a=line.indexOf("charset="); // String str=line.substring(a); // charset=str.substring(8,str.indexOf("\"")); // } // if(charset==null) // { // charset="utf-8"; // } */ /*************************************************************************************/ /* //得到网页内容 BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset)); */ /*************************************************************************************/ String a=response.getFirstHeader("Content-Type").getValue(); System.out.println("Content-Type内容: "+a); InputStream responseBody=response.getEntity().getContent(); filePath="E:\\temp\\" +getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue()); System.out.println("文件路径: "+filePath); // saveToLocal(responseBody,filePath); FileOutputStream outputStream=new FileOutputStream(new File(filePath)); int length=0; byte b[]=new byte[1024]; while((length=responseBody.read(b))!=-1) { outputStream.write(b,0,length); } responseBody.close(); outputStream.close(); } else { System.err.print("Method Failed:"+response.getStatusLine().getStatusCode()); } } catch (ClientProtocolException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }catch(Exception e){ e.printStackTrace(); } // try { // httpclient.close(); // } catch (IOException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } return filePath; } private void saveToLocal(InputStream responseBody,String filePath) throws IOException { // int ch; // FileWriter fw=new FileWriter(filePath); // // // while((ch=responseBody.read())!=-1){ // fw.write(ch); // } // responseBody.close(); // fw.close(); // // return ; // String line=""; // StringBuffer buffer=new StringBuffer(); // int i=0; // while((line=responseBody.readLine())!=null) // { // buffer.append(line); // System.out.println("第"+i+"次循环"); // i++; // } // line=buffer.toString(); // System.out.println(line);//输出源码 /**********************************************************************************************/ //向文件中写入源码字符串 // FileWriter fw1=new FileWriter(filePath); // fw1.write(line); // fw1.close(); // System.out.println("保存完成"+filePath); // DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath))); // for(int i=0;i<b.length;i++) // { // out.write(b[i]); // } FileOutputStream outputStream=new FileOutputStream(new File(filePath)); byte b[]=new byte[1024]; while(responseBody.read(b)!=-1) { outputStream.write(b); } responseBody.close(); outputStream.close(); } /*****************************************************************************************/ //调试用 // public static void main(String[] args) // { // DownLoadFile df=new DownLoadFile(); // df.downloadFile("http://www.baidu.com"); // } }提取链接
import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.util.HashSet; import java.util.Set; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class HtmlParserTool { public static Set<String> extractLinks(String filePath) { Set<String> links=new HashSet<String>(); NodeList nodeList; String line=""; StringBuffer sb=new StringBuffer(); NodeFilter linkFilter=new NodeClassFilter(LinkTag.class); OrFilter lastFilter=new OrFilter(); lastFilter.setPredicates(new NodeFilter[]{linkFilter}); try { BufferedReader br=new BufferedReader(new FileReader(filePath)); while((line=br.readLine())!=null) { sb.append(line); } Parser parser=Parser.createParser(sb.toString(), "utf-8"); nodeList=parser.parse(lastFilter); Node nodes[]=nodeList.toNodeArray(); String link=null; for(int i=0;i<nodes.length;i++) { if(nodes[i] instanceof LinkTag)// <a> 标签 { LinkTag linkNode=(LinkTag)(nodes[i]); link=linkNode.getLink(); links.add(link); } else// <frame标签> { //提取frame里src属性的链接,如<frame src="test.html"/> String frame=nodes[i].getText(); int start=frame.indexOf("src"); int end=frame.indexOf(" "); if(end==-1) { end=frame.indexOf(">"); } String frameUrl=frame.substring(start+5, end-1); links.add(frameUrl); } } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); }catch (Exception e) { e.printStackTrace(); } return links; // try { // Parser parser=new Parser(url); // parser.setEncoding("gb2312"); // NodeFilter linkFilter=new NodeClassFilter(LinkTag.class); // OrFilter lastFilter=new OrFilter(); // lastFilter.setPredicates(new NodeFilter[]{linkFilter}); //// parser.setEncoding("gb2312"); // nodeList=parser.parse(lastFilter); // Node[] nodes=nodeList.toNodeArray(); // String link=""; // System.out.println("开始提取链接循环"); // for(int i=0;i<nodes.length;i++) // { // if(nodes[i] instanceof LinkTag)// <a> 标签 // { // LinkTag linkNode=(LinkTag)(nodes[i]); // link=linkNode.getLink(); // links.add(link); // } // else// <frame标签> // { // //提取frame里src属性的链接,如<frame src="test.html"/> // String frame=nodes[i].getText(); // int start=frame.indexOf("src"); // int end=frame.indexOf(" "); // if(end==-1) // { // end=frame.indexOf(">"); // } // String frameUrl=frame.substring(start+5, end-1); // links.add(frameUrl); // } // } // } catch (ParserException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // catch(Exception e){ // e.printStackTrace(); // } // return links; } }
import java.util.Set; public class MyClawler { private void initCrawlerWithSeeds(String[] seeds) { for(int i=0;i<seeds.length;i++) { NewLinkQueue.addUnvisitedUrl(seeds[i]); } } public void crawling(String[] seeds) { /******************************************************************************/ //定义过滤器 /* LinkFilter filter=new LinkFilter() { public boolean accept(String url) { if(url.startsWith("http://www.baidu.com")) return true; else return false; } }; */ /******************************************************************************/ initCrawlerWithSeeds(seeds); DownLoadFile downLoader=new DownLoadFile(); Set<String> links=null; String filePath=null; while(!NewLinkQueue.unvisitedUrlIsEmpty()) { String visitUrl; try { visitUrl = (String)NewLinkQueue.unvisitedUrlDeQueue();// 未访问队列队首Url出列 System.out.println("提取未访问的Url"+visitUrl); if(visitUrl==null) continue; filePath=downLoader.downloadFile(visitUrl);// 下载网页 NewLinkQueue.addVisitedUrl(visitUrl);// 将该Url放入已访问队列 links=HtmlParserTool.extractLinks(filePath);// 提取网页中的链接 System.out.println("网页中的链接数:"+links.size()); for(String link:links) { NewLinkQueue.addUnvisitedUrl(link);// 将链接放入未访问队列 System.out.println(link); } System.out.println("网页中的链接数:"+links.size()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static void main(String[] args) { MyClawler clawler=new MyClawler(); clawler.crawling(new String[]{"http://www.baidu.com"}); System.out.println("done"); } }