内存数据库用来保存待访问url,布隆过滤器用来记录已访问的url。先前我们待访问url是存放在内存中,已访问的url是利用HashSet实现的。
布隆过滤器
package hashfilter;
import java.util.BitSet;
import bdb.CrawlUrl;
public class SimpleBloomFilter {
private static final int DEFAULT_SIZE=2<<24;
private static final int seeds[]={7,11,13,31,37,61};
private BitSet bits=new BitSet(DEFAULT_SIZE);
private SimpleHash func[]=new SimpleHash[seeds.length];
public SimpleBloomFilter()
{
int n=func.length;
for(int i=0;i
package hashfilter;
public class SimpleHash {
private int cap;
private int seed;
public SimpleHash(int cap,int seed)
{
this.cap=cap;
this.seed=seed;
}
public int hash(String value)
{
int result=0;
int n=value.length();
for(int i=0;i
package bdb;
import java.io.Serializable;
import java.util.Date;
import com.sleepycat.je.utilint.Timestamp;
public class CrawlUrl implements Serializable{
private static final long serialVersionUID=7931672194843948629L;
public CrawlUrl(){
}
private String oriUrl; // 原始 URL 的值,主机部分是域名
private String url; // URL 的值,主机部分是 IP,为了防止重复主机的出现
private int urlNo; // URL NUM
private int statusCode; // 获取 URL 返回的结果码
private int hitNum; // 此 URL 被其他文章引用的次数
private String charSet; // 此 URL 对应文章的汉字编码
private String abstractText; // 文章摘要
private String author; // 作者
private int weight; // 文章的权重(包含导向词的信息)
private String description; // 文章的描述
private int fileSize; // 文章大小
private Timestamp lastUpdateTime; // 最后修改时间
private Date timeToLive; // 过期时间
private String title; // 文章名称
private String type; // 文章类型
private String[] urlRefrences; // 引用的链接
private int layer; // 爬取的层次, 从种子开始, 依次为第 0 层, 第 1 层...
public int getLayer()
{
return layer;
}
public void setLayer(int layer)
{
this.layer=layer;
}
public String getUrl()
{
return url;
}
public void setUrl(String url)
{
this.url=url;
}
public int getUrlNo()
{
return urlNo;
}
public void setUrlNo(int urlNo)
{
this.urlNo = urlNo;
}
public int getStatusCode()
{
return statusCode;
}
public void setStatusCode(int statusCode)
{
this.statusCode = statusCode;
}
public int getHitNum()
{
return hitNum;
}
public void setHitNum(int hitNum)
{
this.hitNum = hitNum;
}
public String getCharSet()
{
return charSet;
}
public void setCharSet(String charSet)
{
this.charSet = charSet;
}
public String getAbstractText()
{
return abstractText;
}
public void setAbstractText(String abstractText)
{
this.abstractText = abstractText;
}
public String getAuthor()
{
return author;
}
public void setAuthor(String author)
{
this.author = author;
}
public int getWeight()
{
return weight;
}
public void setWeight(int weight)
{
this.weight = weight;
}
public String getDescription()
{
return description;
}
public void setDescription(String description)
{
this.description = description;
}
public int getFileSize()
{
return fileSize;
}
public void setFileSize(int fileSize)
{
this.fileSize = fileSize;
}
public Timestamp getLastUpdateTime()
{
return lastUpdateTime;
}
public void setLastUpdateTime(Timestamp lastUpdateTime)
{
this.lastUpdateTime = lastUpdateTime;
}
public Date getTimeToLive()
{
return timeToLive;
}
public void setTimeToLive(Date timeToLive)
{
this.timeToLive = timeToLive;
}
public String getTitle()
{
return title;
}
public void setTitle(String title)
{
this.title = title;
}
public String getType()
{
return type;
}
public void setType(String type)
{
this.type = type;
}
public String[] getUrlRefrences()
{
return urlRefrences;
}
public void setUrlRefrences(String[] urlRefrences)
{
this.urlRefrences = urlRefrences;
}
public final String getOriUrl()
{
return oriUrl;
}
public void setOriUrl(String oriUrl)
{
this.oriUrl = oriUrl;
}
}
package bdb;
public interface Frontier {
public CrawlUrl getNext() throws Exception;
public boolean putUrl(CrawlUrl url) throws Exception;
}
package bdb;
import java.io.File;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;
public abstract class AbstractFrontier {
private Environment env;
private static final String CLASS_CATALOG="java_class_catalog";
protected StoredClassCatalog javaCatalog;
protected Database catalogdatabase;
protected Database database;
public AbstractFrontier(String homeDirectory)
{
System.out.println("Opening environment in: "+homeDirectory);
EnvironmentConfig envConfig=new EnvironmentConfig();
envConfig.setTransactional(true);
envConfig.setAllowCreate(true);
env=new Environment(new File(homeDirectory),envConfig);
DatabaseConfig dbConfig=new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(true);
catalogdatabase=env.openDatabase(null, CLASS_CATALOG, dbConfig);
// A single StoredClassCatalog object is normally used along with a set of databases that stored serialized objects.
// 存放需要序列化的对象
javaCatalog=new StoredClassCatalog(catalogdatabase);
DatabaseConfig dbConfig0=new DatabaseConfig();
dbConfig0.setAllowCreate(true);
dbConfig0.setTransactional(true);
// 存放的是key
database=env.openDatabase(null,"URL", dbConfig0);
}
public void close()
{
database.close();
javaCatalog.close();
env.close();
}
protected abstract void put(Object key,Object value);
protected abstract Object get(Object key);
protected abstract Object delete(Object key);
}
package bdb;
import java.util.Map.Entry;
import java.util.Set;
import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.collections.StoredMap;
public class BDBFrontier extends AbstractFrontier implements Frontier{
private StoredMap pendingUrisDB=null;
public BDBFrontier(String homeDirectory) {
super(homeDirectory);
// TODO Auto-generated constructor stub
// 获得DatabaseEntry有两种方式,一是通过其构造函数,参数是对象的字节;
// 二是通过EntryBinding.objectToEntry()函数来获得
EntryBinding keyBinding=new SerialBinding(javaCatalog, String.class);
EntryBinding valueBinding=new SerialBinding(javaCatalog,CrawlUrl.class);
// Creates a map entity view of a Database
pendingUrisDB=new StoredMap(database,keyBinding,valueBinding,true);
}
@Override
public CrawlUrl getNext() throws Exception {
// TODO Auto-generated method stub
CrawlUrl result=null;
if(!pendingUrisDB.isEmpty())
{
// Set entrys=pendingUrisDB.entrySet();
// System.out.println(entrys);
Entry
entry=(Entry)pendingUrisDB.entrySet().iterator().next();
result=entry.getValue();
delete(entry.getKey());
}
return result;
}
@Override
public boolean putUrl(CrawlUrl url) throws Exception {
// TODO Auto-generated method stub
put(url.getOriUrl(),url);
return true;
}
@Override
protected void put(Object key, Object value) {
// TODO Auto-generated method stub
pendingUrisDB.put(key, value);
}
@Override
protected Object get(Object key) {
// TODO Auto-generated method stub
return pendingUrisDB.get(key);
}
@Override
protected Object delete(Object key) {
// TODO Auto-generated method stub
return pendingUrisDB.remove(key);
}
// 根据url可计算键值,可使用包括MD5在内的各种压缩算法
private String calulateUrl(String url)
{
return url;
}
public boolean contains(CrawlUrl url)
{
return pendingUrisDB.containsKey(url.getOriUrl());
}
public boolean isEmpty()
{
return pendingUrisDB.isEmpty();
}
// 测试程序
// public static void main(String[] args)
// {
// BDBFrontier bDBFrontier=new BDBFrontier("D:\\bdb");
// CrawlUrl url=new CrawlUrl();
// url.setOriUrl("http://www.baidu.com");
// try {
// bDBFrontier.putUrl(url);
// System.out.println(bDBFrontier.getNext().getOriUrl());
// bDBFrontier.close();
// } catch (Exception e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// }
}
import bdb.BDBFrontier;
import bdb.CrawlUrl;
import hashfilter.SimpleBloomFilter;
public class NewLinkQueue {
private static SimpleBloomFilter visitedUrl=new SimpleBloomFilter();
private static BDBFrontier unvistedUrl=new BDBFrontier("D:\\bdb");
public static Object unvisitedUrlDeQueue() throws Exception
{
return unvistedUrl.getNext().getOriUrl();
}
public static void addUnvisitedUrl(String url)
{
CrawlUrl crawlUrl=new CrawlUrl();
crawlUrl.setOriUrl(url);
if(url!=null&&!url.trim().equals("")
&&!unvistedUrl.contains(crawlUrl)&&!visitedUrl.contains(crawlUrl))
{
try {
unvistedUrl.putUrl(crawlUrl);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static boolean unvisitedUrlIsEmpty()
{
return unvistedUrl.isEmpty();
}
public static void addVisitedUrl(String url)
{
CrawlUrl crawlUrl=new CrawlUrl();
crawlUrl.setOriUrl(url);
visitedUrl.add(crawlUrl);
}
}
//下载网页
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
public class DownLoadFile {
private String filePath;
private CloseableHttpClient httpclient;
DownLoadFile()
{
filePath=null;
// httpclient=HttpClients.createDefault();
// HttpParams httpParams=new BasicHttpParams();
// HttpConnectionParams.setConnectionTimeout(httpParams, 50000);
// HttpConnectionParams.setSoTimeout(httpParams, 5000);
}
//根据URL和网页类型生成需要保存的网页的文件名,去除URL中的非文件名字符
public String getFileNameByUrl(String url,String contentType)
{
url=url.substring(7);
//text/html类型
if(contentType.indexOf("html")!=-1)
{
url=url.replaceAll("[\\?/:|<>\"]","_")+".html";
return url;
}
else
{
return url.replaceAll("[\\?/:|<>\"]","_")+"."
+contentType.substring(contentType.lastIndexOf("/")+1);
}
}
//保存网页字节数组到本地文件,filePath为要保存的文件的相对路径
//下载URL指向的网页
public String downloadFile(String url)
{
System.out.println("link:"+url);
// HttpClientConnectionManager connManager=new BasicHttpClientConnectionManager();
// connManager.closeIdleConnections(5, TimeUnit.SECONDS);
// httpclient=HttpClients.createMinimal(connManager);
// RequestConfig.Builder requestBuilder=RequestConfig.custom();
// requestBuilder = requestBuilder.setConnectionRequestTimeout(5*1000);
// requestBuilder = requestBuilder.setConnectTimeout(5*1000);
// HttpClientBuilder builder=HttpClientBuilder.create();
// builder.setDefaultRequestConfig(requestBuilder.build());
// CloseableHttpClient httpclient=builder.build();
HttpParams params = new BasicHttpParams();
HttpConnectionParams.setConnectionTimeout(params, 10000);
HttpConnectionParams.setSoTimeout(params, 10000);
HttpClient httpClient = new DefaultHttpClient(params);
try {
HttpGet httpGet=new HttpGet(url);
HttpResponse response=httpClient.execute(httpGet);
System.out.println("得到http响应");
if(response.getStatusLine().getStatusCode()==HttpStatus.SC_OK)
{
/**************************************************************************************/
//提取网页编码方式
/* Header[] headers=response.getAllHeaders();
String charset=null;
int temp=-1;
for(int i=0;i");
// charset=headers[i].getValue().substring(temp+8,end-1);
charset=headers[i].getValue().substring(temp+8);
break;
}
}
*/
/* InputStream in=response.getEntity().getContent();
String charset=null;
byte b[]=null;
int contentLength=in.available();
if(contentLength>1000)
{
contentLength=1000;
}
b=new byte[1000];
in.read(b,0,contentLength);
String strTmp=new String(b);
Pattern p;
Matcher m;
String regex="gb2312|GB2312|GBK|gbk|utf-8|UTF-8|utf8|UTF8";
p=Pattern.compile(regex);
m=p.matcher(strTmp);
if(m.find())
{
charset=m.group();
}
else
{
charset="utf-8";
}
System.out.println("得到网页字符集"+charset);
// BufferedReader br=new BufferedReader(new InputStreamReader(in));
// if(charset==null)
// {
// String line="";
// StringBuffer buffer=new StringBuffer();
// while((line=br.readLine())!=null)
// {
// buffer.append(line);
// }
// line=buffer.toString();
// int a=line.indexOf("charset=");
// String str=line.substring(a);
// charset=str.substring(8,str.indexOf("\""));
// }
// if(charset==null)
// {
// charset="utf-8";
// }
*/
/*************************************************************************************/
/* //得到网页内容
BufferedReader responseBody=new BufferedReader(new InputStreamReader(in,charset));
*/
/*************************************************************************************/
String a=response.getFirstHeader("Content-Type").getValue();
System.out.println("Content-Type内容: "+a);
InputStream responseBody=response.getEntity().getContent();
filePath="E:\\temp\\"
+getFileNameByUrl(url,response.getFirstHeader("Content-Type").getValue());
System.out.println("文件路径: "+filePath);
// saveToLocal(responseBody,filePath);
FileOutputStream outputStream=new FileOutputStream(new File(filePath));
int length=0;
byte b[]=new byte[1024];
while((length=responseBody.read(b))!=-1)
{
outputStream.write(b,0,length);
}
responseBody.close();
outputStream.close();
}
else
{
System.err.print("Method Failed:"+response.getStatusLine().getStatusCode());
}
} catch (ClientProtocolException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch(Exception e){
e.printStackTrace();
}
// try {
// httpclient.close();
// } catch (IOException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
return filePath;
}
private void saveToLocal(InputStream responseBody,String filePath) throws IOException
{
// int ch;
// FileWriter fw=new FileWriter(filePath);
//
//
// while((ch=responseBody.read())!=-1){
// fw.write(ch);
// }
// responseBody.close();
// fw.close();
//
// return ;
// String line="";
// StringBuffer buffer=new StringBuffer();
// int i=0;
// while((line=responseBody.readLine())!=null)
// {
// buffer.append(line);
// System.out.println("第"+i+"次循环");
// i++;
// }
// line=buffer.toString();
// System.out.println(line);//输出源码
/**********************************************************************************************/
//向文件中写入源码字符串
// FileWriter fw1=new FileWriter(filePath);
// fw1.write(line);
// fw1.close();
// System.out.println("保存完成"+filePath);
// DataOutputStream out=new DataOutputStream(new FileOutputStream(new File(filePath)));
// for(int i=0;i
提取链接
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParserTool {
public static Set extractLinks(String filePath)
{
Set links=new HashSet();
NodeList nodeList;
String line="";
StringBuffer sb=new StringBuffer();
NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
OrFilter lastFilter=new OrFilter();
lastFilter.setPredicates(new NodeFilter[]{linkFilter});
try {
BufferedReader br=new BufferedReader(new FileReader(filePath));
while((line=br.readLine())!=null)
{
sb.append(line);
}
Parser parser=Parser.createParser(sb.toString(), "utf-8");
nodeList=parser.parse(lastFilter);
Node nodes[]=nodeList.toNodeArray();
String link=null;
for(int i=0;i 标签
{
LinkTag linkNode=(LinkTag)(nodes[i]);
link=linkNode.getLink();
links.add(link);
}
else//
{
//提取frame里src属性的链接,如
String frame=nodes[i].getText();
int start=frame.indexOf("src");
int end=frame.indexOf(" ");
if(end==-1)
{
end=frame.indexOf(">");
}
String frameUrl=frame.substring(start+5, end-1);
links.add(frameUrl);
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (Exception e) {
e.printStackTrace();
}
return links;
// try {
// Parser parser=new Parser(url);
// parser.setEncoding("gb2312");
// NodeFilter linkFilter=new NodeClassFilter(LinkTag.class);
// OrFilter lastFilter=new OrFilter();
// lastFilter.setPredicates(new NodeFilter[]{linkFilter});
//// parser.setEncoding("gb2312");
// nodeList=parser.parse(lastFilter);
// Node[] nodes=nodeList.toNodeArray();
// String link="";
// System.out.println("开始提取链接循环");
// for(int i=0;i 标签
// {
// LinkTag linkNode=(LinkTag)(nodes[i]);
// link=linkNode.getLink();
// links.add(link);
// }
// else//
// {
// //提取frame里src属性的链接,如
// String frame=nodes[i].getText();
// int start=frame.indexOf("src");
// int end=frame.indexOf(" ");
// if(end==-1)
// {
// end=frame.indexOf(">");
// }
// String frameUrl=frame.substring(start+5, end-1);
// links.add(frameUrl);
// }
// }
// } catch (ParserException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
// catch(Exception e){
// e.printStackTrace();
// }
// return links;
}
}
import java.util.Set;
public class MyClawler {
private void initCrawlerWithSeeds(String[] seeds)
{
for(int i=0;i links=null;
String filePath=null;
while(!NewLinkQueue.unvisitedUrlIsEmpty())
{
String visitUrl;
try {
visitUrl = (String)NewLinkQueue.unvisitedUrlDeQueue();// 未访问队列队首Url出列
System.out.println("提取未访问的Url"+visitUrl);
if(visitUrl==null)
continue;
filePath=downLoader.downloadFile(visitUrl);// 下载网页
NewLinkQueue.addVisitedUrl(visitUrl);// 将该Url放入已访问队列
links=HtmlParserTool.extractLinks(filePath);// 提取网页中的链接
System.out.println("网页中的链接数:"+links.size());
for(String link:links)
{
NewLinkQueue.addUnvisitedUrl(link);// 将链接放入未访问队列
System.out.println(link);
}
System.out.println("网页中的链接数:"+links.size());
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static void main(String[] args)
{
MyClawler clawler=new MyClawler();
clawler.crawling(new String[]{"http://www.baidu.com"});
System.out.println("done");
}
}