PageRank介绍
我的环境是MyEclipse7.0,jdk为1.7
PageRank算法包括两个类HtmlEntity和HtmlPageRank,其中HtmlPageRank需要用到htmlParser.jar和htmllexer.jar这两个包。本文是根据http://duyunfei.iteye.com/blog/1532798的说明调试的.
首先,我们准备了7个测试网页,这几个网页的链接情况如下:
i\j |
test1 |
test2 |
test3 |
test4 |
test5 |
test6 |
test7 |
test1 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
test2 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
test3 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
test4 |
0 |
1 |
0 |
0 |
1 |
0 |
1 |
test5 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
test6 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
test7 |
0 |
1 |
0 |
1 |
0 |
0 |
1 |
表格的意思是 test1链接到test2,test3 ....依次类推,我们大致的根据上面两个原则可以猜一下,哪个将会是排名第一的网页?哪个最不重要?
貌似是test4和test6?
Html代码我都是放在E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc
Test1.html链接代码 <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a>< <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a> Test2.html <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a> <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> Test3.html <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html">test6</a> Test4.html <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a> <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a> <a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a> Test5.html <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> Test6.html <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a> Test7.html <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a> <ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>
运行结果:
HtmlPageRank类 import java.io.*; import java.util.*; import org.htmlparser.*; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.visitors.HtmlPage; /** *pagerank算法实现 * *@authorafei * */ publicclassHtmlPageRank{ /*阀值 */ publicstaticdoubleMAX=0.00000000001; /*阻尼系数 */ publicstaticdoublealpha=0.85; publicstaticStringhtmldoc="E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc"; publicstaticMap<String,HtmlEntity>map=newHashMap<String,HtmlEntity>(); publicstaticList<HtmlEntity>list=newArrayList<HtmlEntity>(); publicstaticdouble[]init; publicstaticdouble[]pr; publicstaticvoidmain(String[]args)throws Exception { loadHtml(); pr=doPageRank(); while(!(checkMax())){ System.arraycopy(pr,0,init,0,init.length); pr=doPageRank(); } for(inti=0;i<pr.length;i++){ HtmlEntityhe=list.get(i); he.setPr(pr[i]); } List<HtmlEntity>finalList=newArrayList<HtmlEntity>(); Collections.sort(list,newComparator(){ publicintcompare(Objecto1,Objecto2){ HtmlEntityh1=(HtmlEntity)o1; HtmlEntityh2=(HtmlEntity)o2; intem=0; if(h1.getPr()>h2.getPr()){ em=-1; }else{ em=1; } returnem; } }); for(HtmlEntityhe:list){ System.out.println(he.getPath()+" : "+he.getPr()); } } /** *加载文件夹下的网页文件,并且初始化pr值(即init数组),计算每个网页的外链和内链 */ publicstaticvoidloadHtml()throws Exception { Filefile=newFile(htmldoc); File[]htmlfiles=file.listFiles(newFileFilter(){ publicbooleanaccept(Filepathname){ if(pathname.getPath().endsWith(".html")){ return true; } return false; } }); init=newdouble[htmlfiles.length]; for(inti=0;i<htmlfiles.length;i++){ Filef=htmlfiles[i]; BufferedReaderbr=newBufferedReader(newInputStreamReader( newFileInputStream(f))); Stringline=br.readLine(); StringBufferhtml=newStringBuffer(); while(line!=null){ line=br.readLine(); html.append(line); } HtmlEntityhe=newHtmlEntity(); he.setPath(f.getAbsolutePath()); he.setContent(html.toString()); Parserparser=Parser.createParser(html.toString(),"gb2312"); HtmlPagepage=newHtmlPage(parser); parser.visitAllNodesWith(page); NodeListnodelist=page.getBody(); nodelist=nodelist.extractAllNodesThatMatch( newTagNameFilter("A"),true); for(intj=0;j<nodelist.size();j++){ LinkTagoutlink=(LinkTag)nodelist.elementAt(j); he.getOutLinks().add(outlink.getAttribute("href")); } map.put(he.getPath(),he); list.add(he); init[i]=0.0; } for(inti=0;i<list.size();i++){ HtmlEntityhe=list.get(i); List<String>outlink=he.getOutLinks(); for(Stringol:outlink){ HtmlEntityhe0=map.get(ol); try{ he0.getInLinks().add(he.getPath()); }catch(NullPointerExceptione){ //如果网页的链接路径不正确,则报NullPointerException错误,并且你会发现heo=null,也就是说map.get(ol)取到的值为null,但是事实上map不为null,ol的值在map中不存在导致的,这是由于html中路径设置不正确 e.printStackTrace(); } } } } /** *计算pagerank * *@paraminit *@paramalpho *@return */ privatestaticdouble[]doPageRank(){ double[]pr=newdouble[init.length]; for(inti=0;i<init.length;i++){ doubletemp=0; HtmlEntityhe0=list.get(i); for(intj=0;j<init.length;j++){ HtmlEntityhe=list.get(j); //计算对本页面链接相关总值 if(i!=j&&he.getOutLinks().size()!=0&&he.getOutLinks().contains(he0.getPath())){ temp=temp+init[j]/he.getOutLinks().size(); } } //经典的pr公式 pr[i]=alpha+(1-alpha)*temp; } returnpr; } /** *判断前后两次的pr数组之间的差别是否大于我们定义的阀值假如大于,那么返回false,继续迭代计算pr * *@parampr *@paraminit *@parammax *@return */ privatestaticbooleancheckMax(){ booleanflag=true; for(inti=0;i<pr.length;i++){ if(Math.abs(pr[i]-init[i])>MAX){ flag=false; break; } } return flag; } } HtmlEntity类 import java.util.*; /** *网页entity * *@authorafei * */ classHtmlEntity{ privateStringpath; privateStringcontent; /*外链(本页面链接的其他页面) */ privateList<String>outLinks=newArrayList<String>(); /*内链(另外页面链接本页面) */ privateList<String>inLinks=newArrayList<String>(); privatedoublepr; publicStringgetPath(){ returnpath; } publicvoidsetPath(Stringpath){ this.path=path; } publicStringgetContent(){ returncontent; } publicvoidsetContent(Stringcontent){ this.content=content; } publicdoublegetPr(){ returnpr; } publicvoidsetPr(doublepr){ this.pr=pr; } publicList<String>getOutLinks(){ returnoutLinks; } publicvoidsetOutLinks(List<String>outLinks){ this.outLinks=outLinks; } publicList<String>getInLinks(){ returninLinks; } publicvoidsetInLinks(List<String>inLinks){ this.inLinks=inLinks; } }运行结果
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html: 1.0988562616424633
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html: 1.024767124729736
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html: 1.0225108328175456
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html: 1.0012654834548864
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html: 0.994362279917484
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html: 0.9049428130819769
E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html: 0.9000632741726616