PageRank介绍

PageRank介绍

我的环境是MyEclipse7.0,jdk为1.7

 

PageRank算法包括两个类HtmlEntity和HtmlPageRank,其中HtmlPageRank需要用到htmlParser.jar和htmllexer.jar这两个包。本文是根据http://duyunfei.iteye.com/blog/1532798的说明调试的.

 

首先,我们准备了7个测试网页,这几个网页的链接情况如下:  

i\j

test1

test2

test3

test4

test5

test6

test7

test1

0

1

1

0

0

0

0

test2

1

0

0

1

0

0

0

test3

0

0

0

1

1

1

0

test4

0

1

0

0

1

0

1

test5

0

0

1

1

0

0

0

test6

1

0

0

0

1

0

0

test7

0

1

0

1

0

0

1

表格的意思是 test1链接到test2,test3 ....依次类推,我们大致的根据上面两个原则可以猜一下,哪个将会是排名第一的网页?哪个最不重要?

貌似是test4和test6?

 

Html代码我都是放在E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc

Test1.html链接代码

<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a><
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a>
 
Test2.html

<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
 
Test3.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html">test6</a>
 
Test4.html
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>
<a href="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>


Test5.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html">test3</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>

Test6.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html">test1</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html">test5</a>


Test7.html
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html">test2</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html">test4</a>
<ahref="E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html">test7</a>

运行结果:

HtmlPageRank类

import java.io.*;
import java.util.*;

import org.htmlparser.*;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.HtmlPage;

/**
 *pagerank算法实现
 *
 *@authorafei
 *
 */
publicclassHtmlPageRank{
    /*阀值 */
    publicstaticdoubleMAX=0.00000000001;
    /*阻尼系数 */
    publicstaticdoublealpha=0.85;
    publicstaticStringhtmldoc="E:/MyEclipse/workspace/PageRank/WebRoot/htmlDoc";
    publicstaticMap<String,HtmlEntity>map=newHashMap<String,HtmlEntity>();

    publicstaticList<HtmlEntity>list=newArrayList<HtmlEntity>();

    publicstaticdouble[]init;
    publicstaticdouble[]pr;

    publicstaticvoidmain(String[]args)throws Exception {
       loadHtml();
       pr=doPageRank(); 

       while(!(checkMax())){
           System.arraycopy(pr,0,init,0,init.length);
           pr=doPageRank();
       }

       for(inti=0;i<pr.length;i++){
           HtmlEntityhe=list.get(i);
           he.setPr(pr[i]);
       }

       List<HtmlEntity>finalList=newArrayList<HtmlEntity>();
       Collections.sort(list,newComparator(){
           publicintcompare(Objecto1,Objecto2){
              HtmlEntityh1=(HtmlEntity)o1;
              HtmlEntityh2=(HtmlEntity)o2;
              intem=0;
              if(h1.getPr()>h2.getPr()){
                  em=-1;
              }else{
                  em=1;
              }
              returnem;
           }
       });

       for(HtmlEntityhe:list){
           System.out.println(he.getPath()+" : "+he.getPr());
       }
    }

     /**
     *加载文件夹下的网页文件,并且初始化pr值(即init数组),计算每个网页的外链和内链
     */
    publicstaticvoidloadHtml()throws Exception {
       Filefile=newFile(htmldoc);
       File[]htmlfiles=file.listFiles(newFileFilter(){
           publicbooleanaccept(Filepathname){
              if(pathname.getPath().endsWith(".html")){
                  return true;
              }
              return false;
           }
       });

       init=newdouble[htmlfiles.length];

       for(inti=0;i<htmlfiles.length;i++){
           Filef=htmlfiles[i];
           BufferedReaderbr=newBufferedReader(newInputStreamReader(
                  newFileInputStream(f)));
           Stringline=br.readLine();
           StringBufferhtml=newStringBuffer();

           while(line!=null){
              line=br.readLine();
              html.append(line);
           }

           HtmlEntityhe=newHtmlEntity();
           he.setPath(f.getAbsolutePath());
           he.setContent(html.toString());

           Parserparser=Parser.createParser(html.toString(),"gb2312");
           HtmlPagepage=newHtmlPage(parser);
           parser.visitAllNodesWith(page);

           NodeListnodelist=page.getBody();
           nodelist=nodelist.extractAllNodesThatMatch(
                  newTagNameFilter("A"),true);
           for(intj=0;j<nodelist.size();j++){
              LinkTagoutlink=(LinkTag)nodelist.elementAt(j);
              he.getOutLinks().add(outlink.getAttribute("href"));
           } 

           map.put(he.getPath(),he);
           list.add(he);
           init[i]=0.0;
       }

       for(inti=0;i<list.size();i++){
           HtmlEntityhe=list.get(i);
           List<String>outlink=he.getOutLinks();
 
           for(Stringol:outlink){
              HtmlEntityhe0=map.get(ol);
              try{
                  he0.getInLinks().add(he.getPath());
              }catch(NullPointerExceptione){
//如果网页的链接路径不正确,则报NullPointerException错误,并且你会发现heo=null,也就是说map.get(ol)取到的值为null,但是事实上map不为null,ol的值在map中不存在导致的,这是由于html中路径设置不正确
                  e.printStackTrace();
              }
          }
       }
    }
 
    /**
     *计算pagerank
     *
     *@paraminit
     *@paramalpho
     *@return
     */
    privatestaticdouble[]doPageRank(){
       double[]pr=newdouble[init.length]; 

       for(inti=0;i<init.length;i++){
           doubletemp=0;
           HtmlEntityhe0=list.get(i);
           for(intj=0;j<init.length;j++){
              HtmlEntityhe=list.get(j);
              //计算对本页面链接相关总值
              if(i!=j&&he.getOutLinks().size()!=0&&he.getOutLinks().contains(he0.getPath())){
                  temp=temp+init[j]/he.getOutLinks().size();
              }
           }
           //经典的pr公式
           pr[i]=alpha+(1-alpha)*temp;
       }
       returnpr;
    } 

    /**
     *判断前后两次的pr数组之间的差别是否大于我们定义的阀值假如大于,那么返回false,继续迭代计算pr
     *
     *@parampr
     *@paraminit
     *@parammax
     *@return
     */
    privatestaticbooleancheckMax(){
       booleanflag=true; 
       for(inti=0;i<pr.length;i++){
           if(Math.abs(pr[i]-init[i])>MAX){
              flag=false;
              break;
           }
       }
       return flag;
    }
}


HtmlEntity类
import java.util.*;
/**
 *网页entity
 *
 *@authorafei
 *
 */

classHtmlEntity{

    privateStringpath;
    privateStringcontent;
    /*外链(本页面链接的其他页面) */
    privateList<String>outLinks=newArrayList<String>();
    /*内链(另外页面链接本页面) */
    privateList<String>inLinks=newArrayList<String>();
    privatedoublepr;

    publicStringgetPath(){
       returnpath;
    }
    publicvoidsetPath(Stringpath){
       this.path=path;
    }
    publicStringgetContent(){
       returncontent;
    }
    publicvoidsetContent(Stringcontent){
       this.content=content;
    }
    publicdoublegetPr(){
       returnpr;
    }
    publicvoidsetPr(doublepr){
       this.pr=pr;
    }
    publicList<String>getOutLinks(){
       returnoutLinks;
    }
    publicvoidsetOutLinks(List<String>outLinks){
       this.outLinks=outLinks;
    }
    publicList<String>getInLinks(){
       returninLinks;
    }
    publicvoidsetInLinks(List<String>inLinks){
       this.inLinks=inLinks;
    }
}
运行结果

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test4.html: 1.0988562616424633

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test2.html: 1.024767124729736

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test5.html: 1.0225108328175456

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test3.html: 1.0012654834548864

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test1.html: 0.994362279917484

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test7.html: 0.9049428130819769

E:\MyEclipse\workspace\PageRank\WebRoot\htmlDoc\test6.html: 0.9000632741726616

 


你可能感兴趣的:(PageRank介绍)