【单机版】一个小爬虫+PageRank代码实现

在这个小程序里边,首先是使用一个爬虫,获取网页的出链网址,然后在对获取的所有网页进行执行PageRank算法。

import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;

public class GetWebStructure {
		
		public String StartAddress;
		
		public int times;
	
		public ArrayList partOfWebStructure=new ArrayList();
		
		public ArrayList webStructure=new ArrayList();
		
		public ArrayList Mapping=new ArrayList();//这个结果用于存储web站点和列号之间的关系
		
		public  GetWebStructure(String StartAddress,int times){
			
				this.StartAddress=StartAddress;
			
				this.times=times;
			
				}
		
		public void run() {
			
			String  sourcePage=StartAddress;
			
			WebNodeWithLink tmpOfWebNodeWithLink=new WebNodeWithLink(sourcePage,null);
			
			partOfWebStructure.add(tmpOfWebNodeWithLink);

			do{
				
				//System.out.println("正在处理partOfWebStructuresize中的新元素");	
				
				int count=0;
				
				int	thisWebNumber=-1;
				
				//查看当前节点是否已经被存储
				
				for(int n=0;n

import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class WebNodeWithLink {
	
	String sourcePage;
	
	String parentPage;

	int outDegree;
	
	ArrayList targetPage=new ArrayList();
	
	public WebNodeWithLink(String sourcePage,String parentPage)	{	
		
		this.sourcePage=sourcePage;	
		
		this.parentPage=parentPage;
	}
	
	public void setTargetPage(){
		
		Document doc=null;
		
		try {
			
			doc = Jsoup.connect(sourcePage).get();
			
			Elements links=doc.select("a[href~=^(?!(javascript:|_blank|#)).*$]");	
			
			for(Element link:links){
				
				String linkHref=link.attr("href");//取得链接地址
				
				if(linkHref.equals("/")) 																linkHref=sourcePage;
				
				if(linkHref.length()>2&&linkHref.substring(0, 2).equals("//")) 	linkHref="http:"+linkHref;

				if(linkHref.length()>1&&linkHref.substring(0, 1).equals("/")) 	linkHref=sourcePage+linkHref;
				
				if(!linkHref.equals(""))  targetPage.add(linkHref);
				
			}
			
			outDegree=targetPage.size();
			
		} catch (IOException e1) {
			
			//System.out.println("出现异常,异常网址为:"+sourcePage);			
			
			//e1.printStackTrace();
		}
		
	}
	
}

import java.util.ArrayList;

public class 	WebNode{
			
			int sourcePage;
			
			ArrayList targetPage=new ArrayList();
			
			public WebNode(int sourcePage)	{	
				
				this.sourcePage=sourcePage;	

			}
		
}

public class WebsiteToNumber {
	
	public String website;
	
	public int number;

}

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;

public class PageRank {
	
	private ArrayList webStructure=new ArrayList();
	
	private ArrayList Mapping=new ArrayList();
	
	double difference;
	
	double []v;

	triple []M;//M是转移矩阵,但是它是一个稀疏矩阵,可以采用三元组的方式进行表示以减少内存使用
	
	public PageRank(ArrayList webStructure,ArrayList Mapping,double difference){
		
		this.webStructure=webStructure;
		
		this.Mapping=Mapping;
		
		this.difference=difference;
		
		//对转移矩阵M进行初始化 
		
		int webStructureSize=this.webStructure.size();
		
		int length=0;
		
		for(int countLength=0;countLength"+nowDifference+"\n");
			
		}while(nowDifference>difference);
		
		System.out.print("迭代结束\n");
	}
	
	public double add(double value1, double value2) {  
		
        BigDecimal b1 = new BigDecimal(Double.toString(value1));  
        
        BigDecimal b2;  
        
        try{ b2 = new BigDecimal(Double.toString(value2));}
        
        catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;}
        
        return b1.add(b2).doubleValue();  
    
	}  
	
	 public double sub(double  value1,double value2){
		 
	        BigDecimal b1 = new BigDecimal(Double.toString(value1));
	        
	        BigDecimal b2 = new BigDecimal(Double.toString(value2));  
	        
	        return b1.subtract(b2).doubleValue();
	    }
	
	public void print(String information){
		
		PrintWriter outputStream=null;
		
		try{
			
			outputStream=new PrintWriter(new FileOutputStream("D://"+information+"result.txt"));
			
		} catch (FileNotFoundException e) {
		
			e.printStackTrace();
		
		}
		
		for(int i=0;i

public class triple{int row; int column;}

import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;

public class MatrixMultiplication extends Thread {
	
	triple []M;
	
	int targetPageNumber;
	
	int webStructureSize;
	
	private ArrayList webStructure=new ArrayList();
	
	double []v;
	
	double []v_New;
	
	CountDownLatch latch;
	
	public MatrixMultiplication(triple []M, ArrayList webStructure,double []v,double []v_New,int targetPageNumber,CountDownLatch latch){
		
		this.M=M;
		
		this.webStructure=webStructure;
		
		webStructureSize=webStructure.size();
		
		this.v=v;
		
		this.v_New=v_New;
		
		this.targetPageNumber=targetPageNumber;
		
		this.latch=latch;
		
	}
	
	public double add(double value1, double value2) {  
		
        BigDecimal b1 = new BigDecimal(Double.toString(value1));  
        
        BigDecimal b2;  
        
        try{ b2 = new BigDecimal(Double.toString(value2));}
        
        catch(Exception e){System.out.printf("error"+Double.toString(value2)); return b1.doubleValue() ;}
        
        return b1.add(b2).doubleValue();  
    
	}  
	
	public double mul(double value1, double value2) {  
	
		 BigDecimal b1 = new BigDecimal(Double.toString(value1));  
	     
		 BigDecimal b2 = new BigDecimal(Double.toString(value2));  
	     
		 return b1.multiply(b2).doubleValue();  
	    
	 }  
	 
	public void run(){
		
		double InitialValue=1.0/webStructureSize;
		
		double sum=0;
		
		int begin=0;
		
		int end=0;
		
		for(int count=0;count


你可能感兴趣的:(【单机版】一个小爬虫+PageRank代码实现)