jsoup

抓取百度百科词条解释的正文信息,用的方法简单没有什么技术含量,详细参见:http://www.open-open.com/jsoup/

http://www.open-open.com/jsoup/
 
 
<pre name="code" class="java">package org.baidu.crawl;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.Buffer;
import java.util.Iterator;

import javax.print.Doc;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Myreader {
	
	//获取百度百科中的词条正文和词条便签
	public static void CrawlBaidu(String url) throws IOException{
		BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\11"));
		String regex = "\\[.\\d*\\]";
		//String regex1 = "\\[.\\]\\s\\S";
		Document document = Jsoup.connect(url).get();
		
		Elements elementsTop = document.select("div.poster-top");
		Elements elementsTopText = elementsTop.select("div.lemma-summary");
		String stringText = elementsTopText.text();
		String lastElementTopText = stringText.replaceAll(regex, "");
		
		Elements elements = document.select("div.main-content");
		if(elements.size()!=0){
			Elements elements2 = elements.select("div.para");
			if(elements2.size()!=0){
				String strText = elements2.text();
				String replace = strText.replaceAll(regex, "");
				Document parse = Jsoup.parse(strText);
//				String replaceAll = parse.body().text().replaceAll(regex, "");
				Element elementById = document.getElementById("open-tag-item");
				
				bWriter.write(lastElementTopText+replace+"\n"+"词条便签:" +elementById.text());
				
				//当百度百科正文没有词条标签的时候,注释上面的write(),放开下面的的write()
				bWriter.newLine();
				bWriter.flush();
			}
			bWriter.close();
			
		}
	}
	//获取正文中的超链接
	public static void hyperLink(String url) throws IOException{
		BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\11链接"));
		Document document = Jsoup.connect(url).get();
		Elements elements = document.select("div.main-content");
		if(elements.size()!=0){
			Elements elements2 = elements.select("div.para");
			if(elements2.size()!=0){
				Elements elements3 = elements2.select("a[href]");
				String text = elements3.html();
//				System.out.println(text);
				bWriter.write(text);
				bWriter.newLine();
				bWriter.flush();
			}
		}
		bWriter.close();
	}
	
	//获取tbody要素
	public static void crawlTbody(String url) throws IOException {
		BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter("D:\\词条\\tbody"));
		Document document = Jsoup.connect(url).get();
	}
	
	//一行一行的读取数据,并解决中文乱码问题,超链接字段与正文匹配,并把正文中超链接字段表示为[[超链接字段]]
	public static void modifyHyperLink(String bodyText,String linkText,String url) throws IOException{
		//输出最终路径
//		BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\last\\滇池"));
		//百度百科词条的解释正文
		FileInputStream fInputStreamBody = new FileInputStream(bodyText);
		//设定输入字段的格式,防止中文乱码
		InputStreamReader iStreamReaderBody = new InputStreamReader(fInputStreamBody,"UTF-8");
		//缓冲字符输入流
		BufferedReader bufferedReaderBody = new BufferedReader(iStreamReaderBody);
//		//下面这一句和上面三句是同一个意思
//		BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(bodyText), "UTF-8"));
		String bodyLine = null;
		bodyLine = bufferedReaderBody.readLine();
		
		BufferedWriter bWriter = new BufferedWriter(new FileWriter("D:\\词条\\高校\\南开大学"));
		
		//正文中的超链接
		FileInputStream fInputStreamLink = new FileInputStream(linkText);
		InputStreamReader iReaderLink = new InputStreamReader(fInputStreamLink);
		BufferedReader bReaderLink = new BufferedReader(iReaderLink);
		//一行一行读取超链接字段
		String linkLine = null;
		while((linkLine = bReaderLink.readLine())!=null){
			bodyLine = bodyLine.replaceAll(linkLine, "[[" + linkLine + "]]");
		}
		//获取词条标签
		Document document = Jsoup.connect(url).get();
		Element element = document.getElementById("open-tag-item");
		String fieldTag = element.text();
		bWriter.write(bodyLine+ "\n" +"词条标签:" + fieldTag);
//		bWriter.write(bodyLine);
		bWriter.newLine();
		bWriter.flush();
		bWriter.close();
//		System.out.println(bodyLine);
	}
	public static void main(String[] args) throws IOException {
		String url = "http://baike.baidu.com/item/%E5%8D%97%E5%BC%80%E5%A4%A7%E5%AD%A6/134521";
		CrawlBaidu(url);
		hyperLink(url);
		String bodyText = "D:\\词条\\11";
		String linkText = "D:\\词条\\11链接";
		modifyHyperLink(bodyText, linkText,url);
		
	}	
}


 
 
 
 



你可能感兴趣的:(百度)