Jsoup提取指定table中所有td的内容

由于数据抓取的需要,将网页下载完后所有的都是html源码,需要抓取某一信息时,需要对html做特定的分析,然后按照class或者id进行抓取。如果不了解抓取页面的html标签时,所得到的信息是非常的难看,也很难从其中发现有用的信息。也是现在我开始做页面挖掘最大的难点。

以下代码是结合前面对html页面空格处理以及特定抓取和存储的代码:

下载页面代码:

package com.dazhihui;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class MyJsoup {
	public static boolean downloadPage(String url, File file){
		try {
			Document doc = Jsoup.connect(url).data("jquery","java").userAgent("Mozilla").cookie("auth", "tiken").timeout(5000).get();
			String pageHtml = doc.toString();
			OutputStream out = new FileOutputStream(file);
			out.write(pageHtml.toString().getBytes());
			out.close();	
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		}
		return true;
	}
}

处理空格代码:

package com.dazhihui;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

public class ReplaceAllFileString {
	//参数说明:oldFile为所需要替换的文件,即为原文件;   newFile为替换后新的文件 ;oldString为所需要替换的字符串;newString为替换字符串
	public static boolean replaceAllFileString(File oldFile, File newFile, String oldString, String newString){
		try {
		BufferedReader reader = new BufferedReader(new FileReader(oldFile));
		BufferedWriter writer = new BufferedWriter(new FileWriter(newFile));
		String teamString = null;
		while((teamString = reader.readLine()) != null){
			String str = teamString.replaceAll(oldString, newString);
			writer.write(str);
		}
		reader.close();
		writer.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return false;
		}
		return true;
	}
}


提取大智慧公司概况代码:

package com.dazhihui;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class DazhihuiResolveCompanyProfile {
	public static ArrayList resolvePageText(File file){
		ArrayList list=null;
		try {
			Document doc = Jsoup.parse(file, "GBK");
			Elements elements = doc.getElementsByClass("table_style_e");
			list = new ArrayList();
			//select("table#table_style_e");
			for(Element element:elements){
				if(element.text()!=null&& !"".equals(element.text())){
					Elements es = element.select("tr");
					for(Element tdelement:es){
						Elements tdes = tdelement.select("td");
						for(int i = 0; i < tdes.size(); i++){
							list.add(tdes.get(i).text());
							//System.out.println(tdes.get(i).text());
						}
					}
				}
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}	
		return list;
	}
}


主代码:

package com.dazhihui;

import java.io.File;
import java.util.ArrayList;


public class Dazhihui {

	
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		String url = "http://cj.gw.com.cn/news/stock/601288.shtml";
		File file = new File("C:/myjsoup/dazhihui/dazhihui.txt");
		File newFile = new File("C:/myjsoup/dazhihui/newdazhihui.txt");
		boolean mark =  MyJsoup.downloadPage(url, file);
		System.out.println(mark);
		boolean mark2 = ReplaceAllFileString.replaceAllFileString(file, newFile, " ", "");
		System.out.println(mark2);
		ArrayList list =DazhihuiResolveCompanyProfile.resolvePageText(newFile);
		for(int i = 0; i < list.size(); i++){
			System.out.println(list.get(i));
		}
	}

}



你可能感兴趣的:(jsoup)