htmlparser初体验

阅读更多

昨天晚上完成了网页的下载,暂时不用和heritrix打交道了,有空我要好好研究下它的代码,现在没那么多时间。

今天对htmlparser有了初步了解,并自己写了一个简单的可以提取出网页中图片的url的小程序

package test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public  class Extractor {
	private String outputPath;

	private String inputPath;

	private Parser parse;

	public String getOutputPath() {
		return outputPath;
	}

	public void setOutputPath(String outputPath) {
		this.outputPath = outputPath;
	}

	public String getInputPath() {
		return inputPath;
	}

	public void setInputPath(String inputPath) {
		this.inputPath = inputPath;
	}

	public Parser getParse() {
		return parse;
	}

	public void setParse(Parser parse) {
		this.parse = parse;
	}

	public static void main(String args[]) {
		Extractor ex = new Extractor();
		ex.setInputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html");
		ex.setOutputPath("F:/Workspaces/MyEclipse 7.1/test/src/test/");
		try {
			ex.setParse(new Parser("F:/Workspaces/MyEclipse 7.1/test/src/test/index.html"));
			ex.extract();
		} catch (ParserException e) {
			e.printStackTrace();
		}
	}
	
	public void extract(){
		NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"),
				new HasAttributeFilter("class", "series_sy_intro_pic"));

		NodeFilter Attribute_filter = new AndFilter(new TagNameFilter("td"),
				new AndFilter(new HasAttributeFilter("class", "bor1_c1"),
						new HasAttributeFilter("style", "padding:5px;")));
		try {
			this.getParse().setEncoding("gb2312");
			NodeList pic_nodes =this.getParse().parse(pic_filter);
			System.out.println("a");
			TableColumn tc = (TableColumn) pic_nodes.elementAt(0);
			
			ImageTag it = (ImageTag)(tc.childAt(1).getChildren().elementAt(0));
			String imgURL = it.getImageURL();
System.out.println(imgURL);
			BufferedWriter bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath()+"aa.txt")));
			bw.write(imgURL);
			bw.flush();
			
//			for(int i=0;i 
 

 过节,休息下,明天继续..

 

你可能感兴趣的:(Myeclipse,F#,HTML)