自己写爬虫

最近因为论文的原因,需要爬取一些文本数据。所以找了本书看了一下,写了一个爬虫程序,和对原始爬取数据的清洗程序。

第一版程序没有用到线程的概念,所以比较慢,第二版用了线程,速度确实提升了很多。

爬虫用到了几个包:

commons-httpclient-3.0.1.jar

htmlparser.jar

第一个是模拟浏览器http的包,第二个是解析网页的包。

一般情况下,爬虫用的策略为广度优先,实现用的是一个队列,首先把要爬取的初始url进队列,然后出队列,解析出队列的url中的<a>标签,做一下筛选,把符合条件的标签url依次进队列,对队列进行出队列操作,直到满足条件退出。退出条件一定设好,否则容易死循环,一般情况下是队列不为空。

看一下单线程版的主程序:


public class MyCrawler {
	

	/**
	 * 使用种子初始化 URL 队列
	 * 
	 * @return
	 * @param seeds
	 * 种子URL
	 */
	private void initCrawlerWithSeeds(String seeds) {

		LinkQueue.addUnvisitedUrl(seeds);
	}

	/**
	 * 抓取过程
	 * @return
	 * @param seeds
	 */
	public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接
		LinkFilter filter = new LinkFilter() {
			public boolean accept(String url) {
				if (url.startsWith("http://xxxx"))
					return true;
				else
					return false;
			}
		};
		// 初始化 URL 队列
		initCrawlerWithSeeds(seeds);

		Set<String> links = HtmlParserTool.extracLinks(seeds, filter);
		// 新的未访问的 URL 入队
		for (String link : links) {
			LinkQueue.addUnvisitedUrl(link);
		}
		while (!LinkQueue.unVisitedUrlsEmpty()) {
			// 队头URL出队列
			String visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();
			if (visitUrl == null)
				continue;
			DownLoadFile downLoader = new DownLoadFile();
			// 下载网页
			downLoader.downloadFile(visitUrl);
		}
	}

	private void initCrawl() {
		
		LinkQueue.removeAllUnvisited();
		LinkQueue.removeAllVisited();
	}

	// main 方法入口
	public static void main(String[] args) {
		MyCrawler crawler = new MyCrawler();
		for (int j = 1; j < 201; j++) {
			crawler.initCrawl();
			crawler.crawling("http://xxxx"+j+".htm");
		}
	}
}

我用的网页结构比较简单,所以直接用了循环来做, LinkQueue是定义的队列。

第二版用的多线程:

public class MyCrawlerMultiThread {
	public static List<Thread> childThread = new ArrayList<Thread>();
	private final static int FROM=1;
	private final static int TO=201;
	/**
	 * 使用种子初始化 URL 队列
	 * @return
	 * @param seeds
	 * 种子URL
	 */
	private static void initCrawlerWithSeeds(String seeds) {

		LinkQueue.addUnvisitedUrl(seeds);
	}

	/**
	 * 抓取过程
	 * @return
	 * @param seeds
	 */
	public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接

		while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程
			String visitUrl;
			// 队头URL出队列

			visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();// 很快不会影响吧

			if (visitUrl == null)
				continue;
			DownLoadFile downLoader = new DownLoadFile();
			// 下载网页
			downLoader.downloadFile(visitUrl);
		}
	}

	private void initCrawl() {
		LinkQueue.removeAllUnvisited();
		LinkQueue.removeAllVisited();
	}

	// main 方法入口
	public static void main(String[] args) {
		MyCrawlerMultiThread crawler = new MyCrawlerMultiThread();
		BThread bt = null;
        AThread at = null;
		for (int j = FROM; j < TO; j++) {
			crawler.initCrawl();
			LinkFilter filter = new LinkFilter() {
				public boolean accept(String url) {
					if (url.startsWith("http://xxx"))
						return true;
					else
						return false;
				}
			};
			String seeds = null;//	
			seeds = "http://xxxx" + j + ".htm";
			// 初始化 URL 队列
			initCrawlerWithSeeds(seeds);

			Set<String> links = HtmlParserTool.extracLinks(seeds, filter);
			// 新的未访问的 URL 入队
			for (String link : links) {
				LinkQueue.addUnvisitedUrl(link);
			} // 进队列
			bt=new BThread();
			at=new AThread(bt);
			try {
	            bt.start();
	            at.start();
	            bt.join();
	        } catch (Exception e) {
	            System.out.println("Exception from main");
	        }
		}
	}

}

class CThread extends Thread {
	private String visitUrl;

	public CThread(String url) {
		super("[CThread] Thread");
		this.visitUrl = url;
	};

	public void run() {
		String threadName = Thread.currentThread().getName();
		try {

			DownLoadFile downLoader = new DownLoadFile();
			// 下载网页
			downLoader.downloadFile(visitUrl);

		} catch (Exception e) {
			System.out.println("Exception from " + threadName + ".run");
		}
	}
}

class BThread extends Thread {
	public BThread() {
		super("[BThread] Thread");
	};

	public void run() {
		String threadName = Thread.currentThread().getName();
		System.out.println(threadName + " start.");
		try {
			while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程
				String visitUrl;
				// 队头URL出队列

				visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();

				if (visitUrl == null)
					continue;
				new CThread(visitUrl).start();
			}

		} catch (Exception e) {
			System.out.println("Exception from " + threadName + ".run");
		}
	}
}

class AThread extends Thread {
	BThread bt;

	public AThread(BThread bt) {
		super("[AThread] Thread");
		this.bt = bt;
	}

	public void run() {
		String threadName = Thread.currentThread().getName();
		System.out.println(threadName + " start.");
		try {
			bt.join();
			System.out.println(threadName + " end.");
		} catch (Exception e) {
			System.out.println("Exception from " + threadName + ".run");
		}
	}
}
这样可以把我们需要的网页下载下来。下一步需要对下载的网页进行清洗去噪等工作。

public class FileUtil {
	private static String filePath = "temp";//html文件路径

	public static File[] getAllFiles(String filePath) {// UTF-8
		File root = new File(filePath);
		File[] files = root.listFiles();
		return files;
	}

	public static String openFile(File fileName, String encode) {
		try {
			BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encode));
			String szContent = "";
			String szTemp;
			while ((szTemp = bis.readLine()) != null) {
				szContent += szTemp + "\n";
			}
			bis.close();
			return szContent;
		} catch (Exception e) {
			return "";
		}
	}

	public static String getContent(File file) throws ParserException {
		String eL1 = "[0-9]{4}-[0-9]{2}-[0-9]{2}[0-9]{2}:[0-9]{2}:[0-9]{2}";// 正则表达式匹配时间
		String eL2 = "[0-9]{1,2}岁";

		NodeFilter titleFilter = new HasAttributeFilter("class", "fl dib fb");
		NodeFilter infoFilter = new HasAttributeFilter("class", "f12 graydeep Userinfo clearfix pl29");
		NodeFilter describeFilter = new HasAttributeFilter("class", "graydeep User_quecol pt10 mt10");// 病人自己的描述与想获得的帮助
		NodeFilter answerFilter = new HasAttributeFilter("class", "Doc_dochf mb15 bc");// 普通回复
		NodeFilter adoptFilter = new HasAttributeFilter("class", "Doc_dochf Best_dochf bc");// 被患者采纳的回复

		Parser parser1 = new Parser();
		Parser parser2 = new Parser();
		Parser parser3 = new Parser();
		Parser parser4 = new Parser();
		Parser parser5 = new Parser();
		Parser parser6 = new Parser();
		Pattern p1 = Pattern.compile(eL1);
		Pattern p2 = Pattern.compile(eL2);

		String fileContent = FileUtil.openFile(file, "GBK");
		parser1.setInputHTML(fileContent);
		parser2.setInputHTML(fileContent);
		parser3.setInputHTML(fileContent);
		parser4.setInputHTML(fileContent);
		parser5.setInputHTML(fileContent);
		parser6.setInputHTML(fileContent);
		NodeList nodes = new NodeList();
		nodes.add(parser1.extractAllNodesThatMatch(titleFilter));
		nodes.add(parser2.extractAllNodesThatMatch(infoFilter));
		nodes.add(parser3.extractAllNodesThatMatch(describeFilter));
		nodes.add(parser5.extractAllNodesThatMatch(answerFilter));
		nodes.add(parser6.extractAllNodesThatMatch(adoptFilter));

		StringBuffer textLine = new StringBuffer();
		StringBuffer splitLine = new StringBuffer();
		String date = "";
		HtmlParser.totalFileNum++;
		for (int j = 0; j < nodes.size(); j++) {
			Node textNode = (Node) nodes.elementAt(j);

			if (j == 0) {
				textLine.append(HtmlParser.totalFileNum + "|" + textNode.toPlainTextString() + "|");
			} else if (j == 1) {// 获取一部分:病人信息
				NodeList infoList = new NodeList();
				infoList = textNode.getChildren();
				int nodeNeed = 0;

				for (int m = 0; m < infoList.size(); m++) {// listnode很多空格
					Node tmp = (Node) infoList.elementAt(m);
					String textTmp = tmp.toPlainTextString();
					if (nodeNeed == 4)
						break;
					String trimTextTmp = textTmp.replace("\n", "").replaceAll("\r", "").replaceAll(" ", "");
					if (trimTextTmp.length() != 0) {
						Matcher matcher = p1.matcher(trimTextTmp);
						Matcher matcher2 = p2.matcher(trimTextTmp);
						if (matcher2.matches()) {// 年龄规范
							trimTextTmp = trimTextTmp.replaceFirst("岁", "");
						}
						if (matcher.matches()) {// 只匹配日期
							date = textTmp.replace("\n", "").replaceAll("\r", "");
						} else {
							textLine.append(trimTextTmp + "|");
						}
						nodeNeed++;
					}
				}
			} else if (j == 2) {// 病情描述,与想获得的帮助
				textLine.append("健康咨询描述:" + textNode.toPlainTextString().replaceAll("\n", "") + "|null|" + date + "|");
			} else if (j >= 3) {// 医生诊断,可能有好几个

				NodeList docAns = new NodeList();
				docAns = textNode.getChildren();
				splitLine.append(textLine.toString() + "医生" + j + "|null|"
						+ docAns.elementAt(1).toPlainTextString().trim().replaceAll("\n", "") + "|"
						+ docAns.elementAt(3).toPlainTextString().trim().replaceAll("\n", "") + "|\n");

			}
		}
		// System.out.println(textLine);
		return splitLine.toString();
	}
	public static void writeContent() throws ParserException {
		File[] files = FileUtil.getAllFiles(filePath);
		
		try {
			String path = "data\\data_xywy.txt";
			File dataFile = new File(path);
			if (!dataFile.exists())
				dataFile.createNewFile();

			FileOutputStream out = new FileOutputStream(dataFile, true); // 如果追加方式用true
			for(File file:files){
				String content = FileUtil.getContent(file);
				if (content == null)
					break;
				StringBuffer sb = new StringBuffer();
				sb.append(content);
				System.out.println(HtmlParser.totalFileNum);
				out.write(sb.toString().getBytes("utf-8"));// 注意需要转换对应的字符集*/
			}
			
			out.close();
		} catch (IOException ex) {
			System.out.println(ex.getStackTrace());
		} finally {

		}
	}
}

源码:http://download.csdn.net/detail/zbuger/9173757

你可能感兴趣的:(自己写爬虫)