最近因为论文的原因,需要爬取一些文本数据。所以找了本书看了一下,写了一个爬虫程序,和对原始爬取数据的清洗程序。
第一版程序没有用到线程的概念,所以比较慢,第二版用了线程,速度确实提升了很多。
爬虫用到了几个包:
commons-httpclient-3.0.1.jar
htmlparser.jar
第一个是模拟浏览器http的包,第二个是解析网页的包。
一般情况下,爬虫用的策略为广度优先,实现用的是一个队列,首先把要爬取的初始url进队列,然后出队列,解析出队列的url中的<a>标签,做一下筛选,把符合条件的标签url依次进队列,对队列进行出队列操作,直到满足条件退出。退出条件一定设好,否则容易死循环,一般情况下是队列不为空。
看一下单线程版的主程序:
public class MyCrawler { /** * 使用种子初始化 URL 队列 * * @return * @param seeds * 种子URL */ private void initCrawlerWithSeeds(String seeds) { LinkQueue.addUnvisitedUrl(seeds); } /** * 抓取过程 * @return * @param seeds */ public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接 LinkFilter filter = new LinkFilter() { public boolean accept(String url) { if (url.startsWith("http://xxxx")) return true; else return false; } }; // 初始化 URL 队列 initCrawlerWithSeeds(seeds); Set<String> links = HtmlParserTool.extracLinks(seeds, filter); // 新的未访问的 URL 入队 for (String link : links) { LinkQueue.addUnvisitedUrl(link); } while (!LinkQueue.unVisitedUrlsEmpty()) { // 队头URL出队列 String visitUrl = (String) LinkQueue.unVisitedUrlDeQueue(); if (visitUrl == null) continue; DownLoadFile downLoader = new DownLoadFile(); // 下载网页 downLoader.downloadFile(visitUrl); } } private void initCrawl() { LinkQueue.removeAllUnvisited(); LinkQueue.removeAllVisited(); } // main 方法入口 public static void main(String[] args) { MyCrawler crawler = new MyCrawler(); for (int j = 1; j < 201; j++) { crawler.initCrawl(); crawler.crawling("http://xxxx"+j+".htm"); } } }
第二版用的多线程:
public class MyCrawlerMultiThread { public static List<Thread> childThread = new ArrayList<Thread>(); private final static int FROM=1; private final static int TO=201; /** * 使用种子初始化 URL 队列 * @return * @param seeds * 种子URL */ private static void initCrawlerWithSeeds(String seeds) { LinkQueue.addUnvisitedUrl(seeds); } /** * 抓取过程 * @return * @param seeds */ public void crawling(String seeds) { // 定义过滤器,提取以xxx开头的链接 while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程 String visitUrl; // 队头URL出队列 visitUrl = (String) LinkQueue.unVisitedUrlDeQueue();// 很快不会影响吧 if (visitUrl == null) continue; DownLoadFile downLoader = new DownLoadFile(); // 下载网页 downLoader.downloadFile(visitUrl); } } private void initCrawl() { LinkQueue.removeAllUnvisited(); LinkQueue.removeAllVisited(); } // main 方法入口 public static void main(String[] args) { MyCrawlerMultiThread crawler = new MyCrawlerMultiThread(); BThread bt = null; AThread at = null; for (int j = FROM; j < TO; j++) { crawler.initCrawl(); LinkFilter filter = new LinkFilter() { public boolean accept(String url) { if (url.startsWith("http://xxx")) return true; else return false; } }; String seeds = null;// seeds = "http://xxxx" + j + ".htm"; // 初始化 URL 队列 initCrawlerWithSeeds(seeds); Set<String> links = HtmlParserTool.extracLinks(seeds, filter); // 新的未访问的 URL 入队 for (String link : links) { LinkQueue.addUnvisitedUrl(link); } // 进队列 bt=new BThread(); at=new AThread(bt); try { bt.start(); at.start(); bt.join(); } catch (Exception e) { System.out.println("Exception from main"); } } } } class CThread extends Thread { private String visitUrl; public CThread(String url) { super("[CThread] Thread"); this.visitUrl = url; }; public void run() { String threadName = Thread.currentThread().getName(); try { DownLoadFile downLoader = new DownLoadFile(); // 下载网页 downLoader.downloadFile(visitUrl); } catch (Exception e) { System.out.println("Exception from " + threadName + ".run"); } } } class BThread extends Thread { public BThread() { super("[BThread] Thread"); }; public void run() { String threadName = Thread.currentThread().getName(); System.out.println(threadName + " start."); try { while (!LinkQueue.unVisitedUrlsEmpty()) {// 这里用多线程 String visitUrl; // 队头URL出队列 visitUrl = (String) LinkQueue.unVisitedUrlDeQueue(); if (visitUrl == null) continue; new CThread(visitUrl).start(); } } catch (Exception e) { System.out.println("Exception from " + threadName + ".run"); } } } class AThread extends Thread { BThread bt; public AThread(BThread bt) { super("[AThread] Thread"); this.bt = bt; } public void run() { String threadName = Thread.currentThread().getName(); System.out.println(threadName + " start."); try { bt.join(); System.out.println(threadName + " end."); } catch (Exception e) { System.out.println("Exception from " + threadName + ".run"); } } }这样可以把我们需要的网页下载下来。下一步需要对下载的网页进行清洗去噪等工作。
public class FileUtil { private static String filePath = "temp";//html文件路径 public static File[] getAllFiles(String filePath) {// UTF-8 File root = new File(filePath); File[] files = root.listFiles(); return files; } public static String openFile(File fileName, String encode) { try { BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), encode)); String szContent = ""; String szTemp; while ((szTemp = bis.readLine()) != null) { szContent += szTemp + "\n"; } bis.close(); return szContent; } catch (Exception e) { return ""; } } public static String getContent(File file) throws ParserException { String eL1 = "[0-9]{4}-[0-9]{2}-[0-9]{2}[0-9]{2}:[0-9]{2}:[0-9]{2}";// 正则表达式匹配时间 String eL2 = "[0-9]{1,2}岁"; NodeFilter titleFilter = new HasAttributeFilter("class", "fl dib fb"); NodeFilter infoFilter = new HasAttributeFilter("class", "f12 graydeep Userinfo clearfix pl29"); NodeFilter describeFilter = new HasAttributeFilter("class", "graydeep User_quecol pt10 mt10");// 病人自己的描述与想获得的帮助 NodeFilter answerFilter = new HasAttributeFilter("class", "Doc_dochf mb15 bc");// 普通回复 NodeFilter adoptFilter = new HasAttributeFilter("class", "Doc_dochf Best_dochf bc");// 被患者采纳的回复 Parser parser1 = new Parser(); Parser parser2 = new Parser(); Parser parser3 = new Parser(); Parser parser4 = new Parser(); Parser parser5 = new Parser(); Parser parser6 = new Parser(); Pattern p1 = Pattern.compile(eL1); Pattern p2 = Pattern.compile(eL2); String fileContent = FileUtil.openFile(file, "GBK"); parser1.setInputHTML(fileContent); parser2.setInputHTML(fileContent); parser3.setInputHTML(fileContent); parser4.setInputHTML(fileContent); parser5.setInputHTML(fileContent); parser6.setInputHTML(fileContent); NodeList nodes = new NodeList(); nodes.add(parser1.extractAllNodesThatMatch(titleFilter)); nodes.add(parser2.extractAllNodesThatMatch(infoFilter)); nodes.add(parser3.extractAllNodesThatMatch(describeFilter)); nodes.add(parser5.extractAllNodesThatMatch(answerFilter)); nodes.add(parser6.extractAllNodesThatMatch(adoptFilter)); StringBuffer textLine = new StringBuffer(); StringBuffer splitLine = new StringBuffer(); String date = ""; HtmlParser.totalFileNum++; for (int j = 0; j < nodes.size(); j++) { Node textNode = (Node) nodes.elementAt(j); if (j == 0) { textLine.append(HtmlParser.totalFileNum + "|" + textNode.toPlainTextString() + "|"); } else if (j == 1) {// 获取一部分:病人信息 NodeList infoList = new NodeList(); infoList = textNode.getChildren(); int nodeNeed = 0; for (int m = 0; m < infoList.size(); m++) {// listnode很多空格 Node tmp = (Node) infoList.elementAt(m); String textTmp = tmp.toPlainTextString(); if (nodeNeed == 4) break; String trimTextTmp = textTmp.replace("\n", "").replaceAll("\r", "").replaceAll(" ", ""); if (trimTextTmp.length() != 0) { Matcher matcher = p1.matcher(trimTextTmp); Matcher matcher2 = p2.matcher(trimTextTmp); if (matcher2.matches()) {// 年龄规范 trimTextTmp = trimTextTmp.replaceFirst("岁", ""); } if (matcher.matches()) {// 只匹配日期 date = textTmp.replace("\n", "").replaceAll("\r", ""); } else { textLine.append(trimTextTmp + "|"); } nodeNeed++; } } } else if (j == 2) {// 病情描述,与想获得的帮助 textLine.append("健康咨询描述:" + textNode.toPlainTextString().replaceAll("\n", "") + "|null|" + date + "|"); } else if (j >= 3) {// 医生诊断,可能有好几个 NodeList docAns = new NodeList(); docAns = textNode.getChildren(); splitLine.append(textLine.toString() + "医生" + j + "|null|" + docAns.elementAt(1).toPlainTextString().trim().replaceAll("\n", "") + "|" + docAns.elementAt(3).toPlainTextString().trim().replaceAll("\n", "") + "|\n"); } } // System.out.println(textLine); return splitLine.toString(); } public static void writeContent() throws ParserException { File[] files = FileUtil.getAllFiles(filePath); try { String path = "data\\data_xywy.txt"; File dataFile = new File(path); if (!dataFile.exists()) dataFile.createNewFile(); FileOutputStream out = new FileOutputStream(dataFile, true); // 如果追加方式用true for(File file:files){ String content = FileUtil.getContent(file); if (content == null) break; StringBuffer sb = new StringBuffer(); sb.append(content); System.out.println(HtmlParser.totalFileNum); out.write(sb.toString().getBytes("utf-8"));// 注意需要转换对应的字符集*/ } out.close(); } catch (IOException ex) { System.out.println(ex.getStackTrace()); } finally { } } }