抓取口碑网店铺资料

阅读更多

   呵呵,只为自己玩,哈哈。

   技术难度:

   1)快速高效的抓取记录,并去重,和以后的增量抓取。

   2)因为口碑网的联系方式是图片的,如何批量的完成OCR的转换

 

   本文只是一个实验,不建议使用在项目当中,如下是部分代码。

 

   涉及的开源代码:

   crawler4j

   AspriseOCR

 

   资源包,把crawler4j所有jar包放在你的应用目录中。

 

  • Berkeley DB Java Edition 4.0.71 or higher
  • fastutil 5.1.5
  • DSI Utilities 1.0.10 or higher
  • Apache HttpClient 4.0.1
  • Apache Log4j 1.2.15
  • Apache Commons Logging 1.1.1
  • Apache Commons Codec 1.4

  如上的JAR包都必须包括在你的项目lib当中,否则会出错。

 

  第一步,抓取口碑网店铺信息。
   如下是抓取的部分代码:

  主程序入口:Controller.java

package com.aragon.crawdata;

import org.junit.Test;

import edu.uci.ics.crawler4j.crawler.CrawlController;

public class Controller {
	private static String CRAWSITE = "http://beijing.koubei.com/";
	
	@Test
	public void test(){
		CrawlController controller;
		try {
			controller = new CrawlController("/data/crawl/root");
			controller.addSeed(CRAWSITE);
			controller.start(MyCrawler.class, 3);
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}

}

    抓取主程序 MyCrawler.java

package com.aragon.crawdata;

import java.util.ArrayList;
import java.util.regex.Pattern;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;

public class MyCrawler extends WebCrawler {
	private static String CRAWSITE = "http://beijing.koubei.com/store/";
	Pattern filters = Pattern.compile(".(detail--storeId-*)$");
	SaveInDb savein = new SaveInDb();

	public MyCrawler() {
	}

	public boolean shouldVisit(WebURL url) {
		String href = url.getURL().toLowerCase();
		if (filters.matcher(href).matches()) {
			return false;
		}
		if (href.startsWith(CRAWSITE)) {
			return true;
		}
		return false;
	}

	public void visit(Page page) {
		int docid = page.getWebURL().getDocid();
		String url = page.getWebURL().getURL();
		System.out.println("Url:" + url);
		Template4Koubei koubei = new Template4Koubei();
		try {
			CrawDataModel model = koubei.translationData(page.getHTML());
			if (model != null) {
				//保存在数据库当中。
				savein
						.update("insert into info(type,name,address,tel1,tel2,tel3,otherinfo,othertype,ownername) value ('"
								+ model.getType()
								+ "','"
								+ model.getName()
								+ "','"
								+ model.getAddress()
								+ "','"
								+ model.getTel1()
								+ "','"
								+ model.getTel2()
								+ "','"
								+ model.getTel3()
								+ "','"
								+ model.getOtherInfo()
								+ "','"
								+ model.getOtherType()
								+ "','"
								+ model.getOwnerName() + "')");
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		String text = page.getText();
		ArrayList links = page.getURLs();
	}

}

   真正解析口碑网店铺的类Template4Koubei.java

package com.aragon.crawdata;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;

public class Template4Koubei {
	private static String NAME_TAG = "title-wrap yk-fix-float";// 名称
	private static String TYPE_TAG = "detail-info-item";// 类型
	private static String ADDRESS_TAG = "info yk-fix-float";// 地址
	private static String TEL_TAG = "detail-info-item";// 电话,图像
	private static String MOB_TAG = "detail-info-item";
	private static String DETAIL_TAG = "detail-intro";
	private static String OTHERINFO_TAG = "detail-info-item";

	public CrawDataModel translationData(String webHtml) throws Exception {
		CrawDataModel model = new CrawDataModel();
		Object[] name = findCrawDataByCol(webHtml, NAME_TAG);
		if (name != null && name[0] != null) {
			model.setName(StringUtils.deleteWhitespace(name[0].toString()));
			Object[] type = findCrawDataByCol(webHtml, TYPE_TAG);
			if (type != null) {
				model.setType(StringUtils.deleteWhitespace(type[0].toString()));
			}

			Object[] tel = findCrawDataByImageFilter(webHtml, TEL_TAG);
			if (tel != null) {
				for (int i = 0; i < tel.length; i++) {
					if (i == 0) {
						model.setTel1(StringUtils.deleteWhitespace(tel[0]
								.toString()));
					}
					if (i == 1) {
						model.setTel2(StringUtils.deleteWhitespace(tel[1]
								.toString()));
					}
					if (i == 2) {
						model.setTel3(StringUtils.deleteWhitespace(tel[2]
								.toString()));
					}
				}

			}
			Object[] otherType = findCrawDataByCol(webHtml, OTHERINFO_TAG);
			String otherTypeString = "";
			if (otherType != null) {
				for (int i = 0; i < otherType.length; i++) {
					if (i < 3) {
						otherTypeString += StringUtils
								.deleteWhitespace(otherType[i].toString());
					}
				}
				model.setOtherType(otherTypeString);
			}

			Object[] address = findCrawDataByCol(webHtml, ADDRESS_TAG);
			if (address != null) {
				model.setAddress(StringUtils.deleteWhitespace(address[0]
						.toString()));
			}

			Object[] detail = findCrawDataByCol(webHtml, DETAIL_TAG);
			if (detail != null) {
				model.setOtherInfo(StringUtils.deleteWhitespace(detail[0]
						.toString()));
			}

		} else {
			return null;
		}
		// check address
		if (model.getAddress() != null) {
			if (model.getAddress().indexOf("电话") > 0) {
				model.setOtherType(model.getAddress().substring(
						model.getAddress().indexOf("电话")));
				model.setAddress(model.getAddress().substring(0,
						model.getAddress().indexOf("电话")));
			}
		}

		return model;
	}

	public Object[] findCrawDataByCol(String webHtml, String Tag)
			throws Exception {
		String name = "";
		List resultTextList = new ArrayList();
		Parser parser = new Parser();
		parser.setInputHTML(webHtml);

		NodeFilter filter = new HasAttributeFilter("class", Tag);
		/* NodeFilter filter 就是要解析的过滤器,实现有好多种,我采用的属性过滤,其他more api */

		NodeList nodeList = parser.extractAllNodesThatMatch(filter);
		/*
		 * extractAllNodesThatAre(class)已经不被推荐使用,在1.6版本中,我感到更加体形了灵活性.更好的适用了自定义的tag
		 */
		if (nodeList == null)
			return null;
		if (nodeList.size() == 0)
			return null;

		// System.out.println("start ============== ,size = "
		// + nodeList.size());
		Node[] nodes = nodeList.toNodeArray();
		String line = "";
		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i]; /* 得到所以符合的节点,类型化做对应的标签类 */
			if (node instanceof Span) {
				Span spanTag = (Span) node;
				line = spanTag.toPlainTextString();
			} else if (node instanceof TableColumn) {
				TableColumn tableTag = (TableColumn) node;
				line = tableTag.toPlainTextString();
			} else if (node instanceof Div) {
				Div divTag = (Div) node;
				line = divTag.toPlainTextString();
			}
			if (StringUtil.isTrimEmpty(line)) {
				continue;
			} else {
				resultTextList.add(line);
			}

		}
		return resultTextList.toArray();
		// return name;
	}

	public Object[] findCrawDataByImageFilter(String webHtml, String Tag)
			throws Exception {
		List resultTextList = new ArrayList();
		Parser parser = new Parser();
		parser.setInputHTML(webHtml);

		NodeFilter filter = new NodeClassFilter(ImageTag.class);
		/* NodeFilter filter 就是要解析的过滤器,实现有好多种,我采用的属性过滤,其他more api */

		NodeList nodeList = parser.extractAllNodesThatMatch(filter);
		/*
		 * extractAllNodesThatAre(class)已经不被推荐使用,在1.6版本中,我感到更加体形了灵活性.更好的适用了自定义的tag
		 */
		if (nodeList == null)
			return null;
		if (nodeList.size() == 0)
			return null;

		// System.out.println("start ============== ,size = "
		// + nodeList.size());
		Node[] nodes = nodeList.toNodeArray();
		String line = "";
		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i]; /* 得到所以符合的节点,类型化做对应的标签类 */
			if (node.getParent().getText().indexOf(Tag) > 0) {
				ImageTag imageTag = (ImageTag) node;
				line = imageTag.getImageURL();
			}

			if (StringUtil.isTrimEmpty(line)) {
				continue;
			} else {
				resultTextList.add(line);
			}

		}
		return resultTextList.toArray();
		// return name;
	}
}

 Model对象:CrawDataModel.java

package com.aragon.crawdata;

import java.io.Serializable;

public class CrawDataModel implements Serializable{
	private Integer id;
	private String type;
	private String name;
	private String address;
	private String tel1;
	private String tel2;
	private String tel3;
	private String ownerName;
	private String otherInfo;
	private String otherType;
	public Integer getId() {
		return id;
	}
	public void setId(Integer id) {
		this.id = id;
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public String getAddress() {
		return address;
	}
	public void setAddress(String address) {
		this.address = address;
	}
	public String getTel1() {
		return tel1;
	}
	public void setTel1(String tel1) {
		this.tel1 = tel1;
	}
	public String getTel2() {
		return tel2;
	}
	public void setTel2(String tel2) {
		this.tel2 = tel2;
	}
	public String getTel3() {
		return tel3;
	}
	public void setTel3(String tel3) {
		this.tel3 = tel3;
	}
	public String getOwnerName() {
		return ownerName;
	}
	public void setOwnerName(String ownerName) {
		this.ownerName = ownerName;
	}
	public String getOtherInfo() {
		return otherInfo;
	}
	public void setOtherInfo(String otherInfo) {
		this.otherInfo = otherInfo;
	}
	public String getOtherType() {
		return otherType;
	}
	public void setOtherType(String otherType) {
		this.otherType = otherType;
	}
	
}
 

哈哈,如上的步骤就可以把口碑网的店铺信息抓入你的数据库当中,我在恶劣的2M宽带条件下,一个小时抓了2000多条信息,当然了,没有加入去重的功能。

 

第二步:把电话,手机信息转换成指定的数字号码

 

   从口碑网抓取的电话,手机信息都是图片信息,如何把他转换成数字号码,我们使用AspriseOCR。网上破解的方法很多。大家可以google一下。

  如下是他的TEST实验。因为我的应用代码太多,所以就粘一个Test,从技术上没有问题。

package com.aragon.crawdata;

import java.io.File;

/**
 * 
 * 本程序实现功能:给出一个图片的具体网络地址,把该图片的进行解析,解析后把图片内容以字符串形式进行返回
 */
public class RecognisePicture {
	/**
	 * 
	 * create date:2009-5-22 author:Administrator
	 * 
	 * 
	 * 
	 * @param args
	 */

	public static void main(String[] args) {

		// TODO Auto-generated method stub

		recognise("http://count.koubei.com/showphone/showphone.php?f=jpg&w=96&h=10&bc=255,255,255&fc=0,0,0&fs=10&fn=arial&phone=MTI5NDU5ODg0Mw%3D%3D%23OQ532L8m6okoAzY6".replaceAll("&", "&"));

	}

	/**
	 * 
	 * 解析指定网络地址的图片信息 create date:2009-5-22 author:Administrator
	 * 
	 * 
	 * 
	 * @param fromUrl
	 * 
	 * @return
	 */

	public static String recognise(String fromUrl) {

		String result = "";

		// 下载图片文件到本地磁盘

		File file = DownLoadWithUrl.downLoadFile(fromUrl);

		if (file != null) {

			// 解析下载到本地磁盘文件

			result = ParseJPEG_withOCR.getRecogniseStr(file);

			// 删除下载到本地磁盘的文件

			DownLoadWithUrl.delFile(file);

		}
		System.out.println("输出的电话号码是:"+result);
		return result;

	}
}

如下是实验结果:

输出的电话号码是:01O 51402688
 

哈哈,这样一来。是不是能减少很多MM的工作量呀。enjoy it!

 

 

 

 

 

 

你可能感兴趣的:(Apache,网络应用,log4j,junit,PHP)