分析
新蛋详情页的价格字段是用图片显示的,虽然其它电商都已经认识到这是没什么卵用还浪费资源的行为但貌似新蛋不这样认为,所以尝试爬取一下。
价格字段大概是这个样子:
这个图片也是很纯净的能够识别率百分百的。
代码实现
还是上之前写的一个小小工具库:https://github.com/CC11001100/commons-simple-character-ocr
首先需要抓取一些图片来生成标注数据,这里选择了智能手机下的前十页,将前十页商品的价格字段图片爬下来生成标注数据,代码如下:
package org.cc11001100.t1;
import cc11001100.ocr.OcrUtil;
import org.apache.http.client.fluent.Request;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* 新蛋: http://www.newegg.cn/
*
* @author CC11001100
*/
public class NeweggCrawler {
private static OcrUtil ocrUtil;
static {
ocrUtil = new OcrUtil();
}
/**
* 在智能手机类下面收集一些图片
*/
public static void grabTrainImage(String saveBasePath) {
ExecutorService executorService = Executors.newFixedThreadPool(10);
String url = "http://www.newegg.cn/SubCategory/1043-%d.htm";
for (int i = 1; i <= 10; i++) {
Document doc = getDocument(String.format(url, i));
doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> {
executorService.execute(() -> {
String detailPageUrl = detailPageLinkElt.attr("href");
Document detailPage = getDocument(detailPageUrl);
// 原类名即如此...
String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src");
byte[] imgBytes = download(imgLink);
try {
BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
String savePath = saveBasePath + UUID.randomUUID().toString() + ".png";
ImageIO.write(img, "png", new File(savePath));
} catch (IOException e) {
e.printStackTrace();
}
});
});
}
try {
executorService.shutdown();
executorService.awaitTermination(10, TimeUnit.DAYS);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private static Document getDocument(String url) {
byte[] responseBytes = download(url);
String html = null;
try {
html = new String(responseBytes, "gb2312");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return Jsoup.parse(html);
}
private static byte[] download(String url) {
for (int i = 0; i < 3; i++) {
try {
return Request.Get(url).execute().returnContent().asBytes();
} catch (IOException e) {
e.printStackTrace();
}
}
return new byte[0];
}
public static void main(String[] args) {
grabTrainImage("E:/test/crawler/newegg/raw/");
new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/");
}
}
所有的价格图片都是由下面这些字符组成的:
手动将每张图片的文件名修改为图片所表示的意思:
时间有限,只增加一个测试方法用来测试结果是否正确,完整代码如下:
package org.cc11001100.t1;
import cc11001100.ocr.OcrUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.http.client.fluent.Request;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* 新蛋: http://www.newegg.cn/
*
* @author CC11001100
*/
public class NeweggCrawler {
private static OcrUtil ocrUtil;
static {
ocrUtil = new OcrUtil();
ocrUtil.loadDictionaryMap("E:/test/crawler/newegg/char/");
}
/**
* 在智能手机类下面收集一些图片
*/
public static void grabTrainImage(String saveBasePath) {
ExecutorService executorService = Executors.newFixedThreadPool(10);
String url = "http://www.newegg.cn/SubCategory/1043-%d.htm";
for (int i = 1; i <= 10; i++) {
Document doc = getDocument(String.format(url, i));
doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> {
executorService.execute(() -> {
String detailPageUrl = detailPageLinkElt.attr("href");
Document detailPage = getDocument(detailPageUrl);
// 原类名即如此...
String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src");
byte[] imgBytes = download(imgLink);
try {
BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
String savePath = saveBasePath + UUID.randomUUID().toString() + ".png";
ImageIO.write(img, "png", new File(savePath));
} catch (IOException e) {
e.printStackTrace();
}
});
});
}
try {
executorService.shutdown();
executorService.awaitTermination(10, TimeUnit.DAYS);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 测试抓取结果是否正确
*
* @param detailPageUrl
* @return
*/
public static JSONObject parse(String detailPageUrl) {
JSONObject product = new JSONObject();
Document doc = getDocument(detailPageUrl);
String imgLink = doc.select(".godds_info_data img[src~=PriceImage]").attr("src");
byte[] imgBytes = download(imgLink);
BufferedImage img = null;
try {
img = ImageIO.read(new ByteArrayInputStream(imgBytes));
double price = Double.parseDouble(ocrUtil.ocr(img));
product.put("price", price);
} catch (IOException e) {
e.printStackTrace();
}
String productTitle = doc.select("#productTitle").text();
product.put("title", productTitle);
return product;
}
private static Document getDocument(String url) {
byte[] responseBytes = download(url);
String html = null;
try {
html = new String(responseBytes, "gb2312");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return Jsoup.parse(html);
}
private static byte[] download(String url) {
for (int i = 0; i < 3; i++) {
try {
return Request.Get(url).execute().returnContent().asBytes();
} catch (IOException e) {
e.printStackTrace();
}
}
return new byte[0];
}
public static void main(String[] args) {
// grabTrainImage("E:/test/crawler/newegg/raw/");
// new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/");
String url = "http://www.newegg.cn/Product/A28-032-7Q5.htm";
System.out.println(JSON.toJSONString(parse(url), true));
}
}