背景介绍:
场景是,有京东三级分类名称,没有对应图标,需要根据京东三级分类名称,获取分类名称匹配的图片,来作为商品分类图标
技术选型:
WebCollector+jsoup,WebCollector进行爬取,jsoup进行html解析
实现步骤:
1.根据根url发起请求,
2.得到响应页面数据,
3.对页面数据进行解析,并提取
4.对图片进行下载持久化
代码实现
1.相关包结构说明
2.代码实例
links.java 类,存储已经访问过的URL路径 和 待访问的URL 路径;
package com.etoak.crawl.link;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
/*
* Link主要功能;
* 1: 存储已经访问过的URL路径 和 待访问的URL 路径;
*
*
* */
public class Links {
//已访问的 url 集合 已经访问过的 主要考虑 不能再重复了 使用set来保证不重复;
private static Set visitedUrlSet = new HashSet();
//待访问的 url 集合 待访问的主要考虑 1:规定访问顺序;2:保证不提供重复的带访问地址;
private static LinkedList unVisitedUrlQueue = new LinkedList();
//获得已经访问的 URL 数目
public static int getVisitedUrlNum() {
return visitedUrlSet.size();
}
//添加到访问过的 URL
public static void addVisitedUrlSet(String url) {
visitedUrlSet.add(url);
}
//移除访问过的 URL
public static void removeVisitedUrlSet(String url) {
visitedUrlSet.remove(url);
}
//获得 待访问的 url 集合
public static LinkedList getUnVisitedUrlQueue() {
return unVisitedUrlQueue;
}
// 添加到待访问的集合中 保证每个 URL 只被访问一次
public static void addUnvisitedUrlQueue(String url) {
if (url != null && !url.trim().equals("") && !visitedUrlSet.contains(url) && !unVisitedUrlQueue.contains(url)){
unVisitedUrlQueue.add(url);
}
}
//删除 待访问的url
public static Object removeHeadOfUnVisitedUrlQueue() {
return unVisitedUrlQueue.removeFirst();
}
//判断未访问的 URL 队列中是否为空
public static boolean unVisitedUrlQueueIsEmpty() {
return unVisitedUrlQueue.isEmpty();
}
}
Page.java类,保存获取到的响应的相关内容;
package com.etoak.crawl.page;
import com.etoak.crawl.util.CharsetDetector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.UnsupportedEncodingException;
/*
* page
* 1: 保存获取到的响应的相关内容;
* */
public class Page {
private byte[] content ;
private String html ; //网页源码字符串
private Document doc ;//网页Dom文档
private String charset ;//字符编码
private String url ;//url路径
private String contentType ;// 内容类型
public Page(byte[] content , String url , String contentType){
this.content = content ;
this.url = url ;
this.contentType = contentType ;
}
public String getCharset() {
return charset;
}
public String getUrl(){return url ;}
public String getContentType(){ return contentType ;}
public byte[] getContent(){ return content ;}
/**
* 返回网页的源码字符串
*
* @return 网页的源码字符串
*/
public String getHtml() {
if (html != null) {
return html;
}
if (content == null) {
return null;
}
if(charset==null){
charset = CharsetDetector.guessEncoding(content); // 根据内容来猜测 字符编码
}
try {
this.html = new String(content, charset);
return html;
} catch (UnsupportedEncodingException ex) {
ex.printStackTrace();
return null;
}
}
/*
* 得到文档
* */
public Document getDoc(){
if (doc != null) {
return doc;
}
try {
this.doc = Jsoup.parse(getHtml(), url);
return doc;
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}
}
PageParserTool.java 类,处理html页面
package com.etoak.crawl.page;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
public class PageParserTool {
/* 通过选择器来选取页面的 */
public static Elements select(Page page , String cssSelector) {
return page.getDoc().select(cssSelector);
}
/*
* 通过css选择器来得到指定元素;
*
* */
public static Element select(Page page , String cssSelector, int index) {
Elements eles = select(page , cssSelector);
int realIndex = index;
if (index < 0) {
realIndex = eles.size() + index;
}
return eles.get(realIndex);
}
/**
* 获取满足选择器的元素中的链接 选择器cssSelector必须定位到具体的超链接
* 例如我们想抽取id为content的div中的所有超链接,这里
* 就要将cssSelector定义为div[id=content] a
* 放入set 中 防止重复;
* @param cssSelector
* @return
*/
public static Set
getLinks(Page page ,String cssSelector) { Set
links = new HashSet () ; Elements es = select(page , cssSelector);
Iterator iterator = es.iterator();
while(iterator.hasNext()) {
Element element = (Element) iterator.next();
if ( element.hasAttr("href") ) {
links.add(element.attr("abs:href"));
}else if( element.hasAttr("src") ){
links.add(element.attr("abs:src"));
}
}
return links;
}
/**
* 获取网页中满足指定css选择器的所有元素的指定属性的集合
* 例如通过getAttrs("img[src]","abs:src")可获取网页中所有图片的链接
* @param cssSelector
* @param attrName
* @return
*/
public static ArrayList
getAttrs(Page page , String cssSelector, String attrName) { ArrayList
result = new ArrayList (); Elements eles = select(page ,cssSelector);
for (Element ele : eles) {
if (ele.hasAttr(attrName)) {
result.add(ele.attr(attrName));
}
}
return result;
}
}
RequestAndResponseTool.java 类,进行请求响应
package com.etoak.crawl.page;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import java.io.IOException;
public class RequestAndResponseTool {
public static Page sendRequstAndGetResponse(String url) {
Page page = null;
// 1.生成 HttpClinet 对象并设置参数
HttpClient httpClient = new HttpClient();
// 设置 HTTP 连接超时 5s
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);
// 2.生成 GetMethod 对象并设置参数
GetMethod getMethod = new GetMethod(url);
// 设置 get 请求超时 5s
getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
// 设置请求重试处理
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
// 3.执行 HTTP GET 请求
try {
int statusCode = httpClient.executeMethod(getMethod);
// 判断访问的状态码
if (statusCode != HttpStatus.SC_OK) {
System.err.println("Method failed: " + getMethod.getStatusLine());
}
// 4.处理 HTTP 响应内容
byte[] responseBody = getMethod.getResponseBody();// 读取为字节 数组
String contentType = getMethod.getResponseHeader("Content-Type").getValue(); // 得到当前返回类型
page = new Page(responseBody,url,contentType); //封装成为页面
} catch (HttpException e) {
// 发生致命的异常,可能是协议不对或者返回的内容有问题
System.out.println("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
// 发生网络异常
e.printStackTrace();
} finally {
// 释放连接
getMethod.releaseConnection();
}
return page;
}
}
CharsetDetector.java类,进行字符集处理
/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.etoak.crawl.util;
import org.mozilla.universalchardet.UniversalDetector;
import java.io.UnsupportedEncodingException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 字符集自动检测
*
* @author hu
*/
public class CharsetDetector {
//从Nutch借鉴的网页编码检测代码
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern = Pattern.compile(
"]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern = Pattern.compile(
"charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
private static Pattern charsetPatternHTML5 = Pattern.compile(
"]*>",
Pattern.CASE_INSENSITIVE);
//从Nutch借鉴的网页编码检测代码
private static String guessEncodingByNutch(byte[] content) {
int length = Math.min(content.length, CHUNK_SIZE);
String str = "";
try {
str = new String(content, "ascii");
} catch (UnsupportedEncodingException e) {
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find()) {
encoding = new String(charsetMatcher.group(1));
}
}
if (encoding == null) {
metaMatcher = charsetPatternHTML5.matcher(str);
if (metaMatcher.find()) {
encoding = new String(metaMatcher.group(1));
}
}
if (encoding == null) {
if (length >= 3 && content[0] == (byte) 0xEF
&& content[1] == (byte) 0xBB && content[2] == (byte) 0xBF) {
encoding = "UTF-8";
} else if (length >= 2) {
if (content[0] == (byte) 0xFF && content[1] == (byte) 0xFE) {
encoding = "UTF-16LE";
} else if (content[0] == (byte) 0xFE
&& content[1] == (byte) 0xFF) {
encoding = "UTF-16BE";
}
}
}
return encoding;
}
/**
* 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
*
* @param bytes 待检测的字节数组
* @return 可能的字符集,如果检测失败,返回utf-8
*/
public static String guessEncodingByMozilla(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
}
return encoding;
}
/**
* 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
* @param content 待检测的字节数组
* @return 可能的字符集,如果检测失败,返回utf-8
*/
public static String guessEncoding(byte[] content) {
String encoding;
try {
encoding = guessEncodingByNutch(content);
} catch (Exception ex) {
return guessEncodingByMozilla(content);
}
if (encoding == null) {
encoding = guessEncodingByMozilla(content);
return encoding;
} else {
return encoding;
}
}
}
FileTool .java本类主要是 下载那些已经访问过的文件
package com.etoak.crawl.util;
import com.etoak.crawl.page.Page;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
/* 本类主要是 下载那些已经访问过的文件*/
public class FileTool {
private static String dirPath;
/**
* getMethod.getResponseHeader("Content-Type").getValue()
* 根据 URL 和网页类型生成需要保存的网页的文件名,去除 URL 中的非文件名字符
*/
private static String getFileNameByUrl(String url, String contentType) {
//去除 http://
url = url.substring(7);
//text/html 类型
if (contentType.indexOf("html") != -1) {
url = url.replaceAll("[\\?/:*|<>\"]", "_") + ".html";
return url;
}
//如 application/pdf 类型
else {
return url.replaceAll("[\\?/:*|<>\"]", "_") + "." +
contentType.substring(contentType.lastIndexOf("/") + 1);
}
}
/*
* 生成目录
* */
private static void mkdir() {
if (dirPath == null) {
dirPath = Class.class.getClass().getResource("/").getPath() + "temp\\";
}
File fileDir = new File(dirPath);
if (!fileDir.exists()) {
fileDir.mkdir();
}
}
/**
* 保存网页字节数组到本地文件,filePath 为要保存的文件的相对地址
*/
public static void saveToLocal(Page page) {
mkdir();
String fileName = getFileNameByUrl(page.getUrl(), page.getContentType()) ;
String filePath = dirPath + fileName ;
byte[] data = page.getContent();
try {
//Files.lines(Paths.get("D:\\jd.txt"), StandardCharsets.UTF_8).forEach(System.out::println);
DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
for (int i = 0; i < data.length; i++) {
out.write(data[i]);
}
out.flush();
out.close();
System.out.println("文件:"+ fileName + "已经被存储在"+ filePath );
} catch (IOException e) {
e.printStackTrace();
}
}
}
RegexRule.java类,正则
/*
* Copyright (C) 2014 hu
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
package com.etoak.crawl.util;
import java.util.ArrayList;
import java.util.regex.Pattern;
/**
*
* @author hu
*/
public class RegexRule {
public RegexRule(){
}
public RegexRule(String rule){
addRule(rule);
}
public RegexRule(ArrayList
rules){ for (String rule : rules) {
addRule(rule);
}
}
public boolean isEmpty(){
return positive.isEmpty();
}
private ArrayList
positive = new ArrayList (); private ArrayList
negative = new ArrayList ();
/**
* 添加一个正则规则 正则规则有两种,正正则和反正则
* URL符合正则规则需要满足下面条件: 1.至少能匹配一条正正则 2.不能和任何反正则匹配
* 正正则示例:+a.*c是一条正正则,正则的内容为a.*c,起始加号表示正正则
* 反正则示例:-a.*c时一条反正则,正则的内容为a.*c,起始减号表示反正则
* 如果一个规则的起始字符不为加号且不为减号,则该正则为正正则,正则的内容为自身
* 例如a.*c是一条正正则,正则的内容为a.*c
* @param rule 正则规则
* @return 自身
*/
public RegexRule addRule(String rule) {
if (rule.length() == 0) {
return this;
}
char pn = rule.charAt(0);
String realrule = rule.substring(1);
if (pn == '+') {
addPositive(realrule);
} else if (pn == '-') {
addNegative(realrule);
} else {
addPositive(rule);
}
return this;
}
/**
* 添加一个正正则规则
* @param positiveregex
* @return 自身
*/
public RegexRule addPositive(String positiveregex) {
positive.add(positiveregex);
return this;
}
/**
* 添加一个反正则规则
* @param negativeregex
* @return 自身
*/
public RegexRule addNegative(String negativeregex) {
negative.add(negativeregex);
return this;
}
/**
* 判断输入字符串是否符合正则规则
* @param str 输入的字符串
* @return 输入字符串是否符合正则规则
*/
public boolean satisfy(String str) {
int state = 0;
for (String nregex : negative) {
if (Pattern.matches(nregex, str)) {
return false;
}
}
int count = 0;
for (String pregex : positive) {
if (Pattern.matches(pregex, str)) {
count++;
}
}
if (count == 0) {
return false;
} else {
return true;
}
}
}
MyCrawler.java 爬取主类
package com.etoak.crawl.main;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSON;
import com.etoak.crawl.link.LinkFilter;
import com.etoak.crawl.link.Links;
import com.etoak.crawl.page.Page;
import com.etoak.crawl.page.PageParserTool;
import com.etoak.crawl.page.RequestAndResponseTool;
import com.etoak.crawl.util.FileTool;
public class MyCrawler {
/**
* 使用种子初始化 URL 队列
*
* @param seeds 种子 URL
* @return
*/
private void initCrawlerWithSeeds(String[] seeds) {
for (int i = 0; i < seeds.length; i++){
Links.addUnvisitedUrlQueue(seeds[i]);
}
}
/**
* 抓取过程
*
* @param seeds
* @return
*/
public void crawling(String[] seeds , String name) {
//初始化 URL 队列
initCrawlerWithSeeds(seeds);
//定义过滤器,提取以 http://www.baidu.com 开头的链接
LinkFilter filter = new LinkFilter() {
public boolean accept(String url) {
if (url.startsWith("https://www.jd.com"))
return true;
else
return false;
}
};
//循环条件:待抓取的链接不空且抓取的网页不多于 1000
int m = 1;
for (int i = 0; i < m; i++) {
//先从待访问的序列中取出第一个;
String visitUrl = (String) Links.removeHeadOfUnVisitedUrlQueue();
if (visitUrl == null){
continue;
}
//根据URL得到page;
Page page = RequestAndResponseTool.sendRequstAndGetResponse(visitUrl);
//对page进行处理: 访问DOM的某个标签
System.out.println(page);
Elements es = PageParserTool.select(page,"img[src]");
if(!es.isEmpty()){
System.out.println("下面将打印所有img[src]标签: ");
System.out.println(es);
}
//得到超链接
ArrayList
links = PageParserTool.getAttrs(page,"img[src]","abs:src"); m = links.size();
try {
FileTool.saveToLocal(page,URLDecoder.decode(name, "utf-8"));
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(links!=null&&links.size()>0){
Links.addUnvisitedUrlQueue(links.get(3));
System.out.println("新增爬取路径: " + links);
}
}
}
//main 方法入口
public static void main(String[] args) {
String as ="['帆布鞋']";
List
/*JSONObject json = null;
json = RedisUtils.getObject("JDClassCid3");
if(json!=null){
list = (List
) json.get("list"); }else{
list = ShoppingGuideUtils.getClasss11();
Map
map = new HashMap (); map.put("list", list);
RedisUtils.setObjectMap("JDClassCid3", map, RedisUtils.EXRP_DAY);
}*/
System.out.println(list);
if(list!=null&&list.size()>0){
String name= "";
for (int i = 0; i < list.size(); i++) {
MyCrawler crawler = new MyCrawler();
name = list.get(i).toString();
try {
if(URLDecoder.decode(name, "utf-8").contains("二手")){//去除二手物品
continue;
}
} catch (UnsupportedEncodingException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
name =URLEncoder.encode(name, "utf-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name);
try {
System.out.println("总计"+list.size()+"个===当前分类"+URLDecoder.decode(name, "utf-8")+"爬到"+i+"个");
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/*MyCrawler crawler = new MyCrawler();
String name= "ETC";
try {
if(URLDecoder.decode(name, "utf-8").contains("二手")){
System.out.println("jies");
}
} catch (UnsupportedEncodingException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
try {
name =URLEncoder.encode(name, "utf-8");
} catch (UnsupportedEncodingException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
crawler.crawling(new String[]{"https://so.m.jd.com/ware/search.action?keyword="+name+"&searchFrom=category&sf=1&as=1"},name); */
}
}
我选的根url是京东是搜索接口,取出来合适的图片,这里难点就在这,取出来html中的img集合,匹配度最高的,这得批跑抓取好几次,还有比较有争议的分类名称,比如“苹果”这种就得特殊处理
运行结果
文章主要参考: 罗刚 老师的 书籍 << 自己动手写网络爬虫>>