去年中旬开始接触爬虫一直都是浅显带过 期间也写过 知乎爬虫和科技网站定向抓取及爬取整个互联网的爬虫
今天和大家分享一下第三个 及其实现方式和代码 早期的实现想法 附代码
关于爬虫其实理论上很简单 就是通过互联网上的超链接导航实现页面的调转与抓取 互联网的网也因此而来
我也会一步一步的将实现方式和想法展现出来 方便大家能够明白每一步要做什么应该怎么做
爬虫可以分为6个部分:
1.下载器 ——实现爬虫的基础
2.链接解析器——获取文档超链接
3.链接队列——负责管理链接(分为两部分 1已经抓取的,2待抓取(实现去重))
4.页面分析器——负责将有用信息剥离出来
5.存储器——将页面信息进行存储(这里为了方便展示选择了生成html文件,同样也可以持久化信息)
6.任务分发器——负责以上模块的协作
1.下载器我们选择了apache提供的httpClient(还有其他一些也不错,自由选择)
- package com.search.sprider;
- import java.io.IOException;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpStatus;
- import org.apache.http.ParseException;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- /**
- * @see 爬取网页内容
- * @author zhuGe
- *
- */
- public class Sprider {
- public static String get(String url) {
- CloseableHttpClient httpClient = HttpClients.createDefault();
- // 创建httpget
- HttpGet httpGet;
- try {
- httpGet = new HttpGet(url);
- } catch (Exception e1) {
- return null;
- }
- // 设置表头
- httpHeader(httpGet);
- //设置超时
- RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(2000).setConnectTimeout(2000).build();//设置请求和传输超时时间
- httpGet.setConfig(requestConfig);
- String download = null;
- try {
- // 执行get请求.
- CloseableHttpResponse response = httpClient.execute(httpGet);
- // 获取响应实体
- HttpEntity entity = response.getEntity();
- // System.out.println(httpGet.getURI());
- // // 打印响应状态
- // System.out.println(response.getStatusLine());
- // System.out.println("--------------------------------------");
- /**
- * 爬虫
- */
- if(entity != null){
- if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
- download = EntityUtils.toString(entity);
- }
- }
- // if (entity != null) {
- // // 打印响应内容长度
- // System.out.println("Response content length: " +
- // entity.getContentLength());
- // 打印响应内容
- // System.out.println(download);
- } catch (ClientProtocolException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- return null;
- } catch (ParseException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- return null;
- } catch (IOException e) {
- // TODO Auto-generated catch block
- new Exception("ioe");
- return null;
- }finally {
- // 关闭连接,释放资源
- try {
- httpClient.close();
- } catch (IOException e) {
- e.printStackTrace();
- return null;
- }
- }
- return download;
- }
- //设置表头
- public static void httpHeader(HttpGet httpGet){
- httpGet.setHeader("Accept", "Accept text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
- httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
- httpGet.setHeader("Accept-Encoding", "gzip, deflate");
- httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
- httpGet.setHeader("Connection", "keep-alive");
- // httpGet.setHeader("Cookie", "__utma=226521935.73826752.1323672782.1325068020.1328770420.6;");
- // httpGet.setHeader("Host", "www.cnblogs.com");
- httpGet.setHeader("refer",
- "http://www.baidu.com/s?tn=monline_5_dg&bs=httpclient4+MultiThreadedHttpConnectionManager");
- httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
- // System.out.println("Accept-Charset: " + httpGet.getFirstHeader("Accept-Charset"));
- }
- }
2.链接解析器选择了jsoup 配合正则(通过dom树更方便获取,可以选择单纯使用正则或jsoup>_< 早期写代码失误了下版升级优化)
- package com.search.split;
- import java.util.HashSet;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- *
- * @author zhuGe
- * @see 链接获取器
- */
- public class HrefOfPage {
- /**
- *
- * @see 获取所有符合要求的链接
- * @param doc
- * @return 所有的http://的a链接里面的href属性值
- *
- */
- @SuppressWarnings({ "rawtypes", "unchecked" })
- public static Set<String> printHref(Document doc){
- Set aHref = null;
- if(aHref==null){
- aHref = new HashSet<String>();
- }
- aHref.clear();
- //获取所有的a元素
- Elements aS = doc.getElementsByTag("a");
- for (Element element : aS) {
- //正则匹配
- //获取属性href里面满足条件的内容
- String href = (element.attr("href"));
- String regex ="(http://.+)";
- Pattern p = Pattern.compile(regex);
- Matcher m = p.matcher(href);
- //获取遍历所有满足条件的标签并获取链接
- while(m.find()){
- String a = m.group(0);
- aHref .add(a);
- }
- }
- // System.out.println("页面链接数量:"+aHref.size());
- return aHref;
- }
- }
3.链接队列 待抓取队列 选择了LinkedList的集合(队列(queue)方便管理)
- package com.search.url;
- import java.util.LinkedList;
- public class UrlQueue {
- /**超链接队列*/
- public static LinkedList<String> urlQueue = new LinkedList<String>();
-
- /**队列中对应最多的超链接数量*/
- public static final int MAX_SIZE = 10000;
-
- public synchronized static void addElem(String url)
- {
- urlQueue.add(url);
- }
-
- public synchronized static String outElem()
- {
- String outUrl = urlQueue.removeFirst();
- //将查询过的去除掉
- if(urlQueue.contains(outUrl)){
- urlQueue.remove(outUrl);
- System.out.println("faxxx");
- }
- return outUrl;
- }
- public synchronized static boolean isEmpty()
- {
- return urlQueue.isEmpty();
- }
-
- }
3.链接队列 以抓取队列 选择了set结婚(可以去重)
- package com.search.url;
- import java.util.HashSet;
- /**
- * 已访问url队列
- * @author zhuGe
- *
- */
- public class VisitedUrlQueue
- {
- public static HashSet<String> visitedUrlQueue = new HashSet<String>();
-
- public synchronized static void addElem(String url)
- {
- visitedUrlQueue.add(url);
- }
-
- public synchronized static boolean isContains(String url)
- {
- return visitedUrlQueue.contains(url);
- }
-
- public synchronized static int size()
- {
- return visitedUrlQueue.size();
- }
- }
4.页面分析器同样采用jsoup(2和4分开方便后期维护管理,只获取了网站标题,可以定制)
- package com.search.split;
- import org.jsoup.nodes.Document;
- import org.jsoup.select.Elements;
- public class PageTitle {
- public static String printTitle(Document doc){
- Elements title = doc.getElementsByTag("title");
- return title.text();
- }
- }
5.存储器使用输出流输出数据生成html页面 6.任务分发器配合多线程提升效率(加入和深度筛选 控制深度优先 )
- package com.search.tread;
- import java.io.BufferedWriter;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.Set;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import com.search.split.HrefOfPage;
- import com.search.split.PageTitle;
- import com.search.sprider.Sprider;
- import com.search.url.UrlQueue;
- import com.search.url.VisitedUrlQueue;
- import com.search.util.Depth;
- /**
- * @author zhuGe
- * @data 2016年1月17日
- */
- public class UrlTread implements Runnable{
- @Override
- public void run() {
- while(!UrlQueue.isEmpty()){
- String url = UrlQueue.outElem();
- System.out.println("移除"+url);
- String context = null;
- if(!VisitedUrlQueue.isContains(url)){
- context = Sprider.get(url);
- }
- if(context!=null){
- //访问过的链接
- addHref(context,url);
- }
- VisitedUrlQueue.addElem(url);
- }
- }
- /**
- * @see 获取链接并输出标题
- * @param context
- * @param url
- */
- public void addHref(String context,String url){
- Document doc = Jsoup.parse(context);
- //获取所有链接
- Set<String> hrefSet = HrefOfPage.printHref(doc);
- //获取网站标题
- String title = PageTitle.printTitle(doc);
- System.out.println(Thread.currentThread().getName());
- String html =("<li><a href='"+url+"'>"+title+"</a></li>\n");
- //添加文件到输出对象
- outFile(html);
- System.out.println(html);
- //进行深度筛选
- if(hrefSet!=null){
- hrefSet = Depth.depth(hrefSet, 1);
- }
- //将链接添加进待访问队列
- for (String string : hrefSet) {
- if(!VisitedUrlQueue.isContains(string)){//判断是否已被访问
- System.out.println("加入队列"+string);
- UrlQueue.addElem(string);
- }else{
- System.out.println("重复"+string);
- }
- }
- }
- public void outFile(String html){
- try {
- @SuppressWarnings("resource")
- BufferedWriter out = new BufferedWriter(new FileWriter("d://test.html",true));
- out.write(html);
- out.flush();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
其他扩展
深度控制器
- package com.search.util;
- import java.util.HashSet;
- import java.util.Set;
- /**
- * @see 筛选链接的深度
- * @author zhuGe
- *
- */
- public class Depth {
- /**
- *
- * @param hrefSet 注入需要控制深度的链接
- * @param depth 筛选满足深度的链接
- */
- public static Set<String> depth(Set<String> hrefSet,int depth){
- Set<String> deptahHrefSet=null;
- if(deptahHrefSet==null){
- deptahHrefSet = new HashSet<String>();
- }
- deptahHrefSet.clear();
- String[] str = null;
- for (String href : hrefSet) {
- str = href.split("/");
- //链接深度
- int idepth = str==null?0:str.length-2;
- //
- // System.out.println(href+" [深度:"+idepth+"]");
- if(idepth<=depth){
- //去除最后的反斜杠
- if(href.lastIndexOf("/")==href.length()-1){
- deptahHrefSet.add(href.substring(0, href.length()-1));
- }else{
- deptahHrefSet.add(href);
- }
- }
- }
- return deptahHrefSet;
- }
- }
启动入口(‘加入睡眠防止开启时链接数目过少导致线程没有获取任务“)
- package com.search.control;
- import com.search.tread.UrlTread;
- import com.search.url.UrlQueue;
- public class controlCentre {
- public static void main(String[] args) {
- UrlQueue.addElem("http://www.ifanr.com");
- UrlQueue.addElem("http://www.leiphone.com");
- UrlQueue.addElem("http://www.huxiu.com");
- UrlTread[] t = new UrlTread[8];
- for(int i=0;i<t.length;i++){
- t[i] = new UrlTread();
- try {
- Thread.sleep(2000);
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- new Thread(t[i],"蜘蛛人:"+i+"号").start();
- }
- //
- }
- }
代码还有待优化(这只是简单爬虫实现的基础,不过理论上他已经可以爬取整个互联网了) ,源码下载可以邮箱留言