从头学习爬虫(三十一)实战篇----动漫之家漫画(Java实现)

本文主要用Java selenium实现点击打开漫画

如果有selenium配置问题请前往从头学习爬虫(十)进阶篇----selenium回顾

未使用框架

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

public class GaReiZeroSpiderX{

	public static void main(String[] args) {
		//主页
		String url="https://manhua.dmzj.com/shiling";
		//线程数
		int threadsize=10;
		//延迟
		long sleeptime=5000;
		//获取列表页
		List itemList=getListPage(url);
		//获取图片地址
		List imgList=getListImg(itemList);
		//多线程下载
		DownLoadImg(imgList,threadsize,sleeptime);
	}


	private static List getListImg(List itemList) {
		List listImg=new ArrayList<>();
		if(itemList==null) {
			return null;
		}
		//配置驱动
		System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
    	ChromeOptions options = new ChromeOptions();
    	//配置浏览器位置
    	options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
    	//无头模式 59版本以上才可以
    	options.addArguments("test-type"); //ignore certificate errors
        options.addArguments("headless");// headless mode
        options.addArguments("disable-gpu");
        //没啥用 本来可以用于页面显示模式设置
        options.addArguments("Cookie:display_mode=1");
    	WebDriver driver = new ChromeDriver(options);
		for (String url : itemList) {
			url="https://manhua.dmzj.com"+url;
	    	driver.get(url);
	    	WebElement webElement = driver.findElement(By.xpath("/html"));
			String content = webElement.getAttribute("outerHTML");
			Html html=new Html(content);
			String title=html.xpath("//title/text()").toString().split("-")[0];
			List s=html.xpath("//div[@class='btmBtnBox']/select/option").nodes();
			for (Selectable selectable : s) {
                            //每一话的标题           每一页    图片地址
                             listImg.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
			}
		}
		//关闭窗口
		driver.close();
		//关闭进程
		driver.quit();
		return listImg;
	}

	private static List getListPage(String url) {
		CloseableHttpResponse response = null;
		try{
			CloseableHttpClient httpClient = HttpClients.createDefault();
			RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(1000).setConnectionRequestTimeout(1000).setSocketTimeout(1000).setRedirectsEnabled(true).build();
			HttpGet httpGet = new HttpGet(url);	
			httpGet.setConfig(requestConfig);
			httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
			response =httpClient.execute(httpGet);
			if (response.getStatusLine().getStatusCode() != 200) {
				System.out.println("request url failed, http code=" + response.getStatusLine().getStatusCode());
				return  null;
			}else{
				HttpEntity entity1 = response.getEntity();
				String resultStr = EntityUtils.toString(entity1, "utf-8");
				Html html=new Html(resultStr);
/*				List list=new ArrayList<>();
				list.add(html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").toString());*/
				return html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
			}
		} catch (Exception e) {
			return  null;
		} finally {
			if (response != null){ 
				try {
					response.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			
		}
	}
	private static void DownLoadImg(List imgList, int threadsize, long sleeptime) {
		int count=0;
		int size=imgList.size();
		ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize);
		CompletionService cs = new ExecutorCompletionService(fixedThreadPool);
		for (String url : imgList) {
			final String url1 = url;
			cs.submit(new Callable() {
				public String call() throws Exception {
					try {
						Thread.sleep(sleeptime);
						return down(url1);
					} catch (InterruptedException e) {
						System.out.println("线程异常");
						return "error_"+"url1";
					}
				}


			});
		}
		for (String url : imgList) {
			try {
				String a = cs.take().get();
				if(a!=null) {
					count++;
				}
			} catch (Exception e) {
				e.printStackTrace();
			}finally {
				if(count==size) {
					System.out.println("over");
				}else {
					System.out.println(count+"/"+size);
				}
			}
		}
		fixedThreadPool.shutdown();
	}


	protected static  String down(String url) {
			try {
				url=url.replace(" ", "");
				File dest1 = new File("D:/manhua");
				if (!dest1.exists() && !dest1.isDirectory()) {
					dest1.mkdir();
				}
				File dest2 = new File("D:/manhua/" + url.split("___")[0]);
				if (!dest2.exists() && !dest2.isDirectory()) {
					dest2.mkdir();
				}
				File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "."
						+ url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
				
				if (!dest.exists()) {
						dest.createNewFile();
				}
				//接收字节输入流
				InputStream is;
				//字节输出流
				FileOutputStream fos = new FileOutputStream(dest);
				URL temp;
				String imgurl=url.split("___")[2];
					temp = new URL(imgurl.trim());
					 HttpURLConnection uc=(HttpURLConnection) temp.openConnection();
					uc.addRequestProperty("User-Agent",
							"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
					//必须加refer 防封 这个比较烂 写成百度地址也可以
					uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
					is=uc.getInputStream();
					//为字节输入流加缓冲
					BufferedInputStream bis = new BufferedInputStream(is);
					//为字节输出流加缓冲
					BufferedOutputStream bos = new BufferedOutputStream(fos);
					int length;
					byte[] bytes = new byte[1024 * 20];
					while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
						fos.write(bytes, 0, length);
					}
					bos.close();
					fos.close();
					bis.close();
					is.close();
					return "success_"+"url1";
			} catch (Exception e) {
				e.printStackTrace();
				return "error_"+"url1";
			}
		
	}


	
}

webmagic框架

spider

import java.util.ArrayList;
import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

public class GaReiZeroSpider implements PageProcessor{
    static List imgurl=new ArrayList<>();
    private Site site =Site.me();

    @Override
    public Site getSite() {
        return site ;
    }

    @Override
    public void process(Page page) {
        if(page.getUrl().toString().equals("https://manhua.dmzj.com/shiling")) {
            List pageUrl=page.getHtml().xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
            for (String string : pageUrl) {
                Request request=new Request("https://manhua.dmzj.com"+string);
                request.addHeader("Cookie", "display_mode=1");
                page.addTargetRequest(request);
            }
        }else {
            String title=page.getHtml().xpath("//title/text()").toString().split("-")[0];
            List s=page.getHtml().xpath("//div[@class='btmBtnBox']/select/option").nodes();
            for (Selectable selectable : s) {
                imgurl.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
            }
            page.putField("imgurl", imgurl);
        }
        
    }
    public static void main(String[] args) {
        Spider.create(new GaReiZeroSpider()).downloader(new GaReiZeroDownloader()).addPipeline(new GaReiZeroPipline()).addUrl("https://manhua.dmzj.com/shiling").start();
    
    }
    
    
}

downloader

import java.io.Closeable;
import java.io.IOException;

import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;

public class GaReiZeroDownloader implements Downloader, Closeable{

	@Override
	public void close() throws IOException {
		
	}

	@Override
	public Page download(Request request, Task task) {
		System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
    	ChromeOptions options = new ChromeOptions();
    	options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
    	options.addArguments("test-type"); //ignore certificate errors
        options.addArguments("headless");// headless mode
        options.addArguments("disable-gpu");
        options.addArguments("Cookie:display_mode=1");
    	WebDriver driver = new ChromeDriver(options);
    	driver.get(request.getUrl());
    	WebElement webElement = driver.findElement(By.xpath("/html"));
		String content = webElement.getAttribute("outerHTML");
    	Page page = new Page();
		page.setRawText(content);
		page.setHtml(new Html(content, request.getUrl()));
		page.setUrl(new PlainText(request.getUrl()));
		page.setRequest(request);
		driver.close();
		driver.quit();
		return page;
	}

	@Override
	public void setThread(int threadNum) {
		
	}

}

pipline

public class GaReiZeroPipline implements Pipeline{

	@Override
	public void process(ResultItems resultItems, Task task) {
		try {
			if(null!=resultItems.get("imgurl")) {
				List imgurl=resultItems.get("imgurl");
				if(!imgurl.isEmpty()) {
					DownLoadImg(imgurl,5,500);
				}
			}
		} catch (Exception e) {
		}
	}
	
	private  void DownLoadImg(List imgList, int threadsize, long sleeptime) {
		int count=0;
		int size=imgList.size();
		ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize);
		CompletionService cs = new ExecutorCompletionService(fixedThreadPool);
		for (String url : imgList) {
			final String url1 = url;
			cs.submit(new Callable() {
				public String call() throws Exception {
					try {
						Thread.sleep(sleeptime);
						return down(url1);
					} catch (InterruptedException e) {
						System.out.println("线程异常");
						return "error_"+"url1";
					}
				}


			});
		}
		for (String url : imgList) {
			try {
				String a = cs.take().get();
				if(a!=null) {
					count++;
				}
			} catch (Exception e) {
				e.printStackTrace();
			}finally {
				if(count==size) {
					System.out.println("over");
				}else {
					System.out.println(count+"/"+size);
				}
			}
		}
		fixedThreadPool.shutdown();
	}
	
	protected   String down(String url) {
		try {
			url=url.replace(" ", "");
			File dest1 = new File("D:/manhua");
			if (!dest1.exists() && !dest1.isDirectory()) {
				dest1.mkdir();
			}
			File dest2 = new File("D:/manhua/" + url.split("___")[0]);
			if (!dest2.exists() && !dest2.isDirectory()) {
				dest2.mkdir();
			}
			File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "."
					+ url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
			
			if (!dest.exists()) {
					dest.createNewFile();
			}
			//接收字节输入流
			InputStream is;
			//字节输出流
			FileOutputStream fos = new FileOutputStream(dest);
			URL temp;
			String imgurl=url.split("___")[2];
				temp = new URL(imgurl.trim());
				 HttpURLConnection uc=(HttpURLConnection) temp.openConnection();
				uc.addRequestProperty("User-Agent",
						"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
				//必须加refer 防封 这个比较烂 写成百度地址也可以
				uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
				is=uc.getInputStream();
				//为字节输入流加缓冲
				BufferedInputStream bis = new BufferedInputStream(is);
				//为字节输出流加缓冲
				BufferedOutputStream bos = new BufferedOutputStream(fos);
				int length;
				byte[] bytes = new byte[1024 * 20];
				while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
					fos.write(bytes, 0, length);
				}
				bos.close();
				fos.close();
				bis.close();
				is.close();
				return "success_"+"url1";
		} catch (Exception e) {
			e.printStackTrace();
			return "error_"+"url1";
		}
	
}
}

download没有复用webdriver,建议改造下

欢迎加群313557283(刚创建),小白互相学习~

 

你可能感兴趣的:(网络爬虫)