本文主要用Java selenium实现点击打开漫画
如果有selenium配置问题请前往从头学习爬虫(十)进阶篇----selenium回顾
未使用框架
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
public class GaReiZeroSpiderX{
public static void main(String[] args) {
//主页
String url="https://manhua.dmzj.com/shiling";
//线程数
int threadsize=10;
//延迟
long sleeptime=5000;
//获取列表页
List itemList=getListPage(url);
//获取图片地址
List imgList=getListImg(itemList);
//多线程下载
DownLoadImg(imgList,threadsize,sleeptime);
}
private static List getListImg(List itemList) {
List listImg=new ArrayList<>();
if(itemList==null) {
return null;
}
//配置驱动
System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
//配置浏览器位置
options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
//无头模式 59版本以上才可以
options.addArguments("test-type"); //ignore certificate errors
options.addArguments("headless");// headless mode
options.addArguments("disable-gpu");
//没啥用 本来可以用于页面显示模式设置
options.addArguments("Cookie:display_mode=1");
WebDriver driver = new ChromeDriver(options);
for (String url : itemList) {
url="https://manhua.dmzj.com"+url;
driver.get(url);
WebElement webElement = driver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Html html=new Html(content);
String title=html.xpath("//title/text()").toString().split("-")[0];
List s=html.xpath("//div[@class='btmBtnBox']/select/option").nodes();
for (Selectable selectable : s) {
//每一话的标题 每一页 图片地址
listImg.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
}
}
//关闭窗口
driver.close();
//关闭进程
driver.quit();
return listImg;
}
private static List getListPage(String url) {
CloseableHttpResponse response = null;
try{
CloseableHttpClient httpClient = HttpClients.createDefault();
RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(1000).setConnectionRequestTimeout(1000).setSocketTimeout(1000).setRedirectsEnabled(true).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(requestConfig);
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");
response =httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() != 200) {
System.out.println("request url failed, http code=" + response.getStatusLine().getStatusCode());
return null;
}else{
HttpEntity entity1 = response.getEntity();
String resultStr = EntityUtils.toString(entity1, "utf-8");
Html html=new Html(resultStr);
/* List list=new ArrayList<>();
list.add(html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").toString());*/
return html.xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
}
} catch (Exception e) {
return null;
} finally {
if (response != null){
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private static void DownLoadImg(List imgList, int threadsize, long sleeptime) {
int count=0;
int size=imgList.size();
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize);
CompletionService cs = new ExecutorCompletionService(fixedThreadPool);
for (String url : imgList) {
final String url1 = url;
cs.submit(new Callable() {
public String call() throws Exception {
try {
Thread.sleep(sleeptime);
return down(url1);
} catch (InterruptedException e) {
System.out.println("线程异常");
return "error_"+"url1";
}
}
});
}
for (String url : imgList) {
try {
String a = cs.take().get();
if(a!=null) {
count++;
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(count==size) {
System.out.println("over");
}else {
System.out.println(count+"/"+size);
}
}
}
fixedThreadPool.shutdown();
}
protected static String down(String url) {
try {
url=url.replace(" ", "");
File dest1 = new File("D:/manhua");
if (!dest1.exists() && !dest1.isDirectory()) {
dest1.mkdir();
}
File dest2 = new File("D:/manhua/" + url.split("___")[0]);
if (!dest2.exists() && !dest2.isDirectory()) {
dest2.mkdir();
}
File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "."
+ url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
if (!dest.exists()) {
dest.createNewFile();
}
//接收字节输入流
InputStream is;
//字节输出流
FileOutputStream fos = new FileOutputStream(dest);
URL temp;
String imgurl=url.split("___")[2];
temp = new URL(imgurl.trim());
HttpURLConnection uc=(HttpURLConnection) temp.openConnection();
uc.addRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
//必须加refer 防封 这个比较烂 写成百度地址也可以
uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
is=uc.getInputStream();
//为字节输入流加缓冲
BufferedInputStream bis = new BufferedInputStream(is);
//为字节输出流加缓冲
BufferedOutputStream bos = new BufferedOutputStream(fos);
int length;
byte[] bytes = new byte[1024 * 20];
while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
fos.write(bytes, 0, length);
}
bos.close();
fos.close();
bis.close();
is.close();
return "success_"+"url1";
} catch (Exception e) {
e.printStackTrace();
return "error_"+"url1";
}
}
}
webmagic框架
spider
import java.util.ArrayList;
import java.util.List;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
public class GaReiZeroSpider implements PageProcessor{
static List imgurl=new ArrayList<>();
private Site site =Site.me();
@Override
public Site getSite() {
return site ;
}
@Override
public void process(Page page) {
if(page.getUrl().toString().equals("https://manhua.dmzj.com/shiling")) {
List pageUrl=page.getHtml().xpath("//div[@class='cartoon_online_border']/ul/li/a/@href").all();
for (String string : pageUrl) {
Request request=new Request("https://manhua.dmzj.com"+string);
request.addHeader("Cookie", "display_mode=1");
page.addTargetRequest(request);
}
}else {
String title=page.getHtml().xpath("//title/text()").toString().split("-")[0];
List s=page.getHtml().xpath("//div[@class='btmBtnBox']/select/option").nodes();
for (Selectable selectable : s) {
imgurl.add(title+"___"+selectable.xpath("/option/text()")+"___"+"https:"+selectable.xpath("/option/@value"));
}
page.putField("imgurl", imgurl);
}
}
public static void main(String[] args) {
Spider.create(new GaReiZeroSpider()).downloader(new GaReiZeroDownloader()).addPipeline(new GaReiZeroPipline()).addUrl("https://manhua.dmzj.com/shiling").start();
}
}
downloader
import java.io.Closeable;
import java.io.IOException;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
public class GaReiZeroDownloader implements Downloader, Closeable{
@Override
public void close() throws IOException {
}
@Override
public Page download(Request request, Task task) {
System.getProperties().setProperty("webdriver.chrome.driver","D:\\newChromeDriver\\chromedriver_win32\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
options.setBinary("C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
options.addArguments("test-type"); //ignore certificate errors
options.addArguments("headless");// headless mode
options.addArguments("disable-gpu");
options.addArguments("Cookie:display_mode=1");
WebDriver driver = new ChromeDriver(options);
driver.get(request.getUrl());
WebElement webElement = driver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
driver.close();
driver.quit();
return page;
}
@Override
public void setThread(int threadNum) {
}
}
pipline
public class GaReiZeroPipline implements Pipeline{
@Override
public void process(ResultItems resultItems, Task task) {
try {
if(null!=resultItems.get("imgurl")) {
List imgurl=resultItems.get("imgurl");
if(!imgurl.isEmpty()) {
DownLoadImg(imgurl,5,500);
}
}
} catch (Exception e) {
}
}
private void DownLoadImg(List imgList, int threadsize, long sleeptime) {
int count=0;
int size=imgList.size();
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(threadsize);
CompletionService cs = new ExecutorCompletionService(fixedThreadPool);
for (String url : imgList) {
final String url1 = url;
cs.submit(new Callable() {
public String call() throws Exception {
try {
Thread.sleep(sleeptime);
return down(url1);
} catch (InterruptedException e) {
System.out.println("线程异常");
return "error_"+"url1";
}
}
});
}
for (String url : imgList) {
try {
String a = cs.take().get();
if(a!=null) {
count++;
}
} catch (Exception e) {
e.printStackTrace();
}finally {
if(count==size) {
System.out.println("over");
}else {
System.out.println(count+"/"+size);
}
}
}
fixedThreadPool.shutdown();
}
protected String down(String url) {
try {
url=url.replace(" ", "");
File dest1 = new File("D:/manhua");
if (!dest1.exists() && !dest1.isDirectory()) {
dest1.mkdir();
}
File dest2 = new File("D:/manhua/" + url.split("___")[0]);
if (!dest2.exists() && !dest2.isDirectory()) {
dest2.mkdir();
}
File dest = new File("D:/manhua/" + url.split("___")[0] + "/" + url.split("___")[1] + "."
+ url.split("___")[2].split("\\.")[url.split("___")[2].split("\\.").length- 1]);
if (!dest.exists()) {
dest.createNewFile();
}
//接收字节输入流
InputStream is;
//字节输出流
FileOutputStream fos = new FileOutputStream(dest);
URL temp;
String imgurl=url.split("___")[2];
temp = new URL(imgurl.trim());
HttpURLConnection uc=(HttpURLConnection) temp.openConnection();
uc.addRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
//必须加refer 防封 这个比较烂 写成百度地址也可以
uc.addRequestProperty("Referer", "https://manhua.dmzj.com/");
is=uc.getInputStream();
//为字节输入流加缓冲
BufferedInputStream bis = new BufferedInputStream(is);
//为字节输出流加缓冲
BufferedOutputStream bos = new BufferedOutputStream(fos);
int length;
byte[] bytes = new byte[1024 * 20];
while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
fos.write(bytes, 0, length);
}
bos.close();
fos.close();
bis.close();
is.close();
return "success_"+"url1";
} catch (Exception e) {
e.printStackTrace();
return "error_"+"url1";
}
}
}