今天接着上个博客继续进行爬虫的研究,今天这个是一个完整流程(登录--跳转列表页--获取列表页所有数据--翻页)的演示。
本人没有对数据进行处理入库,需要的小伙伴可以按照自己的需求进行编写代码,编写位置://TODO:
***代码里所有有关地址的地方都需按照个人项目进行修改,如有疑问请留言(本人也是刚刚开始研究,大家可以一起研究讨论)
下面我就把代码与大家分享一下。
上代码:
package com.zhaxd.web.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
public class VerificationUtil {
/**
*
* @param args
* @author zhangjiaqiang
*
*/
public static void main(String[] args) throws IOException, TesseractException, InterruptedException {
// TODO Auto-generated method stub
System.getProperties().setProperty("webdriver.chrome.driver", "D:\\chromedriver.exe");
WebDriver chrome = new ChromeDriver();
//全局的等待设置,每次执行diver,都会等待2秒
chrome.manage().timeouts().implicitlyWait(2, TimeUnit.SECONDS);
//进入项目登录页--登录页地址
chrome.get("http://10.10.10.163:8080/unityplatform/loginController.do?login");
//初始化httpclient
DefaultHttpClient httpClient = new DefaultHttpClient();
// 获得 Cookie
Set
StringBuffer tmpcookies = new StringBuffer();
for (Cookie c : cookies) {
tmpcookies.append(c.toString() + ";");
}
//访问验证码图片
HttpPost httpPost = new HttpPost("http://10.10.10.163:8080/unityplatform/randCodeImage?a=" + new Date().getTime());
//携带cookie
httpPost.setHeader("cookie",tmpcookies.toString());
HttpResponse response = httpClient.execute(httpPost);
InputStream a = response.getEntity().getContent();
File targetFile = new File("d:/selenium/tt.jpg");
OutputStream outStream = new FileOutputStream(targetFile);
byte[] buffer = new byte[8 * 1024];
int bytesRead;
//表示从InputStream中读取度一个数知组的数道据,如果返回-1 则表示数据读版取完成了
while ((bytesRead = a.read(buffer)) != -1) {
//生成文件
outStream.write(buffer, 0, bytesRead);
}
outStream.close(); // 关闭输出流
//tess4j解析验证码图片
Tesseract tesseract = new Tesseract();
tesseract.setDatapath("D:/Tess4J/tessdata");
String text2= tesseract.doOCR(new File("D:/selenium/tt.jpg"));
WebElement randCodeInput = chrome.findElement(By.id("randCode"));
WebElement usernameInput = chrome.findElement(By.id("userNameFake"));
usernameInput.sendKeys("admin");
WebElement passwordInput = chrome.findElement(By.id("passwordFake"));
passwordInput.sendKeys("admin");
String replaceAll = text2.replaceAll("\r", "").replaceAll("\n", "").replaceAll(" ", "");
System.out.println("======"+replaceAll + "=====");
randCodeInput.sendKeys(replaceAll);
Thread.sleep(2000);
WebElement loginButton = chrome.findElement(By.xpath("//*[@id=\"btnLogin\"]"));
try {
loginButton.click();
Thread.sleep(5000);
//http://10.10.10.163:8080/unityplatform/mutiLangController.do?mutiLang
//http://10.10.10.163:8080/unityplatform/userController.do?user
//这个地方其实应该走数据库把所有的配置地址拿出来,进行循环访问
//跳转页面
chrome.get("http://10.10.10.163:8080/unityplatform/mutiLangController.do?mutiLang");
Thread.sleep(3000);
tableValue(chrome,By.cssSelector(".datagrid-btable tbody>tr"));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static boolean aa(WebDriver chrome) throws InterruptedException{
boolean aa=false;
WebElement dis = chrome.findElement(By.xpath("/html/body/div[2]/div[2]/div[3]/table/tbody/tr/td[10]/a"));
String bbb=dis.getAttribute("class");
//判断
if(!bbb.contains("disabled")){
Thread.sleep(3000);
dis.click();
Thread.sleep(3000);
tableValue(chrome,By.cssSelector(".datagrid-btable tbody>tr"));
aa=true;
}else{
aa=false;
}
return aa;
}
public static boolean bb(WebDriver chrome) throws InterruptedException{
boolean next=aa(chrome);
if(next){
bb(chrome);
}else{
Thread.sleep(3000);
chrome.close();
chrome.quit();
}
return true;
}
public static void tableValue(WebDriver driver,By selector) throws InterruptedException{
//获取定位table下的所有tr标签
List
for(WebElement row : rows2){
row.click();
//遍历点击查看详情
WebElement c00=driver.findElement(By.xpath("//*[@id=\"mutiLangListtb\"]/div[2]/span/a[3]/span/span"));
c00.click();
//TODO:去页面数据 存库
Thread.sleep(2000);
//点击关闭
WebElement cxx=driver.findElement(By.xpath("/html/body/div[2]/table/tbody/tr[2]/td[2]/div/table/tbody/tr[1]/td/div/div[2]/a[4]"));
cxx.click();
}
//去点击下一页
bb(driver);
}
}
selenium 录制和回放的功能值得学习,自己还没有看,同事推荐,等着自己研究一下。