其中图片识别所需jar包为:
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.Point;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.stereotype.Service;
import com.alibaba.fastjson.JSONObject;
import com.zt.framework.common.tools.uuid.UUIDGenerator;
import com.zt.spider.core.core.service.ICrawlService;
import com.zt.spider.soa.monitor.bean.SpiderMonitorBean;
import com.zt.spider.soa.queue.bean.CoreRule;
import com.zt.spider.soa.queue.bean.DataQueue;
import cn.dreampie.orm.Record;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.util.LoadLibs;
public class ZtsJxsjzazDwcbjyztServiceImpl implements ICrawlService {
@Override
public void addRunSpider(DataQueue dataQueue, SpiderMonitorBean spiderMonitorBean, CoreRule coreRule)
throws Exception {
System.setProperty("webdriver.chrome.driver", "D:/Java/selenium/selenium/chromedriver.exe");
ChromeOptions chromeOptions = new ChromeOptions();
chromeOptions.addArguments("--headless");
// options.addArguments("--disable-infobars");
ChromeDriver webDriver = new ChromeDriver(chromeOptions);
try {
String taskInstanceId = dataQueue.getTaskInstanceId();
Record findRed = new Record("tb_spider_task");
List listR = findRed.find("select * from tb_spider_task_instance sti, tb_spider_task st where sti.id = ? and sti.task_id = st.id",new Object[] { taskInstanceId });
String taskRules = listR.get(0).get("taskRule").toString();
JSONObject ruleJson = JSONObject.parseObject(taskRules);
// 重复次数
Integer repeatCount = Integer.valueOf(ruleJson.get("repeatTime").toString());
// 重复默认为1次
Integer repeatTime = 1;
// 打开浏览器
String url = "XXXXXXXX";
webDriver.manage().window().maximize();
webDriver.get(url);
Thread.sleep(1000);
// 截取验证码图片
screenshots(webDriver, "D:\\code.jpg");
// 解析图片,得到图片验证码
String code = getImgContent("D:\\code.jpg");
// 输入验证码
WebElement codeInput = webDriver.findElement(By.id("code"));
codeInput.sendKeys(code);
Thread.sleep(3000);
// 查询相关数据
WebElement findButton = webDriver.findElement(By.className("u-btn-c1"));
findButton.click();
// 点击查询后,删除截取的验证码图片
FileUtils.forceDelete(new File("D:\\code.jpg"));
// 得到总页码
Integer pageCount = resolvesTotalPageCount(webDriver);
Integer pageIndex = 1;
if ("other".equals(ruleJson.get("crawler").toString())) {
pageIndex = Integer.valueOf(ruleJson.get("beginPage").toString());
if (pageCount <= pageIndex) {
pageIndex = 1;
}
}
while (true) {
String html = webDriver.getPageSource();
Document document = Jsoup.parse(html);
Elements trs = document.select(".m-table tr");
Record record = new Record("zts_nc_jzazdwcbjyzt");
for (Element element : trs) {
if (!element.text().contains("企业中文")) {
String qyzwmc = element.select("td").get(0).text();
String qyywmc = element.select("td").get(1).text();
String qydz = element.select("td").get(2).text();
String byjzt = element.select("td").get(3).text();
String lxdh = element.select("td").get(4).text();
String lxcz = element.select("td").get(5).text();
String yzbm = element.select("td").get(6).text();
Long temp = record.countBy("qyzwmc='" + qyzwmc + "'", new Object[0]);
if (temp == 0) {
Record addRed = record.set("id", UUIDGenerator.getUUID())
.set("create_time", DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm:ss"))
.set("task_instance_id", taskInstanceId).set("url", url).set("zsbh", "")
.set("qyzwmc", qyzwmc).set("qyywmc", qyywmc).set("qydz", qydz).set("jyfw", byjzt)
.set("lxdh", lxdh).set("lxcz", lxcz).set("yzbm", yzbm);
record.save(addRed);
repeatTime = 1;
} else {
repeatTime++;
Object[] obj = new Object[] { DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm:ss"), qyzwmc, qyywmc, qydz, byjzt, lxdh, lxcz, yzbm };
record.updateColsBy("create_time,qyywmc, qyywmc, qydz,jyfw,lxdh,lxcz,yzbm", "qyzwmc='" + qyzwmc + "'", obj);
}
}
}
if (pageIndex >= pageCount) {
break;
} else {
if ("other".equals(ruleJson.get("crawler").toString())) {
if (repeatTime >= repeatCount) {
break;
}
}
pageIndex++;
WebElement nextPage = webDriver.findElement(By.className("pagenxt"));
// 点击下一页
Thread.sleep(3000);
nextPage.click();
}
}
webDriver.quit();
} catch (Exception e) {
webDriver.quit();
}
}
/**
* 解析_页码总数
*
* @param driver
* @param defaultValue
* @return
*/
private static int resolvesTotalPageCount(ChromeDriver driver) {
String str = driver.findElement(By.className("m-page-total-num")).getText();
if (StringUtils.isNotBlank(str)) {
return Integer.valueOf(str);
}
return 1;
}
/**
* 截图并保存到本地
*
* @param driver
* 浏览器驱动
* @param imgAddress
* 图片地址
*/
private static void screenshots(WebDriver driver, String imgAddress) {
try {
WebElement img = driver.findElement(By.className("m-table2")).findElement(By.tagName("tbody"))
.findElement(By.tagName("img"));
// 获取验证码图片的位置
Point location = img.getLocation();
// 获取大小
Dimension size = img.getSize();
// 指定了OutputType.BYTES做为参数传递给getScreenshotAs()方法,其含义是将截取的屏幕以BYTES形式返回。
byte[] screenshotAs = ((TakesScreenshot) driver).getScreenshotAs(OutputType.BYTES);
// 创建全屏截图。
BufferedImage originalImage = ImageIO.read(new ByteArrayInputStream(screenshotAs));
// 截取img所在位置的子图。
BufferedImage croppedImage = originalImage.getSubimage(location.getX(), location.getY(), size.getWidth(),
size.getHeight());
// 保存到本地
ImageIO.write(croppedImage, "jpg", new File(imgAddress));
} catch (WebDriverException e) {
// e.printStackTrace();
} catch (IOException e) {
// e.printStackTrace();
}
}
/**
* 解析图片得到验证码
*
* @param imgPath
* 图片的地址
*/
public static String getImgContent(String imgPath) {
String content = "";
File imageFile = new File(imgPath);
// 读取图片数字
ITesseract instance = new Tesseract();
File tessDataFolder = LoadLibs.extractTessResources("tessdata");
instance.setLanguage("eng");// 英文库识别数字比较准确
instance.setDatapath(tessDataFolder.getAbsolutePath());
try {
content = instance.doOCR(imageFile).replace("\n", "");
} catch (TesseractException e) {
// System.err.println(e.getMessage());
}
return content;
}
}