爬虫之截取验证码解析验证码

 其中图片识别所需jar包为:

       
       
            net.java.dev.jna
            jna
            4.1.0
       

       
            net.sourceforge.tess4j
            tess4j
            2.0.1
           
               
                    com.sun.jna
                    jna
               

           

       

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;

import javax.imageio.ImageIO;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.OutputType;
import org.openqa.selenium.Point;
import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.stereotype.Service;

import com.alibaba.fastjson.JSONObject;
import com.zt.framework.common.tools.uuid.UUIDGenerator;
import com.zt.spider.core.core.service.ICrawlService;
import com.zt.spider.soa.monitor.bean.SpiderMonitorBean;
import com.zt.spider.soa.queue.bean.CoreRule;
import com.zt.spider.soa.queue.bean.DataQueue;

import cn.dreampie.orm.Record;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.util.LoadLibs;

public class ZtsJxsjzazDwcbjyztServiceImpl implements ICrawlService {

	@Override
	public void addRunSpider(DataQueue dataQueue, SpiderMonitorBean spiderMonitorBean, CoreRule coreRule)
			throws Exception {
		System.setProperty("webdriver.chrome.driver", "D:/Java/selenium/selenium/chromedriver.exe");
		ChromeOptions chromeOptions = new ChromeOptions();
		chromeOptions.addArguments("--headless");
//		options.addArguments("--disable-infobars");
		ChromeDriver webDriver = new ChromeDriver(chromeOptions);
		try {
			String taskInstanceId = dataQueue.getTaskInstanceId();
			Record findRed = new Record("tb_spider_task");
			List listR = findRed.find("select * from tb_spider_task_instance sti, tb_spider_task st where sti.id = ? and sti.task_id = st.id",new Object[] { taskInstanceId });
			String taskRules = listR.get(0).get("taskRule").toString();
			JSONObject ruleJson = JSONObject.parseObject(taskRules);
			// 重复次数
			Integer repeatCount = Integer.valueOf(ruleJson.get("repeatTime").toString());
			// 重复默认为1次
			Integer repeatTime = 1;
			// 打开浏览器
			String url = "XXXXXXXX";
			webDriver.manage().window().maximize();
			webDriver.get(url);
			Thread.sleep(1000);

			// 截取验证码图片
			screenshots(webDriver, "D:\\code.jpg");
			// 解析图片,得到图片验证码
			String code = getImgContent("D:\\code.jpg");
			// 输入验证码
			WebElement codeInput = webDriver.findElement(By.id("code"));
			codeInput.sendKeys(code);
			Thread.sleep(3000);
			// 查询相关数据
			WebElement findButton = webDriver.findElement(By.className("u-btn-c1"));
			findButton.click();
			// 点击查询后,删除截取的验证码图片
			FileUtils.forceDelete(new File("D:\\code.jpg"));
			// 得到总页码
			Integer pageCount = resolvesTotalPageCount(webDriver);
			Integer pageIndex = 1;
			if ("other".equals(ruleJson.get("crawler").toString())) {
				pageIndex = Integer.valueOf(ruleJson.get("beginPage").toString());
				if (pageCount <= pageIndex) {
					pageIndex = 1;
				}
			}
			while (true) {
				String html = webDriver.getPageSource();
				Document document = Jsoup.parse(html);
				Elements trs = document.select(".m-table tr");
				Record record = new Record("zts_nc_jzazdwcbjyzt");
				for (Element element : trs) {
					if (!element.text().contains("企业中文")) {
						String qyzwmc = element.select("td").get(0).text();
						String qyywmc = element.select("td").get(1).text();
						String qydz = element.select("td").get(2).text();
						String byjzt = element.select("td").get(3).text();
						String lxdh = element.select("td").get(4).text();
						String lxcz = element.select("td").get(5).text();
						String yzbm = element.select("td").get(6).text();
						Long temp = record.countBy("qyzwmc='" + qyzwmc + "'", new Object[0]);
						if (temp == 0) {
							Record addRed = record.set("id", UUIDGenerator.getUUID())
									.set("create_time", DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm:ss"))
									.set("task_instance_id", taskInstanceId).set("url", url).set("zsbh", "")
									.set("qyzwmc", qyzwmc).set("qyywmc", qyywmc).set("qydz", qydz).set("jyfw", byjzt)
									.set("lxdh", lxdh).set("lxcz", lxcz).set("yzbm", yzbm);
							record.save(addRed);
							repeatTime = 1;
						} else {
							repeatTime++;
							Object[] obj = new Object[] { DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm:ss"), qyzwmc, qyywmc, qydz, byjzt, lxdh, lxcz, yzbm };
							record.updateColsBy("create_time,qyywmc, qyywmc, qydz,jyfw,lxdh,lxcz,yzbm", "qyzwmc='" + qyzwmc + "'", obj);
						}
					}
				}
				if (pageIndex >= pageCount) {
					break;
				} else {
					if ("other".equals(ruleJson.get("crawler").toString())) {
						if (repeatTime >= repeatCount) {
							break;
						}
					}
					pageIndex++;
					WebElement nextPage = webDriver.findElement(By.className("pagenxt"));
					// 点击下一页
					Thread.sleep(3000);
					nextPage.click();
				}
			}
			webDriver.quit();
		} catch (Exception e) {
			webDriver.quit();
		}
	}

	/**
	 * 解析_页码总数
	 * 
	 * @param driver
	 * @param defaultValue
	 * @return
	 */
	private static int resolvesTotalPageCount(ChromeDriver driver) {
		String str = driver.findElement(By.className("m-page-total-num")).getText();
		if (StringUtils.isNotBlank(str)) {
			return Integer.valueOf(str);
		}
		return 1;
	}

	/**
	 * 截图并保存到本地
	 * 
	 * @param driver
	 *            浏览器驱动
	 * @param imgAddress
	 *            图片地址
	 */
	private static void screenshots(WebDriver driver, String imgAddress) {
		try {
			WebElement img = driver.findElement(By.className("m-table2")).findElement(By.tagName("tbody"))
					.findElement(By.tagName("img"));
			// 获取验证码图片的位置
			Point location = img.getLocation();
			// 获取大小
			Dimension size = img.getSize();
			// 指定了OutputType.BYTES做为参数传递给getScreenshotAs()方法,其含义是将截取的屏幕以BYTES形式返回。
			byte[] screenshotAs = ((TakesScreenshot) driver).getScreenshotAs(OutputType.BYTES);
			// 创建全屏截图。
			BufferedImage originalImage = ImageIO.read(new ByteArrayInputStream(screenshotAs));
			// 截取img所在位置的子图。
			BufferedImage croppedImage = originalImage.getSubimage(location.getX(), location.getY(), size.getWidth(),
					size.getHeight());
			// 保存到本地
			ImageIO.write(croppedImage, "jpg", new File(imgAddress));
		} catch (WebDriverException e) {
//			e.printStackTrace();
		} catch (IOException e) {
//			e.printStackTrace();
		}
	}

	/**
	 * 解析图片得到验证码
	 * 
	 * @param imgPath
	 *            图片的地址
	 */
	public static String getImgContent(String imgPath) {
		String content = "";
		File imageFile = new File(imgPath);
		// 读取图片数字
		ITesseract instance = new Tesseract();

		File tessDataFolder = LoadLibs.extractTessResources("tessdata");
		instance.setLanguage("eng");// 英文库识别数字比较准确
		instance.setDatapath(tessDataFolder.getAbsolutePath());

		try {
			content = instance.doOCR(imageFile).replace("\n", "");
		} catch (TesseractException e) {
//			System.err.println(e.getMessage());
		}
		return content;
	}

}

 

你可能感兴趣的:(爬虫之截取验证码解析验证码)