Webmagic+Selenium(IE或Chrome)+Java+百度OCR识别验证码 模拟登陆

首先,如果是maven项目,需要在pom.xml里添加

//Selenium包

    org.seleniumhq.selenium
    selenium-java
    3.3.1


//百度OCR包 如不需要验证码可不导入

    com.baidu.aip
    java-sdk
    4.4.1

其次按需求下载IEDrvierServer或ChromeDrvier驱动

接下来进入正题,开始撸代码.

public class DemoPageProcessor implements PageProcessor {
	//配置抓取时间、重试次数等
    private Site site = Site.me().setRetryTimes(5).setTimeOut(3000).setSleepTime(1000);
    
    //用来存储cookie信息
    private Set cookies;
    
	@Override
	public void process(Page page) {
		//设置POST请求
		Request request = new Request("http://172.24.30.137/wallinfo/queryList.xhtml?ids=&datas=&street=&createtimeBegin=&createtimeEnd=&busst=&maiindustry=&page=1&rows=10000");
		//只有POST请求才可以添加附加参数
		request.setMethod(HttpConstant.Method.POST);
		// 开始执行
		try {
		    Spider.create(new kaiqiangdadongDataPageProcessor()).addRequest(request).addPipeline(new kaiqiangdadongDataPipeline()).addPipeline(new ConsolePipeline()).thread(5).run();
		} catch (Exception e) {
		    e.printStackTrace();
		}*/
	}


	//使用 Selenium 来模拟用户的登录获取cookie信息
    public void login() throws InterruptedException, IOException {
    	
        //设置驱动程序路径
        //Chrome
        System.setProperty("webdriver.chrome.driver","D:\\chromedriver.exe");
        WebDriver driver = new ChromeDriver();
        //IE 8-10
    	System.setProperty("webdriver.ie.driver","D:\\IEDriverServer.exe");
        WebDriver driver = new InternetExplorerDriver();
        
        //登陆地址
        driver.get("http://xxx");
        
        //设置等待时间,防止页面没有渲染完成导致抓取元素失败
        Thread.sleep(3000);
        
        //定义验证码变量
        String verify = null;
    
        //寻找账号编辑框
        driver.findElement(By.id("账号编辑框ID")).clear();
        driver.findElement(By.id("账号编辑框ID")).sendKeys("登陆账号");
        
        //寻找密码编辑框
        driver.findElement(By.id("密码编辑框ID")).clear();
        driver.findElement(By.id("密码编辑框ID")).sendKeys("登陆密码");
        
        //创建一个时间戳,防止验证码图片文件重名
    	String timestamp = System.currentTimeMillis()+"";

        //寻找验证码容器
    	WebElement ele = driver.findElement(By.id("验证码容器ID"));

        //创建一个快照
        File screenshot = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
    
        //读取截图
        BufferedImage fullImg = ImageIO.read(screenshot);
        
        //获取页面上元素的位置
        org.openqa.selenium.Point point= ele.getLocation();
 
        //获取元素宽高  
        int eleWidth= ele.getSize().getWidth();  
        int eleHeight= ele.getSize().getHeight();

        //裁剪整个页面截图只得到元素截图  
        BufferedImage eleScreenshot= fullImg.getSubimage(point.getX(), point.getY(), eleWidth, eleHeight);  
        ImageIO.write(eleScreenshot, "png", screenshot);
  
        //将验证码截图保存到本地  
        File screenshotLocation = new File("E:/"+timestamp+".jpg");
        FileUtils.copyFile(screenshot, screenshotLocation);  
    	String otherHost = "https://aip.baidubce.com/rest/2.0/ocr/v1/webimage";
    	// 本地图片路径
    	String filePath = "E:\\"+timestamp+".jpg";
    	try {
            byte[] imgData = FileUtil.readFileByBytes(filePath);
    	    String imgStr = Base64Util.encode(imgData);
    	    String params = URLEncoder.encode("image", "UTF-8") + 
                            "=" + URLEncoder.encode(imgStr, "UTF-8");              

    	//线上环境access_token有过期时间,客户端可自行缓存,过期后重新获取
    	             
    	String accessToken = AipOcrInit.getAuth("API_KEY", "SECRET_KEY");
                              
    	String result = HttpUtil.post(otherHost, accessToken, params);
    	BaiDuOCRBean baiDuOCRBean = 
                   com.alibaba.fastjson.JSONObject.toJavaObject(JSON.parseObject(result),
                   BaiDuOCRBean.class);

    	List list = baiDuOCRBean.getWords_result();
    	    		for (int i = 0; i < list.size(); i++) {
    	    			System.out.println(list.get(i).getWords());
    	    			verify = list.get(i).getWords().replace(".","");
    	    		}
    	        } catch (Exception e) {
    	            e.printStackTrace();
    	        }
        //寻找验证码编辑框
        driver.findElement(By.id("验证码编辑框ID")).clear();
        driver.findElement(By.id("验证码编辑框ID")).sendKeys(verify);

        //模拟点击登录按钮
        driver.findElement(By.className("登陆按钮ID")).click();

        //获取cookie信息
        cookies = driver.manage().getCookies();
        //driver.close(); 
    }
	@Override
	public Site getSite() {
		//将获取到的cookie信息添加到webmagic中
        for (Cookie cookie : cookies) { 
            site.addCookie(cookie.getName().toString(),cookie.getValue().toString());
        }

        return site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
              AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");           
	}
	
	public static void main(String[] args) throws InterruptedException, IOException{
		DemoPageProcessor dpp = new DemoPageProcessor ();

        //调用selenium,进行模拟登录
        dpp.login();
      
      	// 开始执行
      	try {
      		    Spider.create(new  DemoPageProcessor()).addUrl("xxx").addPipeline(new 
                DemoPageProcessor()).addPipeline(new ConsolePipeline()).thread(5).run();
      		} catch (Exception e) {
      		    e.printStackTrace();
      		}
    }



}

如有错误欢迎指正.

你可能感兴趣的:(爬虫)