WebDriver 登陆 Jsoup抓取内容

1. 环境

pom:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>

	<groupId>mybatis</groupId>
	<artifactId>test</artifactId>
	<version>0.0.1-SNAPSHOT</version>
	<packaging>jar</packaging>

	<name>test</name>
	<url>http://maven.apache.org</url>

	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	</properties>

	<!-- 添加mybatis-generator插件 -->
	<!-- ——>在Goals框中输入:mybatis-generator:generate 运行mybatis插件 -->
	<build>
		<plugins>
			<plugin>
				<groupId>org.mybatis.generator</groupId>
				<artifactId>mybatis-generator-maven-plugin</artifactId>
				<version>1.3.2</version>
				<configuration>
					<verbose>true</verbose>
					<overwrite>true</overwrite>
				</configuration>
			</plugin>
		</plugins>
	</build>


	<dependencies>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.38</version>
		</dependency>

		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>3.8.1</version>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.mybatis</groupId>
			<artifactId>mybatis</artifactId>
			<version>3.3.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.12</version>
		</dependency>

		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.2</version>
		</dependency>
		<dependency>
			<groupId>net.sourceforge.jexcelapi</groupId>
			<artifactId>jxl</artifactId>
			<version>2.6.12</version>
		</dependency>

		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.2</version>
		</dependency>

		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
		</dependency>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.8.3</version>
		</dependency>
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-server</artifactId>
			<version>2.53.0</version>
		</dependency>
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>1.2.17</version>
		</dependency>


	</dependencies>

</project>

2. 初始化WebDriver的类 DriverFactory.java

package test;

import java.util.Arrays;

import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;

public class DriverFactory {

	public static ChromeDriver create() {

		// TODO Auto-generated method stub
		String chromdriver = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";
		System.setProperty("webdriver.chrome.driver", chromdriver);
		ChromeOptions options = new ChromeOptions();

		DesiredCapabilities capabilities = DesiredCapabilities.chrome();
		capabilities.setCapability("chrome.switches", Arrays.asList("--start-maximized"));
		options.addArguments("--test-type", "--start-maximized");
		ChromeDriver driver = new ChromeDriver(options);
		return driver;
	}

}



3. 西祠胡同的登陆抓取类

//import java.io.File;
package test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.AfterClass;

import org.junit.BeforeClass;

import org.junit.Test;
import org.openqa.jetty.http.SSORealm;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Platform;

import org.openqa.selenium.WebDriver;

import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;

import org.openqa.selenium.remote.DesiredCapabilities;

import mx4j.log.Log;

public class XiciLogin2 {

	public Logger log = Logger.getLogger(Main.class);
	public static Set<Cookie> cookies = new HashSet<Cookie>();
	public static ChromeDriver driver = DriverFactory.create();

	/**
	 * 抓取到每一个分页上所有详细页链接
	 * 
	 * @param url
	 */
	public List<String> crawlSource(String url) {
		int time = 1;
//		System.out.println("开始抓: " + url);
		log.info("开始抓: " + url);
		List<String> sourceUrls = new ArrayList<String>();
		String baseUrl = "http://www.xici.net";
		driver.get(url);
		Document document = Jsoup.parse(driver.getPageSource());
		// WebElement webElement =
		// driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a"));
		// 选出所有详细页链接
		Elements elements = document.select("table#board_t tbody tr");
		// System.out.println(elements);
		if (elements != null) {
			for (Element element : elements) {
				if (element.select("td").isEmpty()) {
					continue;
				}
				String targets = element.select("td a[onclick=this.parentNode.className = 'visited';]").attr("href");
				if (targets == "" || targets == null) {
					continue;
				}
				targets = baseUrl + targets;
				// System.out.println(targets);
				sourceUrls.add(targets);
			}
		} else {
			System.out.println(url + "中没有详细页链接~~");
		}
		System.out.println(sourceUrls.size());
		if (sourceUrls.size() == 0 && time <= 5) {
			System.out.println("抓不到啦~ 重新抓一下");
			crawlSource(url);
			time++;
		}
		return sourceUrls;

	}

	/**
	 * 解析详细页 出东西
	 */
	public void crawlTarget(String url) {
		driver.get(url);
		Document document = Jsoup.parse(driver.getPageSource());
		System.out.println("抓" + url + "的标题");
		// 取标题
		Element element = document.select("div#doc_tit h1").first();
		if (element != null) {
			System.out.println("标题:" + element.text());
		} else {
			System.out.println("");
		}
	}

	public static void main(String[] args) {
		PropertyConfigurator.configure("log4j.properties");
		
		
		XiciLogin2 xc = new XiciLogin2();
		String site = "http://www.xici.net/b1513005/";

		try {
//			 xc.xiciLogin();
			// xc.crawlSource("http://www.xici.net/b1402132/1");
			// xc.crawlTarget("http://www.xici.net/d191739198.htm");
			// xc.getMaxPageNum("http://www.xici.net/b1468535/");
			int page = 1;

			// 获取入口页最大分页页码数
			int maxPageNum = xc.getMaxPageNum(site);

			// 根据最大分页数挨个抓取各个分页
			do {
				// 拿到要抓取得分页url sourceUrl
				String sourceUrl = site + page;
				System.out.println("分页: " + sourceUrl);

				// 抓取分页 解析出目标页列表
				List<String> targetsList = xc.crawlSource(sourceUrl);

				// 抓目标页
				if (targetsList.isEmpty()) {
					System.out.println("没抓到详细页!!");
				} else {
					for (String target : targetsList) {
						try {
							xc.crawlTarget(target);
							Thread.sleep(3000);
						} catch (Exception e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					}
				}
				page++;
				Thread.sleep(3500);
			} while (page <= 15);

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	/**
	 * 获取当前入口site的最大分页数
	 * 
	 * @param url
	 * @return
	 */
	public int getMaxPageNum(String site) {
		Document document = null;
		int maxPageNum = 0;
		try {
			document = Jsoup.connect(site).get();
			Element element = document.select("div#page").first();
			String s = element.text();
			if (s.contains("共")) {
				s = s.split("共")[1];
				s = s.split("页")[0];
			}
			System.out.println(s);
			maxPageNum = Integer.parseInt(s);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return maxPageNum;
	}

	

	public void xiciLogin() throws Exception {
		System.setProperty("webdriver.chrome.driver",
				"C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe");
		driver.get("http://account.xici.net/login");
		WebElement user = driver.findElement(By.name("username"));
		WebElement pwa = driver.findElement(By.name("password"));

		// 分别将用户名和密码文本框清空
		user.clear();
		pwa.clear();

		// 输入用户名和密码
		user.sendKeys("18611693832");
		pwa.sendKeys("zero911108");

		// 找到登陆按钮点击
		// driver.findElement(By.name("TANGRAM__PSP_3__submit")).click();
		driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click();
		// 输出title
		System.out.println(driver.getTitle());
		cookies = driver.manage().getCookies();
		System.out.println(cookies);
		for (Cookie cookie2 : cookies) {
			driver.manage().addCookie(cookie2);
			System.out.println(cookie2);
		}
		// 能打开15页说明登陆成功
		// driver.get("http://www.xici.net/b1402132/15");
	}

}













你可能感兴趣的:(WebDriver 登陆 Jsoup抓取内容)