pom:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>mybatis</groupId> <artifactId>test</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>test</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <!-- 添加mybatis-generator插件 --> <!-- ——>在Goals框中输入:mybatis-generator:generate 运行mybatis插件 --> <build> <plugins> <plugin> <groupId>org.mybatis.generator</groupId> <artifactId>mybatis-generator-maven-plugin</artifactId> <version>1.3.2</version> <configuration> <verbose>true</verbose> <overwrite>true</overwrite> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.3.1</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.12</version> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>net.sourceforge.jexcelapi</groupId> <artifactId>jxl</artifactId> <version>2.6.12</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.2</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-server</artifactId> <version>2.53.0</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> </dependencies> </project>
package test; import java.util.Arrays; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.DesiredCapabilities; public class DriverFactory { public static ChromeDriver create() { // TODO Auto-generated method stub String chromdriver = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe"; System.setProperty("webdriver.chrome.driver", chromdriver); ChromeOptions options = new ChromeOptions(); DesiredCapabilities capabilities = DesiredCapabilities.chrome(); capabilities.setCapability("chrome.switches", Arrays.asList("--start-maximized")); options.addArguments("--test-type", "--start-maximized"); ChromeDriver driver = new ChromeDriver(options); return driver; } }
//import java.io.File; package test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.Set; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.openqa.jetty.http.SSORealm; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.Platform; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.ie.InternetExplorerDriver; import org.openqa.selenium.remote.DesiredCapabilities; import mx4j.log.Log; public class XiciLogin2 { public Logger log = Logger.getLogger(Main.class); public static Set<Cookie> cookies = new HashSet<Cookie>(); public static ChromeDriver driver = DriverFactory.create(); /** * 抓取到每一个分页上所有详细页链接 * * @param url */ public List<String> crawlSource(String url) { int time = 1; // System.out.println("开始抓: " + url); log.info("开始抓: " + url); List<String> sourceUrls = new ArrayList<String>(); String baseUrl = "http://www.xici.net"; driver.get(url); Document document = Jsoup.parse(driver.getPageSource()); // WebElement webElement = // driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a")); // 选出所有详细页链接 Elements elements = document.select("table#board_t tbody tr"); // System.out.println(elements); if (elements != null) { for (Element element : elements) { if (element.select("td").isEmpty()) { continue; } String targets = element.select("td a[onclick=this.parentNode.className = 'visited';]").attr("href"); if (targets == "" || targets == null) { continue; } targets = baseUrl + targets; // System.out.println(targets); sourceUrls.add(targets); } } else { System.out.println(url + "中没有详细页链接~~"); } System.out.println(sourceUrls.size()); if (sourceUrls.size() == 0 && time <= 5) { System.out.println("抓不到啦~ 重新抓一下"); crawlSource(url); time++; } return sourceUrls; } /** * 解析详细页 出东西 */ public void crawlTarget(String url) { driver.get(url); Document document = Jsoup.parse(driver.getPageSource()); System.out.println("抓" + url + "的标题"); // 取标题 Element element = document.select("div#doc_tit h1").first(); if (element != null) { System.out.println("标题:" + element.text()); } else { System.out.println(""); } } public static void main(String[] args) { PropertyConfigurator.configure("log4j.properties"); XiciLogin2 xc = new XiciLogin2(); String site = "http://www.xici.net/b1513005/"; try { // xc.xiciLogin(); // xc.crawlSource("http://www.xici.net/b1402132/1"); // xc.crawlTarget("http://www.xici.net/d191739198.htm"); // xc.getMaxPageNum("http://www.xici.net/b1468535/"); int page = 1; // 获取入口页最大分页页码数 int maxPageNum = xc.getMaxPageNum(site); // 根据最大分页数挨个抓取各个分页 do { // 拿到要抓取得分页url sourceUrl String sourceUrl = site + page; System.out.println("分页: " + sourceUrl); // 抓取分页 解析出目标页列表 List<String> targetsList = xc.crawlSource(sourceUrl); // 抓目标页 if (targetsList.isEmpty()) { System.out.println("没抓到详细页!!"); } else { for (String target : targetsList) { try { xc.crawlTarget(target); Thread.sleep(3000); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } } page++; Thread.sleep(3500); } while (page <= 15); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 获取当前入口site的最大分页数 * * @param url * @return */ public int getMaxPageNum(String site) { Document document = null; int maxPageNum = 0; try { document = Jsoup.connect(site).get(); Element element = document.select("div#page").first(); String s = element.text(); if (s.contains("共")) { s = s.split("共")[1]; s = s.split("页")[0]; } System.out.println(s); maxPageNum = Integer.parseInt(s); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return maxPageNum; } public void xiciLogin() throws Exception { System.setProperty("webdriver.chrome.driver", "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe"); driver.get("http://account.xici.net/login"); WebElement user = driver.findElement(By.name("username")); WebElement pwa = driver.findElement(By.name("password")); // 分别将用户名和密码文本框清空 user.clear(); pwa.clear(); // 输入用户名和密码 user.sendKeys("18611693832"); pwa.sendKeys("zero911108"); // 找到登陆按钮点击 // driver.findElement(By.name("TANGRAM__PSP_3__submit")).click(); driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click(); // 输出title System.out.println(driver.getTitle()); cookies = driver.manage().getCookies(); System.out.println(cookies); for (Cookie cookie2 : cookies) { driver.manage().addCookie(cookie2); System.out.println(cookie2); } // 能打开15页说明登陆成功 // driver.get("http://www.xici.net/b1402132/15"); } }