第一次发博客,如有雷同纯属故意。
最近需要扒取一个网站页面数据,网上查了很多资料,最后选择用selenium 和 jsoup。
闲话少说,开工:
要扒数据,先得登陆网站,如何登陆,流程如下:
首先是jar仓库:
代码:
public static void getIndex2() {
//之前运行程序发现生成了N多个chrome driver进程,搞不懂为什么会有那么多进程产生,网上查了下,说起这个service有用,拿来试下,效果未知
ChromeDriverService service = new ChromeDriverService.Builder().usingDriverExecutable(new File("./driver/chromedriver.exe")).usingAnyFreePort().build();
try {
service.start();
} catch (IOException ex) {
Logger.getLogger(kechengbiaoIndex.class.getName()).log(Level.SEVERE, null, ex);
}
//end
//正式开始
//先定义浏览器驱动,我用chrome浏览器,网上下载一个chromedriver.exe,启动时需要加载
System.getProperties().setProperty("webdriver.chrome.driver", "./driver/chromedriver.exe");
//打开浏览器
WebDriver webDriver = new ChromeDriver();
try {
//需先打开登录页面,
webDriver.get("http://www.*****.com/loginnew.asp");
webDriver.manage().deleteAllCookies();
//jsoup获取登录后的cookies
Map cookies =getLoginCookeis();
//将cookies放入浏览器
for (Object entry : cookies.keySet()) {
Cookie cookie = new Cookie(entry + "", cookies.get(entry) + "");
webDriver.manage().addCookie(cookie);
}
webDriver.get("http://www.******.com/***.asp");//登录后的页面
//TODO:通过操作webDriver操作页面,解析获取数据
} catch (InterruptedException ex) {
Logger.getLogger(kechengbiaoIndex.class.getName()).log(Level.SEVERE, null, ex);
}finally{
webDriver.close();
}
service.stop();
}
private static Integer TIMEOUT = 10000;
private static String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36";
private static String URL="登陆请求发送的url";
public static Map getLoginCookeis(){
try {
//此处数据需要在页面上登陆页面,测试一下,获取登录时发送的登陆请求数据
Map map = new HashMap();
map.put("username", "用户名");
map.put("password", "密码");
map.put("Action", "Login");
map.put("Submit.x", "40");
map.put("Submit.y", "15");
Map map1 = new HashMap();
map1.put("ASPSESSIONIDASBTBDDT", "ACABMBFDKBGHOLHBHMKKMHLA");
map1.put("Sailing", "Skin=");
map1.put("wwwkechengbiaonetjecas", "UserName=%D5%C5%C8%CB%C0%FB&AdminLoginCode=&AdminName=%D5%C5%C8%CB%C0%FB&LastPassword=v0rdu3g775Uqy735&UnreadMsg=&UserPassword=877b0591474be1fb&RndPassword=v0rdu3g775Uqy735&AdminPassword=877b0591474be1fb");
//发送登录请求
Connection.Response rs=Jsoup.connect(URL)
.postDataCharset("GB2312")//编码格式
.data(map)//请求参数
.userAgent(USER_AGENT)
.cookies(map1)//cookies
.timeout(TIMEOUT)//超时
.method(Connection.Method.POST)
.execute();
map1=rs.cookies();//获取登录的cookies
return map1;
} catch (IOException ex) {
Logger.getLogger(KechengbiaoLogin.class.getName()).log(Level.SEVERE, null, ex);
return null;
}
}