JAVA爬虫爬取携程酒店数据selenium实现

  在爬取携程的时候碰到很多的壁垒,接下来分析所有过程

  1.根据以往经验最初想到用jsoup去解析每个HTML元素,然后拿到酒店数据,然后发现解析HTML根本拿不到id为hotel_list的div,所以也就无法通过静态的HTML去获取数据

JAVA爬虫爬取携程酒店数据selenium实现_第1张图片

  可以看到标签里面根据就是没有数据的,因为这里的数据是动态的所有无法拿取,接下来采用动态拿取

JAVA爬虫爬取携程酒店数据selenium实现_第2张图片 

2.第一种方法就不行, 于是疯狂查博文,找到了携程动态数据的接口,在AjaxHotelList.aspx里我找到了酒店,里面有HTML的代码拼接,数据都在这里了,怎么拿取呢?

 JAVA爬虫爬取携程酒店数据selenium实现_第3张图片

 模拟post请求,然后拿数据

 在发送请求的时候注意下图红框中的信息。请求头必须要加上来源信息和游览器信息。发送的参数就是Form Data里的数据,可以只传部分数据。

JAVA爬虫爬取携程酒店数据selenium实现_第4张图片 

请求发送后,很遗憾还是没有拿到数据,可能是一些加密的处理。

3.虽然模拟请求拿不到数据,但是大致方向还是找到了,还剩一下一种办法,就是selenium自动化测试框架模拟游览器从游览器页面中拿取数据。(由于能力有限,并没有破解汉字识别验证码,这里用人工验证代替)

准备:

        下载 selenium

        下载Chromedriver(这里需要与自己的Chorme游览器版本相对应,我在下载的时候最新的,后面放出连接)

 

上代码

import com.nf.xiecheng.entyty.Hotel;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.awt.*;
import java.awt.event.KeyEvent;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class SelectFlight {
    private final int MAX_X=2560;//用于滑块验证,电脑分辨率
    private final int MAX_Y=1408;
    private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置
    private final int TARGET_Y=MAX_Y-477;
    private static List hotelList = new ArrayList();
    public static  void main(String args[]) throws InterruptedException {
        SelectFlight s = new SelectFlight();
        System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录
        WebDriver webDriver = new ChromeDriver();
        webDriver.get("https://hotels.ctrip.com");
        Thread.sleep(1000);
        //跳转登陆页面
        WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text");
        login.click();
        //登陆信息
        Thread.sleep(1000);
        WebElement phone = webDriver.findElement(By.id("nloginname"));
        phone.sendKeys("13647610831");
        WebElement passw = webDriver.findElement(By.id("npwd"));
        passw.sendKeys("a96968426");
        //滑块验证
        s.Robotcheck();
        Thread.sleep(10000);
        //点击登陆
        WebElement nsubmit = webDriver.findElement(By.id("nsubmit"));
        nsubmit.click();
        //点击酒店搜索
        Thread.sleep(2000);
        WebElement btnSearch = webDriver.findElement(By.id("btnSearch"));
        btnSearch.click();
        Thread.sleep(5000);
        //进入主页
        String pageSource = webDriver.getPageSource();
        Thread.sleep(1000);
        WebElement nextPage = webDriver.findElement(By.id("downHerf"));
        nextPage.click();
        //下一页
        WebElement downHerfa = s.getNextPage(webDriver, "downHerf");
        s.getHotelMassge(webDriver);//获取酒店信息
        for (Hotel ph:hotelList
             ) {
            System.out.println(ph.toString());
        }
        webDriver.close();
        webDriver.quit();

    }

    //获取下一页
    public WebElement getNextPage(WebDriver webDriver,String nextPage){
        WebElement nextWeb = webDriver.findElement(By.id(nextPage));
        nextWeb.click();
        return nextWeb;
    }
    //填入酒店信息
    public void getHotelMassge(WebDriver webDriver){
        WebElement hotel_list = webDriver.findElement(By.id("hotel_list"));
        List hotel_item = hotel_list.findElements(By.className("hotel_item"));
        System.err.println(hotel_item.size());
        Iterator it = hotel_item.iterator();
        while (it.hasNext()){
            Hotel entry = new Hotel();
            WebElement hotel = it.next();
            //酒店名称
            WebElement hotel_name = hotel.findElement(By.className("hotel_name"));
            WebElement a = hotel_name.findElement(By.tagName("a"));
            entry.setName(a.getAttribute("title"));
            //id
            String id = hotel_name.getAttribute("data-id");
            entry.setId(id);
            //酒店地址
            WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress"));
            List a_area = hotel_item_htladdress.findElements(By.tagName("a"));
            StringBuffer areabuffer = new StringBuffer();
            for(int i = 0; i 
   
import com.nf.xiecheng.entyty.Hotel;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import java.awt.*;
import java.awt.event.KeyEvent;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class SelectFlight {
    private final int MAX_X=2560;//用于滑块验证,电脑分辨率
    private final int MAX_Y=1408;
    private final int TARGET_X=MAX_X-733;//用于滑块验证,webdriver启动后,游览器中滑块验证的坐标位置
    private final int TARGET_Y=MAX_Y-477;
    private static List hotelList = new ArrayList();
    public static  void main(String args[]) throws InterruptedException {
        SelectFlight s = new SelectFlight();
        System.setProperty("webdriver.chrome.driver","D:\\myporject\\IDEworkspace\\chromedriver.exe");//chromedriver驱动地址,自己所放入的目录
        WebDriver webDriver = new ChromeDriver();
        webDriver.get("https://hotels.ctrip.com");
        Thread.sleep(1000);
        //跳转登陆页面
        WebElement login = ((ChromeDriver) webDriver).findElementByClassName("person-text");
        login.click();
        //登陆信息
        Thread.sleep(1000);
        WebElement phone = webDriver.findElement(By.id("nloginname"));
        phone.sendKeys("13647610831");
        WebElement passw = webDriver.findElement(By.id("npwd"));
        passw.sendKeys("a96968426");
        //滑块验证
        s.Robotcheck();
        //睡眠10秒,用于人工验证汉字识别
        Thread.sleep(10000);
        //点击登陆
        WebElement nsubmit = webDriver.findElement(By.id("nsubmit"));
        nsubmit.click();
        //点击酒店搜索
        Thread.sleep(2000);
        WebElement btnSearch = webDriver.findElement(By.id("btnSearch"));
        btnSearch.click();
        Thread.sleep(5000);
        //进入主页
        String pageSource = webDriver.getPageSource();
        Thread.sleep(1000);
        WebElement nextPage = webDriver.findElement(By.id("downHerf"));
        nextPage.click();
        //下一页
        WebElement downHerfa = s.getNextPage(webDriver, "downHerf");
        s.getHotelMassge(webDriver);//获取酒店信息
        for (Hotel ph:hotelList
             ) {
            System.out.println(ph.toString());
        }
        webDriver.close();
        webDriver.quit();

    }

    //获取下一页
    public WebElement getNextPage(WebDriver webDriver,String nextPage){
        WebElement nextWeb = webDriver.findElement(By.id(nextPage));
        nextWeb.click();
        return nextWeb;
    }
    //填入酒店信息
    public void getHotelMassge(WebDriver webDriver){
        WebElement hotel_list = webDriver.findElement(By.id("hotel_list"));
        List hotel_item = hotel_list.findElements(By.className("hotel_item"));
        System.err.println(hotel_item.size());
        Iterator it = hotel_item.iterator();
        while (it.hasNext()){
            Hotel entry = new Hotel();
            WebElement hotel = it.next();
            //酒店名称
            WebElement hotel_name = hotel.findElement(By.className("hotel_name"));
            WebElement a = hotel_name.findElement(By.tagName("a"));
            entry.setName(a.getAttribute("title"));
            //id
            String id = hotel_name.getAttribute("data-id");
            entry.setId(id);
            //酒店地址
            WebElement hotel_item_htladdress = hotel.findElement(By.className("hotel_item_htladdress"));
            List a_area = hotel_item_htladdress.findElements(By.tagName("a"));
            StringBuffer areabuffer = new StringBuffer();
            for(int i = 0; i 
  

 

 

 

 

你可能感兴趣的:(爬虫,java,java)