jsoup + json 解析网页

阅读更多
package com.teamdev.jxbrowser.chromium.demo_lingshui.baidunuomi.goods;

import java.awt.BorderLayout;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;

import javax.swing.JFrame;
import javax.swing.WindowConstants;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.hyjx.common.CommonUtil;
import com.hyjx.orclJdbcUtil.JDBCUtils;
import com.teamdev.jxbrowser.chromium.Browser;
import com.teamdev.jxbrowser.chromium.BrowserPreferences;
import com.teamdev.jxbrowser.chromium.JSValue;
import com.teamdev.jxbrowser.chromium.LoggerProvider;
import com.teamdev.jxbrowser.chromium.events.FinishLoadingEvent;
import com.teamdev.jxbrowser.chromium.events.LoadAdapter;
import com.teamdev.jxbrowser.chromium.swing.BrowserView;
/**
* 百度糯米(陵水市) plat_code(010) 美食
* @author 1
*
*/
public class ls_baidunuomi_goods_meishi{
public static void main(String[] args) throws Exception {


java.sql.Connection conOrcale  = null;
try {

conOrcale =    JDBCUtils.getConnection();

} catch (SQLException e1) {
e1.printStackTrace();
}
String sql = null;
PreparedStatement ps = null;
//创建添加sql
try{
sql = "insert into ls_nm_shop_good   "+
"  (good_id, good_name, good_url)  "+
"  values " +
"( ?   ,      ? ,     ?   )  ";
  
ps = conOrcale.prepareStatement(sql);
}catch (Exception e) {
e.printStackTrace();
}
String good_name="";
String good_url="";

       Document doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326-page1?#j-sort-bar").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
             } catch (Exception e2) {
e2.printStackTrace();
}
String text = doc.html();
Document document = Jsoup.parse(text);
        String pag= document.getElementsByAttributeValue("class","page-number").text();
        int page_totle =Integer.parseInt(pag.substring(pag.length()-1));
        System.out.println(page_totle);
        //循环翻页
       
        for(int i=1;i<=page_totle;i++){
        //睡眠2秒
        try {
Thread.sleep(2000);
        } catch (InterruptedException e1) {
    e1.printStackTrace();
    }
       
       
        doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326-page"+i+"?#j-sort-bar").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
     Elements elements= document.getElementsByAttributeValue("class","itemlist clearfix").select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
    
/**
* 第一次加载
* */     
     if(i==1){
        doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=2&_=1477897343482").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
     elements= document.select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
     doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=3&_=1477897343484").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
       elements= document.select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
    
        }
     System.out.println("第"+i+"页");
        }
       
       
      

      
       
}
}

你可能感兴趣的:(html,jsoup)