jsoup + json 解析网页

package com.teamdev.jxbrowser.chromium.demo_lingshui.baidunuomi.goods;

import java.awt.BorderLayout;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;

import javax.swing.JFrame;
import javax.swing.WindowConstants;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.hyjx.common.CommonUtil;
import com.hyjx.orclJdbcUtil.JDBCUtils;
import com.teamdev.jxbrowser.chromium.Browser;
import com.teamdev.jxbrowser.chromium.BrowserPreferences;
import com.teamdev.jxbrowser.chromium.JSValue;
import com.teamdev.jxbrowser.chromium.LoggerProvider;
import com.teamdev.jxbrowser.chromium.events.FinishLoadingEvent;
import com.teamdev.jxbrowser.chromium.events.LoadAdapter;
import com.teamdev.jxbrowser.chromium.swing.BrowserView;
/**
* 百度糯米(陵水市) plat_code(010) 美食
* @author 1
*
*/
public class ls_baidunuomi_goods_meishi{
public static void main(String[] args) throws Exception {


java.sql.Connection conOrcale = null;
try {

conOrcale = JDBCUtils.getConnection();

} catch (SQLException e1) {
e1.printStackTrace();
}
String sql = null;
PreparedStatement ps = null;
//创建添加sql
try{
sql = "insert into ls_nm_shop_good "+
" (good_id, good_name, good_url) "+
" values " +
"( ? , ? , ? ) ";

ps = conOrcale.prepareStatement(sql);
}catch (Exception e) {
e.printStackTrace();
}
String good_name="";
String good_url="";

Document doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326-page1?#j-sort-bar").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
} catch (Exception e2) {
e2.printStackTrace();
}
String text = doc.html();
Document document = Jsoup.parse(text);
String pag= document.getElementsByAttributeValue("class","page-number").text();
int page_totle =Integer.parseInt(pag.substring(pag.length()-1));
System.out.println(page_totle);
//循环翻页

for(int i=1;i<=page_totle;i++){
//睡眠2秒
try {
Thread.sleep(2000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}


doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326-page"+i+"?#j-sort-bar").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
} catch (Exception e2) {
e2.printStackTrace();
}
text = doc.html();
document = Jsoup.parse(text);
Elements elements= document.getElementsByAttributeValue("class","itemlist clearfix").select("li");
System.out.println("elements.size()"+elements.size());

for(Element e : elements){
Document parse = Jsoup.parse(e.html());
good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
System.out.println("good_name:"+good_name);
System.out.println("good_url:"+good_url);
try {
// (good_id, good_name, good_url)
ps.setString(1,CommonUtil.getUUID32());
ps.setString(2,good_name);
ps.setString(3,good_url);
ps.executeUpdate();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}

/**
* 第一次加载
* */
if(i==1){
doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=2&_=1477897343482").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
} catch (Exception e2) {
e2.printStackTrace();
}
text = doc.html();
document = Jsoup.parse(text);
elements= document.select("li");
System.out.println("elements.size()"+elements.size());

for(Element e : elements){
Document parse = Jsoup.parse(e.html());
good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
System.out.println("good_name:"+good_name);
System.out.println("good_url:"+good_url);
try {
// (good_id, good_name, good_url)
ps.setString(1,CommonUtil.getUUID32());
ps.setString(2,good_name);
ps.setString(3,good_url);
ps.executeUpdate();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=3&_=1477897343484").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
} catch (Exception e2) {
e2.printStackTrace();
}
text = doc.html();
document = Jsoup.parse(text);
elements= document.select("li");
System.out.println("elements.size()"+elements.size());

for(Element e : elements){
Document parse = Jsoup.parse(e.html());
good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
System.out.println("good_name:"+good_name);
System.out.println("good_url:"+good_url);
try {
// (good_id, good_name, good_url)
ps.setString(1,CommonUtil.getUUID32());
ps.setString(2,good_name);
ps.setString(3,good_url);
ps.executeUpdate();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}

}
System.out.println("第"+i+"页");
}






}
}

你可能感兴趣的:(爬虫学习)