import com.mongodb.BasicDBObject import com.mongodb.DBCollection import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.jsoup.nodes.Element import org.jsoup.select.Elements public class ZhongYuan { public static final DBCollection test = MongoUtils.getCollectionByName("name", "table", "port") public static final DBCollection html = MongoUtils.getCollectionByName("name", "table", "port") public static void main(String[] args){ // 循环遍历页面进行数据爬去 for(int i = 500 ; i<598 ;i++) { String url = "http://sh.centanet.com/xiaoqu/g"+i+"/"; String result = RequestUtil.doGet(url, "GBK"); Document doc = Jsoup.parse(result); //页面加载完成后对document进行处理,获取自己有用的数据 parseList(doc); System.out.println("page=====>"+i); } } private static void parseList(Document doc){ Elements elements = doc.select("div.house-listBox>div"); int j = 0; for(Element element : elements){ String name = element.select(".house-title a").first().text(); html.save(new BasicDBObject("name",name).append("html",element.toString())) String regionstr = element.select("div>div>p").first().text().replace(' ','-'); String region = regionstr.split("-")[0]; String address = null; if(regionstr.split("-").length>1) { address = regionstr.split("-")[1] + regionstr.split("-")[2]; } else { address = regionstr.split("-")[1]; } String price = element.select("div>div").last().select("p").first().text(); test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name) .append("avg_price",price)); System.out.println(name); j++; } System.out.println(j); } private static void parseList1(Document doc) { Elements elements = doc.select("div.section>ul>li"); String name = null; String region = null; String price = null; for (Element element : elements) { if (element.toString().contains("room-img")) { name = element.select("h5.room-name a").first().text(); Elements datas = element.select("p"); int i = 0; for (Element data : datas) { i++; if (i == 2) { price = data.text(); } if (i == 4) { region = data.text(); } } System.out.println(name + price + region); test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name) .append("avg_price",price)); } } } }
相关doget请求自己封装了一个util,可以看看,上面的这一句String result = RequestUtil.doGet(url, "GBK");用的就是自己封装的util包,这里也可以使用jsoup自己封装的。
/** * 发送get请求 * @param url * @return */ public static String doGet(String url) { return doGet(url, null, "UTF-8", false); } public static String doGet(String url, boolean encodeUrl) { return doGet(url, null, "UTF-8", encodeUrl); } public static String doGet(String url, String charset) { return doGet(url, null, charset, true); } public static String doGet(String url, Mapheaders) { return doGet(url, headers, "UTF-8", true); } public static String doGet(final String url, Map headers, String charset, boolean encodeUrl) { CloseableHttpClient client = HttpClients .custom() .setUserAgent(USERAGENT_CHROME) .build(); CloseableHttpResponse response = null; String result = null; String requestUrl = url; try { if(encodeUrl) { requestUrl = encodingUrl(url, charset); } HttpGet httpGet = new HttpGet(requestUrl); // RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).build();//设置请求和传输超时时间 // httpGet.setConfig(requestConfig); if(headers != null) { for(Map.Entry entry : headers.entrySet()) { httpGet.addHeader(entry.getKey(), entry.getValue()); } } response = client.execute(httpGet); int statusCode = response.getStatusLine().getStatusCode(); if(statusCode == 200) { result = EntityUtils.toString(response.getEntity(), charset); } } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(response != null) { try { response.close(); } catch (IOException e) { } } if(client != null) { try { client.close(); } catch (IOException e) { } } } return result; }