最近 老有人 要我抓取页面的数据, 大部分人是要客户资料开发新的客户, 但对不了解coding 的人, 是要一个一个 的复制 ,粘贴,所以很浪费时间, 做 了个简单但demo ,用Httpclient ,jousp ,poi。
public class GetHouseData {
public final static String BASE_URL="http://example.com";
public static void main(String[] args) {
//第一页到第三页
getHoustInfoLink(3);
}
/**
*
* @param n 第一页到第n页
*/
public static void getHoustInfoLink(int n) {
DefaultHttpClient httpclient = new DefaultHttpClient();
Workbook wb = new HSSFWorkbook();
try {
HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {
public boolean retryRequest(IOException exception, int executionCount,
HttpContext context) {
System.out.println("尝试连接次数:-------:" + executionCount);
if (executionCount >= 5) {
// 如果超过最大重试次数,那么就不要继续了
return false;
}
if (exception instanceof NoHttpResponseException) {
// 如果服务器丢掉了连接,那么就重试
return true;
}
if (exception instanceof SSLHandshakeException) {
// 不要重试SSL握手异常
return false;
}
HttpRequest request = (HttpRequest) context
.getAttribute(ExecutionContext.HTTP_REQUEST);
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果请求被认为是幂等的,那么就重试
return true;
}
return false;
}
};
for(int pageIndex =1;pageIndex<=n;pageIndex++){
httpclient.setHttpRequestRetryHandler(myRetryHandler);
List<NameValuePair> formparams = new
ArrayList<NameValuePair>();
formparams.add(new BasicNameValuePair("__EVENTARGUMENT", String.valueOf(pageIndex)));
formparams.add(new BasicNameValuePair("__EVENTTARGET", "AspNetPager1"));
formparams.add(new BasicNameValuePair("ddlistOrder", "1"));
UrlEncodedFormEntity urlEntity = new UrlEncodedFormEntity(formparams, "UTF-8");
HttpPost httppost = new HttpPost("BASE_URL");
httppost.setEntity(urlEntity);
ResponseHandler<String> responseHandler = new BasicResponseHandler();
String responseBody = httpclient.execute(httppost, responseHandler);
Document doc = Jsoup.parse(responseBody);
Elements elements = doc.select(".modou .k2 .leb_mod");
JSONArray arr = new JSONArray();
for(int i =0 ;i<elements.size();i++){
JSONObject obj = new JSONObject();
Element element = elements.get(i);
//楼盘图片地址
String imgURL= element.select(".mod_1 img").attr("src");
//楼盘名称
String houstName = element.select(".mod_2 .zuti .a1").text();
//楼盘价格
String price = element.select(".mod_2 .zuti .b1").text();
//更新时间
String updatedTime = element.select(".mod_2 .zuti .c1").text();
//销售电话
String sellPhone = element.select(".mod_2 .dizi .a1 span").text();
element.select(".mod_2 .dizi .b1 span").remove();
String houstDeveloper = element.select(".mod_2 .dizi .b1").eq(0).text();
String address = element.select(".mod_2 .dizi .b1").eq(1).text();
obj.put("imgURL", imgURL);
obj.put("houstName", houstName);
obj.put("price", price);
obj.put("updatedTime", updatedTime);
obj.put("sellPhone", sellPhone);
obj.put("houstDeveloper", houstDeveloper);
obj.put("address", address);
arr.add(obj);
}
//Workbook wb = new XSSFWorkbook();
Sheet sheet = wb.createSheet("HouseList-page-"+pageIndex);
// Create a new font and alter it.
Font font = wb.createFont();
font.setFontName("Times New Roman");
CellStyle style = wb.createCellStyle();
style.setFont(font);
int i =0;
for (Object object : arr) {
object =arr.get(i);
JSONObject js =(JSONObject) object;
Row row = sheet.createRow((short)i);
Cell cell0 = row.createCell(0);
Cell cell1 = row.createCell(1);
Cell cell2 = row.createCell(2);
Cell cell3 = row.createCell(3);
Cell cell4 = row.createCell(4);
Cell cell5 = row.createCell(5);
Cell cell6 = row.createCell(6);
cell0.setCellStyle(style);
cell0.setCellValue(js.get("imgURL").toString());
cell1.setCellStyle(style);
cell1.setCellValue(js.get("houstName").toString());
cell2.setCellValue(js.get("price").toString());
cell3.setCellValue(js.get("updatedTime").toString());
cell4.setCellValue(js.get("sellPhone").toString());
cell5.setCellValue(js.get("houstDeveloper").toString());
cell6.setCellValue(js.get("address").toString());
i++;
}
System.out.println("--导入中...........页面:第"+pageIndex+"页");
//睡眠一秒
Thread.sleep(1000);
}
FileOutputStream fileOut = new FileOutputStream("house-"+System.currentTimeMillis()+".xls");
wb.write(fileOut);
fileOut.close();
// System.out.println(elements.html());
// Struts2Utils.getResponse().setCharacterEncoding("UTF-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
httpclient.getConnectionManager().shutdown();
}
}
}