公司实习导师布置一项用HttpClient抓取网页表格的实践作业,当然了,肯定是python抓比较简单,可能由于我是“java开发实习生”,总之,就有这么一个作业来练练手啦
第一次实践,直接在main函数里面进行实现。
导入依赖,其中jsoup和httpcomponents是必须的。
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
但由此图可见,用httpclient并没有抓取到页面。了解到是iframe相当于一种反爬机制,由于只是第一次练练手,就没有继续深究这个问题,本来应该是获得iframe的src作为url来抓取的,我就偷了个懒直接用src里面的url来作为待抓取页面的url了,即“http://www.safe.gov.cn/AppStructured/hlw/RMBQuery.do”,未来学习了再改进
package com.crawler;
public class RMBCentralParityPOJO {
private String date;//日期
private String USD;//美元
private String EUR;//欧元
private String JPY;//日元
private String HKD;//港元
private String GBP;//英镑
private String MYR;//林吉特
private String LUF;//卢布
private String AUD;//澳元
private String CAD;//加元
private String NZD;//新西兰元
private String SGD;//新加坡
private String CHF;//瑞士法郎
private String ZAR;//兰特
private String KRW;//韩元
private String AED;//迪拉姆
private String SAR;//里亚尔
private String HUF;//福林
private String PLZ;//兹罗提
private String DKR;//丹麦克朗
private String SEK;//瑞典克朗
private String NOK;//挪威克朗
private String TRY;//里拉
private String PHP;//比索
private String THB;//泰铢public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getUSD() {
return USD;
}
public void setUSD(String USD) {
this.USD = USD;
}
public String getEUR() {
return EUR;
}
public void setEUR(String EUR) {
this.EUR = EUR;
}
public String getJPY() {
return JPY;
}
public void setJPY(String JPY) {
this.JPY = JPY;
}
public String getHKD() {
return HKD;
}
public void setHKD(String HKD) {
this.HKD = HKD;
}
public String getGBP() {
return GBP;
}
public void setGBP(String GBP) {
this.GBP = GBP;
}
public String getMYR() {
return MYR;
}
public void setMYR(String MYR) {
this.MYR = MYR;
}
public String getLUF() {
return LUF;
}
public void setLUF(String LUF) {
this.LUF = LUF;
}
public String getAUD() {
return AUD;
}
public void setAUD(String AUD) {
this.AUD = AUD;
}
public String getCAD() {
return CAD;
}
public void setCAD(String CAD) {
this.CAD = CAD;
}
public String getNZD() {
return NZD;
}
public void setNZD(String NZD) {
this.NZD = NZD;
}
public String getSGD() {
return SGD;
}
public void setSGD(String SGD) {
this.SGD = SGD;
}
public String getCHF() {
return CHF;
}
public void setCHF(String CHF) {
this.CHF = CHF;
}
public String getZAR() {
return ZAR;
}
public void setZAR(String ZAR) {
this.ZAR = ZAR;
}
public String getKRW() {
return KRW;
}
public void setKRW(String KRW) {
this.KRW = KRW;
}
public String getAED() {
return AED;
}
public void setAED(String AED) {
this.AED = AED;
}
public String getSAR() {
return SAR;
}
public void setSAR(String SAR) {
this.SAR = SAR;
}
public String getHUF() {
return HUF;
}
public void setHUF(String HUF) {
this.HUF = HUF;
}
public String getPLZ() {
return PLZ;
}
public void setPLZ(String PLZ) {
this.PLZ = PLZ;
}
public String getDKR() {
return DKR;
}
public void setDKR(String DKR) {
this.DKR = DKR;
}
public String getSEK() {
return SEK;
}
public void setSEK(String SEK) {
this.SEK = SEK;
}
public String getNOK() {
return NOK;
}
public void setNOK(String NOK) {
this.NOK = NOK;
}
public String getTRY() {
return TRY;
}
public void setTRY(String TRY) {
this.TRY = TRY;
}
public String getPHP() {
return PHP;
}
public void setPHP(String PHP) {
this.PHP = PHP;
}
public String getTHB() {
return THB;
}
public void setTHB(String THB) {
this.THB = THB;
}
@Override
public String toString() {
return "RMBCentralParityPOJO{" +
"date='" + date + '\'' +
", USD='" + USD + '\'' +
", EUR='" + EUR + '\'' +
", JPY='" + JPY + '\'' +
", HKD='" + HKD + '\'' +
", GBP='" + GBP + '\'' +
", MYR='" + MYR + '\'' +
", LUF='" + LUF + '\'' +
", AUD='" + AUD + '\'' +
", CAD='" + CAD + '\'' +
", NZD='" + NZD + '\'' +
", SGD='" + SGD + '\'' +
", CHF='" + CHF + '\'' +
", ZAR='" + ZAR + '\'' +
", KRW='" + KRW + '\'' +
", AED='" + AED + '\'' +
", SAR='" + SAR + '\'' +
", HUF='" + HUF + '\'' +
", PLZ='" + PLZ + '\'' +
", DKR='" + DKR + '\'' +
", SEK='" + SEK + '\'' +
", NOK='" + NOK + '\'' +
", TRY='" + TRY + '\'' +
", PHP='" + PHP + '\'' +
", THB='" + THB + '\'' +
'}';
}
```
package com.crawler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class GetExcelByUrl {
public static void main(String[] args) {
/*
1.创建HttpClient对象;
2.创建请求方法的实例,并指定请求URL。如果需要发送GET请求,创建HttpGet对象;如果需要发送POST请求,创建HttpPost对象;
3.如果需要发送请求参数,可调用HttpGet,HttpPost共同的setParams(HttpParams params)方法来添加请求参数;
4.调用HttpClient对象的execute(HttpUriRequest request)发送请求,该方法返回一个HttpResponse;
5.调用HttpResponse的getAllHeaders().getHeaders(String name)等方法可获得服务器的响应头;调用HttpResponse的get
6.释放连接,无论执行方法是否成功,都必须释放资源
*/
//创建HttpClientBuilder
HttpClientBuilder httpClientBuilder= HttpClientBuilder.create();
//创建HttpClient
CloseableHttpClient closeableHttpClient=httpClientBuilder.build();
HttpGet httpGet=new HttpGet("http://www.safe.gov.cn/AppStructured/hlw/RMBQuery.do");
System.out.println(httpGet.getRequestLine());
try {
String urlHeader="http://www.safe.gov.cn";
//定义一个集合返回需要的RMBCentralParityPOJO数组
List<RMBCentralParityPOJO> rmbCentralParityPOJOS=new ArrayList<RMBCentralParityPOJO>();
//执行get请求
HttpResponse httpResponse=closeableHttpClient.execute(httpGet);
//获取响应消息实体
HttpEntity entity=httpResponse.getEntity();
//响应状态
//System.out.println("status:"+httpResponse.getStatusLine());
if (entity!=null){
//System.out.println("contentEncoding:"+entity.getContentEncoding());
//html文本信息
String response= EntityUtils.toString(entity,"UTF-8") ;
//解析页面内容
Document document= Jsoup.parse(response);
Elements titleElements = document.getElementsByClass("table_head");//获得className为"table_head"的标签
//获得标题
String[] titles=new String[25];
for (int i=0;i<titleElements.size();i++){
Element titleElement=titleElements.get(i);
titles[i]=titleElement.text();
}
RMBCentralParityPOJO rmbCentralParityPOJOTitle = new RMBCentralParityPOJO();
rmbCentralParityPOJOTitle.setDate(titles[0]);
rmbCentralParityPOJOTitle.setUSD(titles[1]);
rmbCentralParityPOJOTitle.setEUR(titles[2]);
rmbCentralParityPOJOTitle.setJPY(titles[3]);
rmbCentralParityPOJOTitle.setHKD(titles[4]);
rmbCentralParityPOJOTitle.setGBP(titles[5]);
rmbCentralParityPOJOTitle.setMYR(titles[6]);
rmbCentralParityPOJOTitle.setLUF(titles[7]);
rmbCentralParityPOJOTitle.setAUD(titles[8]);
rmbCentralParityPOJOTitle.setCAD(titles[9]);
rmbCentralParityPOJOTitle.setNZD(titles[10]);
rmbCentralParityPOJOTitle.setSGD(titles[11]);
rmbCentralParityPOJOTitle.setCHF(titles[12]);
rmbCentralParityPOJOTitle.setZAR(titles[13]);
rmbCentralParityPOJOTitle.setKRW(titles[14]);
rmbCentralParityPOJOTitle.setAED(titles[15]);
rmbCentralParityPOJOTitle.setSAR(titles[16]);
rmbCentralParityPOJOTitle.setHUF(titles[17]);
rmbCentralParityPOJOTitle.setPLZ(titles[18]);
rmbCentralParityPOJOTitle.setDKR(titles[19]);
rmbCentralParityPOJOTitle.setSEK(titles[20]);
rmbCentralParityPOJOTitle.setNOK(titles[21]);
rmbCentralParityPOJOTitle.setTRY(titles[22]);
rmbCentralParityPOJOTitle.setPHP(titles[23]);
rmbCentralParityPOJOTitle.setTHB(titles[24]);
rmbCentralParityPOJOS.add(rmbCentralParityPOJOTitle);
//获得元素
Elements rowElements = document.getElementsByClass("first");//获得className为"first"的标签
for (int i=0;i<rowElements.size();i++) {//从第一行开始数据遍历
Element rowElement = rowElements.get(i);//获取该行
String[] columnsForEachRow = new String[25];//每行有25列
Elements columnElementsForEachRow = rowElement.select("td");//获取该行的td标签
//解析数据
for (int j = 0; j < columnElementsForEachRow.size(); j++) {
Element columnElement = columnElementsForEachRow.get(j);//获得单个td标签
columnsForEachRow[j] = columnElement.text();
}
RMBCentralParityPOJO rmbCentralParityPOJO = new RMBCentralParityPOJO();
rmbCentralParityPOJO.setDate(columnsForEachRow[0]);
rmbCentralParityPOJO.setUSD(columnsForEachRow[1]);
rmbCentralParityPOJO.setEUR(columnsForEachRow[2]);
rmbCentralParityPOJO.setJPY(columnsForEachRow[3]);
rmbCentralParityPOJO.setHKD(columnsForEachRow[4]);
rmbCentralParityPOJO.setGBP(columnsForEachRow[5]);
rmbCentralParityPOJO.setMYR(columnsForEachRow[6]);
rmbCentralParityPOJO.setLUF(columnsForEachRow[7]);
rmbCentralParityPOJO.setAUD(columnsForEachRow[8]);
rmbCentralParityPOJO.setCAD(columnsForEachRow[9]);
rmbCentralParityPOJO.setNZD(columnsForEachRow[10]);
rmbCentralParityPOJO.setSGD(columnsForEachRow[11]);
rmbCentralParityPOJO.setCHF(columnsForEachRow[12]);
rmbCentralParityPOJO.setZAR(columnsForEachRow[13]);
rmbCentralParityPOJO.setKRW(columnsForEachRow[14]);
rmbCentralParityPOJO.setAED(columnsForEachRow[15]);
rmbCentralParityPOJO.setSAR(columnsForEachRow[16]);
rmbCentralParityPOJO.setHUF(columnsForEachRow[17]);
rmbCentralParityPOJO.setPLZ(columnsForEachRow[18]);
rmbCentralParityPOJO.setDKR(columnsForEachRow[19]);
rmbCentralParityPOJO.setSEK(columnsForEachRow[20]);
rmbCentralParityPOJO.setNOK(columnsForEachRow[21]);
rmbCentralParityPOJO.setTRY(columnsForEachRow[22]);
rmbCentralParityPOJO.setPHP(columnsForEachRow[23]);
rmbCentralParityPOJO.setTHB(columnsForEachRow[24]);
rmbCentralParityPOJOS.add(rmbCentralParityPOJO);
}
String rmbCentralParityListJson= JSON.toJSONString(rmbCentralParityPOJOS);
System.out.println(rmbCentralParityListJson);
toExcel("E:\\rmbrate\\myrmb",rmbCentralParityPOJOS,titles);
}
}catch (IOException e){
e.printStackTrace();
}
finally {
try {
closeableHttpClient.close();
}catch (IOException e){
e.printStackTrace();
}
}
}
private static void toExcel(String exportFilePath,List<RMBCentralParityPOJO> rmbCentralParityPOJOList,String[] titles){
//创建excel工作簿
XSSFWorkbook xssfWorkbook=new XSSFWorkbook();
//创建工作表sheet
XSSFSheet sheet=xssfWorkbook.createSheet();
//创建第一行
XSSFRow row=sheet.createRow(0);
XSSFCell cell=null;
//插入第一行数据的表头
for(int i=0;i<titles.length;i++)
{
cell=row.createCell(i);
cell.setCellValue(titles[i]);
}
//写数据
for (int i=1;i<rmbCentralParityPOJOList.size();i++){
XSSFRow newRow=sheet.createRow(i);
XSSFCell newCell=newRow.createCell(0);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getDate());
newCell=newRow.createCell(1);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getUSD());
newCell=newRow.createCell(2);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getEUR());
newCell=newRow.createCell(3);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getJPY());
newCell=newRow.createCell(4);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getHKD());
newCell=newRow.createCell(5);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getGBP());
newCell=newRow.createCell(6);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getMYR());
newCell=newRow.createCell(7);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getLUF());
newCell=newRow.createCell(8);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getAUD());
newCell=newRow.createCell(9);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getCAD());
newCell=newRow.createCell(10);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getNZD());
newCell=newRow.createCell(11);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getSGD());
newCell=newRow.createCell(12);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getCHF());
newCell=newRow.createCell(13);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getZAR());
newCell=newRow.createCell(14);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getKRW());
newCell=newRow.createCell(15);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getAED());
newCell=newRow.createCell(16);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getSAR());
newCell=newRow.createCell(17);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getHUF());
newCell=newRow.createCell(18);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getPLZ());
newCell=newRow.createCell(19);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getDKR());
newCell=newRow.createCell(20);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getSEK());
newCell=newRow.createCell(21);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getNOK());
newCell=newRow.createCell(22);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getTRY());
newCell=newRow.createCell(23);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getPHP());
newCell=newRow.createCell(24);
newCell.setCellValue(rmbCentralParityPOJOList.get(i).getTHB());
}
File file = new File(exportFilePath+".xlsx");
try {
file.createNewFile();
//将excel写入
FileOutputStream stream= FileUtils.openOutputStream(file);
xssfWorkbook.write(stream);
stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}