最近博主在做一个气象项目,需要实时爬取水文网的水文数据。博主会的是java,可是上网一搜,爬虫的博客基本上都是用的pathon。至此,博主下定决心写一篇用java爬虫的博客,希望能帮到大家,喜欢的朋友可以点个赞哦。
首先你需要找到请求的网址路径,以及要分析你所需要的网页数据。
请求网址:
需要爬取的数据:
分析网页源码:
这里看到第一个请求参数发现这么长的一串,不得不佩服给政府做网站的技术人员啊,这是一个安全加密请求数据,后来还发现这个是会变的,反爬虫(但是再难的问题都难不倒博主)。
话不多说直接上代码:
这是爬取数据的业务类:
package com.spd.service;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.spd.dao.GetShuiWenDataDao;
import com.spd.dao.impl.GetShuiWenDataDaoImpl;
import com.spd.pojo.ReservoirWater;
import com.spd.pojo.River;
public class GetShuiWenDataService {
private static GetShuiWenDataDao dao=new GetShuiWenDataDaoImpl();
public static Log Logger=LogFactory.getLog(GetShuiWenDataDao.class);
//陕西水文信息网址
private static String url = "http://www.shxsw.com.cn";
//获取数据的地址
private static String hedaoAction = "iframe/hdsqxx_list.aspx";
public static void main(String[] args) {
List hedaoList= getRiverInfoList(url+"/" + hedaoAction);
System.out.println(hedaoList.size());
if (hedaoList!=null&&hedaoList.size()>0) {
dao.addHeDaoData(hedaoList);
}
}
/**
* 河道水情信息获取
* @param dataUrl
* @return
*/
public static List getRiverInfoList (String dataUrl) {
Long startDate=System.currentTimeMillis();
Document doc = null;
List riversList=new ArrayList<>();
Map zhanmingAndShiDateMap=new LinkedHashMap();
String VIEWSTATE="/wEPDwUKMTU1NzczODAwMA9kFgICAw9kFgQCAQ8WAh4LXyFJdGVtQ291bnQCDxYeZg9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIemtj+WutuWgoSAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxOOaXtgY0OTQuNjYDMzM5A+iQvQgyNTAwLjAwMAg1MDAwLjAwMGQCAQ9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOeZveaysyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BjE3Mi43OAM2NTID5raoCTEzMDAwLjAwMAkyMjUwMC4wMDBkAgIPZBYCZg8VCCHljJfmtJvmsrMgICAgICAgICAgICAgICAgICAgICAgICAg54q2IOWktCAgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTjml7YGMzYyLjU2AzEwNwPmtqgIMjUwMC4wMDAIMzMwMC4wMDBkAgMPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICHov4fpo47mpbwgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTjml7YFOTMuMzUEMTYuMQPlubMAAGQCBA9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOefs+aziSAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BjM2MS41NQAD5raoCTEyMDAwLjAwMAkxNTAwMC4wMDBkAgUPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICDkuLnlh6QgICAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxOOaXtgY1NDYuOTcEMTYuMgPokL0HODAwLjAwMAgxNTAwLjAwMGQCBg9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIOS4tOa9vCAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BTM1Mi43AzM2OAPmtqgIMzAwMC4wMDAIODAwMC4wMDBkAgcPZBYCZg8VCCDms77msrMgICAgICAgICAgICAgICAgICAgICAgICAgICHlvKDlrrblsbEgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTfml7YGNDIzLjQ4AzYyMwPokL0IMzAwMC4wMDAINjAwMC4wMDBkAggPZBYCZg8VCCDms77msrMgICAgICAgICAgICAgICAgICAgICAgICAgICHlvKDlrrblsbEgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTfml7YFNDIzLjUDNjEwA+W5swgzMDAwLjAwMAg2MDAwLjAwMGQCCQ9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIOWSuOmYsyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjM4Mi4xMgMyNDgD5raoCDMwMDAuMDAwCDUwMDAuMDAwZAIKD2QWAmYPFQgh5YyX5rSb5rKzICAgICAgICAgICAgICAgICAgICAgICAgIOeKtiDlpLQgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjM2Mi41MQMxMDED5raoCDI1MDAuMDAwCDMzMDAuMDAwZAILD2QWAmYPFQgg5Li55rGfICAgICAgICAgICAgICAgICAgICAgICAgICAh6L+H6aOO5qW8ICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BTkzLjM1BDE2LjED5raoAABkAgwPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICDkuLnlh6QgICAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxN+aXtgY1NDYuOTgEMTcuMQPokL0HODAwLjAwMAgxNTAwLjAwMGQCDQ9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOefs+aziSAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BTM2MS40AAPlubMJMTIwMDAuMDAwCTE1MDAwLjAwMGQCDg9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOeZveaysyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjE3Mi43NgM2NDID5raoCTEzMDAwLjAwMAkyMjUwMC4wMDBkAgMPDxYEHhBDdXJyZW50UGFnZUluZGV4Ag4eC1JlY29yZGNvdW50AtYCZGRk";
int num=1;
Date now=new Date();
while(num<=30){
try {
Connection con = Jsoup.connect(dataUrl).userAgent("Mozilla/5.0").timeout(3000);
con.data("__EVENTTARGET","pager");
con.data("__EVENTARGUMENT", num+"");
con.data("__VIEWSTATE",VIEWSTATE);
doc=con.post();
} catch (IOException e) {
e.printStackTrace();
}
VIEWSTATE=doc.getElementById("__VIEWSTATE").val();
Elements elements = doc.getElementsByTag("tr");
SimpleDateFormat fro1=new SimpleDateFormat("dd日 HH时");
for (int j=1; j
River river=new River();
String rivers[]=elements.get(j).toString().replace("", "").replace(" ", "").replace("", "").replace(" ", "").split(" " );
String riverName=rivers[0].trim();
String zhanName=rivers[1].trim();
String date=rivers[2].trim();
String shuiWei=rivers[3].trim();
String liuliang=rivers[4].trim();
String shuishi=rivers[5].trim();
String jingjieliangliang=rivers[6].trim();
String baozhengliuliang=rivers[7].trim();
//封装数据
river.setRiverName(riverName);
river.setZhanName(zhanName);
//"--"标识无效数据
river.setWaterLevel(StringUtil.isBlank(shuiWei)?"--":shuiWei);
river.setTraffic(StringUtil.isBlank(liuliang)?"--":liuliang);
river.setAlertTraffic((StringUtil.isBlank(jingjieliangliang)?"--":jingjieliangliang));
river.setEnsureTraffic((StringUtil.isBlank(baozhengliuliang)?"--":baozhengliuliang));
river.setShuiShi(shuishi);
try {
river.setDate(fro1.parse(date));
} catch (Exception e) {
e.printStackTrace();
}
//对今日重复数据进行过滤
if (!date.equals(zhanmingAndShiDateMap.get(date+zhanName))) {
riversList.add(river);
zhanmingAndShiDateMap.put(date+zhanName, date);
}
}
num++;
}
Long endDate=System.currentTimeMillis();
Logger.info(new SimpleDateFormat("yyyy月MM月dd日 HH时mm分").format(now)+"共爬取爬取"+riversList.size()+"河道水情信息,共耗时"+(endDate-startDate)/1000+"秒");
return riversList;
}
}
这是对将数据存入数据库中:
package com.spd.dao.impl;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import com.spd.dao.GetShuiWenDataDao;
import com.spd.pojo.ReservoirWater;
import com.spd.pojo.River;
import com.spd.util.JdbcUtil;
public class GetShuiWenDataDaoImpl implements GetShuiWenDataDao {
@Override
public void addHeDaoData(List list1) {
for (int i = 0; i < list1.size(); i++) {
River river=(River) list1.get(i);
String date=new SimpleDateFormat("yyyy-MM").format(now)+"-"+river.getDate().getDate()+" "+river.getDate().getHours()+":00:00";
String sql1="select * from t_river_info where zhanName=? and date=?";
Object[] params1={river.getZhanName(),date};
List riList =JdbcUtil.executeDQL(River.class, sql1, params1);
if (riList!=null&&riList.size()==0) {
String sql2="insert into t_river_info (id,riverName,zhanName,date,waterLevel,traffic,shuiShi,alertTraffic,ensureTraffic,create_time) " +
"values(null,?,?,?,?,?,?,?,?,now())";
Object[] params2={river.getRiverName(),river.getZhanName(),date,river.getWaterLevel(),river.getTraffic(),river.getShuiShi(),river.getAlertTraffic(),river.getEnsureTraffic()};
JdbcUtil.executeDML(sql2, params2);
}
}
}
}
}
dao的接口:
package com.spd.dao;
import java.util.List;
public interface GetShuiWenDataDao {
void addHeDaoData(List list);
}
pojo类:
package com.spd.pojo;
import java.io.Serializable;
import java.util.Date;
public class River implements Serializable {
private String riverName;
private String zhanName;
private Date date;
//水位
private String waterLevel;
//流量
private String traffic;
private String shuiShi;
//警戒流量
private String alertTraffic;
//保证流量
private String ensureTraffic;
public River() {
super();
}
public String getRiverName() {
return riverName;
}
public void setRiverName(String riverName) {
this.riverName = riverName;
}
public String getZhanName() {
return zhanName;
}
public void setZhanName(String zhanName) {
this.zhanName = zhanName;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public String getWaterLevel() {
return waterLevel;
}
public void setWaterLevel(String waterLevel) {
this.waterLevel = waterLevel;
}
public String getTraffic() {
return traffic;
}
public void setTraffic(String traffic) {
this.traffic = traffic;
}
public String getShuiShi() {
return shuiShi;
}
public void setShuiShi(String shuiShi) {
this.shuiShi = shuiShi;
}
public String getAlertTraffic() {
return alertTraffic;
}
public void setAlertTraffic(String alertTraffic) {
this.alertTraffic = alertTraffic;
}
public String getEnsureTraffic() {
return ensureTraffic;
}
public void setEnsureTraffic(String ensureTraffic) {
this.ensureTraffic = ensureTraffic;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((alertTraffic == null) ? 0 : alertTraffic.hashCode());
result = prime * result + ((date == null) ? 0 : date.hashCode());
result = prime * result
+ ((ensureTraffic == null) ? 0 : ensureTraffic.hashCode());
result = prime * result
+ ((riverName == null) ? 0 : riverName.hashCode());
result = prime * result + ((shuiShi == null) ? 0 : shuiShi.hashCode());
result = prime * result + ((traffic == null) ? 0 : traffic.hashCode());
result = prime * result
+ ((waterLevel == null) ? 0 : waterLevel.hashCode());
result = prime * result
+ ((zhanName == null) ? 0 : zhanName.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
River other = (River) obj;
if (alertTraffic == null) {
if (other.alertTraffic != null)
return false;
} else if (!alertTraffic.equals(other.alertTraffic))
return false;
if (date == null) {
if (other.date != null)
return false;
} else if (!date.equals(other.date))
return false;
if (ensureTraffic == null) {
if (other.ensureTraffic != null)
return false;
} else if (!ensureTraffic.equals(other.ensureTraffic))
return false;
if (riverName == null) {
if (other.riverName != null)
return false;
} else if (!riverName.equals(other.riverName))
return false;
if (shuiShi == null) {
if (other.shuiShi != null)
return false;
} else if (!shuiShi.equals(other.shuiShi))
return false;
if (traffic == null) {
if (other.traffic != null)
return false;
} else if (!traffic.equals(other.traffic))
return false;
if (waterLevel == null) {
if (other.waterLevel != null)
return false;
} else if (!waterLevel.equals(other.waterLevel))
return false;
if (zhanName == null) {
if (other.zhanName != null)
return false;
} else if (!zhanName.equals(other.zhanName))
return false;
return true;
}
@Override
public String toString() {
return "River [riverName=" + riverName + ", zhanName=" + zhanName
+ ", date=" + date + ", waterLevel=" + waterLevel
+ ", traffic=" + traffic + ", shuiShi=" + shuiShi
+ ", alertTraffic=" + alertTraffic + ", ensureTraffic="
+ ensureTraffic + "]";
}
}
jdbc操作的工具类是博主的精华,呈上。
jdbc的工具类:
package com.spd.util;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.commons.beanutils.BeanUtils;
/**
* 调用JDBC的公共类
* @author Administrator
*
*/
public class JdbcUtil {
//声明一些变量存放配置信息
private static String className = "";
private static String url = "";
private static String user = "";
private static String password = "";
/**
* 在类被加载的时候执行,而且只执行一次
*/
static {
try {
//获取配置文件信息
Properties properties = new Properties();
properties.load(JdbcUtil.class.getClassLoader().getResourceAsStream("jdbcUtil.properties"));
//开始获取配置的初始值
String datatype = properties.getProperty("datatype");
className = properties.getProperty(datatype + "Driver");
url = properties.getProperty(datatype + "Url");
user = properties.getProperty(datatype + "User");
password = properties.getProperty(datatype + "Password");
//加载驱动
Class.forName(className);
} catch (IOException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
/**
* 获取proxool连接对象
* @return
*/
public static Connection getProxoolConnection(){
Connection conn = null;
try {
Class.forName("org.logicalcobwebs.proxool.ProxoolDriver");
conn = DriverManager.getConnection("proxool.test");
conn.setAutoCommit(false);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
/**
* 创建一个连接
* @return
*/
public static Connection createConnection() {
//声明一个连接
Connection connection = null;
try {
//获取一个连接
connection = DriverManager.getConnection(url, user, password);
//将事务的自动提交关闭,每次都需要手动提交
connection.setAutoCommit(false);
} catch (SQLException e) {
System.err.println("创建连接失败,请检查连接参数:url[" + url + "]user[" + user + "]password[" + password + "]");
e.printStackTrace();
}
return connection;
}
/**
* 创建发送器
* @param connection
* @return
*/
public static Statement createStatement(Connection connection) {
Statement statement = null;
try {
statement = connection.createStatement();
} catch (SQLException e) {
e.printStackTrace();
}
return statement;
}
/**
* 创建预处理发送器
* @param connection
* @param sql
* @return
*/
public static PreparedStatement createPreparedStatement(Connection connection, CharSequence sql) {
PreparedStatement preparedStatement = null;
try {
preparedStatement = connection.prepareStatement(sql.toString());
} catch (SQLException e) {
e.printStackTrace();
}
return preparedStatement;
}
/**
* 关闭连接释放资源
* @param connection
* @param statement
* @param resultSet
*/
public static void closeAll(Connection connection, Statement statement, ResultSet resultSet) {
if (resultSet != null) {
try {
resultSet.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (statement != null) {
try {
statement.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
/**
* 将结果集合里面的数据存放至List
* @param resultSet
* @param clazz
*/
public static List resultSet2List(ResultSet resultSet, Class clazz) {
//声明一个List
List list = new ArrayList();
try {
//获取当前类所有的属性
Field[] dfields = clazz.getDeclaredFields();
//首先遍历resultSet
while (resultSet.next()) {
//创建一个对象
T bean = clazz.getConstructor().newInstance();
//遍历属性
for (int i = 0; i < dfields.length; i++) {
//获取属性的名字
String fieldName = dfields[i].getName();
//获取属性对应的值
Object fieldValue = resultSet.getObject(fieldName);
//获取属性的set方法
String methodName = "set" + fieldName.toUpperCase().substring(0, 1) + fieldName.substring(1);
//获取方法
Method setMethod = clazz.getMethod(methodName, dfields[i].getType());
//要将值设置给对象
setMethod.invoke(bean, fieldValue);
}
//然后将对象添加到List
list.add(bean);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
/**
* 将结果集合里面的数据存放至对象
* @param resultSet
* @param clazz
*/
public static T resultSet2bean(ResultSet resultSet, Class clazz) {
//声明一个List
List list = resultSet2List(resultSet, clazz);
if (list != null && list.size() >= 1) {
return list.get(0);
}
return null;
}
/**
* 给sql变量赋值
* @param sql
* @param preparedStatement
* @param params
*/
public static void setParams(String sql,
PreparedStatement preparedStatement, Object... params) {
// 遍历参数
for (int i = 0; i < params.length; i++) {
try {
preparedStatement.setObject(i + 1, params[i]);
} catch (SQLException e) {
e.printStackTrace();
}
}
}
/**
* 查询操作
* @param cls
* @param sql
* @param params
* @return
*/
public static List executeDQL(Class cls, String sql,
Object... params) {
// 声明返回值对象
List list = new ArrayList();
// 声明连接
Connection connection = null;
PreparedStatement preparedStatement = null;
ResultSet resultSet = null;
try {
// 获得一个Connection对象
connection = createConnection();
// 创建命令发送器
preparedStatement = createPreparedStatement(connection, sql);
// 给sql变量赋值
setParams(sql, preparedStatement, params);
// 利用命令发送器执行sql语句并接收结果集
resultSet = preparedStatement.executeQuery();
// 获得数据库表的字段信息
ResultSetMetaData resultSetMetaData = resultSet.getMetaData();
/**
* rsmd.getColumnLabel(i).toLowerCase():获取字段名称
* rsmd.getColumnCount():获取字段的个数
*/
while (resultSet.next()) {
// 创建当前对象
T bean = cls.newInstance();
for (int i = 1; i < resultSetMetaData.getColumnCount(); i++) {
BeanUtils.setProperty(bean, resultSetMetaData.getColumnLabel(i), resultSet.getObject(i));
}
list.add(bean);
}
} catch (Exception e) {
e.printStackTrace();
}
// 创建对象
return list;
}
/**
* 更新操作
* @param sql
* @param params
* @return
*/
public static boolean executeDML(String sql,Object[] params){
//声明连接
Connection connection = null;
PreparedStatement preparedStatement = null;
try {
//创建连接
connection = JdbcUtil.createConnection();
//创建命令发送器
preparedStatement = JdbcUtil.createPreparedStatement(connection, sql);
for(int i=0;i1, params[i]);
}
//利用命令发送器执行SQL语句并且获取结果集
int count = preparedStatement.executeUpdate();
//判断是否插入成功
if(count >0){
connection.commit();
return true;
}else {
connection.rollback();
}
} catch (SQLException e) {
e.printStackTrace();
}finally{//关闭资源
JdbcUtil.closeAll(connection, preparedStatement, null);
}
return false;
}
}
数据库配置文件:
##choose which database
datatype=mysql
##mysql
mysqlDriver=com.mysql.jdbc.Driver
mysqlUrl=jdbc:mysql://localhost:3306/*****?useUnicode=true&characterEncoding=utf-8
mysqlUser=****
mysqlPassword=****
这是项目结构:
将项目导出为可执行jar包,用windows定时任务器每小时执行jar包,就可以完成定时抓取网页数据。
对于别的网站也是可行,主要是分析和爬取数据的步骤。喜欢的小伙伴可以试一下,如果觉得还行,给博主点一波赞哦。