利用java定时爬取网页数据

最近博主在做一个气象项目,需要实时爬取水文网的水文数据。博主会的是java,可是上网一搜,爬虫的博客基本上都是用的pathon。至此,博主下定决心写一篇用java爬虫的博客,希望能帮到大家,喜欢的朋友可以点个赞哦。
首先你需要找到请求的网址路径,以及要分析你所需要的网页数据。
请求网址:

利用java定时爬取网页数据_第1张图片
需要爬取的数据:
利用java定时爬取网页数据_第2张图片
利用java定时爬取网页数据_第3张图片
分析网页源码:
利用java定时爬取网页数据_第4张图片
利用java定时爬取网页数据_第5张图片
这里看到第一个请求参数发现这么长的一串,不得不佩服给政府做网站的技术人员啊,这是一个安全加密请求数据,后来还发现这个是会变的,反爬虫(但是再难的问题都难不倒博主)。
话不多说直接上代码:

这是爬取数据的业务类:

package com.spd.service;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.spd.dao.GetShuiWenDataDao;
import com.spd.dao.impl.GetShuiWenDataDaoImpl;
import com.spd.pojo.ReservoirWater;
import com.spd.pojo.River;

public class GetShuiWenDataService {

    private static GetShuiWenDataDao dao=new GetShuiWenDataDaoImpl();

    public static Log Logger=LogFactory.getLog(GetShuiWenDataDao.class);
        //陕西水文信息网址
        private static String url = "http://www.shxsw.com.cn";
        //获取数据的地址
        private static String hedaoAction = "iframe/hdsqxx_list.aspx";

        public static void main(String[] args) {
            List hedaoList= getRiverInfoList(url+"/" + hedaoAction);
            System.out.println(hedaoList.size());
            if (hedaoList!=null&&hedaoList.size()>0) {
                dao.addHeDaoData(hedaoList);
            }
        }
        /**
         * 河道水情信息获取
         * @param dataUrl
         * @return
         */
        public static List getRiverInfoList (String dataUrl) {
            Long startDate=System.currentTimeMillis();
              Document doc = null;
              List riversList=new ArrayList<>();
              Map zhanmingAndShiDateMap=new LinkedHashMap();
              String VIEWSTATE="/wEPDwUKMTU1NzczODAwMA9kFgICAw9kFgQCAQ8WAh4LXyFJdGVtQ291bnQCDxYeZg9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIemtj+WutuWgoSAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxOOaXtgY0OTQuNjYDMzM5A+iQvQgyNTAwLjAwMAg1MDAwLjAwMGQCAQ9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOeZveaysyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BjE3Mi43OAM2NTID5raoCTEzMDAwLjAwMAkyMjUwMC4wMDBkAgIPZBYCZg8VCCHljJfmtJvmsrMgICAgICAgICAgICAgICAgICAgICAgICAg54q2IOWktCAgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTjml7YGMzYyLjU2AzEwNwPmtqgIMjUwMC4wMDAIMzMwMC4wMDBkAgMPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICHov4fpo47mpbwgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTjml7YFOTMuMzUEMTYuMQPlubMAAGQCBA9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOefs+aziSAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BjM2MS41NQAD5raoCTEyMDAwLjAwMAkxNTAwMC4wMDBkAgUPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICDkuLnlh6QgICAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxOOaXtgY1NDYuOTcEMTYuMgPokL0HODAwLjAwMAgxNTAwLjAwMGQCBg9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIOS4tOa9vCAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE45pe2BTM1Mi43AzM2OAPmtqgIMzAwMC4wMDAIODAwMC4wMDBkAgcPZBYCZg8VCCDms77msrMgICAgICAgICAgICAgICAgICAgICAgICAgICHlvKDlrrblsbEgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTfml7YGNDIzLjQ4AzYyMwPokL0IMzAwMC4wMDAINjAwMC4wMDBkAggPZBYCZg8VCCDms77msrMgICAgICAgICAgICAgICAgICAgICAgICAgICHlvKDlrrblsbEgICAgICAgICAgICAgICAgICAgICAgICALMjLml6UgMTfml7YFNDIzLjUDNjEwA+W5swgzMDAwLjAwMAg2MDAwLjAwMGQCCQ9kFgJmDxUIIOa4reaysyAgICAgICAgICAgICAgICAgICAgICAgICAgIOWSuOmYsyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjM4Mi4xMgMyNDgD5raoCDMwMDAuMDAwCDUwMDAuMDAwZAIKD2QWAmYPFQgh5YyX5rSb5rKzICAgICAgICAgICAgICAgICAgICAgICAgIOeKtiDlpLQgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjM2Mi41MQMxMDED5raoCDI1MDAuMDAwCDMzMDAuMDAwZAILD2QWAmYPFQgg5Li55rGfICAgICAgICAgICAgICAgICAgICAgICAgICAh6L+H6aOO5qW8ICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BTkzLjM1BDE2LjED5raoAABkAgwPZBYCZg8VCCDkuLnmsZ8gICAgICAgICAgICAgICAgICAgICAgICAgICDkuLnlh6QgICAgICAgICAgICAgICAgICAgICAgICAgIAsyMuaXpSAxN+aXtgY1NDYuOTgEMTcuMQPokL0HODAwLjAwMAgxNTAwLjAwMGQCDQ9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOefs+aziSAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BTM2MS40AAPlubMJMTIwMDAuMDAwCTE1MDAwLjAwMGQCDg9kFgJmDxUIIOaxieaxnyAgICAgICAgICAgICAgICAgICAgICAgICAgIOeZveaysyAgICAgICAgICAgICAgICAgICAgICAgICAgCzIy5pelIDE35pe2BjE3Mi43NgM2NDID5raoCTEzMDAwLjAwMAkyMjUwMC4wMDBkAgMPDxYEHhBDdXJyZW50UGFnZUluZGV4Ag4eC1JlY29yZGNvdW50AtYCZGRk";
              int num=1;
              Date now=new Date();
             while(num<=30){
              try {
                  Connection con = Jsoup.connect(dataUrl).userAgent("Mozilla/5.0").timeout(3000);
                 con.data("__EVENTTARGET","pager");
                 con.data("__EVENTARGUMENT", num+"");
                 con.data("__VIEWSTATE",VIEWSTATE);
                  doc=con.post();
              } catch (IOException e) {
                  e.printStackTrace();
              }
              VIEWSTATE=doc.getElementById("__VIEWSTATE").val();
              Elements elements = doc.getElementsByTag("tr");
              SimpleDateFormat fro1=new SimpleDateFormat("dd日 HH时");
              for (int j=1; j
                  River river=new River();
                  String rivers[]=elements.get(j).toString().replace("", "").replace("", "").replace("", "").replace("", "").split("");
                      String riverName=rivers[0].trim();
                      String zhanName=rivers[1].trim();
                      String date=rivers[2].trim();
                      String shuiWei=rivers[3].trim();
                      String liuliang=rivers[4].trim();
                      String shuishi=rivers[5].trim();
                      String jingjieliangliang=rivers[6].trim();
                      String baozhengliuliang=rivers[7].trim();
                            //封装数据
                          river.setRiverName(riverName);
                          river.setZhanName(zhanName);
                          //"--"标识无效数据
                          river.setWaterLevel(StringUtil.isBlank(shuiWei)?"--":shuiWei);
                          river.setTraffic(StringUtil.isBlank(liuliang)?"--":liuliang);
                          river.setAlertTraffic((StringUtil.isBlank(jingjieliangliang)?"--":jingjieliangliang));
                          river.setEnsureTraffic((StringUtil.isBlank(baozhengliuliang)?"--":baozhengliuliang));
                          river.setShuiShi(shuishi);
                          try {
                            river.setDate(fro1.parse(date));
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                          //对今日重复数据进行过滤
                          if (!date.equals(zhanmingAndShiDateMap.get(date+zhanName))) {
                              riversList.add(river);
                              zhanmingAndShiDateMap.put(date+zhanName, date);
                        }
            }
              num++;
             }
             Long endDate=System.currentTimeMillis();
             Logger.info(new SimpleDateFormat("yyyy月MM月dd日 HH时mm分").format(now)+"共爬取爬取"+riversList.size()+"河道水情信息,共耗时"+(endDate-startDate)/1000+"秒");
        return riversList;
        }

}

这是对将数据存入数据库中:

package com.spd.dao.impl;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.beanutils.BeanUtils;

import com.spd.dao.GetShuiWenDataDao;
import com.spd.pojo.ReservoirWater;
import com.spd.pojo.River;
import com.spd.util.JdbcUtil;

public class GetShuiWenDataDaoImpl implements GetShuiWenDataDao {

    @Override
    public void addHeDaoData(List list1) {
            for (int i = 0; i < list1.size(); i++) {
                River river=(River) list1.get(i);

                String date=new SimpleDateFormat("yyyy-MM").format(now)+"-"+river.getDate().getDate()+" "+river.getDate().getHours()+":00:00";
                String sql1="select * from t_river_info where zhanName=? and date=?";
                Object[] params1={river.getZhanName(),date};
                List riList =JdbcUtil.executeDQL(River.class, sql1, params1);

                if (riList!=null&&riList.size()==0) {
                    String sql2="insert into t_river_info (id,riverName,zhanName,date,waterLevel,traffic,shuiShi,alertTraffic,ensureTraffic,create_time) " +
                            "values(null,?,?,?,?,?,?,?,?,now())";
                    Object[] params2={river.getRiverName(),river.getZhanName(),date,river.getWaterLevel(),river.getTraffic(),river.getShuiShi(),river.getAlertTraffic(),river.getEnsureTraffic()};
                    JdbcUtil.executeDML(sql2, params2);
                }

            }
        }

    }



}

dao的接口:

package com.spd.dao;

import java.util.List;

public interface GetShuiWenDataDao {

    void addHeDaoData(List list);
}

pojo类:

package com.spd.pojo;

import java.io.Serializable;
import java.util.Date;

public class River implements Serializable {
    private String riverName;
    private String zhanName;
    private Date date;
    //水位
    private String waterLevel;
    //流量
    private String traffic;
    private String shuiShi;
    //警戒流量
    private String alertTraffic;
    //保证流量
    private String ensureTraffic;
    public River() {
        super();
    }
    public String getRiverName() {
        return riverName;
    }
    public void setRiverName(String riverName) {
        this.riverName = riverName;
    }
    public String getZhanName() {
        return zhanName;
    }
    public void setZhanName(String zhanName) {
        this.zhanName = zhanName;
    }
    public Date getDate() {
        return date;
    }
    public void setDate(Date date) {
        this.date = date;
    }
    public String getWaterLevel() {
        return waterLevel;
    }
    public void setWaterLevel(String waterLevel) {
        this.waterLevel = waterLevel;
    }
    public String getTraffic() {
        return traffic;
    }
    public void setTraffic(String traffic) {
        this.traffic = traffic;
    }
    public String getShuiShi() {
        return shuiShi;
    }
    public void setShuiShi(String shuiShi) {
        this.shuiShi = shuiShi;
    }
    public String getAlertTraffic() {
        return alertTraffic;
    }
    public void setAlertTraffic(String alertTraffic) {
        this.alertTraffic = alertTraffic;
    }
    public String getEnsureTraffic() {
        return ensureTraffic;
    }
    public void setEnsureTraffic(String ensureTraffic) {
        this.ensureTraffic = ensureTraffic;
    }
    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result
                + ((alertTraffic == null) ? 0 : alertTraffic.hashCode());
        result = prime * result + ((date == null) ? 0 : date.hashCode());
        result = prime * result
                + ((ensureTraffic == null) ? 0 : ensureTraffic.hashCode());
        result = prime * result
                + ((riverName == null) ? 0 : riverName.hashCode());
        result = prime * result + ((shuiShi == null) ? 0 : shuiShi.hashCode());
        result = prime * result + ((traffic == null) ? 0 : traffic.hashCode());
        result = prime * result
                + ((waterLevel == null) ? 0 : waterLevel.hashCode());
        result = prime * result
                + ((zhanName == null) ? 0 : zhanName.hashCode());
        return result;
    }
    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        River other = (River) obj;
        if (alertTraffic == null) {
            if (other.alertTraffic != null)
                return false;
        } else if (!alertTraffic.equals(other.alertTraffic))
            return false;
        if (date == null) {
            if (other.date != null)
                return false;
        } else if (!date.equals(other.date))
            return false;
        if (ensureTraffic == null) {
            if (other.ensureTraffic != null)
                return false;
        } else if (!ensureTraffic.equals(other.ensureTraffic))
            return false;
        if (riverName == null) {
            if (other.riverName != null)
                return false;
        } else if (!riverName.equals(other.riverName))
            return false;
        if (shuiShi == null) {
            if (other.shuiShi != null)
                return false;
        } else if (!shuiShi.equals(other.shuiShi))
            return false;
        if (traffic == null) {
            if (other.traffic != null)
                return false;
        } else if (!traffic.equals(other.traffic))
            return false;
        if (waterLevel == null) {
            if (other.waterLevel != null)
                return false;
        } else if (!waterLevel.equals(other.waterLevel))
            return false;
        if (zhanName == null) {
            if (other.zhanName != null)
                return false;
        } else if (!zhanName.equals(other.zhanName))
            return false;
        return true;
    }
    @Override
    public String toString() {
        return "River [riverName=" + riverName + ", zhanName=" + zhanName
                + ", date=" + date + ", waterLevel=" + waterLevel
                + ", traffic=" + traffic + ", shuiShi=" + shuiShi
                + ", alertTraffic=" + alertTraffic + ", ensureTraffic="
                + ensureTraffic + "]";
    }



}

jdbc操作的工具类是博主的精华,呈上。
jdbc的工具类:

package com.spd.util;

import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.Method;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.commons.beanutils.BeanUtils;



/**
 * 调用JDBC的公共类
 * @author Administrator
 *
 */
public class JdbcUtil {

    //声明一些变量存放配置信息
    private static String className = "";
    private static String url = "";
    private static String user = "";
    private static String password = "";

    /**
     * 在类被加载的时候执行,而且只执行一次
     */
    static {
        try {
            //获取配置文件信息
            Properties properties = new Properties();
            properties.load(JdbcUtil.class.getClassLoader().getResourceAsStream("jdbcUtil.properties"));
            //开始获取配置的初始值
            String datatype = properties.getProperty("datatype");
            className = properties.getProperty(datatype + "Driver");
            url = properties.getProperty(datatype + "Url");
            user = properties.getProperty(datatype + "User");
            password = properties.getProperty(datatype + "Password");
            //加载驱动
            Class.forName(className);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }

    /**
     * 获取proxool连接对象
     * @return
     */
    public static Connection getProxoolConnection(){
        Connection conn = null;
        try {
            Class.forName("org.logicalcobwebs.proxool.ProxoolDriver");
            conn = DriverManager.getConnection("proxool.test");
            conn.setAutoCommit(false);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }

        return conn;
    }


    /**
     * 创建一个连接
     * @return
     */
    public static Connection createConnection() {
        //声明一个连接
        Connection connection = null;
        try {
            //获取一个连接
            connection = DriverManager.getConnection(url, user, password);
            //将事务的自动提交关闭,每次都需要手动提交
            connection.setAutoCommit(false);

        } catch (SQLException e) {
            System.err.println("创建连接失败,请检查连接参数:url[" + url + "]user[" + user + "]password[" + password + "]");
            e.printStackTrace();
        }
        return connection;
    }

    /**
     * 创建发送器
     * @param connection
     * @return
     */
    public static Statement createStatement(Connection connection) {
        Statement statement = null;
        try {
            statement = connection.createStatement();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return statement;
    }

    /**
     * 创建预处理发送器
     * @param connection
     * @param sql
     * @return
     */
    public static PreparedStatement createPreparedStatement(Connection connection, CharSequence sql) {
        PreparedStatement preparedStatement = null;
        try {
            preparedStatement = connection.prepareStatement(sql.toString());
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return preparedStatement;
    }

    /**
     * 关闭连接释放资源
     * @param connection
     * @param statement
     * @param resultSet
     */
    public static void closeAll(Connection connection, Statement statement, ResultSet resultSet) {
        if (resultSet != null) {
            try {
                resultSet.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        if (statement != null) {
            try {
                statement.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
        if (connection != null) {
            try {
                connection.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 将结果集合里面的数据存放至List
     * @param resultSet
     * @param clazz
     */
    public static  List resultSet2List(ResultSet resultSet, Class clazz) {
        //声明一个List
        List list = new ArrayList();
        try {
            //获取当前类所有的属性
            Field[] dfields = clazz.getDeclaredFields();
            //首先遍历resultSet
            while (resultSet.next()) {
                //创建一个对象
                T bean = clazz.getConstructor().newInstance();
                //遍历属性
                for (int i = 0; i < dfields.length; i++) {
                    //获取属性的名字
                    String fieldName = dfields[i].getName();
                    //获取属性对应的值
                    Object fieldValue = resultSet.getObject(fieldName);
                    //获取属性的set方法
                    String methodName = "set" + fieldName.toUpperCase().substring(0, 1) + fieldName.substring(1);
                    //获取方法
                    Method setMethod = clazz.getMethod(methodName, dfields[i].getType());
                    //要将值设置给对象
                    setMethod.invoke(bean, fieldValue);
                }
                //然后将对象添加到List
                list.add(bean);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return list;
    }


    /**
     * 将结果集合里面的数据存放至对象
     * @param resultSet
     * @param clazz
     */
    public static  T resultSet2bean(ResultSet resultSet, Class clazz) {
        //声明一个List
        List list = resultSet2List(resultSet, clazz);
        if (list != null && list.size() >= 1) {
            return list.get(0);
        }
        return null;
    }

    /**
     * 给sql变量赋值
     * @param sql
     * @param preparedStatement
     * @param params
     */
    public static void setParams(String sql,
            PreparedStatement preparedStatement, Object... params) {
        // 遍历参数
        for (int i = 0; i < params.length; i++) {
            try {
                preparedStatement.setObject(i + 1, params[i]);
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 查询操作
     * @param cls
     * @param sql
     * @param params
     * @return
     */
    public static  List executeDQL(Class cls, String sql,
            Object... params) {
        // 声明返回值对象
        List list = new ArrayList();
        // 声明连接
        Connection connection = null;
        PreparedStatement preparedStatement = null;
        ResultSet resultSet = null;
        try {
            // 获得一个Connection对象
            connection = createConnection();
            // 创建命令发送器
            preparedStatement = createPreparedStatement(connection, sql);
            // 给sql变量赋值
            setParams(sql, preparedStatement, params);
            // 利用命令发送器执行sql语句并接收结果集
            resultSet = preparedStatement.executeQuery();
            // 获得数据库表的字段信息
            ResultSetMetaData resultSetMetaData = resultSet.getMetaData();

            /**
             * rsmd.getColumnLabel(i).toLowerCase():获取字段名称
             * rsmd.getColumnCount():获取字段的个数
             */
            while (resultSet.next()) {
                // 创建当前对象
                T bean = cls.newInstance();
                for (int i = 1; i < resultSetMetaData.getColumnCount(); i++) {
                    BeanUtils.setProperty(bean, resultSetMetaData.getColumnLabel(i), resultSet.getObject(i));
                }
                list.add(bean);

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        // 创建对象

        return list;
    }
    /**
     * 更新操作
     * @param sql
     * @param params
     * @return
     */
    public static boolean executeDML(String sql,Object[] params){


        //声明连接
            Connection connection = null;
            PreparedStatement preparedStatement = null;

        try {
            //创建连接
            connection = JdbcUtil.createConnection();

            //创建命令发送器

            preparedStatement = JdbcUtil.createPreparedStatement(connection, sql);

            for(int i=0;i1, params[i]);
            }


            //利用命令发送器执行SQL语句并且获取结果集
            int count = preparedStatement.executeUpdate();
            //判断是否插入成功
            if(count >0){
                connection.commit();
                return true;
            }else {
                connection.rollback();
            }

        } catch (SQLException e) {
            e.printStackTrace();
        }finally{//关闭资源
            JdbcUtil.closeAll(connection, preparedStatement, null);
        }

        return false;
    }


}

数据库配置文件:

##choose which database
datatype=mysql
##mysql
mysqlDriver=com.mysql.jdbc.Driver
mysqlUrl=jdbc:mysql://localhost:3306/*****?useUnicode=true&characterEncoding=utf-8
mysqlUser=****
mysqlPassword=****

这是项目结构:
利用java定时爬取网页数据_第6张图片
将项目导出为可执行jar包,用windows定时任务器每小时执行jar包,就可以完成定时抓取网页数据。
对于别的网站也是可行,主要是分析和爬取数据的步骤。喜欢的小伙伴可以试一下,如果觉得还行,给博主点一波赞哦。

你可能感兴趣的:(Java,爬虫)