JAVA 爬取指定网站的数据并存入MySQL数据库中 maven +httpclient+jsoup+mysql

            最近在做一个小项目,因为要用的数据爬取,所以研究了好多天,分享一下自己的方法

目录结构:

JAVA 爬取指定网站的数据并存入MySQL数据库中 maven +httpclient+jsoup+mysql_第1张图片

自己创建maven工程,导入相关依赖:pom.xml




  4.0.0

  maven
  maven
  1.0-SNAPSHOT
  war

  maven Maven Webapp
  
  http://www.example.com

  
    UTF-8
    1.7
    1.7
  

  
    
      junit
      junit
      4.11
      test
    

    
    
      org.apache.httpcomponents
      httpclient
      4.5.3
    

    
    
      mysql
      mysql-connector-java
      5.1.18
    

    
    
      org.jsoup
      jsoup
      1.8.3
    

  

  
    maven
    
      
        
          maven-clean-plugin
          3.0.0
        
        
        
          maven-resources-plugin
          3.0.2
        
        
          maven-compiler-plugin
          3.7.0
        
        
          maven-surefire-plugin
          2.20.1
        
        
          maven-war-plugin
          3.2.0
        
        
          maven-install-plugin
          2.5.2
        
        
          maven-deploy-plugin
          2.8.2
        
      
    
  


StockTest类:

import java.io.IOException;
import java.sql.*;
import org.apache.http.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 * @author 
 * @date 2018/12/6 
 */
public class StockTest {

    public static void main(String[] args) throws ParseException, IOException {
        String content = StockUtils.getHtmlByUrl("https:***需要爬取的网页****", "utf-8");
        parserHtml(content);
    }

    public static void parserHtml(String content) throws ParseException, IOException {
        Document doc = Jsoup.parse(content);
        //Elements links = doc.getElementsByClass("winstyle214").select("tr");
        Elements links = doc.getElementsByClass("winstyle614").select("tr");
        //line67214_0
        for (Element e : links) {
            String title = e.select("a").text().toString();
            System.out.println("新闻标题:" + title);
            //获取页面链接
            Elements linkHref = e.select("a");
            String url =   linkHref.attr("href");
            System.out.println("新闻链接:" + url);
            //截取时间字符串
            Elements timeStr = e.select("span[class=timestyle67214]");
            String time = timeStr.text();
            System.out.println("发布时间:" + time);
            insert(title, url, time);
        }
    }

    private static void insert(String title, String urll, String date1) {
        Connection con = null;
        PreparedStatement pstm = null;
        PropertiesUtil.loadFile("jdbc.properties");
        String driver = PropertiesUtil.getPropertyValue("driver");
        String url = PropertiesUtil.getPropertyValue("url");
        String username = PropertiesUtil.getPropertyValue("username");
        String password = PropertiesUtil.getPropertyValue("password");
        try {
            Class.forName(driver);
            con = DriverManager.getConnection(url,username,password);


            String sql = "insert into news(title,urll,date1) value(?,?,?)";
            pstm = con.prepareStatement(sql);
            pstm.setString(1, title);
            pstm.setString(2, urll);
            pstm.setString(3, date1);
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } finally {
            if (con != null) {
                try {
                    con.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
                if (pstm != null) {
                    try {
                        pstm.close();
                    } catch (SQLException e) {
                        e.printStackTrace();
                    }
                }
            }

        }
    }
}

StokUtils类:

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
/**
 * 传递网页链接
 * 返回网页源码
 * @author
 *
 */
public class StockUtils {
    //第一次获取网页源码
    public static String getHtmlByUrl(final String url, final String charset) throws IOException {
        /*RequestConfig defaultRequestConfig = RequestConfig.custom()
                  .setConnectTimeout(5000)
                  .setConnectionRequestTimeout(5000)
                  .build();*/

        //CloseableHttpClient httpclient = HttpClients.custom().setMaxConnTotal(800).setMaxConnPerRoute(800).setDefaultRequestConfig(defaultRequestConfig).build();
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            HttpGet httpget = new HttpGet(url);
            //System.out.println("executing request " + httpget.getURI());
            ResponseHandler responseHandler = new ResponseHandler() {
                public String handleResponse(final HttpResponse response) throws ClientProtocolException, IOException {
                    int status = response.getStatusLine().getStatusCode();
                    //System.out.println("========responseStatusCode:"+status + "  "+url);
                    if (status == 200) {
                        HttpEntity entity = response.getEntity();
                        if (entity == null) {
                            System.out.println("========entity is null:" + status + "  " + url);
                            return null;
                        } else {
                            String content = EntityUtils.toString(entity);
                            if (charset != null) {
                                content = new String(content.getBytes("ISO-8859-1"), charset);
                            }
                            return content;
                        }
                    } else {
                        throw new ClientProtocolException("Unexpected response status: " + status);
                    }
                }
            };
            String responseBody = httpclient.execute(httpget, responseHandler);

            return responseBody;

        } catch (ClientProtocolException e) {
            System.out.println("========ClientProtocolException====" + e.getMessage() + "  " + url);
            //e.printStackTrace();
            return getHtmlByUrl(url, charset);
        } catch (IOException e) {
            System.out.println("========IOException====" + e.getMessage() + "  " + url);
            //e.printStackTrace();
            return getHtmlByUrl(url, charset);
        } finally {
            httpclient.close();
        }
    }
}

配置文件    jdbc.properties(maven 工程和JAVA工程连接数据库还是有所不同的,具体自己百度):

driver=com.mysql.jdbc.Driver
url=jdbc:mysql://localhost:3306/zgschool?useUnicode=true&characterEncoding=utf-8&useSSL=false
username=root
password=root

配置该文件后,properties类会自动生成。。。。

至此,大功告成。。

你可能感兴趣的:(JAVA)