java+jsoup实现简单的爬虫 简单爬取百度百度实时热点

架构:Maven + MyBatis + MySQL + Mapper + Jsoup

先上整体架子 

java+jsoup实现简单的爬虫 简单爬取百度百度实时热点_第1张图片

数据库表设计

java+jsoup实现简单的爬虫 简单爬取百度百度实时热点_第2张图片

 

 下面就开始上代码了

Day01_BaiduNewsCrawler

package edu.xawl.main;

import edu.xawl.mapper.BaiduNewsMapper;
import edu.xawl.po.BaiduNews;
import edu.xawl.utils.MybatisHelper;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Day01_BaiduNewsCrawler {

    public static void main(String[] args) throws Exception {
        // 获取SQLSession
        SqlSession sqlsessionLocal = MybatisHelper.getSqlsessionLocal();
        // 注入要要操作的表mapper
        BaiduNewsMapper baiduNewsMapper = sqlsessionLocal.getMapper(BaiduNewsMapper.class);
        // 要抓取的网站
        String url = "http://top.baidu.com/buzz?b=1";
        // 使用爬虫获取
        Document document = Jsoup.connect(url).get();
        getElmentAndInsert(document,baiduNewsMapper,"实时热点");
        // 获取新闻列表
        Elements lis = document.select("#flist li");
        for (int i = 2; i < lis.size(); i++) {
            Element li = lis.get(i);
            String title = li.select("a").attr("title");
            String href = "http://top.baidu.com" + li.select("a").attr("href").substring(1);
            document = Jsoup.connect(href).get();
            getElmentAndInsert(document,baiduNewsMapper,title);
        }
        // 提交事务
        sqlsessionLocal.commit();
        // 关闭流
        sqlsessionLocal.close();
    }

    public static void getElmentAndInsert(Document document, BaiduNewsMapper baiduNewsMapper,String type) {
        // 选择相应的内容
        Elements trs = document.select("#main > div.mainBody > div > table tr");
        for (Element tr : trs) {
            // 获取网页数据
            String keyword = tr.select(".list-title").text();
            String clazz = tr.select(".tc").text();
            String tempNum = tr.select(".last").text();
            int num = 0;
            try {
                num = Integer.parseInt(tempNum);
            } catch (NumberFormatException e) {
                continue;
            }
            // 封装到po对象中
            BaiduNews baiduNews = new BaiduNews();
            baiduNews.setKeyword(keyword);
            baiduNews.setType(type);
            baiduNews.setClazz(clazz);
            baiduNews.setSerchNm(num);
            baiduNewsMapper.insert(baiduNews);
        }
    }


}

pom.xml



    4.0.0
    edu.xawl
    crawler
    1.0-SNAPSHOT
    
        
            mysql
            mysql-connector-java
            5.1.20
        

        
            org.mybatis
            mybatis
            3.4.6
        

        
            tk.mybatis
            mapper
            4.0.4
        

        
            org.jsoup
            jsoup
            1.11.3
        
        
        
            org.projectlombok
            lombok
            1.18.2
        
    

BaiduNewsMapper

mapper中实现了单表的增删改查,我们直接继承就可以使用

package edu.xawl.mapper;

import edu.xawl.po.BaiduNews;
import tk.mybatis.mapper.common.Mapper;

public interface BaiduNewsMapper extends Mapper {
}

BaiduNews   po对象

package edu.xawl.po;

import lombok.Data;

import javax.persistence.Column;
import javax.persistence.Table;

@Data
@Table(name = "baidu_news")
public class BaiduNews {
    private Integer id;
    private String keyword;
    private String type;
    private String clazz;
    @Column(name = "search_num")
    private Integer serchNm;
}

MybatisHelper

package edu.xawl.utils;

import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import tk.mybatis.mapper.common.Mapper;
import tk.mybatis.mapper.common.MySqlMapper;
import tk.mybatis.mapper.entity.Config;
import tk.mybatis.mapper.mapperhelper.MapperHelper;


public class MybatisHelper {

    private static SqlSessionFactory sqlSessionFactory;

    static {
        try {
            sqlSessionFactory = new SqlSessionFactoryBuilder().build(Resources.getResourceAsReader("mybatis-config.xml"), "local");
            SqlSession sessionLocal = null;
            try {
                sessionLocal = sqlSessionFactory.openSession();
                MapperHelper mapperHelper = new MapperHelper();
                Config config = new Config();
                config.setEnableMethodAnnotation(true);
                mapperHelper.setConfig(config);
                mapperHelper.registerMapper(Mapper.class);
                mapperHelper.registerMapper(MySqlMapper.class);
                mapperHelper.processConfiguration(sessionLocal.getConfiguration());
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (sessionLocal != null) {
                    sessionLocal.close();
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static SqlSession getSqlsessionLocal() {
        return sqlSessionFactory.openSession();
    }

}

mybatis-config.xml




    
    
        
        
    

    
        
    

    
    
        
            
            
                
                
                
                
            
        
    

    
        
    


ok,完成

你可能感兴趣的:(java)