Java爬虫初体验:简单抓取IT之家热评(整合Spring Boot+Elasticsearch+Redis+Mybatis)

爬取主程序

使用Jsoup解析网页源代码

@Component
public class WebCrawler {

    private static final String encoding = "utf-8";

    @Autowired
    private HotCommentMapper hotCommentMapper;
    @Autowired
    private RedisService redisService;
    @Autowired
    private EsService esService;

    private static boolean done = false;
    private static final int THREAD_NUM = 15;
    private static AtomicInteger page = new AtomicInteger(0);
    private static List breakpoints;

    /**
     * 定时爬取更新
     */
    //@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3)
    public void start(){
        done = false;
        System.out.println("开始爬取:"+System.currentTimeMillis());
        for (int i = 0;inew Thread(new Runnable() {
                @Override
                public void run() {
                    while (!done) {
                        int p = page.incrementAndGet();
                        crawl(p);
                    }
                    System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());
                }
            },"Thread--"+i).start();
        }
    }

    public synchronized void stop(){
        done = true;
        redisService.listRemove("ithome:breakpoints");
        redisService.listAdd("ithome:breakpoints",breakpoints);
    }

    /**
     * @param page :页码
     */
    public void crawl(int page){
        String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +
                "page="+page+"&type=indexpage&randnum="+Math.random();
        String src = getHtmlSrc(url,encoding);
        List links = getArticleLinks(src);
        if (links.size()<=0){
            stop();
            return ;
        }
        //不知还有没更好的方法判断最近一次抓取的位置?
        if(redisService.containsValue("ithome:breakpoints",links)){
            stop();
            return ;
        }
        //保存第一页链接做结束点
        if (page == 1){
            breakpoints = links;
        }
        for (String link:links){
            parseAndSaveHotComments(link);
        }
    }

    /**
     *
     * @param url
     * @param encoding 编码
     * @return 网页源代码
     */
    public String getHtmlSrc(String url,String encoding){
        StringBuilder src = new StringBuilder();
        InputStreamReader isr = null;
        try {
            URL urlObj = new URL(url);//建立网络链接
            URLConnection urlConn = urlObj.openConnection();//打开链接
            isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流
            BufferedReader reader = new BufferedReader(isr);//建立缓冲
            String line = null;
            while ((line = reader.readLine())!=null){
                src.append(line);
            }
        }catch (Exception e){
            e.printStackTrace();
        }finally {
            try {
                if (isr != null){
                    isr.close();
                }
            }catch (Exception e){
                e.printStackTrace();
            }
        }
        return src.toString();
    }

    /**
     * @param srcCode
     * @return 解析源代码,获取文章链接
     */
    public List getArticleLinks(String srcCode){
        List links = new ArrayList();
        Document document = Jsoup.parse(srcCode);
        Elements articleEls = document.select("h2>a");

        for (Element el:articleEls){
            String href = el.attr("href");
            links.add(href);
        }
        return links;
    }

    /**
     *
     * @param articleHref 文章链接
     * @description 使用Jsoup解析热评内容并保存
     */
    public void parseAndSaveHotComments(String articleHref){
        String articlePage = getHtmlSrc(articleHref,encoding);
        Document document = Jsoup.parse(articlePage);
        Element iframeEl = document.getElementById("ifcomment");
        if(iframeEl == null) {
            return ;
        }
        String commentHref = iframeEl.attr("src");//评论页面URL

        //获取文章ID
        document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));
        Element articleIdInput = document.getElementById("newsid");
        String articleId = articleIdInput.attr("value");

        //获取热评数据并解析
        String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";
        String hotCommentPage = getHtmlSrc(link,encoding);
        document = Jsoup.parse(hotCommentPage);
        Elements hotCommentEls = document.select("li.entry");

        HotComment hotComment = null;
        for (Element el:hotCommentEls){
            hotComment = new HotComment();
            String  commontId = el.attr("cid");
            String user = el.select("strong.nick a").text();
            String comment = el.getElementsByTag("P").text();
            int up = getNumber(el.select("a.s").text());
            int down = getNumber(el.select("a.a").text());
            String posandtime = el.select("span.posandtime").text();
            String mobile = el.select("span.mobile a").text();

            hotComment = new HotComment();
            hotComment.setCommentId(commontId);
            hotComment.setArticleUrl(articleHref);
            hotComment.setUser(user);
            hotComment.setComment(comment);
            hotComment.setUp(up);
            hotComment.setDown(down);
            hotComment.setPosandtime(posandtime);
            hotComment.setMobile(mobile);

            hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以
            esService.addHotComment(hotComment);//添加索引
            if(hotComment.getUp()>=2500){
                redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评
            }

            //System.out.println(hotComment.toString());
        }
    }

    /**
     *
     * @param str
     * @return 解析"()"中的数字
     */
    public int getNumber(String str){
        Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");
        Matcher matcher = pattern.matcher(str);

        if(matcher.find()){
            return Integer.parseInt(matcher.group());
        }
        return 0;
    }

//    public static void main(String [] args){
//        new WebCrawler().start();
//    }
}

整合Elasticsearch

相关依赖:


        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-data-elasticsearchartifactId>
        dependency>

application.yml配置:

spring:
  data:
  ##elasticsearch配置
    elasticsearch:
      cluster-name: elasticsearch
      cluster-nodes: localhost:9300

代码部分:

Respository

public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{
    public List findByUser(String user);
}

实体

@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")
public class HotComment implements Serializable{

    private static final long serialVersionUID = -4249699545233058684L;
    @Id
    private Long id;//热评编号
    private String commentId;
    private String user;//用户
    private String comment;//内容
    private int up;//支持数
    private int down;//反对数
    private String posandtime;//位置和时间
    private String mobile;//设备
    private String articleUrl;//源文章地址

    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public String getCommentId() {
        return commentId;
    }

    public void setCommentId(String commentId) {
        this.commentId = commentId;
    }


    public String getUser() {
        return user;
    }

    public void setUser(String user) {
        this.user = user;
    }

    public String getComment() {
        return comment;
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    public int getUp() {
        return up;
    }

    public void setUp(int up) {
        this.up = up;
    }

    public int getDown() {
        return down;
    }

    public void setDown(int down) {
        this.down = down;
    }

    public String getPosandtime() {
        return posandtime;
    }

    public void setPosandtime(String posandtime) {
        this.posandtime = posandtime;
    }

    public String getMobile() {
        return mobile;
    }

    public void setMobile(String mobile) {
        this.mobile = mobile;
    }

    public String getArticleUrl() {
        return articleUrl;
    }

    public void setArticleUrl(String articleUrl) {
        this.articleUrl = articleUrl;
    }

    @Override
    public String toString() {
        return "HotComment{" +
                "id='" + id + '\'' +
                "commentId='" + commentId + '\'' +
                ", user='" + user + '\'' +
                ", comment='" + comment + '\'' +
                ", up=" + up +
                ", down=" + down +
                ", posandtime='" + posandtime + '\'' +
                ", mobile='" + mobile + '\'' +
                ", articleUrl='" + articleUrl + '\'' +
                '}';
    }
}

Service

@Service
public class EsService {
    @Autowired
    private EsRepository esRepository;

    public void addHotComment(HotComment hotComment){
        esRepository.save(hotComment);
    }

    /**
     * 缓存搜索结果
     * @param user
     * @return
     */
    @Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")
    public List findByUser(String user){
        return esRepository.findByUser(user);
    }
}

整合Redis

相关依赖


        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-data-redisartifactId>
        dependency>

相关配置

appliaction.yml配置

spring:
  ##redis配置
  redis:
    database: 0
    host: localhost
    port: 6379
    password: redis
    pool:
      max-active: 15
      max-wait: 1
      max-idle: 0
    timeout: 0

缓存相关配置

@Configuration
@EnableCaching
public class RedisConfig {
    @Bean
    public KeyGenerator keyGenerator(){
        return new KeyGenerator(){

            @Override
            public Object generate(Object o, Method method, Object... objects) {
                StringBuilder sb = new StringBuilder();
                sb.append(o.getClass().getName());
                sb.append(method.getName());
                for(Object obj : objects){
                    sb.append(obj.toString());
                }
                return sb.toString();
            }
        };
    }

    @Bean
    public CacheManager cacheManager(RedisTemplate redisTemplate){
        RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);
//        redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s
        Map map = new HashMap<>();
        map.put("ithome:hotcomments",60*60*24L);
        return redisCacheManager;
    }

    @Bean
    public RedisTemplate redisTemplate(RedisConnectionFactory factory){
        StringRedisTemplate template = new StringRedisTemplate(factory);
        Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);
        ObjectMapper om = new ObjectMapper();
        om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
        om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
        jackson2JsonRedisSerializer.setObjectMapper(om);
        template.setValueSerializer(jackson2JsonRedisSerializer);
        template.afterPropertiesSet();

        return template;
    }
}

代码部分:

Service

@Service
public class RedisService {

    @Autowired
    private RedisTemplate redisTemplate;

    /**
     *
     * @param key
     * @param hotComment
     * 添加热评至 redis
     */
    public void rankAdd(String key, HotComment hotComment){
        ZSetOperations zSetOperations = redisTemplate.opsForZSet();
        zSetOperations.add(key,hotComment,hotComment.getUp());
    }

    /**
     *
     * @param key
     * @param top 前top条记录
     * @return
     */
    public Set rankGet(String key,int top){
        ZSetOperations zSetOperations = redisTemplate.opsForZSet();
        return zSetOperations.range(key,0,top);
    }

    /**
     *
     * @param key
     * @param values
     * @desc 保存最近一次抓取的位置点
     */
    public void listAdd(String key,List values){
        ListOperations listOperations = redisTemplate.opsForList();
        listOperations.rightPushAll(key,values);
    }

    public void listRemove(String key){
        redisTemplate.delete(key);
    }

    /**
     *
     * @param key
     * @param values
     * @return
     * @desc 判断是否抓取结束位置
     */
    public boolean containsValue(String key,List values){
        ListOperations listOperations = redisTemplate.opsForList();
        List list = listOperations.range(key,0,-1);
        for (String val : values){
            if(list.contains(val)){
                return true;
            }
        }
        return false;
    }
}

整合Mybatis

依赖配置


        
        <dependency>
            <groupId>org.mybatis.spring.bootgroupId>
            <artifactId>mybatis-spring-boot-starterartifactId>
            <version>1.3.1version>
        dependency>
        
        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <scope>runtimescope>
        dependency>
        
        <dependency>
            <groupId>com.alibabagroupId>
            <artifactId>druidartifactId>
            <version>1.0.24version>
        dependency>

数据源配置

@Configuration
@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")
public class DatabaseConfig {
    /**
     * 数据源配置
     * @return
     */
    @Bean
    public DataSource druidDataSource(){
        DruidDataSource dataSource=new DruidDataSource();
        dataSource.setUsername("root");
        dataSource.setPassword("mysql");
        dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");
        dataSource.setDriverClassName("com.mysql.jdbc.Driver");
        return dataSource;
    }
}

代码部分:

Mapper

public interface HotCommentMapper {
    @Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")
    @Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")
    public void addHotComment(HotComment hotComment);
}

Controller

@Controller
@RequestMapping("/ithome")
public class HotCommentController {

    @Autowired
    private RedisService redisService;
    @Autowired
    private EsService esService;

    /**
     * 首页
     * @return
     */
    @GetMapping("/index")
    public ModelAndView index(){
        ModelAndView mav = new ModelAndView("search");
        Set set = redisService.rankGet("ithome:hotrank",50);
        mav.addObject("comments",set);
        return mav;
    }

    /**
     * 搜索
     * @param keyword
     * @return
     */
    @GetMapping("/search/{keyword}")
    public ModelAndView search(@PathVariable("keyword") String keyword){
        ModelAndView mav = new ModelAndView("search");
        List list = esService.findByUser(keyword);
        mav.addObject("comments",list);
        return mav;
    }
}

主程序

@SpringBootApplication
@EnableElasticsearchRepositories
@EnableScheduling
public class IthomecrawlerApplication {

    public static void main(String[] args) {
        SpringApplication.run(IthomecrawlerApplication.class, args);
    }
}

完整application.yml文件

#端口号
server:
  port: 8081

spring:
  data:
  ##elasticsearch配置
    elasticsearch:
      cluster-name: elasticsearch
      cluster-nodes: localhost:9300
  ##redis配置
  redis:
    database: 0
    host: localhost
    port: 6379
    password: redis
    pool:
      max-active: 15
      max-wait: 1
      max-idle: 0
    timeout: 0
  ##freemarker配置
  freemarker:
  ##是否允许属性覆盖
    allow-request-override: false
    allow-session-override: false
    cache: true
    check-template-location: true
    content-type: text/html
  ##暴露request属性
    expose-request-attributes: false
    expose-session-attributes: false
    expose-spring-macro-helpers: false
    suffix: .ftl
    template-loader-path: classpath:/templates/
    request-context-attribute: request
    settings:
      classic_compatible: true
      locale: zh_CN
      date_format: yyyy-MM-dd
      time_format: HH:mm:ss
      datetime_format: yyyy-MM-dd HH:mm:ss

完整pom.xml文件


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>

    <groupId>comcrazygroupId>
    <artifactId>ithomecrawlerartifactId>
    <version>0.0.1-SNAPSHOTversion>
    <packaging>jarpackaging>

    <name>ithomecrawlername>
    <description>ITHome Crawler.description>

    <parent>
        <groupId>org.springframework.bootgroupId>
        <artifactId>spring-boot-starter-parentartifactId>
        <version>1.5.6.RELEASEversion>
        <relativePath/> 
    parent>

    <properties>
        <project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8project.reporting.outputEncoding>
        <java.version>1.8java.version>
    properties>

    <dependencies>
        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-webartifactId>
        dependency>
        
        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-data-elasticsearchartifactId>
        dependency>
        
        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-data-redisartifactId>
        dependency>
        
        <dependency>
            <groupId>org.mybatis.spring.bootgroupId>
            <artifactId>mybatis-spring-boot-starterartifactId>
            <version>1.3.1version>
        dependency>
        
        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <scope>runtimescope>
        dependency>
        
        <dependency>
            <groupId>com.alibabagroupId>
            <artifactId>druidartifactId>
            <version>1.0.24version>
        dependency>
        
        <dependency>
            <groupId>org.jsoupgroupId>
            <artifactId>jsoupartifactId>
            <version>1.10.3version>
        dependency>

        
        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-freemarkerartifactId>
        dependency>
        <dependency>
            <groupId>org.springframework.bootgroupId>
            <artifactId>spring-boot-starter-testartifactId>
            <scope>testscope>
        dependency>
    dependencies>

    <build>
        <finalName>ithomecrawlerfinalName>
        <plugins>
            <plugin>
                <groupId>org.springframework.bootgroupId>
                <artifactId>spring-boot-maven-pluginartifactId>
            plugin>
        plugins>
    build>


project>

完整代码

你可能感兴趣的:(Backend,Mybatis,Redis,Freemarker,Spring,Boot,Elasticsearch)