@Component
public class WebCrawler {
private static final String encoding = "utf-8";
@Autowired
private HotCommentMapper hotCommentMapper;
@Autowired
private RedisService redisService;
@Autowired
private EsService esService;
private static boolean done = false;
private static final int THREAD_NUM = 15;
private static AtomicInteger page = new AtomicInteger(0);
private static List breakpoints;
/**
* 定时爬取更新
*/
//@Scheduled(initialDelay = 1000, fixedRate = 1000*60*60*24*3)
public void start(){
done = false;
System.out.println("开始爬取:"+System.currentTimeMillis());
for (int i = 0;inew Thread(new Runnable() {
@Override
public void run() {
while (!done) {
int p = page.incrementAndGet();
crawl(p);
}
System.out.println(Thread.currentThread().getName()+":结束:"+System.currentTimeMillis());
}
},"Thread--"+i).start();
}
}
public synchronized void stop(){
done = true;
redisService.listRemove("ithome:breakpoints");
redisService.listAdd("ithome:breakpoints",breakpoints);
}
/**
* @param page :页码
*/
public void crawl(int page){
String url = "https://www.ithome.com/ithome/getajaxdata.aspx?" +
"page="+page+"&type=indexpage&randnum="+Math.random();
String src = getHtmlSrc(url,encoding);
List links = getArticleLinks(src);
if (links.size()<=0){
stop();
return ;
}
//不知还有没更好的方法判断最近一次抓取的位置?
if(redisService.containsValue("ithome:breakpoints",links)){
stop();
return ;
}
//保存第一页链接做结束点
if (page == 1){
breakpoints = links;
}
for (String link:links){
parseAndSaveHotComments(link);
}
}
/**
*
* @param url
* @param encoding 编码
* @return 网页源代码
*/
public String getHtmlSrc(String url,String encoding){
StringBuilder src = new StringBuilder();
InputStreamReader isr = null;
try {
URL urlObj = new URL(url);//建立网络链接
URLConnection urlConn = urlObj.openConnection();//打开链接
isr = new InputStreamReader(urlConn.getInputStream(),encoding);//建立文件输入流
BufferedReader reader = new BufferedReader(isr);//建立缓冲
String line = null;
while ((line = reader.readLine())!=null){
src.append(line);
}
}catch (Exception e){
e.printStackTrace();
}finally {
try {
if (isr != null){
isr.close();
}
}catch (Exception e){
e.printStackTrace();
}
}
return src.toString();
}
/**
* @param srcCode
* @return 解析源代码,获取文章链接
*/
public List getArticleLinks(String srcCode){
List links = new ArrayList();
Document document = Jsoup.parse(srcCode);
Elements articleEls = document.select("h2>a");
for (Element el:articleEls){
String href = el.attr("href");
links.add(href);
}
return links;
}
/**
*
* @param articleHref 文章链接
* @description 使用Jsoup解析热评内容并保存
*/
public void parseAndSaveHotComments(String articleHref){
String articlePage = getHtmlSrc(articleHref,encoding);
Document document = Jsoup.parse(articlePage);
Element iframeEl = document.getElementById("ifcomment");
if(iframeEl == null) {
return ;
}
String commentHref = iframeEl.attr("src");//评论页面URL
//获取文章ID
document = Jsoup.parse(getHtmlSrc("http:"+commentHref,encoding));
Element articleIdInput = document.getElementById("newsid");
String articleId = articleIdInput.attr("value");
//获取热评数据并解析
String link = "http://dyn.ithome.com/ithome/getajaxdata.aspx?newsID="+articleId+"&type=hotcomment";
String hotCommentPage = getHtmlSrc(link,encoding);
document = Jsoup.parse(hotCommentPage);
Elements hotCommentEls = document.select("li.entry");
HotComment hotComment = null;
for (Element el:hotCommentEls){
hotComment = new HotComment();
String commontId = el.attr("cid");
String user = el.select("strong.nick a").text();
String comment = el.getElementsByTag("P").text();
int up = getNumber(el.select("a.s").text());
int down = getNumber(el.select("a.a").text());
String posandtime = el.select("span.posandtime").text();
String mobile = el.select("span.mobile a").text();
hotComment = new HotComment();
hotComment.setCommentId(commontId);
hotComment.setArticleUrl(articleHref);
hotComment.setUser(user);
hotComment.setComment(comment);
hotComment.setUp(up);
hotComment.setDown(down);
hotComment.setPosandtime(posandtime);
hotComment.setMobile(mobile);
hotCommentMapper.addHotComment(hotComment);//保存数据至数据库,这里保不保存其实都可以
esService.addHotComment(hotComment);//添加索引
if(hotComment.getUp()>=2500){
redisService.rankAdd("ithome:hotrank",hotComment);//缓存大于2500个赞的热评
}
//System.out.println(hotComment.toString());
}
}
/**
*
* @param str
* @return 解析"()"中的数字
*/
public int getNumber(String str){
Pattern pattern = Pattern.compile("(?<=\\()(.+?)(?=\\))");
Matcher matcher = pattern.matcher(str);
if(matcher.find()){
return Integer.parseInt(matcher.group());
}
return 0;
}
// public static void main(String [] args){
// new WebCrawler().start();
// }
}
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-data-elasticsearchartifactId>
dependency>
spring:
data:
##elasticsearch配置
elasticsearch:
cluster-name: elasticsearch
cluster-nodes: localhost:9300
public interface EsRepository extends ElasticsearchRepository<HotComment,Long>{
public List findByUser(String user);
}
@Document(indexName="hotcomments",type="hotcomment",indexStoreType="fs",shards=5,replicas=1,refreshInterval="-1")
public class HotComment implements Serializable{
private static final long serialVersionUID = -4249699545233058684L;
@Id
private Long id;//热评编号
private String commentId;
private String user;//用户
private String comment;//内容
private int up;//支持数
private int down;//反对数
private String posandtime;//位置和时间
private String mobile;//设备
private String articleUrl;//源文章地址
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getCommentId() {
return commentId;
}
public void setCommentId(String commentId) {
this.commentId = commentId;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getComment() {
return comment;
}
public void setComment(String comment) {
this.comment = comment;
}
public int getUp() {
return up;
}
public void setUp(int up) {
this.up = up;
}
public int getDown() {
return down;
}
public void setDown(int down) {
this.down = down;
}
public String getPosandtime() {
return posandtime;
}
public void setPosandtime(String posandtime) {
this.posandtime = posandtime;
}
public String getMobile() {
return mobile;
}
public void setMobile(String mobile) {
this.mobile = mobile;
}
public String getArticleUrl() {
return articleUrl;
}
public void setArticleUrl(String articleUrl) {
this.articleUrl = articleUrl;
}
@Override
public String toString() {
return "HotComment{" +
"id='" + id + '\'' +
"commentId='" + commentId + '\'' +
", user='" + user + '\'' +
", comment='" + comment + '\'' +
", up=" + up +
", down=" + down +
", posandtime='" + posandtime + '\'' +
", mobile='" + mobile + '\'' +
", articleUrl='" + articleUrl + '\'' +
'}';
}
}
@Service
public class EsService {
@Autowired
private EsRepository esRepository;
public void addHotComment(HotComment hotComment){
esRepository.save(hotComment);
}
/**
* 缓存搜索结果
* @param user
* @return
*/
@Cacheable(value = "ithome:hotcomments", key = "'ithome:user:'+#user")
public List findByUser(String user){
return esRepository.findByUser(user);
}
}
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-data-redisartifactId>
dependency>
spring:
##redis配置
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
@Configuration
@EnableCaching
public class RedisConfig {
@Bean
public KeyGenerator keyGenerator(){
return new KeyGenerator(){
@Override
public Object generate(Object o, Method method, Object... objects) {
StringBuilder sb = new StringBuilder();
sb.append(o.getClass().getName());
sb.append(method.getName());
for(Object obj : objects){
sb.append(obj.toString());
}
return sb.toString();
}
};
}
@Bean
public CacheManager cacheManager(RedisTemplate redisTemplate){
RedisCacheManager redisCacheManager = new RedisCacheManager(redisTemplate);
// redisCacheManager.setDefaultExpiration(60*60*24);//缓存失效时间,单位:s
Map map = new HashMap<>();
map.put("ithome:hotcomments",60*60*24L);
return redisCacheManager;
}
@Bean
public RedisTemplate redisTemplate(RedisConnectionFactory factory){
StringRedisTemplate template = new StringRedisTemplate(factory);
Jackson2JsonRedisSerializer jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer(Object.class);
ObjectMapper om = new ObjectMapper();
om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
jackson2JsonRedisSerializer.setObjectMapper(om);
template.setValueSerializer(jackson2JsonRedisSerializer);
template.afterPropertiesSet();
return template;
}
}
@Service
public class RedisService {
@Autowired
private RedisTemplate redisTemplate;
/**
*
* @param key
* @param hotComment
* 添加热评至 redis
*/
public void rankAdd(String key, HotComment hotComment){
ZSetOperations zSetOperations = redisTemplate.opsForZSet();
zSetOperations.add(key,hotComment,hotComment.getUp());
}
/**
*
* @param key
* @param top 前top条记录
* @return
*/
public Set rankGet(String key,int top){
ZSetOperations zSetOperations = redisTemplate.opsForZSet();
return zSetOperations.range(key,0,top);
}
/**
*
* @param key
* @param values
* @desc 保存最近一次抓取的位置点
*/
public void listAdd(String key,List values){
ListOperations listOperations = redisTemplate.opsForList();
listOperations.rightPushAll(key,values);
}
public void listRemove(String key){
redisTemplate.delete(key);
}
/**
*
* @param key
* @param values
* @return
* @desc 判断是否抓取结束位置
*/
public boolean containsValue(String key,List values){
ListOperations listOperations = redisTemplate.opsForList();
List list = listOperations.range(key,0,-1);
for (String val : values){
if(list.contains(val)){
return true;
}
}
return false;
}
}
<dependency>
<groupId>org.mybatis.spring.bootgroupId>
<artifactId>mybatis-spring-boot-starterartifactId>
<version>1.3.1version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<scope>runtimescope>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>druidartifactId>
<version>1.0.24version>
dependency>
@Configuration
@MapperScan(basePackages = "com.crazy.ithomecrawler.mybatis.mapper")
public class DatabaseConfig {
/**
* 数据源配置
* @return
*/
@Bean
public DataSource druidDataSource(){
DruidDataSource dataSource=new DruidDataSource();
dataSource.setUsername("root");
dataSource.setPassword("mysql");
dataSource.setUrl("jdbc:mysql://localhost:3306/ithome");
dataSource.setDriverClassName("com.mysql.jdbc.Driver");
return dataSource;
}
}
public interface HotCommentMapper {
@Insert("INSERT INTO hot_comment(vCommentId,vUser,vComment,iUp,iDown,vPosandTime,vMobile,vArticleUrl) VALUES(#{commentId},#{user},#{comment},#{up},#{down},#{posandtime},#{mobile},#{articleUrl})")
@Options(useGeneratedKeys = true,keyProperty = "id",keyColumn = "id")
public void addHotComment(HotComment hotComment);
}
@Controller
@RequestMapping("/ithome")
public class HotCommentController {
@Autowired
private RedisService redisService;
@Autowired
private EsService esService;
/**
* 首页
* @return
*/
@GetMapping("/index")
public ModelAndView index(){
ModelAndView mav = new ModelAndView("search");
Set set = redisService.rankGet("ithome:hotrank",50);
mav.addObject("comments",set);
return mav;
}
/**
* 搜索
* @param keyword
* @return
*/
@GetMapping("/search/{keyword}")
public ModelAndView search(@PathVariable("keyword") String keyword){
ModelAndView mav = new ModelAndView("search");
List list = esService.findByUser(keyword);
mav.addObject("comments",list);
return mav;
}
}
@SpringBootApplication
@EnableElasticsearchRepositories
@EnableScheduling
public class IthomecrawlerApplication {
public static void main(String[] args) {
SpringApplication.run(IthomecrawlerApplication.class, args);
}
}
#端口号
server:
port: 8081
spring:
data:
##elasticsearch配置
elasticsearch:
cluster-name: elasticsearch
cluster-nodes: localhost:9300
##redis配置
redis:
database: 0
host: localhost
port: 6379
password: redis
pool:
max-active: 15
max-wait: 1
max-idle: 0
timeout: 0
##freemarker配置
freemarker:
##是否允许属性覆盖
allow-request-override: false
allow-session-override: false
cache: true
check-template-location: true
content-type: text/html
##暴露request属性
expose-request-attributes: false
expose-session-attributes: false
expose-spring-macro-helpers: false
suffix: .ftl
template-loader-path: classpath:/templates/
request-context-attribute: request
settings:
classic_compatible: true
locale: zh_CN
date_format: yyyy-MM-dd
time_format: HH:mm:ss
datetime_format: yyyy-MM-dd HH:mm:ss
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>comcrazygroupId>
<artifactId>ithomecrawlerartifactId>
<version>0.0.1-SNAPSHOTversion>
<packaging>jarpackaging>
<name>ithomecrawlername>
<description>ITHome Crawler.description>
<parent>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-parentartifactId>
<version>1.5.6.RELEASEversion>
<relativePath/>
parent>
<properties>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8project.reporting.outputEncoding>
<java.version>1.8java.version>
properties>
<dependencies>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-webartifactId>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-data-elasticsearchartifactId>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-data-redisartifactId>
dependency>
<dependency>
<groupId>org.mybatis.spring.bootgroupId>
<artifactId>mybatis-spring-boot-starterartifactId>
<version>1.3.1version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<scope>runtimescope>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>druidartifactId>
<version>1.0.24version>
dependency>
<dependency>
<groupId>org.jsoupgroupId>
<artifactId>jsoupartifactId>
<version>1.10.3version>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-freemarkerartifactId>
dependency>
<dependency>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-starter-testartifactId>
<scope>testscope>
dependency>
dependencies>
<build>
<finalName>ithomecrawlerfinalName>
<plugins>
<plugin>
<groupId>org.springframework.bootgroupId>
<artifactId>spring-boot-maven-pluginartifactId>
plugin>
plugins>
build>
project>
完整代码