rank/ITEYEBlogSimilarChecker.java at master · ysc/rank · GitHub

我们如何应对这样的商业广告呢?基本思路如下:

    1、当管理员发现一篇博文为黑博文时,人工确认。

    2、将人工确认的黑博文保存到黑博文数据库。

    3、当有新博文发表时,和黑博文数据库进行相似度计算,如果相似度超过预设的阈值,则拒绝发表博文。

下面是黑博文判断程序的详细判断过程,先上最终结果:

 

判定相似性的方式一:简单共有词

阈值=Math.min(339, 340)*0.8=271.2

待发表博文和黑博文共有的词数:339

因为待发表博文和黑博文共有的词数339 大于 阈值:271.2

所以判断为 相似 ,拒绝发表!

 

判定相似性的方式二:余弦相似度

待发表博文和黑博文的余弦夹角值:0.9977658868305056

因为待发表博文和黑博文的余弦夹角值0.9977658868305056大于或等于阈值:0.8

所以判断为 相似 ,拒绝发表!

 

 

/**
  *
  * APDPlat - Application Product Development Platform
  * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
  *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  *
  */
   
  package org.seo.rank.impl;
   
  import org.apdplat.word.WordSegmenter;
  import org.apdplat.word.segmentation.Word;
  import org.jsoup.Jsoup;
  import org.jsoup.nodes.Document;
  import org.jsoup.nodes.Element;
  import org.jsoup.select.Elements;
  import org.seo.rank.SimilarChecker;
  import org.seo.rank.list.DynamicIp;
  import org.slf4j.Logger;
  import org.slf4j.LoggerFactory;
   
  import java.math.BigDecimal;
  import java.util.*;
  import java.util.concurrent.atomic.AtomicInteger;
   
  /**
  * ITEYE博文相似性检测
  * @author 杨尚川
  */
  public class ITEYEBlogSimilarChecker implements SimilarChecker{
  private static final Logger LOGGER = LoggerFactory.getLogger(ITEYEBlogSimilarChecker.class);
  private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
  private static final String ENCODING = "gzip, deflate";
  private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
  private static final String CONNECTION = "keep-alive";
  private static final String REFERER = "http://yangshangchuan.iteye.com";
  private static final String HOST = "yangshangchuan.iteye.com";
  private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
  private static final String BLOG_CSS_PATH = "html body div#page div#content.clearfix div#main div.blog_main";
  private static final String BLOG_TITLE_CSS_PATH = "div.blog_title";
  private static final String BLOG_CONTENT_CSS_PATH = "div#blog_content.blog_content";
  private static final float THRESHOLD_RATE = 0.8F;
   
  @Override
  public boolean isSimilar(String url1, String url2) {
  return similarScore(url1, url2)>=THRESHOLD_RATE;
  }
  @Override
  public double similarScore(String url1, String url2) {
  Blog blog1 = getBlog(url1);
  if(blog1!=null) {
  Blog blog2 = getBlog(url2);
  if(blog2!=null) {
  double score = score(blog1, blog2);
  //取两位小数
  score = (int)(score*100)/(double)100;
  return score;
  }
  }
  return 0;
  }
   
  private double score(Blog blog1, Blog blog2){
  //分词
  List<Word> blog1Words = WordSegmenter.seg(blog1.getTitle()+"\n"+blog1.getContent());
  List<Word> blog2Words = WordSegmenter.seg(blog2.getTitle()+"\n"+blog2.getContent());
  //词频统计
  Map<Word, AtomicInteger> blog1WordsFre = frequence(blog1Words);
  Map<Word, AtomicInteger> blog2WordsFre = frequence(blog2Words);
  //输出详细信息
  if(LOGGER.isDebugEnabled()){
  showDetail(blog1, blog1Words, blog1WordsFre);
  showDetail(blog2, blog2Words, blog2WordsFre);
  }
  //使用简单共有词判定
  return simpleScore(blog1WordsFre, blog2WordsFre);
  //使用余弦相似度判定
  //return cosScore(blog1WordsFre, blog2WordsFre);
  }
   
  /**
  * 判定相似性的方式一:简单共有词
  * @param blog1WordsFre
  * @param blog2WordsFre
  * @return
  */
  private double simpleScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
  //判断有几个相同的词
  AtomicInteger intersectionLength = new AtomicInteger();
  blog1WordsFre.keySet().forEach(word -> {
  if (blog2WordsFre.keySet().contains(word)) {
  intersectionLength.incrementAndGet();
  }
  });
  LOGGER.info("网页1有的词数:" + blog1WordsFre.size());
  LOGGER.info("网页2有的词数:" + blog2WordsFre.size());
  LOGGER.info("网页1和2共有的词数:" + intersectionLength.get());
  double score = intersectionLength.get()/(double)Math.min(blog1WordsFre.size(), blog2WordsFre.size());
  LOGGER.info("相似度分值="+intersectionLength.get()+"/(double)Math.min("+blog1WordsFre.size()+", "+blog2WordsFre.size()+")="+score);
  return score;
  }
   
  /**
  *
  * 判定相似性的方式二:余弦相似度
  * 余弦夹角原理:
  * 向量a=(x1,y1),向量b=(x2,y2)
  * a.b=x1x2+y1y2
  * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2]
  * a,b的夹角的余弦cos=a.b/|a|*|b|=(x1x2+y1y2)/根号[(x1)^2+(y1)^2]*根号[(x2)^2+(y2)^2]
  * @param blog1WordsFre
  * @param blog2WordsFre
  */
  private double cosScore(Map<Word, AtomicInteger> blog1WordsFre, Map<Word, AtomicInteger> blog2WordsFre){
  Set<Word> words = new HashSet<>();
  words.addAll(blog1WordsFre.keySet());
  words.addAll(blog2WordsFre.keySet());
  //向量的维度为words的大小,每一个维度的权重是词频,注意的是,中文分词的时候已经去了停用词
  //a.b
  AtomicInteger ab = new AtomicInteger();
  //|a|
  AtomicInteger aa = new AtomicInteger();
  //|b|
  AtomicInteger bb = new AtomicInteger();
  //计算
  words
  .stream()
  .forEach(word -> {
  AtomicInteger x1 = blog1WordsFre.get(word);
  AtomicInteger x2 = blog2WordsFre.get(word);
  if(x1!=null && x2!=null) {
  //x1x2
  int oneOfTheDimension = x1.get() * x2.get();
  //+
  ab.addAndGet(oneOfTheDimension);
  }
  if(x1!=null){
  //(x1)^2
  int oneOfTheDimension = x1.get() * x1.get();
  //+
  aa.addAndGet(oneOfTheDimension);
  }
  if(x2!=null){
  //(x2)^2
  int oneOfTheDimension = x2.get() * x2.get();
  //+
  bb.addAndGet(oneOfTheDimension);
  }
  });
   
  double aaa = Math.sqrt(aa.get());
  double bbb = Math.sqrt(bb.get());
  //使用BigDecimal保证精确计算浮点数
  BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb));
  double cos = ab.get()/aabb.doubleValue();
  return cos;
  }
   
  private void showDetail(Blog blog, List<Word> blogWords, Map<Word, AtomicInteger> blogWordsFre){
  LOGGER.debug("博文URL:");
  LOGGER.debug("\t"+blog.getUrl());
  LOGGER.debug("博文标题:");
  LOGGER.debug("\t"+blog.getTitle());
  LOGGER.debug("博文内容:");
  LOGGER.debug("\t"+blog.getContent());
  LOGGER.debug("博文长度:"+blog.getContent().length());
  LOGGER.debug("博文分词结果:");
  LOGGER.debug("\t" + blogWords);
  LOGGER.debug("博文词频统计:");
  AtomicInteger c = new AtomicInteger();
  blogWordsFre
  .entrySet()
  .stream()
  .sorted((a,b)->b.getValue().get()-a.getValue().get())
  .forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+""+e.getKey()+"="+e.getValue()));
  }
   
  private Map<Word, AtomicInteger> frequence(List<Word> words){
  Map<Word, AtomicInteger> fre =new HashMap<>();
  words.forEach(word->{
  fre.putIfAbsent(word, new AtomicInteger());
  fre.get(word).incrementAndGet();
  });
  return fre;
  }
   
  private Blog getBlog(String url) {
  try {
  String html = getHtml(url);
  Document doc = Jsoup.parse(html);
  Elements elements = doc.select(BLOG_CSS_PATH);
  String title = null;
  String content = null;
  for(Element element : elements){
  Elements ts = element.select(BLOG_TITLE_CSS_PATH);
  if(ts.size()==1){
  title = ts.get(0).text();
  }
  ts = element.select(BLOG_CONTENT_CSS_PATH);
  if(ts.size()==1){
  content = ts.get(0).text();
  }
  }
  if(title!=null && content!=null){
  Blog blog = new Blog();
  blog.setUrl(url);
  blog.setTitle(title);
  blog.setContent(content);
  return blog;
  }
  } catch (Exception e) {
  LOGGER.error("获取博文失败", e);
  }
  return null;
  }
  private String getHtml(String url){
  String html = getHtmlInternal(url);
  int times = 1;
  while (html==null && times<4){
  times++;
  //使用新的IP地址
  DynamicIp.toNewIp();
  html = getHtmlInternal(url);
  }
  times = 1;
  //LOGGER.debug("获取到的HTML:" +html);
  while((html.contains("非常抱歉,来自您ip的请求异常频繁")
  || html.contains("请您点击按钮解除封锁")
  || html.contains("请输入以下验证码"))
  && times<4){
  times++;
  //使用新的IP地址
  DynamicIp.toNewIp();
  html = getHtmlInternal(url);
  }
  return html;
  }
  private String getHtmlInternal(String url) {
  try {
  return Jsoup.connect(url)
  .header("Accept", ACCEPT)
  .header("Accept-Encoding", ENCODING)
  .header("Accept-Language", LANGUAGE)
  .header("Connection", CONNECTION)
  .header("Referer", REFERER)
  .header("Host", HOST)
  .header("User-Agent", USER_AGENT)
  .header("X-Forwarded-For", getRandomIp())
  .header("Proxy-Client-IP", getRandomIp())
  .header("WL-Proxy-Client-IP", getRandomIp())
  .ignoreContentType(true)
  .timeout(30000)
  .get().html();
  } catch (Exception e) {
  LOGGER.error("获取博文失败", e);
  }
  return null;
  }
  private String getRandomIp(){
  int first = new Random().nextInt(254)+1;
  //排除A类私有地址0.0.0.0--10.255.255.255
  while(first==10){
  first = new Random().nextInt(254)+1;
  }
  int second = new Random().nextInt(254)+1;
  //排除B类私有地址172.16.0.0--172.31.255.255
  while(first==172 && (second>=16 && second<=31)){
  first = new Random().nextInt(254)+1;
  second = new Random().nextInt(254)+1;
  }
  //排除C类私有地址192.168.0.0--192.168.255.255
  while(first==192 && second==168){
  first = new Random().nextInt(254)+1;
  second = new Random().nextInt(254)+1;
  }
  int third = new Random().nextInt(254)+1;
  int forth = new Random().nextInt(254)+1;
  return first+"."+second+"."+second+"."+forth;
  }
  private static class Blog{
  private String url;
  private String title;
  private String content;
   
  public String getUrl() {
  return url;
  }
   
  public void setUrl(String url) {
  this.url = url;
  }
   
  public String getTitle() {
  return title;
  }
   
  public void setTitle(String title) {
  this.title = title;
  }
   
  public String getContent() {
  return content;
  }
   
  public void setContent(String content) {
  this.content = content;
  }
  }
   
  public static void main(String[] args) {
  SimilarChecker similarChecker = new ITEYEBlogSimilarChecker();
  double score = similarChecker.similarScore("http://baidu-27233181.iteye.com/blog/2200707",
  "http://baidu-27233181.iteye.com/blog/2200706");
  LOGGER.info("相似度分值:"+score);
  }
  }

阅读全文……

你可能感兴趣的:(java,算法)