字符串多模式匹配算法:关键字过滤技术

1.MutiPatternParser.java
package com.mutiplepatternmatch;
import java.util.Vector;
/**
 *
 * @author Daniel
 *
 */
public class MutiPatternParser {
 public MutiPatternParser() {
 }
 private boolean initFlag = false;
 private UnionPatternSet unionPatternSet = new UnionPatternSet();
 private int maxIndex = (int) java.lang.Math.pow(2, 16);
 private int shiftTable[] = new int[maxIndex];
 public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];
 private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();
 public boolean addFilterKeyWord(String keyWord, int level) {
  if (initFlag == true)
   return false;
  UnionPattern unionPattern = new UnionPattern();
  String []strArray = keyWord.split(" ");
  for (int i = 0; i < strArray.length; i++) {
   Pattern pattern = new Pattern(strArray[i]);
   AtomicPattern atomicPattern = new AtomicPattern(pattern);
   unionPattern.addNewAtomicPattrn(atomicPattern);
   unionPattern.setLevel(level);
   atomicPattern.setBelongUnionPattern(unionPattern);
  }
  tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
  return true;
 }
 private boolean isValidChar(char ch) {
  if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')
    || (ch >= 'a' && ch <= 'z'))
   return true;
  if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
   return true;//简体中文汉字编码
  return false;
 }
 public String parse(String content, Vector<Integer> levelSet) {
  if (initFlag == false)
   init();
  Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
  String preContent = preConvert(content);
  for (int i = 0; i < preContent.length();) {
   char checkChar = preContent.charAt(i);
   if (shiftTable[checkChar] == 0) {
    Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
    tmpAps = findMathAps(preContent.substring(0, i + 1),
      hashTable[checkChar]);
    aps.addAll(tmpAps);
    i++;
   } else
    i = i + shiftTable[checkChar];
  }
  parseAtomicPatternSet(aps, levelSet);
  return content;
 }
 private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
   Vector<Integer> levelSet) {
  while (aps.size() > 0) {
   AtomicPattern ap = aps.get(0);
   UnionPattern up = ap.belongUnionPattern;
   if (up.isIncludeAllAp(aps) == true) {
    levelSet.add(new Integer(up.getLevel()));
   }
   aps.remove(0);
  }
 }
 private Vector<AtomicPattern> findMathAps(String src,
   Vector<AtomicPattern> destAps) {
  Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
  for (int i = 0; i < destAps.size(); i++) {
   AtomicPattern ap = destAps.get(i);
   if (ap.findMatchInString(src) == true)
   
    aps.add(ap);
  }
  return aps;
 }
 private String preConvert(String content) {
  String retStr = new String();
  for (int i = 0; i < content.length(); i++) {
   char ch = content.charAt(i);
   if (this.isValidChar(ch) == true) {
    retStr = retStr + ch;
   }
  }
  return retStr;
 }
 // shift table and hash table of initialize
 private void init() {
  initFlag = true;
  for (int i = 0; i < maxIndex; i++)
   hashTable[i] = new Vector<AtomicPattern>();
  shiftTableInit();
  hashTableInit();
 }
 public void clear() {
  tmpUnionPatternSet.clear();
  initFlag = false;
 }
 private void shiftTableInit() {
  for (int i = 0; i < maxIndex; i++)
   shiftTable[i] = 2;
  Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
  for (int i = 0; i < upSet.size(); i++) {
   Vector<AtomicPattern> apSet = upSet.get(i).getSet();
   for (int j = 0; j < apSet.size(); j++) {
    AtomicPattern ap = apSet.get(j);
    Pattern pattern = ap.getPattern();
    if (shiftTable[pattern.charAtEnd(1)] != 0)
     shiftTable[pattern.charAtEnd(1)] = 1;
    if (shiftTable[pattern.charAtEnd(0)] != 0)
     shiftTable[pattern.charAtEnd(0)] = 0;
   }
  }
 }
 private void hashTableInit() {
  Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
  for (int i = 0; i < upSet.size(); i++) {
   Vector<AtomicPattern> apSet = upSet.get(i).getSet();
   for (int j = 0; j < apSet.size(); j++) {
    AtomicPattern ap = apSet.get(j);
    Pattern pattern = ap.getPattern();
    if (pattern.charAtEnd(0) != 0) {
     hashTable[pattern.charAtEnd(0)].add(ap);
    }
   }
  }
 }
}
class Pattern { // string
 Pattern(String str) {
  this.str = str;
 }
 public char charAtEnd(int index) {
  if (str.length() > index) {
   return str.charAt(str.length() - index - 1);
  } else
   return 0;
 }
 public String str;
 public String getStr() {
  return str;
 };
}
class AtomicPattern {
 public boolean findMatchInString(String str) {
  if (this.pattern.str.length() > str.length())
   return false;
  int beginIndex = str.length() - this.pattern.str.length();
  String eqaulLengthStr = str.substring(beginIndex);
  if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
   return true;
  return false;
 }
 AtomicPattern(Pattern pattern) {
  this.pattern = pattern;
 };
 private Pattern pattern;
 public UnionPattern belongUnionPattern;
 public UnionPattern getBelongUnionPattern() {
  return belongUnionPattern;
 }
 public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
  this.belongUnionPattern = belongUnionPattern;
 }
 public Pattern getPattern() {
  return pattern;
 }
 public void setPattern(Pattern pattern) {
  this.pattern = pattern;
 }
}
class SameAtomicPatternSet {
 SameAtomicPatternSet() {
  SAPS = new Vector<AtomicPattern>();
 };
 public Vector<AtomicPattern> SAPS;
}
class UnionPattern { // union string
 UnionPattern() {
  this.apSet = new Vector<AtomicPattern>();
 }
 public Vector<AtomicPattern> apSet;
 public void addNewAtomicPattrn(AtomicPattern ap) {
  this.apSet.add(ap);
 }
 public Vector<AtomicPattern> getSet() {
  return apSet;
 }
 public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
  if (apSet.size() > inAps.size())
   return false;
  for (int i = 0; i < apSet.size(); i++) {
   AtomicPattern ap = apSet.get(i);
   if (isInAps(ap, inAps) == false)
    return false;
  }
  return true;
 }
 private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
  for (int i = 0; i < inAps.size(); i++) {
   AtomicPattern destAp = inAps.get(i);
   if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str) == true)
    return true;
  }
  return false;
 }
 public void setLevel(int level) {
  this.level = level;
 }
 public int getLevel() {
  return this.level;
 }
 private int level;
}
class UnionPatternSet { // union string set
 UnionPatternSet() {
  this.unionPatternSet = new Vector<UnionPattern>();
 }
 public void addNewUnionPattrn(UnionPattern up) {
  this.unionPatternSet.add(up);
 }
 public Vector<UnionPattern> unionPatternSet;
 public Vector<UnionPattern> getSet() {
  return unionPatternSet;
 }
 public void clear() {
  unionPatternSet.clear();
 }
}
 
2.TxtReader.java
package com.mutiplepatternmatch;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
/**
 *
 * @author Daniel
 *
 */
public class TxtReader implements Serializable{
 
 public TxtReader() {
  super();
 }
 public static BufferedReader keywordReader(String fileName){
  File file=new File(fileName);
  BufferedReader br=null;
  try {
   FileInputStream in=new FileInputStream(file);
   InputStreamReader inReader = new InputStreamReader(in, "UTF-8");
   
   br=new BufferedReader(inReader);
   
  } catch (FileNotFoundException e) {
   System.out.println("你想加载的文件没有找到!!!");
   e.printStackTrace();
  } catch (UnsupportedEncodingException e) {
   System.out.println("你指定的编码类型不支持哦!!!");
   e.printStackTrace();
  }
   return br;
  
  
 }
}
 
3.FilterTest.java
package com.mutiplepatternmatch;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Vector;
public class FilterTest {
 public static void main(String args[]) {
  MutiPatternParser filterEngine = new MutiPatternParser();
  BufferedReader brKeyword = TxtReader
    .keywordReader("file/illegalkeyword.txt");
  BufferedReader brArticle = TxtReader.keywordReader("file/article.txt");
  String keyword = null;
  String article = null;
  long startTime=System.currentTimeMillis();
  StringBuffer buffer=new StringBuffer();
  Vector<Integer> levelSet = new Vector<Integer>();
  try {
   while ((keyword = brKeyword.readLine()) != null) {
    filterEngine.addFilterKeyWord(keyword, 1);
   }
   while ((article = brArticle.readLine()) != null) {
    buffer.append(article);
   }
  } catch (IOException e) {
   // TODO Auto-generated catch block
   System.out.println("读取文件IO异常!!!");
   e.printStackTrace();
  }
   String content=filterEngine.parse(buffer.toString(), levelSet);
   System.out.println("article.txt test:levelSet.size="
   + levelSet.size()+"\n this article's content: "+content);
  
   long endTime=System.currentTimeMillis();
  
   System.out.println("Cost:"+(endTime-startTime)+"ms");
    
  /* 清除过滤算法引擎,带来的后果是引擎中将找不到任何的字符,测试代码,用来重载关键字时使用! */
  filterEngine.clear();
  levelSet.clear();
  filterEngine.parse("我我我 你你你 他他他", levelSet);
  System.out.println("test:levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符
  levelSet.clear();
  filterEngine.parse("a'b'c a b c", levelSet);
  System.out.println("test:tz levelSet.size=" + levelSet.size());// 测试正确结果是找不到匹配字符*/
 }
}
 ps:09年网上掀起了敏感字词过滤热,一时想到就动手干起来了,此算法借鉴经典的WM算法改写而成,不足之处还需优化和改进,测试时只对屏蔽的illegal keywords 进行了个数统计!
还有两个txt文件不能发出来(illegalkeyword.txt,article.txt)这两个文件包含收集的大量脏字和敏感不法字词,故不能发布!

你可能感兴趣的:(算法,J#,UP)