import java.lang.reflect.Array; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Set; /** * 文本坐标记用的 * tagBegin是开始标记 * tagEnd 是结束标记 * 用了二分法查找来确定单词 * content 是传入的正文 * 正文可以传多次 * 标记词语也可以传多次 * @author Ansj * */ public class TagWord { private String tagBegin; private String tagEnd; Branch frontbegin = null; Set<String> keyWords = new HashSet<String>(); public TagWord(String begin, String end) { this.tagBegin = begin; this.tagEnd = end; } public TagWord addKeyWords(String[] keyWord) { if (keyWord.length > 0) { for (int i = 0; i < keyWord.length; i++) { this.keyWords.add(keyWord[i].trim()); } } return this; } // 是否发现词 boolean findWord = false; public String getTagContent(String content) { if (content == null || content.trim().length() == 0 || keyWords.size() == 0) { return content; } this.frontbegin = new MakeLibrary().getStringTree(this.keyWords); if(frontbegin==null){ return content ; } char[] chars = content.toCharArray(); // 正文 StringBuilder sb = new StringBuilder(); WoodInterface head = this.frontbegin; int start = 0; int end = 1; int index = 0; boolean isBack = false ; int length = chars.length ; // 此处是正向最大匹配 for (int i = 0; i < length; i++) { index++ ; head = head.get(chars[i]) ; if(head==null){ if(isBack){ sb.append(tagBegin).append(chars,start,end).append(tagEnd) ; start = start+end ; i = start-1 ; isBack = false ; }else{ sb.append(chars,start,end) ; i = start ; start++ ; } head = this.frontbegin ; index = 0 ; end = 1 ; continue ; } switch (head.getStatus()) { case 1: break ; case 2: end = index ; isBack = true ; break ; case 3: sb.append(tagBegin).append(chars,start,index).append(tagEnd) ; start = start+index ; index= 0 ; end = 1 ; isBack = false ; head = this.frontbegin; break ; } } return sb.toString(); } public static void main(String[] args) { String[] keyWords = {"中华人民共和国","孙健","伟大","人民", "中华","万岁" }; long start = System.currentTimeMillis(); for (int i = 0; i < 1; i++) { String str = new TagWord("<begin>", "<end>").addKeyWords(keyWords) .getTagContent( "中华人民共和国是一个伟大的民族我们有振兴民族的需要孙健万岁 . 中 国 万万岁哈哈 。"); System.out.println(str); } System.out.println(System.currentTimeMillis() - start); } } class MakeLibrary { public MakeLibrary() { } // 是否有下一个 private static boolean hasNext = true; // 是否是一个词 private static boolean isWords = true; Iterator<String> it = null; public Branch getStringTree(Set<String> keyWords) { it = keyWords.iterator(); Branch head = new Branch('h', 0, 0); Branch branch = head ; while (it.hasNext()) { char[] chars = it.next().toCharArray(); for (int i = 0; i < chars.length; i++) { if (chars.length == (i + 1)) { isWords = true; hasNext = false; } else { isWords = false; hasNext = true; } int status = 1; if (isWords && hasNext) { status = 2; } if (!isWords && hasNext) { status = 1; } if (isWords && !hasNext) { status = 3; } branch.add(new Branch(chars[i], status, 0)); branch = (Branch) branch.get(chars[i]); } branch = head ; } return head; } } interface WoodInterface { public WoodInterface add(WoodInterface branch) ; public WoodInterface get(char c) ; public boolean contains(char c) ; public int compareTo(char c) ; public boolean equals(char c) ; public byte getStatus() ; public char getC() ; public void setStatus(int status) ; public byte getNature() ; public void setNature(byte nature) ; } class Branch implements WoodInterface { /** * status 此字的状态1,继续 2,是个词语但是还可以继续 ,3确定 * nature 词语性质 * 0.未知 . 1是姓 . 2 是职位名称 3 是数量级的词 . 4 是数字词语 5 是标点 */ WoodInterface[] branches = new WoodInterface[0]; private char c; // 状态 private byte status = 1; // 索引 private short index = -1; // 词性 private byte nature = 0; // 单独查找出来的对象 WoodInterface branch = null; public WoodInterface add(WoodInterface branch) { if ((this.branch=this.get(branch.getC()))!=null) { switch (branch.getStatus()) { case 1: if(this.branch.getStatus()==2){ this.branch.setStatus(2) ; } if(this.branch.getStatus()==3){ this.branch.setStatus(2) ; } break; case 2: this.branch.setStatus(2) ; case 3: if(this.branch.getStatus()==2){ this.branch.setStatus(2) ; } if(this.branch.getStatus()==1){ this.branch.setStatus(2) ; } } this.branch.setNature(branch.getNature()) ; return this.branch; } index++; if ((index + 1) > branches.length) { branches = Arrays.copyOf(branches, index + 1); } branches[index] = branch; AnsjArrays.sort(branches); return branch; } public Branch(char c, int status, int nature) { this.c = c; this.status = (byte) status; this.nature = (byte) nature; } int i = 0; public WoodInterface get(char c) { int i = AnsjArrays.binarySearch(branches, c); if (i > -1) { return branches[i]; } return null; } public boolean contains(char c) { if (AnsjArrays.binarySearch(branches, c) > -1) { return true; } else { return false; } } public int compareTo(char c) { if (this.c > c) { return 1; }else if (this.c < c) { return -1; }else return 0 ; } public boolean equals(char c) { if (this.c == c) { return true; } else { return false; } } @Override public int hashCode() { // TODO Auto-generated method stub return c; } public byte getStatus() { return status; } public void setStatus(int status) { this.status = (byte) status; } public char getC() { return this.c; } public byte getNature() { return nature; } public void setNature(byte nature) { this.nature = nature; } } class AnsjArrays { private static final int INSERTIONSORT_THRESHOLD = 7; /** * 二分法查找.摘抄了jdk的东西..只不过把他的自动装箱功能给去掉了 * * @param branches * @param c * @return */ public static int binarySearch(WoodInterface[] branches, char c) { int high = branches.length - 1; if (branches.length < 1) { return high; } int low = 0; while (low <= high) { int mid = (low + high) >>> 1; int cmp = branches[mid].compareTo(c); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else return mid; // key found } return -1; // key not found. } public static void sort(WoodInterface[] a) { WoodInterface[] aux = (WoodInterface[])a.clone(); mergeSort(aux, a, 0, a.length, 0); } public static void sort(WoodInterface[] a, int fromIndex, int toIndex) { rangeCheck(a.length, fromIndex, toIndex); WoodInterface[] aux = copyOfRange(a, fromIndex, toIndex); mergeSort(aux, a, fromIndex, toIndex, -fromIndex); } private static void rangeCheck(int arrayLen, int fromIndex, int toIndex) { if (fromIndex > toIndex) throw new IllegalArgumentException("fromIndex(" + fromIndex + ") > toIndex(" + toIndex + ")"); if (fromIndex < 0) throw new ArrayIndexOutOfBoundsException(fromIndex); if (toIndex > arrayLen) throw new ArrayIndexOutOfBoundsException(toIndex); } private static void mergeSort(WoodInterface[] src, WoodInterface[] dest, int low, int high, int off) { int length = high - low; // Insertion sort on smallest arrays if (length < INSERTIONSORT_THRESHOLD) { for (int i = low; i < high; i++) for (int j = i; j > low && (dest[j - 1]).compareTo(dest[j].getC()) > 0; j--) swap(dest, j, j - 1); return; } // Recursively sort halves of dest into src int destLow = low; int destHigh = high; low += off; high += off; int mid = (low + high) >>> 1; mergeSort(dest, src, low, mid, -off); mergeSort(dest, src, mid, high, -off); // If list is already sorted, just copy from src to dest. This is an // optimization that results in faster sorts for nearly ordered lists. if (src[mid - 1].compareTo(src[mid].getC()) <= 0) { System.arraycopy(src, low, dest, destLow, length); return; } // Merge sorted halves (now in src) into dest for (int i = destLow, p = low, q = mid; i < destHigh; i++) { if (q >= high || p < mid && src[p].compareTo(src[q].getC()) <= 0) dest[i] = src[p++]; else dest[i] = src[q++]; } } /** * Swaps x[a] with x[b]. */ private static void swap(WoodInterface[] x, int a, int b) { WoodInterface t = x[a]; x[a] = x[b]; x[b] = t; } public static <T> T[] copyOfRange(T[] original, int from, int to) { return copyOfRange(original, from, to, (Class<T[]>) original.getClass()); } public static <T, U> T[] copyOfRange(U[] original, int from, int to, Class<? extends T[]> newType) { int newLength = to - from; if (newLength < 0) throw new IllegalArgumentException(from + " > " + to); T[] copy = ((Object) newType == (Object) Object[].class) ? (T[]) new Object[newLength] : (T[]) Array .newInstance(newType.getComponentType(), newLength); System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength)); return copy; } }