mysql对比字符串相似度_对比俩个字符串的相似度

package com.opslab.util.algorithmImpl;

import com.opslab.util.CharsetUtil;

import com.opslab.util.SysUtil;

import java.io.ByteArrayInputStream;

import java.io.InputStream;

import java.io.UnsupportedEncodingException;

/**

* 对比俩个字符串的相似度

*/

public class StringImpl {

//第一种实现方式

private static String longestCommonSubstring(String strA, String strB) {

char[] chars_strA = strA.toCharArray();

char[] chars_strB = strB.toCharArray();

int m = chars_strA.length;

int n = chars_strB.length;

int[][] matrix = new int[m + 1][n + 1];

for (int i = 1; i <= m; i++) {

for (int j = 1; j <= n; j++) {

if (chars_strA[i - 1] == chars_strB[j - 1])

matrix[i][j] = matrix[i - 1][j - 1] + 1;

else

matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);

}

}

char[] result = new char[matrix[m][n]];

int currentIndex = result.length - 1;

while (matrix[m][n] != 0) {

if (matrix[n] == matrix[n - 1])

n--;

else if (matrix[m][n] == matrix[m - 1][n])

m--;

else {

result[currentIndex] = chars_strA[m - 1];

currentIndex--;

n--;

m--;

}

}

return new String(result);

}

private static boolean charReg(char charValue) {

return (charValue >= 0x4E00 && charValue <= 0X9FA5) || (charValue >= 'a' && charValue <= 'z') || (charValue >= 'A' && charValue <= 'Z') || (charValue >= '0' && charValue <= '9');

}

private static String removeSign(String str) {

StringBuffer sb = new StringBuffer();

for (char item : str.toCharArray()){

if (charReg(item)) {

sb.append(item);

}

}

return sb.toString();

}

/**

* 快速比较俩个字符串的相似度

*

* @param strA 较长的字符串

* @param strB 较短的字符串

* @return 俩个字符串的相似度

*

summary

:较长的字符串放到前面有助于提交效率

*/

public static double SimilarDegree(String strA, String strB) {

String newStrA = removeSign(strA);

String newStrB = removeSign(strB);

int temp = Math.max(newStrA.length(), newStrB.length());

int temp2 = longestCommonSubstring(newStrA, newStrB).length();

return temp2 * 1.0 / temp;

}

//第二种实现方式

private static int compare(String str, String target) {

int d[][]; // 矩阵

int n = str.length();

int m = target.length();

int i; // 遍历str的

int j; // 遍历target的

char ch1; // str的

char ch2; // target的

int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1

if (n == 0) {

return m;

}

if (m == 0) {

return n;

}

d = new int[n + 1][m + 1];

for (i = 0; i <= n; i++) { // 初始化第一列

d[i][0] = i;

}

for (j = 0; j <= m; j++) { // 初始化第一行

d[0][j] = j;

}

for (i = 1; i <= n; i++) { // 遍历str

ch1 = str.charAt(i - 1);

// 去匹配target

for (j = 1; j <= m; j++) {

ch2 = target.charAt(j - 1);

if (ch1 == ch2) {

temp = 0;

} else {

temp = 1;

}

// 左边+1,上边+1, 左上角+temp取最小

d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + temp);

}

}

return d[n][m];

}

private static int min(int one, int two, int three) {

return (one = one < two ? one : two) < three ? one : three;

}

/**

* 获取字符串的相似度

*

* @param str

* @param target

* @return

*/

public static double SimilarityRatio(String str, String target) {

return 1 - (double) compare(str, target) / Math.max(str.length(), target.length());

}

/**

* 获取字符串编码

*

* @param str 需要处理的字符串

*/

public static String simpleEncoding(String str) {

try{

byte[] bs = str.getBytes(SysUtil.JVM_ENCODING);

if(str.equals(new String(bs,CharsetUtil.UTF_8))){

return CharsetUtil.UTF_8;

}

if(str.equals(new String(bs,CharsetUtil.GBK))){

return CharsetUtil.GBK;

}

if(str.equals(new String(bs,"ISO-8859-1"))){

return "ISO-8859-1";

}

}catch(UnsupportedEncodingException e) {

System.out.println("111111111");

e.printStackTrace();

}

String encode = "GB2312";

try {

if (str.equals(new String(str.getBytes(encode), encode))) {

return encode;

}

} catch (UnsupportedEncodingException e) {

e.printStackTrace();

}

encode = "ISO-8859-1";

try {

if (str.equals(new String(str.getBytes(encode), encode))) {

return encode;

}

} catch (UnsupportedEncodingException exception1) {

exception1.printStackTrace();

}

encode = "UTF-8";

try {

if (str.equals(new String(str.getBytes(encode), encode))) {

return encode;

}

} catch (UnsupportedEncodingException exception1) {

exception1.printStackTrace();

}

encode = "GBK";

try {

if (str.equals(new String(str.getBytes(encode), encode))) {

return encode;

}

} catch (UnsupportedEncodingException exception1) {

exception1.printStackTrace();

}

return "";

}

}

你可能感兴趣的:(mysql对比字符串相似度)