算法任务:
1. 给定一个文件,统计这个文件中所有字符的相对频率(相对频率就是这些字符出现的概率——该字符出现次数除以字符总个数,并计算该文件的熵)。
2. 给定另外一个文件,按上述同样的方法计算字符分布的概率,然后计算两个文件中的字符分布的KL距离。
(熵和KL距离都是NLP自然语言处理中术语,仅仅是涉及到一两个公式而已,不影响您对代码的理解,so just try!)
说明:
1. 给定的文件可以是两个中文文件或两个英文文件,也可以是两个中英文混合文件。对于中文,计算字符,对于英文,计算词。
2.有效字符不包括 空格 换行符 标点符号。
3.将中文字符、英文单词、其他非有效字符及其出现次数,分别写入三个文件中。
4.代码用java完成。
文章的重点:
1.如何判断一个字符是汉字,而不是ASCII、标点、日文、阿拉伯文……
2.了解汉字是如何编码的。“UTF8”绝逼是要花你一整个下午时间去弄明白的东西。
推荐博文:http://www.cnblogs.com/chenwenbiao/archive/2011/08/11/2134503.html
3.正则表达式。对于计算机科班出身的人应该不陌生,在此我就不造次了。
代码如下:
1 import java.io.BufferedReader; 2 import java.io.FileInputStream; 3 import java.io.FileReader; 4 import java.io.FileWriter; 5 import java.util.HashMap; 6 import java.util.Iterator; 7 import java.util.Map.Entry; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 11 public class NLPFileUnit { 12 public HashMap<String, Integer> WordOccurrenceNumber;//The Occurrence Number of the single Chinese character 13 //or Single English word in the file 14 public HashMap<String, Float> WordProbability;//The probability of single Chinese character or English word 15 public HashMap<String, Integer> Punctuations;//The punctuation that screened out from the file 16 public float entropy;//熵,本文主要计算单个汉字,或者单个英文单词的熵值 17 private String filePath; 18 19 //构造函数 20 public NLPFileUnit(String filePath) throws Exception { 21 this.filePath = filePath; 22 WordOccurrenceNumber = createHash(createReader(filePath)); 23 Punctuations = filterPunctuation(WordOccurrenceNumber); 24 WordProbability = calProbability(WordOccurrenceNumber); 25 this.entropy = calEntropy(this.WordProbability); 26 27 System.out.println("all punctuations were saved at " + filePath.replace(".", "_punctuation.") + "!"); 28 this.saveFile(Punctuations, filePath.replace(".", "_punctuation.")); 29 System.out.println("all words(En & Ch) were saved at " + filePath.replace(".", "_AllWords.") + "!"); 30 this.saveFile(this.WordOccurrenceNumber, filePath.replace(".", "_AllWords.")); 31 } 32 33 /** 34 * get the English words form the file to HashMap 35 * @param hash 36 * @param path 37 * @throws Exception 38 */ 39 public void getEnWords(HashMap<String, Integer> hash, String path) throws Exception { 40 FileReader fr = new FileReader(path); 41 BufferedReader br = new BufferedReader(fr); 42 43 //read all lines into content 44 String content = ""; 45 String line = null; 46 while((line = br.readLine())!=null){ 47 content+=line; 48 } 49 br.close(); 50 51 //extract words by regex正则表达式 52 Pattern enWordsPattern = Pattern.compile("([A-Za-z]+)"); 53 Matcher matcher = enWordsPattern.matcher(content); 54 while (matcher.find()) { 55 String word = matcher.group(); 56 if(hash.containsKey(word)) 57 hash.put(word, 1 + hash.get(word)); 58 else{ 59 hash.put(word, 1); 60 } 61 } 62 } 63 64 private boolean isPunctuation(String tmp) { 65 //Punctuation should not be EN words/ Chinese 66 final String cnregex = "\\p{InCJK Unified Ideographs}"; 67 final String enregex = "[A-Za-z]+"; 68 return !(tmp.matches(cnregex) || tmp.matches(enregex)) ; 69 } 70 71 /** 72 * judge whether the file is encoded by UTF-8 (UCS Transformation Format)format. 73 * @param fs 74 * @return 75 * @throws Exception 76 */ 77 private boolean isUTF8(FileInputStream fs) throws Exception { 78 if (fs.read() == 0xEF && fs.read() == 0xBB && fs.read() == 0xBF)//所有utf8编码的文件前三个字节为0xEFBBBF 79 return true; 80 return false; 81 } 82 83 /** 84 * utf8格式编码的字符,其第一个byte的二进制编码可以判断该字符的长度(汉字一般占三个字节)ASCII占一byte 85 * @param b 86 * @return 87 */ 88 private int getlength(byte b) { 89 int v = b & 0xff;//byte to 十六进制数 90 if (v > 0xF0) { 91 return 4; 92 } 93 // 110xxxxx 94 else if (v > 0xE0) { 95 return 3; 96 } else if (v > 0xC0) { 97 return 2;//该字符长度占2byte 98 } 99 return 1; 100 } 101 102 /** 103 * 通过读取头一个byte来判断该字符占用字节数,并读取该字符,如1110xxxx,表示这个字符占三个byte 104 * @param fs 105 * @return 106 * @throws Exception 107 */ 108 private String readUnit(FileInputStream fs) throws Exception { 109 byte b = (byte) fs.read(); 110 if (b == -1) 111 return null; 112 int len = getlength(b); 113 byte[] units = new byte[len]; 114 units[0] = b; 115 for (int i = 1; i < len; i++) { 116 units[i] = (byte) fs.read(); 117 } 118 String ret = new String(units, "UTF-8"); 119 return ret; 120 } 121 122 /** 123 * 把单词,标点,汉字等全都读入hashmap 124 * @param inputStream 125 * @return 126 * @throws Exception 127 */ 128 private HashMap<String, Integer> createHash(FileInputStream inputStream) 129 throws Exception { 130 HashMap<String, Integer> hash = new HashMap<String, Integer>(); 131 String key = null; 132 while ((key = readUnit(inputStream)) != null) { 133 if (hash.containsKey(key)) { 134 hash.put(key, 1 + (int) hash.get(key)); 135 } else { 136 hash.put(key, 1); 137 } 138 } 139 inputStream.close(); 140 getEnWords(hash, this.filePath); 141 return hash; 142 } 143 144 /** 145 * FileInputStream读取文件,若文件不是UTF8编码,返回null 146 * @param path 147 * @return 148 * @throws Exception 149 */ 150 private FileInputStream createReader(String path) throws Exception { 151 FileInputStream br = new FileInputStream(path); 152 if (!isUTF8(br)) 153 return null; 154 return br; 155 } 156 157 /** 158 * save punctuation filtered form (HashMap)hash into (HashMap)puncs, 159 * @param hash;remove punctuation form (HashMap)hash at the same time 160 * @return 161 */ 162 private HashMap<String, Integer> filterPunctuation( 163 HashMap<String, Integer> hash) { 164 HashMap<String, Integer> puncs = new HashMap<String, Integer>(); 165 Iterator<?> iterator = hash.entrySet().iterator(); 166 167 while (iterator.hasNext()) { 168 Entry<?, ?> entry = (Entry<?, ?>) iterator.next(); 169 String key = entry.getKey().toString(); 170 if (isPunctuation(key)) { 171 puncs.put(key, hash.get(key)); 172 iterator.remove(); 173 } 174 } 175 return puncs; 176 } 177 178 /** 179 * calculate the probability of the word in hash 180 * @param hash 181 * @return 182 */ 183 private HashMap<String, Float> calProbability(HashMap<String, Integer> hash) { 184 float count = countWords(hash); 185 HashMap<String, Float> prob = new HashMap<String, Float>(); 186 Iterator<?> iterator = hash.entrySet().iterator(); 187 while (iterator.hasNext()) { 188 Entry<?, ?> entry = (Entry<?, ?>) iterator.next(); 189 String key = entry.getKey().toString(); 190 prob.put(key, hash.get(key) / count); 191 } 192 return prob; 193 } 194 195 /** 196 * save the content in the hash into file.txt 197 * @param hash 198 * @param path 199 * @throws Exception 200 */ 201 private void saveFile(HashMap<String, Integer> hash, String path) 202 throws Exception { 203 FileWriter fw = new FileWriter(path); 204 fw.write(hash.toString()); 205 fw.close(); 206 } 207 208 /** 209 * calculate the total words in hash 210 * @param hash 211 * @return 212 */ 213 private int countWords(HashMap<String, Integer> hash) { 214 int count = 0; 215 for (Entry<String, Integer> entry : hash.entrySet()) { 216 count += entry.getValue(); 217 } 218 return count; 219 } 220 221 /** 222 * calculate the entropy(熵) of the characters 223 * @param hash 224 * @return 225 */ 226 private float calEntropy(HashMap<String, Float> hash) { 227 float entropy = 0; 228 Iterator<Entry<String, Float>> iterator = hash.entrySet().iterator(); 229 while (iterator.hasNext()) { 230 Entry<String, Float> entry = (Entry<String, Float>) iterator.next(); 231 Float prob = entry.getValue();//get the probability of the characters 232 entropy += 0 - (prob * Math.log(prob));//calculate the entropy of the characters 233 } 234 return entropy; 235 } 236 } 237 238 239 240 241 242 243 244 import java.io.BufferedReader; 245 import java.io.FileNotFoundException; 246 import java.io.IOException; 247 import java.io.InputStreamReader; 248 import java.util.HashMap; 249 import java.util.Iterator; 250 import java.util.Map.Entry; 251 252 public class NLPWork { 253 254 /** 255 * calculate the KL distance form file u1 to file u2 256 * @param u1 257 * @param u2 258 * @return 259 */ 260 public static float calKL(NLPFileUnit u1, NLPFileUnit u2) { 261 HashMap<String, Float> hash1 = u1.WordProbability; 262 HashMap<String, Float> hash2 = u2.WordProbability; 263 float KLdistance = 0; 264 Iterator<Entry<String, Float>> iterator = hash1.entrySet().iterator(); 265 while (iterator.hasNext()) { 266 Entry<String, Float> entry = iterator.next(); 267 String key = entry.getKey().toString(); 268 269 if (hash2.containsKey(key)) { 270 Float value1 = entry.getValue(); 271 Float value2 = hash2.get(key); 272 KLdistance += value1 * Math.log(value1 / value2); 273 } 274 } 275 return KLdistance; 276 } 277 278 public static void main(String[] args) throws IOException, Exception { 279 //all punctuation will be saved under working directory 280 System.out.println("Now only UTF8 encoded file is supported!!!"); 281 System.out.println("PLS input file 1 path:"); 282 BufferedReader cin = new BufferedReader( 283 new InputStreamReader(System.in)); 284 String file1 = cin.readLine(); 285 System.out.println("PLS input file 2 path:"); 286 String file2 = cin.readLine(); 287 NLPFileUnit u1 = null; 288 NLPFileUnit u2 = null; 289 try{ 290 u1 = new NLPFileUnit(file1);//NLP:Nature Language Processing 291 u2 = new NLPFileUnit(file2); 292 } 293 catch(FileNotFoundException e){ 294 System.out.println("File Not Found!!"); 295 e.printStackTrace(); 296 return; 297 } 298 float KLdistance = calKL(u1, u2); 299 System.out.println("KLdistance is :" + KLdistance); 300 System.out.println("File 1 Entropy: " + u1.entropy); 301 System.out.println("File 2 Entropy: " + u2.entropy); 302 } 303 }
计算结果: