java词频统计

package com.cmcm.goods_classification;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class WordProcess {
	
	private static final String DATASOURCEPATH = "D://mallet_data//0DataSource//Watches_Child//Sports_Watches";
	private static final String STOPWORDSPATH = "C://mallet-2.0.7//stoplists//en.txt";
	public static final String RESULTPATH = "D://automotives//result.txt";
	private static Map<String, Integer> dataHash = new HashMap<String, Integer>();
	private static Set<String> stopWordsSet = new HashSet<String>();
	
	public static void main(String[] args) throws Exception {
		loadStopWords();
		FileProcess.readFolder(DATASOURCEPATH);
		List<Map.Entry<String, Integer>> dataList = hashSort();
		FileProcess.writeFile(dataList);
	}
	
	public static void pruneText(String textPath) {
		String text = FileProcess.readFile(textPath).toLowerCase();// 将所有字母化为小写
		text = text.replaceAll("^[a-zA-Z0-9']|\\s+|\t|\r", " "); // 将非字母字符、多个空格回车换行均化为一个空格
		String words[] = text.split("\\s+");// 取出单词,并将单词存入数组中
		getFrequency(words);
	}
	
	public static void getFrequency(String[] words) {
		for (int i = 0; i < words.length; i++) {
			String key = words[i]; // key对应单词
			if ((dataHash.get(key) != null) && (!stopWordsSet.contains(key))) {
				int value = ((Integer) dataHash.get(key)).intValue(); // value对应单词出现的频率,单词已在map中存在则value+1
				value++;
				dataHash.put(key, new Integer(value));
			} else {
				dataHash.put(key, new Integer(1)); // 单词未在map中存在则value初始化为1
			}
		}		
	}
	
	public static List<Map.Entry<String, Integer>> hashSort() {
		List<Map.Entry<String, Integer>> list_Data = new ArrayList<Map.Entry<String, Integer>>(dataHash.entrySet());
		Collections.sort(list_Data, new Comparator<Map.Entry<String, Integer>>() {
			public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
				if (o2.getValue() != null && o1.getValue() != null && o2.getValue().compareTo(o1.getValue()) > 0) {
					return 1;
				} else {
					return -1;
				}
			}
		});
		return list_Data;
	}
	
	public static void loadStopWords() {
		String stopWordsText = FileProcess.readFile(STOPWORDSPATH);	
//		System.out.println(stopWordsText);
		String words[] = stopWordsText.split("\\s+|\\t|\\r|\\n");// 取出单词,并将单词存入数组中
		System.out.println(words.length);
		for(String word : words){
			stopWordsSet.add(word);
		}
	}
}

 

package com.cmcm.goods_classification;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public class FileProcess {
	
	/**
	 * read all file in folder
	 * @param path
	 */
	public static void readFolder(String path) {
		int fileNum = 0, folderNum = 0;
		File file = new File(path);
		if (file.exists()) {
			LinkedList<File> list = new LinkedList<File>();
			File[] files = file.listFiles();
			for (File file2 : files) {
				if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath());
					list.add(file2);
					folderNum++;
				} else {
					System.out.println("FILE: " + file2.getAbsolutePath());
					WordProcess.pruneText(file2.getAbsolutePath());
					fileNum++;
				}
			}
			File temp_file;
			while (!list.isEmpty()) {
				temp_file = list.removeFirst();
				files = temp_file.listFiles();
				for (File file2 : files) {
					if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath());
						list.add(file2);
						folderNum++;
					} else {
						System.out.println("FILE: " + file2.getAbsolutePath());
						fileNum++;
						WordProcess.pruneText(file2.getAbsolutePath());
					}
				}
			}
		} else {
			System.out.println("File is not exist!");
		}
		System.out.println("    num dir is: " + folderNum + "\n    num file is: "+ fileNum);
	}
	
	/**
	 * read content from filePath and return content
	 * @param filePath
	 */
	public static String readFile(String filePath) {
		File file = new File(filePath);
		StringBuffer result = new StringBuffer();
		BufferedReader reader = null;
		try {
			reader = new BufferedReader(new FileReader(file));
			String tempString = null;
			while ((tempString = reader.readLine()) != null) {
				result.append(" ");
				result.append(tempString);
			}
			reader.close();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e1) {
				}
			}			
		}		
		return result.toString();
	}

	/**
	 * write content into filePath
	 * @param dataList
	 */
	public static void writeFile(List<Map.Entry<String, Integer>> dataList) {
		System.out.println("start write word and frequency");
		int size = dataList.size();
		File file = null;
		FileWriter fileWrite = null;
		PrintWriter pw = null;
		int count = 1;
		try {
			// if file exist ,append ; if not, create 
			file = new File(WordProcess.RESULTPATH);
			fileWrite = new FileWriter(file, true);
			pw = new PrintWriter(fileWrite);
			for (int i = 0; i < size; i++) {
				String word = dataList.get(i).getKey();
				int frequency = dataList.get(i).getValue();
//				System.out.println(word + " : " + frequency);
				pw.print(word);
				pw.print(" ");
				pw.print(count++);
				pw.print(" ");
				pw.print(frequency);
				pw.println();
			}
			pw.flush();
			fileWrite.flush();			
		} catch(IOException e) {
			e.printStackTrace();
		}finally{
			try {
				pw.close();
				fileWrite.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			System.out.println("end write word and frequency");
		}
	}
}

 

你可能感兴趣的:(词频统计,词频,递归文件夹,hash排序,读写文件)