hadoop 读取 文本内容

Configuration conf = context.getConfiguration();
				FileSystem fs = FileSystem.get(conf);				
				
//				FSDataInputStream fin = fs.open(new Path(conf.get("emotionPath")));
				FSDataInputStream fin = fs.open(new Path("/user/lvxinjian/negative.txt"));
				BufferedReader in = null;
				String line;
				try {
					in = new BufferedReader(new InputStreamReader(fin, "UTF-8"));
					
					while ((line = in.readLine()) != null) {
						wordSet.add(line);
					}
					System.out.println(wordSet.size());

				} finally {
					if(in != null)
						in.close();
				}


public class GetSentenceWithPos {

	
	public void read () throws IOException
	{
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		SequenceFile.Reader sreader = null;
		
		try {
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-0"),conf);
			Text key = new Text();// key 和 value的类型要和当前读取文件的key val 一致
			IntWritable val = new IntWritable();
			HashMap WordList = new HashMap();//词典
			System.out.println("load dictionary 0...");
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());					
			}		
			System.out.println("load dictionary 1...");
			sreader = null;
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-1"),conf);
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());				
			}
			System.out.println("load dictionary 2...");
			sreader = null;
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-2"),conf);
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());					
			}
			
			Configuration conf1 = new Configuration();;
			FileSystem fs2 = FileSystem.get(conf1);

			FSDataInputStream fin = fs2.open(new Path("/user/lvxinjian/showTfidf49AllData/part-r-00000"));
			BufferedReader in = null;
			String line;
			System.out.println("load wordindex_count...");
			ArrayList wordInfo = new ArrayList();       //mapreduce结果
			in = new BufferedReader(new InputStreamReader(fin, "UTF-8"));				
			while ((line = in.readLine()) != null) {
				wordInfo.add(line);
			}
			System.out.println("sizef:\t"+ wordInfo.size());
			System.out.println("get word ...");
			ArrayList lstResult = new ArrayList();
			int count = 0; 
			for(String str : wordInfo){
				if(count % 1000 == 0)
					System.out.println(count);
				count++;
				String [] arr = str.split("\t");
				if(arr.length != 2)
					continue;
				if(WordList.containsKey(Integer.parseInt(arr[0]))){
					String word = WordList.get(Integer.parseInt(arr[0]));
					lstResult.add(word + "\t" + arr[1]);
				}
				
			}
			System.out.println("saving....");
			FileTool.SaveListToFile(lstResult, "./2013052802.txt", false, Charset.forName("utf-8"));
		} 
		finally {
			IOUtils.closeStream(sreader);
		}

		
	}
	static public void main(String [] args)
	{
		
		try {
			GetSentenceWithPos getSentenceWithPos = new GetSentenceWithPos();
			getSentenceWithPos.read();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
}


你可能感兴趣的:(随笔小记)