海量数据处理

1.一个文件中，存储有10亿个单词（数字+字母组成，每个单词小于16Byte），每行一个，求出现频率最高的10个单词。

算法一：分而治之 + Hash

10亿个单词，不能完全加载到内存中处理
采用“分而治之”的思想,按照单词的hash值，将单词存储在多个文件中
对于每一个小文件，可以构建一个单词为key，出现次数为value的Hash map，同时记录当前出现次数最高的10个单词
可以得到多个小文件中的出现次数最多的10个单词，再依据常规的排序算法得到总体上出现次数最多的10个单词

//算法一：“分而治之 + Hash“的实现
package com.yunfeng;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
  
public class BigDataSort{
    public final static int FILE_NUM = 10;
    public static Set getTop10Word(File file) //获取单个文件中，出现频率最高的10个单词
    {
        int count = 0;
        Map map = new HashMap();
        Set set = new TreeSet();
        Set setTop10 = new TreeSet();
        
        try
        {
            BufferedReader br = new BufferedReader(new FileReader(file));
            String s = null;
            while ((s = br.readLine())!= null) 
            {
                if (map.get(s) == null) {
                    count = 1;
                } else {
                    count = map.get(s).intValue() + 1;
                }
                map.put(s,count);
            }
            
            for (String key : map.keySet()) {
                set.add(new WordEntity(key,map.get(key)));
            }
            
            count = 1;
            for (Iterator it = set.iterator(); it.hasNext(); ) {
                setTop10.add(it.next());
                if (count == 10)
                    break;
                count++;
            }
        } catch (FileNotFoundException e) {
             e.printStackTrace();
        } catch (IOException e) {
             e.printStackTrace();
        }
        return setTop10;
    }
    
    public static File[] cutFileByHash(String file) {//切割大文件，file为包含大数据的文件
        File[] files = new File[FILE_NUM];
        BufferedWriter[] brs = new BufferedWriter[FILE_NUM];
        
        try
        {
            BufferedReader br = new BufferedReader(new FileReader(file));
            for(int i = 0;i < FILE_NUM;i++) {
                files[i] = new File("file_" + i);
                brs[i] = new BufferedWriter(new FileWriter(files[i]));
            }
            
            int j = 0;//取模后的值
            String s = null;
            while ((s = br.readLine())!= null) 
            {
                j = Math.abs(s.hashCode() % FILE_NUM);
                brs[j].write(s);
                brs[j].newLine();
            }
            
         } catch (FileNotFoundException e) {
            e.printStackTrace();
         } catch (IOException e) {
            e.printStackTrace();
       }
       finally {
        try {
            for(BufferedWriter b : brs) {
                b.close();
            }
        }
        catch (IOException e) {
            // TODO: handle exception
            e.printStackTrace();
        }
    }
        return files;
    }
    
    public static void main(String[] args) {
        File[] files = cutFileByHash("123.txt");//传入文件名字
        Set set = new TreeSet();
        for(File file : files) {
            set.addAll(getTop10Word(file));
        }
        int count = 1;
        WordEntity wordEntity = null;
        for (Iterator it = set.iterator(); it.hasNext(); ) {
            wordEntity = it.next();
            System.out.println("The top ten word is :" + wordEntity.getKey()
            + ";frequence is " + wordEntity.getCount());
            if (count == 10)
                break;
            count++;
        }
    }
}

class WordEntity implements Comparable {//单词实体
    private String key;
    private Integer count;
    public WordEntity (String key, Integer count) {
        this.key = key;
        this.count = count;
    }
    
    public int compareTo(WordEntity o) {
        int cmp = count.intValue() - o.count.intValue();
            return (cmp == 0 ? key.compareTo(o.key) : -cmp);
    }
    
    public String getKey(){
        return key; 
    }
    
    public Integer getCount(){
        return count;   
    }
}

海量数据处理

1.一个文件中，存储有10亿个单词（数字+字母组成，每个单词小于16Byte），每行一个，求出现频率最高的10个单词。

你可能感兴趣的:(海量数据处理)