工作一年多了,一直在断断续续的学习使用 Hadoop,它提供的 Map 和 Reduce 数据处理引擎能够帮助我们方便的处理大数据集,HDFS 分布式文件系统,可以帮助我们冗余的存储大数据集,这么好的一门技术,应该是要好好学习的。
        最近一直在看 一书,里面介绍了许多问题的解决办法,总觉得读书应该记些笔记,这里就写点什么吧!

       首先,需要一个数据集用于以后的数据处理算法做准备:

  1. 这个数据集包含10000行数据,其中的每一行都是 json 字符串
  2. 每个 json 字符串中包含一个用户的四个基本信息(id, name, sex, age)

       接来下,生成这个数据集:

package hadoop_design.mock_user_info;

/**
 * User 基本信息
 * Created by zhanghu on 16/8/27.
 */
public class UserBean {

    private String userName;
    private int age;
    private int sex;        // 男性是1, 女性是0
    private String id;  // md5(userName + age + sex)

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public int getSex() {
        return sex;
    }

    public void setSex(int sex) {
        this.sex = sex;
    }

    public String getId() {
        return id;
    }

    public void setId(String userId) {
        this.id = userId;
    }
}
package hadoop_design.mock_user_info;

import net.sf.json.JSONObject;

/**
 * Json 工具类
 * Created by zhanghu on 16/8/27.
 */
public class JsonUtils {

    public static String objectToJsonString(Object object) {
        JSONObject json = JSONObject.fromObject(object);
        return json.toString();
    }
}
package hadoop_design.mock_user_info;

import java.util.Random;

/**
 * 随机器
 * 引用 : http://www.cnblogs.com/dongliyang/archive/2013/04/01/2994554.html
 * Created by zhanghu on 16/8/27.
 */
public final class StdRandom {

    //随机数生成器
    private static Random random;
    //种子值
    private static long seed;

    //静态代码块,初始化种子值及随机数生成器
    static {
        seed = System.currentTimeMillis();
        random = new Random(seed);
    }

    //私有构造函数,禁止实例化
    private StdRandom() {}

    /**
     * 设置种子值
     * @param s 随机数生成器的种子值
     */
    public static void setSeed(long s){
        seed = s;
        random = new Random(seed);
    }

    /**
     * 获取种子值
     * @return long 随机数生成器的种子值
     */
    public static long getSeed(){
        return seed;
    }

    /**
     * 随机返回0到1之间的实数 [0,1)
     * @return double 随机数
     */
    public static double uniform(){
        return random.nextDouble();
    }

    /**
     * 随机返回0到N-1之间的整数 [0,N)
     * @param N 上限
     * @return int 随机数
     */
    public static int uniform(int N){
        return random.nextInt(N);
    }

    /**
     * 随机返回0到1之间的实数 [0,1)
     * @return double 随机数
     */
    public static double random(){
        return uniform();
    }

    /**
     * 随机返回a到b-1之间的整数 [a,b)
     * @param a 下限
     * @param b 上限
     * @return int 随机数
     */
    public static int uniform(int a,int b){
        return a + uniform(b - a);
    }

    /**
     * 随机返回a到b之间的实数
     * @param a 下限
     * @param b 上限
     * @return double 随机数
     */
    public static double uniform(double a,double b){
        return a + uniform() * (b - a);
    }
}
package hadoop_design.mock_user_info;

/**
 * String 对象的一些工具类
 * Created by zhanghu on 16/8/27.
 */
public class StringUtils {

    /**
     * 返回随机字符串,同时包含数字、大小写字母
     * @param len 字符串长度,不能小于3
     * @return String 随机字符串
     */
    public static String randomStr(int len){

        if(len < 3){
            throw new IllegalArgumentException("字符串长度不能小于3");
        }

        //数组,用于存放随机字符
        char[] chArr = new char[len];

        //为了保证必须包含数字、大小写字母
        chArr[0] = (char)('0' + StdRandom.uniform(0,10));
        chArr[1] = (char)('A' + StdRandom.uniform(0,26));
        chArr[2] = (char)('a' + StdRandom.uniform(0,26));


        char[] codes = { '0','1','2','3','4','5','6','7','8','9',
                'A','B','C','D','E','F','G','H','I','J',
                'K','L','M','N','O','P','Q','R','S','T',
                'U','V','W','X','Y','Z','a','b','c','d',
                'e','f','g','h','i','j','k','l','m','n',
                'o','p','q','r','s','t','u','v','w','x',
                'y','z'};

        //charArr[3..len-1]随机生成codes中的字符
        for(int i = 3; i < len; i++){
            chArr[i] = codes[StdRandom.uniform(0,codes.length)];
        }

        //将数组chArr随机排序
        for(int i = 0; i < len; i++){
            int r = i + StdRandom.uniform(len - i);
            char temp = chArr[i];
            chArr[i] = chArr[r];
            chArr[r] = temp;
        }

        return new String(chArr);
    }
}
package hadoop_design.mock_user_info;

import org.apache.commons.codec.digest.DigestUtils;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

/** 主程序代码
 * Created by zhanghu on 16/8/27.
 */
public class Main {

    private static String generateData() {
        UserBean userBean = new UserBean();
        userBean.setUserName(StringUtils.randomStr(StdRandom.uniform(10, 21)));
        userBean.setAge(StdRandom.uniform(0, 100));
        userBean.setSex(StdRandom.uniform(0, 2));

        String md5Bean = userBean.getUserName() + userBean.getAge() + userBean.getSex();
        userBean.setId(DigestUtils.md5Hex(md5Bean));
        return JsonUtils.objectToJsonString(userBean);
    }

    public static void main(String[] args) throws IOException {

        File file = new File("user.data");
        BufferedWriter out = new BufferedWriter(new FileWriter((file)));

        for (int i = 0; i != 10000; ++i) {
            if (i % 1000 == 0) {
                System.out.println("mock data line : " + i);
            }
            out.write(generateData());
            out.newLine();
        }

        out.flush();
        out.close();
    }
}

        OK, 利用上面的程序,我得到了一个包含 10000 行用户信息的 json 文本行,类似于下面这样:

{"age":48,"id":"7a8bd2dc862f8ce972292474f2f3bc56","sex":1,"userName":"dHI3w56HNTiQh"}
{"age":18,"id":"fbcf2df050aa2da3c678dcb0a02bda2d","sex":1,"userName":"Xh7mU53Ba7JZ"}
{"age":70,"id":"4808f32ecbbe21b93882bb44973e7bea","sex":1,"userName":"aLV5E156YdJ"}
{"age":57,"id":"a9863ef325a6ca91f2554e8f4874d424","sex":1,"userName":"CwQ43w548IS"}
{"age":71,"id":"e4e7724632feefc514902d0849a86d6b","sex":1,"userName":"sz9hcdCZnkVXC3x"}
{"age":26,"id":"25ae8b3ab30f11a267939fec7177f829","sex":1,"userName":"cA17IpnzzPFMv4"}
{"age":58,"id":"52a7cb852583fe183d300de9c6de0efa","sex":1,"userName":"9b8v3HFIaNqsIyC2a97"}
{"age":55,"id":"71fd9c057f2c60b99021ce8a353c5cb0","sex":0,"userName":"9OqAJlyZVKgpV"}
{"age":25,"id":"6b758fb1a4e78930ea919074d5abb172","sex":0,"userName":"OvHcn61daoXTu"}
{"age":90,"id":"02883e46b66fd848075401ae205b0896","sex":0,"userName":"4kyHB1s5v6nQ049"}

        下面,开始使用这些数据吧!

你可能感兴趣的:(序)