JobConf conf = new JobConf(SamplerTest.class); conf.setJobName("Sampler"); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); //设置采样率为0.1,最大样本数1000,最大分区数10(在最多10个分区中,按照约总条数*0.1的间隔取出一个样本,最多1000个样本,注意每条提取的顺序也是被打乱了的),采样器在客户端运行 //理论上RandomSampler<K, V>中的k,v和inputformat中的map传入值类型一致,但是这里好像有个bug,在后面的sampler.getSample(inf, conf)中返回的list使用的toArrays()方法,这样返回的就只能是Object数组了,所以k只能为Object,否则会抛类转换异常。另外v在采样时没什么用 RandomSampler<Object, Text> sampler = new RandomSampler<Object, Text>(0.1, 1000, 10); final InputFormat<Object,Text> inf = (InputFormat<Object,Text>) conf.getInputFormat(); //因为sampler.getSample(inf, conf)的返回类型由inf中的InputFormat中的K,V决定,所以这里也要转换成对应的K,V //conf.getInputFormat()从配置中获得inputformat,若不配置,默认使用TextInputFormat(这里就没配置) Object[] keys = sampler.getSample(inf, conf); for(Object l:keys){ LongWritable l2 = (LongWritable)l; System.out.println(l2); } return 0;
public class MyMap extends MapReduceBase implementsMapper < LongWritable , Text , LongWritable , IntWritable > {@Overridepublic void map ( LongWritable key , Text val ,OutputCollector < LongWritable , IntWritable > output , Reporter reporter )throws IOException {output . collect ( key , new IntWritable ( 1 ));}}
public class MyRed extends MapReduceBase implementsReducer < LongWritable , IntWritable , Text , NullWritable > {@Overridepublic void reduce ( LongWritable key , Iterator < IntWritable > values ,OutputCollector < Text , NullWritable > output , Reporter reporter )throws IOException {while ( values . hasNext ()){values . next ();output . collect ( new Text ( key . toString ()), NullWritable . get ());}}}
public class MyPartitions implements Partitioner < LongWritable , IntWritable > {@Overridepublic void configure ( JobConf job ) {}@Overridepublic int getPartition ( LongWritable key , IntWritable value ,int numPartitions ) {//PartitionsUtils.findPartitions的作用就是依据分布式缓存数据,按key进行分区,具体代码就不贴了,具体操作如下//获取分布式缓存文件中的数据,取得对应的数字,这些数字按从小到达顺序排列,返回其插值序数。比如:3,3,9。那么当key<3时返回0,key=3时返回1,3<key<=9时返回2,key>9时返回3。return PartitionsUtils.findPartitions(key,value,numPartitions);}}
public class SamplerTest3 extends Configured implements Tool {public int run ( String [] args ) throws Exception {int reduceTasks = 4 ;JobConf conf = new JobConf ( SamplerTest3 . class );conf . setJobName ( "Sampler" );FileInputFormat . setInputPaths ( conf , new Path ( args [ 0 ]));FileOutputFormat . setOutputPath ( conf , new Path ( args [ 1 ]));//MyTextInputFormat是自定义的一个InputFormat,该InputFormat与TextInputFormat的区别是key就是Long.parseLong(value.toString());conf.setInputFormat(MyTextInputFormat.class);conf.setOutputFormat(TextOutputFormat.class);conf . setOutputKeyClass ( Text . class );//因为是返回原数据中已排好序的数字,而这些数据是来自key的,无须value,所以value类型为NullWritable即可conf.setOutputValueClass(NullWritable.class);conf.setMapOutputKeyClass(LongWritable.class);conf.setMapOutputValueClass(IntWritable.class);conf . setReducerClass ( MyRed . class );conf . setMapperClass ( MyMap . class );conf . setNumReduceTasks ( reduceTasks );conf . setPartitionerClass ( MyPartitions . class );//进行采样RandomSampler<Object, Text> sampler = new RandomSampler<Object, Text>(0.1, 1000, 10);final InputFormat<Object,Text> inf = (InputFormat<Object,Text>) conf.getInputFormat();Object[] keys = sampler.getSample(inf, conf);Path catchPath = new Path("/cache");URI partitionUri = new URI(catchPath.toString()+ "#myCache");FileSystem fs = FileSystem . get ( conf );FSDataOutputStream out = fs . create ( catchPath );//MakeCacheFile方法为写入分布式缓存文件内容。//内容如下:keys为取到的一系列的数字样本。这里将这些样本数据按从小到大进行排序,然后平分成n段,共reduceTasks段//取右端的数,存入分布式缓存中。最后一段不用存//比如采样的数字样本是1112|2222|2367|7788 分成了4段,取右端的数就是2,2,7,8。再去除最后一个数最终得2,2,7,存入分布式缓存中MakeCacheFile(out,keys,reduceTasks);IOUtils.closeStream(out);//添加到分布式缓存DistributedCache.addCacheFile(partitionUri, conf);DistributedCache.createSymlink(conf);JobClient.runJob(conf);return 0;}public static void main ( String [] args ) throws Exception {System . exit ( ToolRunner . run ( new SamplerTest3 (), args ));}}
public class SamplerTest4 extends Configured implements Tool {public int run ( String [] args ) throws Exception {JobConf conf = new JobConf ( SamplerTest4 . class );conf . setJobName ( "Sampler" );FileInputFormat . setInputPaths ( conf , new Path ( args [ 0 ]));FileOutputFormat . setOutputPath ( conf , new Path ( args [ 1 ]));//MyTextInputFormat和例2一样conf.setInputFormat(MyTextInputFormat.class);conf.setOutputFormat(TextOutputFormat.class);conf . setOutputKeyClass ( Text . class );conf . setOutputValueClass ( NullWritable . class );conf . setMapOutputKeyClass ( LongWritable . class );conf . setMapOutputValueClass ( IntWritable . class );//MyRed,MyMap和例2一样conf.setReducerClass(MyRed.class);conf.setMapperClass(MyMap.class);conf . setNumReduceTasks ( 4 );conf . setPartitionerClass ( TotalOrderPartitioner . class );RandomSampler < Object , Text > sampler = new RandomSampler < Object , Text >( 0.1 , 1000 , 10 );//告诉hadoop分布式缓存文件放在哪里好Path catchPath = new Path("/partitionFile");TotalOrderPartitioner.setPartitionFile(conf, catchPath);//自动生成缓存文件InputSampler.writePartitionFile(conf, sampler);URI partitionUri = new URI ( catchPath . toString ()+ "#_partitions" );//添加到分布式缓存DistributedCache.addCacheFile(partitionUri, conf);DistributedCache.createSymlink(conf);JobClient.runJob(conf);return 0;}public static void main(String[] args) throws Exception {System.exit(ToolRunner.run(new SamplerTest4(), args));}}