#所需环境版本
jdk=1.8
scala.version=2.11.0
spark.version=2.3.2
hadoop.verison=2.7.2
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object Main {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
.setAppName("test")
.setMaster("local[2]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.registerKryoClasses(Array(classOf[IntWritable], classOf[Text]))
val sc = new SparkContext(sparkConf)
process(sc)
sc.stop()
}
def process(sc: SparkContext) = {
val output = "output"
val buffer = new ArrayBuffer[(String,String)]
for(i <- 1 to 1000){
val key = (Math.random() * 10).toInt.toString
buffer +=((key,s"value_${key}"))
}
sc.makeRDD(buffer).saveAsHadoopFile(output,classOf[String],classOf[String],classOf[RDDMultipleTextOutputFormat])
val output1 = "output1"
val buffer1 = new ArrayBuffer[(IntWritable,Text)]
for(i <- 1 to 1000){
val key = (Math.random() * 10).toInt
buffer1 +=((new IntWritable(key),new Text(s"value_${key}")))
}
sc.makeRDD(buffer1).saveAsNewAPIHadoopFile(output1,classOf[IntWritable],classOf[Text],classOf[MultipleFileOutputFormat])
}
}
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[String, String]{
override def generateFileNameForKeyValue(key:String, value:String, name:String): String ={
key
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
public class MultipleFileOutputFormat extends FileOutputFormat {
private static final Map FILENAMES = new HashMap(){
{
put(0, "file0");
put(1, "file1");
put(2, "file2");
put(3, "file3");
put(4,"file4");
put(5, "file5");
put(6, "file6");
put(7, "file7");
put(8, "file8");
put(9, "file9");
}
};
protected static final String CODESET = "utf-8";
protected static final byte[] newline;
static {
try {
newline = "\n".getBytes(CODESET);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("failed to get newline bytes, ", e);
}
}
public Path getPathForWorkFile(TaskAttemptContext context, String prefix) throws IOException {
FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context);
return new Path(committer.getWorkPath(), prefix);
}
@Override
public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException {
final TaskAttemptContext myJob = job;
final Configuration conf = job.getConfiguration();
final FileSystem fs = FileSystem.get(conf);
return new RecordWriter() {
LineRecorderWriter[] outWriter = new LineRecorderWriter[FILENAMES.size()];
@Override
public void write(IntWritable key, Text value) throws IOException, InterruptedException {
if(outWriter[key.get()] == null){
Path filePath = getPathForWorkFile(myJob, FILENAMES.get(key.get()));
outWriter[key.get()] = new LineRecorderWriter(fs, filePath);
}
outWriter[key.get()].write(key, value);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
for(LineRecorderWriter writer : outWriter){
if(writer != null)
writer.close(context);
}
}
};
}
public static class LineRecorderWriter extends RecordWriter {
FSDataOutputStream out;
public LineRecorderWriter(FileSystem fs, Path path) throws IOException {
out = fs.create(path);
}
@Override
public void write(IntWritable key, Text value) throws IOException, InterruptedException {
out.write(value.toString().getBytes(CODESET));
out.write(newline);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
out.close();
}
}
}