1 /* 2 matrix-matrix multiplication on Hadoop 3 4 A x B = C 5 constraint: A, B, C must be of the same size 6 7 I use this to evaluate the efficiency of Hadoop for matrix multiplication, 8 so I really don't care to handle non-square matrices. 9 10 ===Data preparation==== 11 Matrix data must be stored in a file on Hadoop. 12 Line number must be appended to the beginning of each line. 13 For example, the following represents a 4x4 matrix: 14 15 0 18 20 16 14 16 1 17 12 11 19 17 2 10 17 11 19 18 3 14 17 20 10 19 20 Left (A in this example) matrix should be stored in file "left"; 21 Right (B in this example) matrix should be stored in file "right"; 22 I use filenames to distinguish input data. 23 24 Place "left" and "right" in the same folder (let's call it "input") 25 26 ====Run the program==== 27 > hadoop jar matrixmul.jar MatrixMul input output 8 2 28 29 results will be placed in "output" folder on HDFS. 30 8: all matrices are 8x8 31 2: every partitioned block is of size 2x2 32 33 ===Read the results=== 34 Given the above sample command, we multiply two 8x8 matrices, 35 in many 2x2 blocks. So, that the resulted C matrix has 16 blocks. 36 37 In the output folder, there will be 16 separate files: 38 part-r-00000, part-r-00001, ... part-r-00015 39 40 Every file stores one block in C. In this example, every block 41 has 2 rows and 2 columns. 42 43 These files are organized in "row"-order. 44 45 ===Algorithm=== 46 Mappers read input data. 47 Every reducer processes one block of the resulted matrix. 48 49 */ 50 import java.io.IOException; 51 import java.util.StringTokenizer; 52 53 import org.apache.hadoop.conf.Configuration; 54 import org.apache.hadoop.fs.Path; 55 import org.apache.hadoop.io.IntWritable; 56 import org.apache.hadoop.io.LongWritable; 57 import org.apache.hadoop.io.Text; 58 import org.apache.hadoop.mapreduce.Job; 59 import org.apache.hadoop.mapreduce.Mapper; 60 import org.apache.hadoop.mapreduce.Reducer; 61 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 62 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 63 import org.apache.hadoop.mapreduce.lib.input.FileSplit; 64 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 65 import org.apache.hadoop.util.GenericOptionsParser; 66 67 public class MatrixMul { 68 69 public static class MyMapper extends Mapper<LongWritable, Text, IntWritable, Text>{ 70 71 private String filename=null; 72 private boolean isLeftMatrix=false; 73 private int totalSize, partSize, npart; 74 75 private boolean isLeft(){return isLeftMatrix;} 76 protected void setup(Context context) throws IOException, InterruptedException{ 77 //get filename 78 FileSplit fileSplit = (FileSplit)context.getInputSplit(); 79 filename = fileSplit.getPath().getName(); 80 if("left".equalsIgnoreCase(filename)) 81 isLeftMatrix=true; 82 else 83 isLeftMatrix=false; 84 85 //get how size and partition information 86 Configuration conf=context.getConfiguration(); 87 totalSize=conf.getInt("matrix-mul-totalsize", -1); 88 partSize=conf.getInt("matrix-mul-partsize", -1); 89 npart=conf.getInt("matrix-mul-npart", -1); 90 if(totalSize<0 || partSize<0 || npart<0){ 91 System.out.println("Error in setup of MyMapper."); 92 System.exit(1); 93 } 94 } 95 96 public void map(LongWritable key, Text value, Context context 97 ) throws IOException, InterruptedException { 98 String line=value.toString(); 99 String[] strs=line.split(" "); 100 if(strs.length!=totalSize+1){ 101 System.out.println("Error in map of Mapper."); 102 System.out.println(strs.length+"___"+totalSize); 103 System.out.println("line is: "+line); 104 System.exit(1); 105 } 106 int linenum=Integer.parseInt(strs[0]); 107 int[] numbers=new int[totalSize]; 108 for(int i=0;i<totalSize;i++) 109 numbers[i]=Integer.parseInt(strs[i+1]); 110 int part_hor=linenum/partSize; //horizontal partitioned id 111 int prev_part_ver=-1; 112 String msg=null; 113 for(int i=0;i<totalSize;i++){ 114 int part_ver=i/partSize; //vertical partition number 115 if(part_ver!=prev_part_ver){ 116 if(msg!=null){ 117 int baselinenum = part_hor * partSize; 118 int old=part_ver; 119 part_ver=prev_part_ver; 120 if(isLeft()){ 121 String toSend="l:"+(linenum - baselinenum)+":"+part_ver+"#"+msg; 122 System.out.println("left "+linenum+","+part_ver+" "+msg); 123 for(int k=0;k<npart;k++){ 124 int dest=part_hor * npart + k; 125 context.write(new IntWritable(dest), new Text(toSend)); 126 } 127 }else{ 128 String toSend="r:"+(linenum - baselinenum)+":"+part_hor+"#"+msg; 129 System.out.println("right "+part_ver+":"+linenum+" "+msg); 130 for(int k=0;k<npart;k++){ 131 int dest=k * npart + part_ver; 132 context.write(new IntWritable(dest), new Text(toSend)); 133 } 134 } 135 part_ver=old; 136 } 137 msg=null; 138 prev_part_ver=part_ver; 139 } 140 if(msg==null) 141 msg=""+strs[i+1]; 142 else 143 msg+=" "+strs[i+1]; 144 } 145 if(msg!=null){ //almost the same code 146 int part_ver=npart-1; 147 int baselinenum = part_hor * partSize; 148 if(isLeft()){ 149 String toSend="l:"+(linenum - baselinenum)+":"+part_ver+"#"+msg; 150 System.out.println("left "+linenum+","+part_ver+" "+msg); 151 for(int k=0;k<npart;k++){ 152 int dest=part_hor * npart + k; 153 context.write(new IntWritable(dest), new Text(toSend)); 154 } 155 }else{ 156 String toSend="r:"+(linenum - baselinenum)+":"+part_hor+"#"+msg; 157 System.out.println("right "+part_ver+":"+linenum+" "+msg); 158 for(int k=0;k<npart;k++){ 159 int dest=k * npart + part_ver; //has to be the last part 160 context.write(new IntWritable(dest), new Text(toSend)); 161 } 162 } 163 } 164 } 165 } 166 167 public static class MyReducer extends Reducer<IntWritable, Text, Text, Text> { 168 169 private int totalSize, partSize, npart; 170 int[][] left=null; 171 int[][] right=null; 172 protected void setup(Context context) throws IOException, InterruptedException{ 173 //get how # of partitions 174 Configuration conf=context.getConfiguration(); 175 totalSize=conf.getInt("matrix-mul-totalsize", -1); 176 partSize=conf.getInt("matrix-mul-partsize", -1); 177 npart=conf.getInt("matrix-mul-npart", -1); 178 if(totalSize<0 || partSize<0 || npart<0){ 179 System.out.println("Error in setup of MyReducer."); 180 System.exit(1); 181 } 182 left=new int[partSize][totalSize]; 183 right=new int[totalSize][partSize]; 184 } 185 public void reduce(IntWritable key, Iterable<Text> values, Context context 186 ) throws IOException, InterruptedException { 187 int sum = 0; 188 for (Text val : values) { 189 String line=val.toString(); 190 String[] meta_val=line.split("#"); 191 String[] metas=meta_val[0].split(":"); 192 String[] numbers=meta_val[1].split(" "); 193 194 int baselinenum=Integer.parseInt(metas[1]); 195 int blkindex=Integer.parseInt(metas[2]); 196 if("l".equalsIgnoreCase(metas[0])){ //from left matrix 197 int start=blkindex * partSize; 198 for(int i=0;i<partSize; i++) 199 left[baselinenum][start+i]=Integer.parseInt(numbers[i]); 200 }else{ 201 int rowindex=blkindex*partSize + baselinenum; 202 for(int i=0;i<partSize; i++) 203 right[rowindex][i]=Integer.parseInt(numbers[i]); 204 } 205 } 206 } 207 protected void cleanup(Context context) throws IOException, InterruptedException { 208 //now let's do the calculation 209 int[][] res=new int[partSize][partSize]; 210 for(int i=0;i<partSize;i++) 211 for(int j=0;j<partSize;j++) 212 res[i][j]=0; 213 for(int i=0;i<partSize;i++){ 214 for(int k=0;k<totalSize;k++){ 215 for(int j=0;j<partSize;j++){ 216 res[i][j]+=left[i][k]*right[k][j]; 217 } 218 } 219 } 220 for(int i=0;i<partSize;i++){ 221 String output=null; 222 for(int j=0;j<partSize;j++){ 223 if(output==null) 224 output=""+res[i][j]; 225 else 226 output+=" "+res[i][j]; 227 } 228 context.write(new Text(output), null); 229 } 230 } 231 } 232 public static void main(String[] args) throws Exception { 233 Configuration conf = new Configuration(); 234 if (args.length != 4) { 235 System.err.println("Usage: MatrixMul input-dir output-dir total-size part-size"); 236 System.exit(2); 237 } 238 int totalsize=Integer.parseInt(args[2]); 239 int partsize=Integer.parseInt(args[3]); 240 if(totalsize==0 || partsize==0 || partsize>totalsize){ 241 System.out.println("Invalid total-size or part-size"); 242 System.exit(1); 243 } 244 conf.setInt("matrix-mul-totalsize", totalsize); //the matrix is 'totalsize' by 'totalsize' 245 conf.setInt("matrix-mul-partsize", partsize); //every block is 'partsize' by 'partsize' 246 int npart=totalsize/partsize; 247 if(npart*partsize<totalsize) 248 npart++; 249 conf.setInt("matrix-mul-npart", npart); //number of parts on one dimension 250 Job job = new Job(conf, "matrix-mul"); 251 job.setJarByClass(MatrixMul.class); 252 job.setMapperClass(MyMapper.class); 253 job.setReducerClass(MyReducer.class); 254 job.setNumReduceTasks(npart*npart); 255 256 job.setOutputKeyClass(IntWritable.class); 257 job.setOutputValueClass(Text.class); 258 259 //FileInputFormat.addInputPath(job, new Path(args[0])); 260 TextInputFormat.addInputPath(job, new Path(args[0])); //need to read a complete line 261 FileOutputFormat.setOutputPath(job, new Path(args[1])); 262 job.waitForCompletion(true) ; 263 } 264 }