背帆

大数据竞赛MR培训与题型

MapReduce编程模板

1.自定义 Mapper类继承类并重写map方法：

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key,Text value,Context context) throws IOException,InterruptedException{
        String[] split = value.toString().split(",");
        //比如每一行aa   bb  cc,根据逗号, 切分后，把字符串为key,value为1
        for (String word : split) {
            Text text = new Text(word);
            IntWritable num = new IntWritable(1);
            context.write(text, num);
        }
    }
}
//hadoop对应java数据类型

Java	Hadoop
int	IntWritable
long	LongWritable
string	Text
byte	ByteWritable
double	DoubleWritable
float	FloatWritable
boolean	BooleanWritable
null	NullWritable

自己定义的需要序列化和反序列化可以通过实现 Writable接口来使用。

在重写map方法时，如果中间处理数据时将类型转化为Java的数据类型，将结果写入上下文对象Context，要重新转为Hadoop的类型。

2.自定义Reducer类集成Reducer，并重写Reduce方法

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
 
import java.io.IOException;
 
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //循环遍历values  求和
        int sum=0;
        for(IntWritable v:values){
          //mapreduce的框架已经帮我们做好了从map出来后已经做好按key分组，
           // 也就是到这里的,Iterable values 是同一个单词的数量迭代器，进行相加就可以得到最后的数量
           //类似于{"aa":[1,1,1,1,1]},所以统计aa单词出现的个数的话，只需要将迭代器中的[1,1,1,1,1]相加就可以得出总数
           sum+=v.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

3.Driver 主入口，整合mapper和reducer

(1) 配置conf并开启一个job

(2) 指定mapper类和reducer类

(3) 设置map输出key value的类型和设置reduce输出key value的类型

(4) 创建输入流FileInputFormat设置输入的hdfs的指定位置

(4) 创建输出流FileOutputFormat 将结果输出的hdfs的指定位置

(5) job提交语句：job.waitForCompletion(true) ，true表示需要打印日志

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();
        // 设置默认hdfs访问路径
        conf.set("fs.defaultFS", "hdfs://master:9000");
        // 设置Windows跨平台提交job的参数
        // conf.set("mapreduce.app-submission.cross-platform","true");
        conf.set("mapred.job.tracker", "master:54311");
        // 配置访问用户
        System.setProperty("HADOOP_USER_NAME", "root");

        //创建一个job
        Job job = Job.getInstance(conf);
        job.setJarByClass(test.class);
        job.setMapperClass(CountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setReducerClass(CountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.addInputPath(job,new Path("hdfs://master:9000/data/stu_score_sub.csv"));
        Path out = new Path("hdfs://master:9000/output");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(out)){
            fs.delete(out,true);
        }
        //配置输入输出的路径
        FileOutputFormat.setOutputPath(job,out);
        job.waitForCompletion(true);
    }

MapReduce各个情景实战

1.多个输入

求每个同学每科成绩的总分
chinese.txt

english.txt

math.txt

Student.java

注意：序列化/反序列化机制：当自定义了一个类之后,如果想要产生的对象在hadoop中进行传输,那么需要这个类实现Hadoop提供的Writable的接口只需要将按序写出并进行序列化/反序列化

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;

public class Student implements Writable{	
	private String name;	
	private Integer chinese;	
	private Integer english;	
	private Integer math;	
	public String getName() {return name;	}	
	public void setName(String name) {this.name = name;}	
	public Integer getChinese() {return chinese;}	
	public void setChinese(Integer chinese) {this.chinese = chinese;}	
	public Integer getEnglish() {return english;}	
	public void setEnglish(Integer english) {this.english = english;}	
	public Integer getMath() {return math;}
	public void setMath(Integer math) {this.math = math;}	
	@Override	
    // 反序列化	
	public void readFields(DataInput input) throws IOException {		
		this.name = input.readUTF();		
		this.chinese = input.readInt();		
		this.english = input.readInt();		
		this.math = input.readInt();	
	}	
	@Override
    // 序列化
	public void write(DataOutput output) throws IOException {		
		output.writeUTF(name);		
		output.writeInt(chinese);		
		output.writeInt(english);		
		output.writeInt(math);	
	}	
	@Override	
	public String toString() {		
		return "Student [name=" + name + ", chinese=" + chinese + ", english=" + english + ", math=" + math + "]";	
	}
}

Mapper代码

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**统计每个人三科的每科各月的总成绩 
 * key : 姓名 * value : student 
 * Map : 映射数据 *  
 * Mapper 数量 = 切片的数量  
 */
 public class ScoreMapper extends Mapper<LongWritable, Text, Text, Student> {	
 @Override	
 	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text,Student>.Context context)	throws IOException, InterruptedException {		
 	// 文件名称，根据文件名称判断成绩是哪个科目的
        FileSplit split = (FileSplit) context.getInputSplit();		
        Student student = new Student();	
        // 每行的内容		
        // 1 zhang 89  月份 姓名 成绩	
        if (split.getPath().getName().equals("chinese.txt")) {
            student.setChinese(Integer.valueOf(score));			
            student.setEnglish(0);			
            student.setMath(0);		
        } else if (split.getPath().getName().equals("math.txt")) {
            student.setEnglish(Integer.valueOf(score));			
            student.setMath(0);			
            student.setChinese(0);
        } else if (split.getPath().getName().equals("english.txt")) {
            student.setMath(Integer.valueOf(score));			
            student.setChinese(0);			
            student.setEnglish(0);	
        }
        String lineContent = value.toString() ;		
        String [] datas = lineContent.split(" ");		
        String name = datas[1];		
        String score = datas[2];			
        student.setName(name);		
        context.write(new Text(name), student);	
 	}
 }

上面用到的FileSplit类用法

	FileSplit fs = new FileSplit();
	String pathname=fs.getPath().getName(); //获取目录名字
    int depth = fs.getPath().depth();       //获取目录深度
    fs.getClass(); //获取当前类
    long length = fs.getLength(); //获取文件长度  
    SplitLocationInfo[] locationInfo =fs.getLocationInfo(); //获取位置信息
    String[] locations = fs.getLocations(); //获取位置

Reducer代码

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class ScoreReducer extends Reducer<Text, Student, Text, Student> {	
	@Override	
	protected void reduce(Text key, Iterable<Student> values, Reducer<Text, Student, Text, Student>.Context context) throws IOException, InterruptedException {		
		Student student = new Student();		
		student.setName(key.toString());		
		Integer chinese = 0;		
		Integer english = 0;		
		Integer math = 0;		
		for(Student stu : values){			
			chinese = chinese + stu.getChinese();			
			english = english + stu.getEnglish();			
			math = math + stu.getMath();		
			}		
		student.setChinese(chinese);		
		student.setEnglish(english);		
		student.setMath(math);		
		context.write(key, student);	
	}
}

Driver代码

public class ScoreDriver {	
	public static void main(String[] args) throws ClassNotFoundException,IOException,InterruptedException {			Configuration conf = new Configuration();		
		Job job = Job.getInstance(conf);		
		job.setJarByClass(ScoreDriver.class);		
		job.setMapperClass(ScoreMapper.class);		
		job.setReducerClass(ScoreReducer.class);		
		job.setMapOutputKeyClass(Text.class);		
		job.setMapOutputValueClass(Student.class);		
		job.setOutputKeyClass(Text.class);		
		job.setOutputValueClass(Student.class);	// 读取路径下的所有文件，此时 result 文件夹不存在							FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/score"));							FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/score/result"));					job.waitForCompletion(true);	
	}
}

2.排序

根据电影热度对电影排序
惊天破 72
机械师2 83
奇异博士 67
但丁密码 79
比利林恩的中场战事 84
侠探杰克:永不回头 68
龙珠Z:复活的弗利萨 79
长城 56

Mapper 排序是根据KEY值进行排序的，所以 PO类作为KEY值

MovieBean.java

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MovieBean implements WritableComparable<MovieBean>{	
	private String name;	
	private Integer hotNum;	
	public String getName() {return name;}	
	public void setName(String name) {this.name = name;}	
	public Integer getHotNum() {return hotNum;}	
	public void setHotNum(Integer hotNum) {this.hotNum = hotNum;}	
	@Override	
	public void readFields(DataInput input) throws IOException {		
		this.name = input.readUTF();		
		this.hotNum = input.readInt();	
	}	
	@Override	
	public void write(DataOutput output) throws IOException {		
		output.writeUTF(this.name);		
		output.writeInt(this.hotNum);	
	}	
	@Override	
	public String toString() {		
		return "MovieBean [name=" + name + ", hotNum=" + hotNum + "]";	
	}
    // 降序排序：旧对象 - 当前对象	
	@Override	
	public int compareTo(MovieBean o) {		
        //return Integer.compare(o.getHotNum(), this.getHotNum())
		return o.getHotNum() - this.getHotNum();	
	}
}

继承WritableComparable接口，重写 compareTo（）函数，定义比较结果

Mapper代码

public class SortMapper extends Mapper<LongWritable, Text, MovieBean, NullWritable> {	
    @Override	
    protected void map(LongWritable key,Text value,Mapper<LongWritable,Text,MovieBean, NullWritable>.Context context)	throws IOException, InterruptedException {		
        String line = value.toString();		
        String [] datas = line.split(" ");		
        MovieBean movieBean = new MovieBean();		
        movieBean.setName(datas[0]);		
        movieBean.setHotNum(Integer.valueOf(datas[1]));		
        context.write(movieBean, NullWritable.get());	
    }
}

Driver代码

public class SortDriver {	
    public static void main(String[] args) throws IllegalArgumentException,IOException, ClassNotFoundException, InterruptedException {		
        Configuration conf = new Configuration();		
        Job job = Job.getInstance(conf);		
        job.setJarByClass(SortDriver.class);		
        job.setMapperClass(SortMapper.class);		
        job.setMapOutputKeyClass(MovieBean.class);		
        job.setMapOutputValueClass(NullWritable.class);		
        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/sort"));							FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/sort/result"));					job.waitForCompletion(true);	
    }
}

3.多层MR处理多 Job 串联

一个稍复杂点的处理逻辑往往需要多个 MapReduce 程序串联处理，多 job 的串联可以借助 MapReduce 框架的 JobControl 实现

在第一层MR处理基础上
添加第二个JOB处理第一个JOB的运行结果
例子：
计算每人3个月的总收入并排序

第一个MR：计算每人的总收入
第二个MR：按照收入进行排序Mapper

以下有两个 MapReduce 任务，分别是 Flow的 SumMR 和 SortMR，其中有依赖关系：SumMR 的输出是 SortMR 的输入，所以 SortMR 的启动得在 SumMR 完成之后

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job jobsum = Job.getInstance(conf);
        jobsum.setJarByClass(RunManyJobMR.class);
        jobsum.setMapperClass(FlowSumMapper.class);
        jobsum.setReducerClass(FlowSumReducer.class);
        jobsum.setMapOutputKeyClass(Text.class);
        jobsum.setMapOutputValueClass(Flow.class);
        jobsum.setCombinerClass(FlowSumReducer.class);
        jobsum.setOutputKeyClass(Text.class);
        jobsum.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(jobsum, "d:/flow/input");
        FileOutputFormat.setOutputPath(jobsum, new Path("d:/flow/output12"));
        
        Job jobsort = Job.getInstance(conf);
        jobsort.setJarByClass(RunManyJobMR.class);
        jobsort.setMapperClass(FlowSortMapper.class);
        jobsort.setReducerClass(FlowSortReducer.class);
        jobsort.setMapOutputKeyClass(Flow.class);
        jobsort.setMapOutputValueClass(Text.class);
        jobsort.setOutputKeyClass(NullWritable.class);
        jobsort.setOutputValueClass(Flow.class);
        FileInputFormat.setInputPaths(jobsort, "d:/flow/output12");
        FileOutputFormat.setOutputPath(jobsort, new Path("d:/flow/sortoutput12"));
        
        ControlledJob sumcj = new ControlledJob(jobsum.getConfiguration());
        ControlledJob sortcj = new ControlledJob(jobsort.getConfiguration());
        
        sumcj.setJob(jobsum);
        sortcj.setJob(jobsort);
		// 设置作业依赖关系
        sortcj.addDependingJob(sumcj);
        
        JobControl jc = new JobControl("flow sum and sort");
        jc.addJob(sumcj);
        jc.addJob(sortcj);
        Thread jobThread = new Thread(jc);
        jobThread.start();
        while(!jc.allFinished()){
            Thread.sleep(500);
        }
        jc.stop();
    }

4.TopN算法-自定义 GroupComparator

输入文件格式

algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85

输出文件格式

k=3, 按课程分4个文件,每个文件保存平均成绩前3的人名和平均成绩

algorithm huangjiaju的成绩:62.0
algorithm liutao的成绩:56.57
algorithm huanglei的成绩:55.89

实现Comparable接口的比较类MyCom

static class MyCom implements Comparable<MyCom>{
        //首字段为人名,次字段为平均成绩
        private String tname;
        private Double tscore;
        //自动生成getset方法
        public String getTname() {return tname;}
        public void setTname(String tname) {this.tname = tname;}
        public Double getTscore() {return tscore;}
        public void setTscore(Double tscore) {this.tscore = tscore;}
        @Override
        public int compareTo(MyCom o) {
            //对传入的平均成绩进行比较
            return this.tscore.compareTo(o.getTscore());
        }
    }

Map代码

static class TopMaper extends Mapper<LongWritable,Text,Text,Text> {
    //输入类型为<偏移量,一行文本>,输出类型为
    private Text mkey=new Text();
    private Text mvalue=new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        System.out.println("map");
        //按,拆分当前行字符串
        String[] lines=value.toString().split(",");
        //第一个字符串为课程,写入输出key
        mkey.set(lines[0]);
        //过滤为空的非法数据
        if (lines==null||lines.length<1){
            return;
        }
        //按下标得到[2]以后所有字符串,转换为double求和
        double sum=0;
        for (int i=2;i<lines.length;i++){
            sum+=new Double(lines[i]);
        }
        //DecimalFormat规定小数点后保留两位
        DecimalFormat df=new DecimalFormat("0.00");
        //输出value为人名,平均成绩
        mvalue.set(lines[1]+","+df.format (sum/lines.length-2));
        context.write(mkey,mvalue);
    }
}

Reduce代码

static class TopReduceer extends Reducer<Text,Text,Text,Text> {
    private Text rvalue=new Text();
    @Override
    protected void reduce(Text mkey, Iterable<Text> iter, Context context) throws IOException, InterruptedException {
        System.out.println("reduce");
        //将MyCom类放入List,通过ArrayList实现
        List<MyCom> slist=new ArrayList<>();
        //遍历传入的人名和成绩
        for (Text it:iter){
            //按,拆分
            String[] lines = it.toString().split(",");
            MyCom c=new MyCom();
            c.setTname(lines[0]);//写入人名
            c.setTscore(new Double(lines[1]));//写入平均成绩
            //将写好的MyCom放入List
            slist.add(c);
        }
        //Collections.sort实现对列表的升序排序
        Collections.sort(slist);
        //Collections.reverse反转升序后的元素,即降序
        Collections.reverse(slist);
        //topk个元素,即输出平均成绩最高的前3条记录
        for (int k=0;k<3;k++){
            MyCom s = slist.get(k);
            rvalue.set(s.getTname() + "的成绩:" + s.getTscore());
            context.write(mkey, rvalue);
        }
    }
}

执行主方法

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration cfg=new Configuration();
    Job job = Job.getInstance(cfg);
    //设置主方法所在的类
    job.setJarByClass(Topk.class);
    job.setMapperClass(TopMaper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setReducerClass(TopReduceer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    //设置reducetask数量为4,默认为1
    job.setNumReduceTasks(4);
    //不重写Partitioner会按map输出的key进行分区,分区数为reducetask数
    //输入路径和输出路径的设置
    FileInputFormat.addInputPath(job, new Path("d:\\mr\\input\\grade.txt"));
    FileOutputFormat.setOutputPath(job, new Path("d:\\mr\\outtopk"));
    System.exit(job.waitForCompletion(true)?0:1);
}

5.全局计数器

以下是一个利用全局计数器来统计一个目录下所有文件出现的单词总数和总行数

package com.mr.counter;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CounterWordCount {
    enum CouterWordCountC{COUNT_WORDS, COUNT_LINES}
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(CounterWordCount.class);
        job.setMapperClass(WCCounterMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        Path inputPath = new Path("d:/wordcount/input");
        FileInputFormat.setInputPaths(job, inputPath);
        job.setNumReduceTasks(0);
        Path outputPath = new Path("d:/wordcount/output");
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outputPath)){
            fs.delete(outputPath, true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);
        boolean waitForCompletion = job.waitForCompletion(true);
        System.exit(waitForCompletion?0:1);
    }
    private static class WCCounterMapper extends Mapper<LongWritable, Text, Text,
            LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
			// 统计行数，因为默认读取文本是逐行读取，所以 map 执行一次，行数+1
            context.getCounter(CouterWordCountC.COUNT_LINES).increment(1L);
            String[] words = value.toString().split(" ");
            for(String word: words){
                // 统计单词总数，遇见一个单词就+1
                context.getCounter(CouterWordCountC.COUNT_WORDS).increment(1L);
            }
        }
    }
}

6.MapJoin

MapJoin 适用于有一份数据较小的连接情况。做法是直接把该小份数据直接全部加载到内存当中，按链接关键字建立索引。然后大份数据就作为 MapTask 的输入，对 map()方法的每次输入都去内存当中直接去匹配连接。然后把连接结果按 key 输出，这种方法要使用 hadoop 中的 DistributedCache 把小份数据分布到各个计算节点，每个 maptask 执行任务的节点都需要加载该数据到内存，并且按连接关键字建立索引
现有两份数据 movies.dat 和 ratings.dat

数据样式分别为：

Movies.dat ---- 字段含义：movieid, moviename, movietype

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance

Ratings.dat ---- 字段含义：userid, movieid, rate, timestamp

1::1193::5::978300760
1::661::3::978302109
1::914::3::978301968

Select * from movie a join ratings b on a.movieid = b.movieid

现要求对两表进行连接，要求输出最终的结果有以上六个字段：

movieid, userid, rate, moviename, movietype, timestamp

第一步：封装 MovieRate,方便数据的排序和序列化

package com.mr.mymapjoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MovieRate implements WritableComparable<MovieRate>{
    private String movieid;
    private String userid;
    private int rate;
    private String movieName;
    private String movieType;
    private long ts;
    public String getMovieid() {return movieid;}
    public void setMovieid(String movieid) {this.movieid = movieid;}
    public String getUserid() {return userid;}
    public void setUserid(String userid) {this.userid = userid;}
    public int getRate() {return rate;}
    public void setRate(int rate) {this.rate = rate;}
    public String getMovieName() {return movieName;}
    public void setMovieName(String movieName) {this.movieName = movieName;}
    public String getMovieType() {return movieType;}
    public void setMovieType(String movieType) {this.movieType = movieType;}
    public long getTs() {return ts;}
    public void setTs(long ts) {this.ts = ts;}
    public MovieRate() {}
    public MovieRate(String movieid, String userid, int rate, String movieName,
                     String movieType, long ts) {
        this.movieid = movieid;
        this.userid = userid;
        this.rate = rate;
        this.movieName = movieName;
        this.movieType = movieType;
        this.ts = ts;
    }
    @Override
    public String toString() {
        return movieid + "\t" + userid + "\t" + rate + "\t" + movieName
                + "\t" + movieType + "\t" + ts;
    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(movieid);
        out.writeUTF(userid);
        out.writeInt(rate);
        out.writeUTF(movieName);
        out.writeUTF(movieType);
        out.writeLong(ts);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.movieid = in.readUTF();
        this.userid = in.readUTF();
        this.rate = in.readInt();
        this.movieName = in.readUTF();
        this.movieType = in.readUTF();
        this.ts = in.readLong();
    }
    @Override
    public int compareTo(MovieRate mr) {
        int it = mr.getMovieid().compareTo(this.movieid);
        if(it == 0){
            return mr.getUserid().compareTo(this.userid);
        }else{
            return it;
        }
    }
}

第二步：编写 MapReduce 程序

package com.mr.mymapjoin;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MovieRatingMapJoinMR {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
        System.setProperty("HADOOP_USER_NAME","hadoop");
        Job job = Job.getInstance(conf);
		// job.setJarByClass(MovieRatingMapJoinMR.class);
        job.setJar("/home/hadoop/mrmr.jar");
        job.setMapperClass(MovieRatingMapJoinMRMapper.class);
        job.setMapOutputKeyClass(MovieRate.class);
        job.setMapOutputValueClass(NullWritable.class);
		// job.setReducerClass(MovieRatingMapJoinMReducer.class);
		// job.setOutputKeyClass(MovieRate.class);
		// job.setOutputValueClass(NullWritable.class);
        job.setNumReduceTasks(0);
        String minInput = args[0];
        String maxInput = args[1];
        String output = args[2];
        FileInputFormat.setInputPaths(job, new Path(maxInput));
        Path outputPath = new Path(output);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outputPath)){
            fs.delete(outputPath, true);
        }
        FileOutputFormat.setOutputPath(job, outputPath);
        URI uri = new Path(minInput).toUri();
        job.addCacheFile(uri);
        boolean status = job.waitForCompletion(true);
        System.exit(status?0:1);
    }
    static class MovieRatingMapJoinMRMapper extends Mapper<LongWritable,Text,MovieRate,NullWritable>{
        // 用来存储小份数据的所有解析出来的 key-value
        private static Map<String, Movie> movieMap = new HashMap<String, Movie>();
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            Path[] localCacheFilePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
            String myfilePath = localCacheFilePaths[0].toString();
            System.out.println(myfilePath);
            URI[] cacheFiles = context.getCacheFiles();
            System.out.println(cacheFiles[0].toString());
            BufferedReader br = new BufferedReader(new FileReader(myfilePath.toString()));
			// 此处的 line 就是从文件当中逐行读到的 movie
            String line = "";
            while(null != (line = br.readLine())){
                String[] splits = line.split("::");
                movieMap.put(splits[0], new Movie(splits[0], splits[1], splits[2]));
            }
            IOUtils.closeStream(br);
        }
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException,
                InterruptedException {
            String[] splits = value.toString().split("::");
            String userid = splits[0];
            String movieid = splits[1];
            int rate = Integer.parseInt(splits[2]);
            long ts = Long.parseLong(splits[3]);
            String movieName = movieMap.get(movieid).getMovieName();
            String movieType = movieMap.get(movieid).getMovieType();
            MovieRate mr = new MovieRate(movieid, userid, rate, movieName, movieType,ts);
            context.write(mr, NullWritable.get());
        }
    }
}

7.最简单的wordcount

测试数据：

zhangyong zhangrui zhangqin
zhangyong zhangrui zhangqin
zhangyong zhangrui zhangqin

mapper类

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 首先获取一行数据
        String line = value.toString ();
        // 将行内的单词进行切分，使用一个数组进行保存，切分数据时根据源数据得知可以使用空格的方式切分。
        String[] arr = line.split (" ");
        for (String str : arr) {
            context.write (new Text (str), new LongWritable (1));
        }
    }
}

reducer类

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    @Override
    public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        // 定义变量记录单词出现的次数
        long sum = 0;
        for (LongWritable val : values) {
            // 记录总次数
            sum += val.get ();
        }
        // 输出数据，key就是单词，value就是在map阶段这个单词出现的总次数
        context.write (key, new LongWritable (sum));
    }
}

Driver类

public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        // 获取当前的默认配置
        Configuration conf = new Configuration ();
        // 获取代表当前mr作业的job对象
        Job job = Job.getInstance (conf);
        // 指定一下当前程序的入口类
        job.setJarByClass (WordCountDriver.class);
        //指定当前Mapper、Reducer任务的类
        job.setMapperClass (WordCountMapper.class);
        job.setReducerClass (WordCountReducer.class);
        //设置Mapper的结果类型
        job.setMapOutputKeyClass (Text.class);
        job.setMapOutputValueClass (LongWritable.class);
        // 设置Reducer的结果类型
        job.setOutputKeyClass (Text.class);
        job.setOutputValueClass (LongWritable.class);
        //设置待分析的文件夹路径（linux的路径地址）
        FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/mapreduce"));
        FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/mapreduce"));
        if (!job.waitForCompletion (true)) {
            return;
        }
    }
}

8.求温度平均值

测试数据：

2329999919500515070000
9909999919500515120022
9909999919500515180011
9509999919490324120111
6509999919490324180078

代码

public class HeightMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取一段数据
        String line = value.toString ();
        //获取年份
        String year = line.substring (8, 12);
        //获取温度(强制转换一下)
        int t = Integer.parseInt (line.substring (18, 22));
        context.write (new Text (year),new LongWritable (t));

    }
}

public class HeightReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    @Override
    public void reduce(Text year, Iterable<LongWritable> t, Context context) throws IOException, InterruptedException {
        long max = 0;
        for (LongWritable data : t) {
            if (max < data.get ()) {
                max = data.get ();
            }
        }
        context.write (year, new LongWritable (max));
    }
}

public class HeightDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration ();
        Job job = Job.getInstance (conf);
        job.setJarByClass (HeightDriver.class);
        job.setMapperClass (HeightMapper.class);
        job.setReducerClass (HeightReducer.class);
        job.setMapOutputKeyClass (Text.class);
        job.setMapOutputValueClass (LongWritable.class);
        job.setOutputKeyClass (Text.class);
        job.setOutputValueClass (LongWritable.class);
        FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/wendu/"));
        FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/wendu"));
        job.waitForCompletion (true);
    }
}

9.分区多路输出

测试数据：

13901000123 zs bj 343
13202111011 ww sh 456
13901000123 zs bj 1024
13207551234 ls sz 758

Partitioner类

public class AddPartitioner extends Partitioner<Text, PartFlowBean> {
         @Override
         public int getPartition(Text text, PartFlowBean flowBean, int
              numPartitioner) {
             String addr = flowBean.getAddr();
          if (addr.equals("bj")) {
                 return 0;//输出part-r-00000
             } else if (addr.equals("sh")) {
                 return 1;//输出part-r-00001
          } else {
                 return 2;//输出part-r-00002
          }
         }
     }

编写MR

public class PartFlowMapper extends Mapper<LongWritable, Text, Text, PartFlowBean> {
    @Override
    public void map(LongWritable key, Text value, Context context) throws
            IOException, InterruptedException {
        String line = value.toString ();
         /**
         [13901000123,zk,bj,343]
         phone = 13901000123;
         name = zk;
         addr = bj;
         flow = 343;
         */
        String[] info = line.split (" ");
        PartFlowBean flowBean = new PartFlowBean ();
        flowBean.setPhone (info[0]);
        flowBean.setName (info[1]);
        flowBean.setAddr (info[2]);
        flowBean.setFlow (Integer.parseInt (info[3]));
        context.write (new Text (flowBean.getName ()), flowBean);
    }
}

public class PartFlowReducer extends Reducer<Text, PartFlowBean, PartFlowBean,
        NullWritable> {
    @Override
    public void reduce(Text key, Iterable<PartFlowBean> values, Context
            context) throws IOException, InterruptedException {
        PartFlowBean result = new PartFlowBean ();
        for (PartFlowBean value : values) {
            result.setPhone (value.getPhone ());
            result.setPhone (value.getPhone ());
            result.setName (value.getName ());
            result.setAddr (value.getAddr ());
            result.setFlow (result.getFlow () + value.getFlow ());
        }
        context.write (result, NullWritable.get ());
    }
}

public class PartFlowDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration ();
        Job job = Job.getInstance (conf);
        job.setJarByClass (PartFlowDriver.class);
        job.setMapperClass (PartFlowMapper.class);
        job.setReducerClass (PartFlowReducer.class);
        /**
         * 下面的两个类如果不写的话，那么就不会生效。
         */
        // 设置分区类
        job.setPartitionerClass (AddPartitioner.class);
        // 设置分区数量
        job.setNumReduceTasks (3);
        job.setMapOutputKeyClass (Text.class);
        job.setMapOutputValueClass (PartFlowBean.class);
        job.setOutputKeyClass (PartFlowBean.class);
        job.setOutputValueClass (NullWritable.class);
        FileInputFormat.setInputPaths (job, new Path ("hdfs://anshun115:9000/partition"));
        FileOutputFormat.setOutputPath (job, new Path ("hdfs://anshun115:9000/result/partition"));
        job.waitForCompletion (true);
    }
}

运行结果：

part-r-00000

FlowBean{phone='13901000123', name='zs', addr='bj', flow=1367}

part-r-00001

FlowBean{phone='13202111011', name='ww', addr='sh', flow=456}

part-r-00002

FlowBean{phone='13207551234', name='ls', addr='sz', flow=758}

10.分区并全排序

82 239 231
23 22 213
123 232 124
213 3434 232
4546 565 123
231 231

Partitioner类

/**
 * @Author zhangyong
 * @Date 2020/4/14 9:39
 * @Version 1.0
 * 全排序
 * 将上述文件内容按照数字位数分别写入三个文件，如下
 * 0-99的写入到文件1
 * 100-999写入到文件2
 * 1000-其他数据写入到文件3
 */
public class AutoPartitioner extends Partitioner<IntWritable, IntWritable> {
    @Override
    public int getPartition(IntWritable key, IntWritable value, int numPartitions) {
        String num = String.valueOf (key.get ());
        if (num.matches ("[0-9][0-9]") || num.matches ("[0-9]")) {
            return 0;
        } else if (num.matches ("[0-9][0-9][0-9]")) {
            return 1;
        } else {
            return 2;
        }
    }
}

11.推荐认识好友

测试数据：

tom rose
tom jim
tom smith
tom lucy
rose tom
rose lucy
rose smith
jim tom

第一个mapper类

public class OneFriendMapper extends Mapper<LongWritable, Text, Text, Text> {
/**

 * 输入的key和value是根据文件内容来确定。
 * 输出的key和value是因为在业务逻辑中设定的输出是name-friend好友关系。
   */
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
   // 获取每行的数据
   String line = value.toString();
   // 获取姓名
   String name = line.split(" ")[0];
   // 获取好友
   String friend = line.split(" ")[1];
   context.write(new Text(name), new Text(friend));
   }
}

第一个reducer类

public class OneFriendReducer extends Reducer<Text, Text, Text, IntWritable> {
/**

 * 输入key和value要和mapper的输出保持一致。
 * Text和IntWritable：
 * 如果是好友-1，如果不是好友就用-2。
   */
   @Override
   protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
   ArrayList<String> friendList = new ArrayList<>();
   //处理好友关系
   for (Text value : values) {
       friendList.add(value.toString());
       if (key.toString().compareTo(value.toString()) < 0) {
           context.write(new Text(key + "-" + value), new IntWritable(1));
       } else {
           context.write(new Text(value + "-" + key), new IntWritable(1));
       }
   }
   // 处理可能相识的好友。
   for (int i = 0; i < friendList.size(); i++) {
       for (int j = 0; j < friendList.size(); j++) {
           String friend1 = friendList.get(i);
           String friend2 = friendList.get(j);
           if (friend1.compareTo(friend2) < 0) {
               context.write(new Text(friend1 + "-" + friend2), new IntWritable(2));
           }
       }
   }
  }
}

第二个mapper类

public class TwoFriendMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    // 获取一行数据
    String line = value.toString();
    // 获取朋友关系的信息
    String friendInfo = line.split("\t")[0];
    // 获取朋友关系的深度
    int deep = Integer.parseInt(line.split("\t")[1]);
    context.write(new Text(friendInfo), new IntWritable(deep));
	}
}

第二个reducer类

public class TwoFriendReducer extends Reducer<Text, IntWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
    Boolean flag = true;
    /**
     * 设定好友关系为true的时候进行输出
     * 因为题目要求是输出可能相识的好友。所以为true的代码应该是2
     * 也就是好友关系为1的时候设置变量为false
     */
    for (IntWritable value : values) {
        if (value.get() == 1) {
            flag = false;
        }
    }
    if (flag) {
        context.write(key, NullWritable.get());
    }
	}
}

Driver类

public class FriendDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    //设置第一轮MapReduce的相应处理类与输入输出
    Job job1 = Job.getInstance(conf);

    job1.setJarByClass(FriendDriver.class);

    job1.setMapperClass(OneFriendMapper.class);
    job1.setReducerClass(OneFriendReducer.class);

    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);

    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(IntWritable.class);

    //设置路径（传输、结果）
    FileInputFormat.setInputPaths(job1, new Path("hdfs://anshun115:9000/friend"));
    FileOutputFormat.setOutputPath(job1, new Path("hdfs://anshun115:9000/result/friend"));

    //如果第一轮MapReduce完成再做这里的代码
    if (job1.waitForCompletion(true)) {
        Job job2 = Job.getInstance(conf);
        // 设置第二个Job任务的Mapper
        job2.setMapperClass(TwoFriendMapper.class);
        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(IntWritable.class);
        // 设置第二个Job任务的Reducer
        job2.setReducerClass(TwoFriendReducer.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(NullWritable.class);
        /**
         * 设置第二个Job任务是输入输出路径。
         * 此处的输入路径是第一个job任务的输出路径
         * 注意设置路径时，里面传入的job应该是当前的job任务，如下所示，应该是job2。
         * 如果写成前面的job任务名称，在运行时则会爆出错误，提示路径不存在。
         */
        FileInputFormat.setInputPaths(job2, new Path("hdfs://anshun115:9000/result/friend"));
        FileOutputFormat.setOutputPath(job2, new Path("hdfs://anshun115:9000/result/friend2"));
        // 此处提交任务时，注意用的是job2。
        job2.waitForCompletion(true);
	}
}

12。自定义文件名并多路输出

public class two {
    public static class TWOMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split(",");
            // 课程
            String clazz = split[0];
            // 姓名
            String name = split[1];
            // 总分
            double zf = 0;
            for (int i = 2; i < split.length; i++) {
                // 分数
                double fen = Double.parseDouble(split[i]);
                zf += fen;
            }
            //平均分
            double v = zf / (split.length - 2);
            BigDecimal bd = new BigDecimal(v);
            BigDecimal bd1 = bd.setScale(2, BigDecimal.ROUND_HALF_UP);
            context.write(new LongWritable(bd1.longValue()),new Text(clazz+","+name));
        }
    }
    public static class TWOReducer extends Reducer<LongWritable,Text,Text, NullWritable> {
        private MultipleOutputs<Text,NullWritable> mos;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //在setup运行时，重新初始化这个类
            mos = new MultipleOutputs<Text,NullWritable>(context);
        }
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            mos.close();
        }
        @Override
        protected void reduce(LongWritable key, Iterable<Text> values, Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
            for (Text value: values) {
                // 课程
                String clazz = value.toString().split(",")[0];
                // 姓名
                String name = value.toString().split(",")[1];
                if (clazz.equals("computer"))
                mos.write("computer",clazz+","+name+","+key,NullWritable.get());
                if (clazz.equals("english"))
                    mos.write("english",clazz+","+name+","+key,NullWritable.get());
                if (clazz.equals("algorithm"))
                    mos.write("algorithm",clazz+","+name+","+key,NullWritable.get());
                if (clazz.equals("math"))
                    mos.write("math",clazz+","+name+","+key,NullWritable.get());
            }
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        FileInputFormat.addInputPath(job,new Path("hdfs://master:9000/data/stu_score_sub.csv"));
        Path out = new Path("hdfs://master:9000/output2");
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(out)){
            fs.delete(out,true);
        }
        //配置输入输出的路径
        FileOutputFormat.setOutputPath(job,out);
        MultipleOutputs.addNamedOutput(job, "computer", TextOutputFormat.class, Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, "english", TextOutputFormat.class, Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, "algorithm", TextOutputFormat.class, Text.class, NullWritable.class);
        MultipleOutputs.addNamedOutput(job, "math", TextOutputFormat.class, Text.class, NullWritable.class);
        job.waitForCompletion(true);
    }
}

你可能感兴趣的:(大数据,大数据,mr,hadoop)

如何解决 NPM proxy，当我们在终端nodejs应用程序时出现代理相关报错
Thisisaproblemrelatedtonetworkconnectivity.npmERR!networkInmostcasesyouarebehindaproxyorhavebadnetworksettings.在使用npminstall下载包的时候总是报以下错误:在控制台或VisualStudioCode终端中运行以下命令：npmconfigrmproxynpmconfigrmhttp
数字孪生技术为UI前端注入新活力：实现产品设计的沉浸式体验 ui设计前端开发老司机 ui
hello宝子们...我们是艾斯视觉擅长ui设计、前端开发、数字孪生、大数据、三维建模、三维动画10年+经验!希望我的分享能帮助到您!如需帮助可以评论关注私信我们一起探讨!致敬感谢感恩!一、引言：从“平面交互”到“沉浸体验”的UI革命当用户在电商APP中翻看3D家具模型却无法感知其与自家客厅的匹配度，当设计师在2D屏幕上绘制汽车内饰却难以预判实际乘坐体验——传统UI设计的“平面化、静态化、割裂感”
提升企业级数据处理效率！TDengine 四个集群优化点详解 TDengine （老段） TDengine 运维大数据数据库物联网时序数据库服务器运维 tdengine
为了帮助企业更好地进行大数据处理，我们在此前TDengine3.x系列版本中进行了几项与集群相关的优化和新功能开发，以提升集群的稳定性和在异常情况下的恢复能力。这些优化包括clusterID隔离、leaderrebalance、raftlearner和restorednode。本文将对这几项重要优化进行详细阐述，以解答企业在此领域的疑问，并帮助大家更好地应对相关挑战。clusterID隔离问题fi
redis管道 -redis pipeline -redis pipelining shuair redis redis bootstrap 数据库
redis管道文档redis单机安装redis常用的五种数据类型redis数据类型-位图bitmapredis数据类型-基数统计HyperLogLogredis数据类型-地理空间GEOredis数据类型-流Streamredis数据类型-位域bitfieldredis持久化-RDBredis持久化-AOFredis持久化-RDB+AOF混合模式redis事务官方文档官网操作命令指南页面：https
three前置课程知识
学习中文网(1.threejs文件包下载和目录简介|Three.js中文网)threejs官方文件包所有版本：https://github.com/mrdoob/three.js/releases更新迭代较快，要选择对应版本使用---下载zip压缩包Threejs官网中文文档链接：https://threejs.org/docs/index.html#manual/zh/重要的内容docs包:文档
中国银联豪掷1亿采购海光C86架构服务器信创新态势海光芯片 C86 国产芯片海光信息
近日，中国银联国产服务器采购大单正式敲定，基于海光C86架构的服务器产品中标，项目金额超过1亿元。接下来，C86服务器将用于支撑中国银联的虚拟化、大数据、人工智能、研发测试等技术场景，进一步提升其业务处理能力、用户服务效率和信息安全水平。作为我国重要的银行卡组织和金融基础设施，中国银联在全球183个国家和地区设有银联受理网络，境内外成员机构超过2600家，是世界三大银行卡品牌之一。此次中国银联发力
全面探索Kafka：架构、应用与流处理
Kafka：企业级消息系统与流处理平台的深度解析ApacheKafka作为分布式流处理平台，广泛应用于大数据处理和实时分析领域。本文将基于其官方文档，详细探讨Kafka的核心功能、应用场景以及如何进行有效管理。背景简介Kafka作为高吞吐量的消息系统，支持企业级的发布-订阅模式。它能够处理大量实时数据，并支持高并发读写操作。本文将依据Kafka官方文档的内容，逐层深入，从入门到高级应用，帮助读者全
Flink时间窗口详解 bxlj_jcj Flink flink 大数据
一、引言在大数据流处理的领域中，Flink的时间窗口是一项极为关键的技术，想象一下，你要统计一个电商网站每小时的订单数量。由于订单数据是持续不断产生的，这就形成了一个无界数据流。如果没有时间窗口的概念，你就需要处理无穷无尽的数据，难以进行有效的统计分析。而时间窗口的作用，就是将这无界的数据流按照时间维度切割成一个个有限的“数据块”，方便我们对这些数据进行处理和分析。比如，我们可以定义一个1小时的时
探索实时流处理的未来：Kafka Streams 深度指南秋或依
探索实时流处理的未来：KafkaStreams深度指南项目介绍欢迎进入KafkaStreams：实时流处理的世界！这不仅仅是一本书，更是一个通往流处理领域深层奥秘的门户。由PrashantPandey编著，这本书以ApacheKafka2.1中的KafkaStreams库为核心，为读者铺就了一条从理解基础概念到熟练掌握KafkaStreams编程的路径。无论是软件工程师、数据架构师，还是对大数据处
Elasticsearch搜索引擎存储：从原理到实践的全景解析 Python×CATIA工业智造搜索引擎 elasticsearch 大数据
引言在大数据时代，数据规模呈指数级增长，传统数据库的模糊查询、实时分析能力逐渐成为瓶颈。Elasticsearch（简称ES）凭借其分布式架构、实时搜索和灵活的数据分析能力，成为企业级搜索与存储的核心引擎。截至2025年，ES在全球日志分析、电商搜索、实时监控等场景的市场占有率超过60%。本文将从存储架构、核心技术、应用场景及优化策略四个维度，深入解析Elasticsearch的设计哲学与实践价值
【Kafka专栏 13】Kafka的消息确认机制：不是所有的“收到”都叫“确认”！
作者名称：夏之以寒作者简介：专注于Java和大数据领域，致力于探索技术的边界，分享前沿的实践和洞见文章专栏：夏之以寒-kafka专栏专栏介绍：本专栏旨在以浅显易懂的方式介绍Kafka的基本概念、核心组件和使用场景，一步步构建起消息队列和流处理的知识体系，无论是对分布式系统感兴趣，还是准备在大数据领域迈出第一步，本专栏都提供所需的一切资源、指导，以及相关面试题，立刻免费订阅，开启Kafka学习之旅！
Hive简介
文章目录Hive简介Hive特点Hive和RDBMS的对比Hive的架构Hive的数据组织Hive数据类型Hive简介1、Hive由Facebook实现并开源2、是基于Hadoop的一个数据仓库工具3、可以将结构化的数据映射为一张数据库表4、并提供HQL(HiveSQL)查询功能5、底层数据是存储在HDFS上6、Hive的本质是将SQL语句转换为MapReduce任务运行7、使不熟悉MapRedu
C语言学生成绩管理系统<；自创>；(功能7有小错误,但可运行） han_xue_feng java
腾讯云加速企业和个人开发创新公开直播预告直播预告：07/18(周四)15:00-16:00随着人工智能与大模型的蓬勃发展，我们正步入一个由技微信实习第一天周五入职，早上早早来到了公司，发现好多人都没上班，到十点才陆陆续续有人来，办理完入职后，mentor中联夏令营遗憾没有入选不过hr的回复真的很好，辛苦啦#提前批简历挂麻了怎么办##机械制造投递记录#大数据开发的工作有点过于简单了吧sq大数据开发的
Python爬虫：从图片或扫描文档中提取文字数据的完整指南 Python爬虫项目 2025年爬虫实战项目 python 爬虫开发语言数据挖掘 c++
1.引言随着大数据技术的不断进步，图像数据逐渐成为了许多行业中重要的数据源之一。图像中不仅包含了丰富的视觉信息，还可能蕴含着大量的文字数据。对于科研、企业、政府等多个领域而言，如何从图片或扫描文档中提取出有价值的文字信息是一个亟待解决的问题。在这一过程中，OCR（OpticalCharacterRecognition，光学字符识别）技术成为了解决这一问题的重要工具。在本文中，我们将探讨如何使用Py
【C语言经典面试题】memcpy函数有没有更高效的拷贝实现方法？架构师李肯嵌入式物联网开发进阶 c语言面试性能优化
【C语言经典面试题】memcpy函数有没有更高效的拷贝实现方法？我相信大部分初中级C程序员在面试的过程中，可能都被问过关于memcpy函数的问题，甚至需要手撕memcpy。本文从另一个角度带你领悟一下memcpy的面试题，你可以看看是否能接得住？文章目录1写在前面2源码实现2.1函数申明2.2简单的功能实现2.3满足大数据量拷贝的功能实现3源码测试4小小总结5更多分享1写在前面假如你遇到下面的面试
python基于Hadoop的NBA球员大数据分析与可视化系统
目录技术栈介绍具体实现截图系统设计研究方法：设计步骤设计流程核心代码部分展示研究方法详细视频演示试验方案论文大纲源码获取/详细视频演示技术栈介绍Django-SpringBoot-php-Node.js-flask本课题的研究方法和研究步骤基本合理，难度适中，本选题是学生所学专业知识的延续，符合学生专业发展方向，对于提高学生的基本知识和技能以及钻研能力有益。该学生能够在预定时间内完成该课题的设计。
大数据技术之集群数据迁移
dfs.namenode.rpc-address.nameservice1.namenode30hadoop104:8020dfs.namenode.rpc-address.nameservice1.namenode37hadoop106:8020dfs.namenode.http-address.nameservice1.namenode30hadoop104:9870dfs.namenode.
HIVE（二） 2301_78012738 hive 数据仓库
目录访问HIVE的三种方式DDLDML数据操作向表中装载数据数据导出常用函数Like和RLike分组Join排序分区表和分桶表访问HIVE的三种方式启动Hive命令，CtrlC退出客户端，执行测试语句，与sql一致[wyc@hadoop102hive]$bin/hive经验小结：在hive中执行语句报错：ExecutionError,returncode2fromorg.apache.hadoop
Oracle EMCC 13.5 集群安装部署指南 Lucifer三思而后行 DBA 实战系列 oracle 数据库
大家好，这里是DBA学习之路，专注于提升数据库运维效率。目录前言第一阶段：OMR集群部署1.1OracleRAC环境准备1.2数据库版本验证1.3EMCC专用数据库优化第二阶段：ACFS集群文件系统构建2.1存储层配置配置multipath多路径配置UDEV设备绑定2.2ACFS文件系统创建使用ASMCA创建磁盘组创建ACFSVolume挂载点准备和文件系统创建第三阶段：OMS集群部署3.1环境准
OpenCV图片操作100例：从入门到精通指南（1）总有刁民想爱朕ha opencv 计算机视觉人工智能
OpenCV图片操作100例：从入门到精通指南本文整理了100个OpenCV实用技巧，涵盖图像处理各个领域，助你轻松掌握计算机视觉核心技能！一、入门必备：基础操作1.图像读写与显示importcv2#读取图像（BGR格式）img=cv2.imread('image.jpg')#显示图像cv2.imshow('示例图片',img)cv2.waitKey(0)#按任意键退出cv2.destroyAll
如何通过YashanDB优化企业大数据处理流程数据库
在当今数据驱动的商业环境中，企业面临着巨大的数据处理挑战。性能瓶颈、数据一致性问题和可扩展性需求使得大数据处理成为一项复杂任务。作为一种新兴的数据库管理系统，YashanDB以其独特的架构设计和强大的数据处理能力，在解决这些挑战方面提供了有效的手段。本文旨在探讨如何利用YashanDB优化大数据处理流程，为企业提供高效、可靠的解决方案。YashanDB的体系架构与部署形态YashanDB支持多种部
AI在垂直领域的深度应用：医疗、金融与自动驾驶的革新之路
AI在垂直领域的深度应用：医疗、金融与自动驾驶的革新之路一、医疗领域：AI驱动的精准诊疗与效率提升1.医学影像诊断AI算法通过深度学习技术，已实现对X光、CT、MRI等影像的快速分析，辅助医生检测癌症、骨折等疾病。例如，GoogleDeepMind的AI系统在乳腺癌筛查中，误检率比人类专家低9.4%；中国的推想医疗AI系统可在20秒内完成肺部CT扫描分析，为急诊救治争取黄金时间。2.药物研发传统药
Pandas 学习教程 _pass_ Data-Alaysis pandas 信息可视化
目录定义基本操作一维数组操作二维数组操作数据选择过滤数据处理数据清洗数据转换数据分析排序分组聚合数据透视表高级操作合并数据时间序列处理自定义函数调用数据可视化集成数据导出和导入大数据分块处理定义全称：'paneldata'and'pythondataanalysis'Analy:Series(一维数据)、DataFrame(二维数据)主要应用：数据清洗：处理缺失数据、重复数据等数据转换：改变数据的
如何通过YashanDB提升客户体验数据库
如何优化查询速度？这是许多企业在使用数据库技术时常常会遇到的问题。查询速度的快慢直接影响到用户的体验，尤其是在大数据量和高并发的使用场景中。顾客期望迅速获取信息，若响应时间过长，可能导致客户流失。因此，优化数据库的性能成为提升客户体验的关键举措之一。YashanDB作为一种高性能的数据库技术架构，提供了多种优化机制，以提升系统的查询速度和整体处理能力。多种部署架构YashanDB支持多种部署架构，
如何通过YashanDB数据库实现企业级数据分区管理？数据库
在当今大数据时代，企业面临着海量数据的管理和优化访问的问题。如何有效地组织和划分庞大的数据集，以提升查询性能和运维效率，成为数据库系统设计的核心挑战。数据分区技术作为解决大规模数据处理的关键手段，能够显著减少无关数据的访问，优化资源利用率。本文聚焦于YashanDB数据库，详细解析其数据分区管理的实现机制及应用，为企业级应用提供高效、灵活的数据分区解决方案。YashanDB中的数据分区基础Yash
centos7下安装 mysql5.7 ammengke mysql 数据库服务器
在CentOS7中默认安装有MariaDB，这个是MySQL的分支，但为了需要，还是要在系统中安装MySQL，而且安装完成之后可以直接覆盖掉MariaDB。1.下载并安装MySQL官方的YumRepository1[root@BrianZhu/]#wget-i-chttp://dev.mysql.com/get/mysql57-community-release-el7-10.noarch.rpm
国产开源高性能对象存储RustFS保姆级上手指南光爷不秃对象存储 rust 国产开源软件 rust 云计算开源软件 github 开源数据仓库 database
在云计算与大数据爆发的时代，企业和开发者对存储方案的要求愈发严苛——不仅要能扛住海量数据的读写压力，还得兼顾安全性、可扩展性和兼容性。今天给大家介绍一款基于Rust语言开发的开源分布式对象存储系统——RustFS，它不仅是MinIO的国产化优秀替代方案，更是AI、大数据和云原生场景的理想之选。本文将从基础介绍到实战操作，带大家快速上手这款"优雅的存储解决方案"。一、RustFS核心特性解析Rust
【TCP/IP】15. 超文本传输协议
15.超文本传输协议15.超文本传输协议15.1统一资源定位符（URL）15.2超文本传输协议（HTTP）15.3HTTP信息的一般格式15.4HTTP请求报文15.5HTTP响应报文本章要点15.超文本传输协议15.1统一资源定位符（URL）URL（UniformResourceLocator）是标识Web资源位置的统一格式，俗称“网址”，用于指定访问资源的方式和位置。URL的完整格式格式：协议
通过YashanDB提升大数据处理能力的指南数据库
数据的急剧增长给数据库技术领域带来了诸多挑战，包括性能瓶颈、数据一致性问题及处理效率低下等。为了应对这些挑战，企业需采取有效的技术手段来提升大数据处理能力。YashanDB作为一款高性能的数据库产品，通过其先进的体系架构、优化的数据存储形式以及强大的并发控制能力，有效地提升了大数据环境下的处理性能。本文旨在为技术人员和决策者提供深入的技术分析和可操作的建议，通过YashanDB的功能特性来实现大数
Java多线程实战指南：从基础到高并发的核心技术解析添砖Java中 java python 开发语言 spring boot spring cloud spring
一、为什么必须掌握多线程？在单核CPU时代，多线程主要用于提高程序响应速度；在如今的多核处理器时代，多线程已成为榨干硬件性能的必备技能。无论是高并发Web服务器、实时数据处理系统，还是游戏引擎，都离不开多线程技术的支撑。典型案例：电商秒杀系统：1秒内处理10万+请求大数据处理：并行计算TB级数据金融交易系统：毫秒级订单撮合二、线程创建的四大核心方式1.继承Thread类（不推荐）classMyTh
C/C++Win32编程基础详解视频下载择善Zach 编程 C++Win32
课题视频：C/C++Win32编程基础详解视频知识：win32窗口的创建 windows事件机制主讲：择善Uncle老师学习交流群：386620625 验证码：625 --
Guava Cache使用笔记 bylijinnan java guava cache
1.Guava Cache的get/getIfPresent方法当参数为null时会抛空指针异常我刚开始使用时还以为Guava Cache跟HashMap一样，get(null)返回null。实际上Guava整体设计思想就是拒绝null的，很多地方都会执行com.google.common.base.Preconditions.checkNotNull的检查。 2.Guava
解决ora-01652无法通过128（在temp表空间中） 0624chenhong oracle
解决ora-01652无法通过128（在temp表空间中）扩展temp段的过程一个sql语句后，大约花了10分钟，好不容易有一个结果，但是报了一个ora-01652错误，查阅了oracle的错误代码说明：意思是指temp表空间无法自动扩展temp段。这种问题一般有两种原因：一是临时表空间空间太小，二是不能自动扩展。分析过程：既然是temp表空间有问题，那当
Struct在jsp标签不懂事的小屁孩 struct
非UI标签介绍：控制类标签： 1：程序流程控制标签 if elseif else <s:if test="isUsed"> <span class="label label-success">True</span> </
按对象属性排序换个号韩国红果果 JavaScript 对象排序
利用JavaScript进行对象排序，根据用户的年龄排序展示 <script> var bob={ name;bob, age:30 } var peter={ name;peter, age:30 } var amy={ name;amy, age:24 } var mike={ name;mike, age:29 } var john={
大数据分析让个性化的客户体验不再遥远蓝儿唯美数据分析
顾客通过多种渠道制造大量数据，企业则热衷于利用这些信息来实现更为个性化的体验。分析公司Gartner表示，高级分析会成为客户服务的关键，但是大数据分析的采用目前仅局限于不到一成的企业。挑战在于企业还在努力适应结构化数据，疲于根据自身的客户关系管理（CRM）系统部署有效的分析框架，以及集成不同的内外部信息源。然而，面对顾客通过数字技术参与而产生的快速变化的信息，企业需要及时作出反应。要想实
java笔记4 a-john java
操作符 1，使用java操作符操作符接受一个或多个参数，并生成一个新值。参数的形式与普通的方法调用不用，但是效果是相同的。加号和一元的正号（+）、减号和一元的负号（-）、乘号（*）、除号（/）以及赋值号（=）的用法与其他编程语言类似。操作符作用于操作数，生成一个新值。另外，有些操作符可能会改变操作数自身的
从裸机编程到嵌入式Linux编程思想的转变------分而治之：驱动和应用程序 aijuans 嵌入式学习
笔者学习嵌入式Linux也有一段时间了，很奇怪的是很多书讲驱动编程方面的知识，也有很多书将ARM9方面的知识，但是从以前51形式的（对寄存器直接操作，初始化芯片的功能模块）编程方法，和思维模式，变换为基于Linux操作系统编程，讲这个思想转变的书几乎没有，让初学者走了很多弯路，撞了很多难墙。笔者因此写上自己的学习心得，希望能给和我一样转变
在springmvc中解决FastJson循环引用的问题 asialee 循环引用 fastjson
我们先来看一个例子： package com.elong.bms; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; import co
ArrayAdapter和SimpleAdapter技术总结百合不是茶 android SimpleAdapter ArrayAdapter 高级组件基础
ArrayAdapter比较简单，但它只能用于显示文字。而SimpleAdapter则有很强的扩展性，可以自定义出各种效果 ArrayAdapter;的数据可以是数组或者是队列 // 获得下拉框对象 AutoCompleteTextView textview = (AutoCompleteTextView) this
九封信 bijian1013 人生励志
有时候，莫名的心情不好，不想和任何人说话，只想一个人静静的发呆。有时候，想一个人躲起来脆弱，不愿别人看到自己的伤口。有时候，走过熟悉的街角，看到熟悉的背影，突然想起一个人的脸。有时候，发现自己一夜之间就长大了。 2014，写给人
Linux下安装MySQL Web 管理工具phpMyAdmin sunjing PHP Install phpMyAdmin
PHP http://php.net/ phpMyAdmin http://www.phpmyadmin.net Error compiling PHP on CentOS x64 一、安装Apache 请参阅http://billben.iteye.com/admin/blogs/1985244 二、安装依赖包 sudo yum install gd
分布式系统理论 bit1129 分布式
FLP One famous theory in distributed computing, known as FLP after the authors Fischer, Lynch, and Patterson, proved that in a distributed system with asynchronous communication and process crashes,
ssh2整合(spring+struts2+hibernate)-附源码白糖_ eclipse spring Hibernate mysql 项目管理
最近抽空又整理了一套ssh2框架，主要使用的技术如下： spring做容器，管理了三层(dao,service,actioin)的对象 struts2实现与页面交互(MVC)，自己做了一个异常拦截器，能拦截Action层抛出的异常 hibernate与数据库交互 BoneCp数据库连接池，据说比其它数据库连接池快20倍，仅仅是据说 MySql数据库项目用eclipse
treetable bug记录 braveCS table
// 插入子节点删除再插入时不能正常显示。修改： //不知改后有没有错，先做个备忘 Tree.prototype.removeNode = function(node) { // Recursively remove all descendants of +node+ this.unloadBranch(node); // Remove
编程之美-电话号码对应英语单词 bylijinnan java 算法编程之美
import java.util.Arrays; public class NumberToWord { /** * 编程之美电话号码对应英语单词 * 题目： * 手机上的拨号盘，每个数字都对应一些字母，比如2对应ABC，3对应DEF.........，8对应TUV，9对应WXYZ， * 要求对一段数字，输出其代表的所有可能的字母组合
jquery ajax读书笔记 chengxuyuancsdn jQuery ajax
1、jsp页面 <%@ page language="java" import="java.util.*" pageEncoding="GBK"%> <% String path = request.getContextPath(); String basePath = request.getScheme()
JWFD工作流拓扑结构解析伪码描述算法 comsci 数据结构算法工作活动 J#
对工作流拓扑结构解析感兴趣的朋友可以下载附件，或者下载JWFD的全部代码进行分析 /* 流程图拓扑结构解析伪码描述算法 public java.util.ArrayList DFS(String graphid, String stepid, int j)
oracle I/O 从属进程 daizj oracle
I/O 从属进程　　I/O从属进程用于为不支持异步I/O的系统或设备模拟异步I/O.例如，磁带设备(相当慢)就不支持异步I/O.通过使用I/O 从属进程，可以让磁带机模仿通常只为磁盘驱动器提供的功能。就好像支持真正的异步I/O 一样，写设备的进程(调用者)会收集大量数据，并交由写入器写出。数据成功地写出时，写入器(此时写入器是I/O 从属进程，而不是操作系统)会通知原来的调用者，调用者则会
高级排序:希尔排序 dieslrae 希尔排序
public void shellSort(int[] array){ int limit = 1; int temp; int index; while(limit <= array.length/3){ limit = limit * 3 + 1;
初二下学期难记忆单词 dcj3sjt126com english word
kitchen 厨房 cupboard 厨柜 salt 盐 sugar 糖 oil 油 fork 叉；餐叉 spoon 匙；调羹 chopsticks 筷子 cabbage 卷心菜；洋白菜 soup 汤 Italian 意大利的 Indian 印度的 workplace 工作场所 even 甚至；更 Italy 意大利 laugh 笑 m
Go语言使用MySQL数据库进行增删改查 dcj3sjt126com mysql
目前Internet上流行的网站构架方式是LAMP，其中的M即MySQL, 作为数据库，MySQL以免费、开源、使用方便为优势成为了很多Web开发的后端数据库存储引擎。MySQL驱动Go中支持MySQL的驱动目前比较多，有如下几种，有些是支持database/sql标准，而有些是采用了自己的实现接口,常用的有如下几种: http://code.google.c...o-mysql-dri
git命令 shuizhaosi888 git
---------------设置全局用户名： git config --global user.name "HanShuliang" //设置用户名 git config --global user.email "[email protected]" //设置邮箱 ---------------查看环境配置 git config --li
qemu-kvm 网络 nat模式 (四) haoningabc kvm qemu
qemu-ifup-NAT #!/bin/bash BRIDGE=virbr0 NETWORK=192.168.122.0 GATEWAY=192.168.122.1 NETMASK=255.255.255.0 DHCPRANGE=192.168.122.2,192.168.122.254 TFTPROOT= BOOTP= function check_bridge()
不要让未来的你，讨厌现在的自己 jingjing0907 生活奋斗工作梦想
故事one 　23岁，他大学毕业，放弃了父母安排的稳定工作，独闯京城，在家小公司混个小职位，工作还算顺手，月薪三千，混了混，混走了一年的光阴。　　　　24岁，有了女朋友，从二环12人的集体宿舍搬到香山民居，一间平房，二人世界，爱爱爱。偶然约三朋四友，打扑克搓麻将，日子快乐似神仙；　　　　25岁，出了几次差，调了两次岗，薪水涨了不过百，生猛狂飙的物价让现实血淋淋，无力为心爱银儿购件大牌
枚举类型详解一路欢笑一路走 enum 枚举详解 enumset enumMap
枚举类型详解一.Enum详解 1.1枚举类型的介绍 JDK1.5加入了一个全新的类型的”类”—枚举类型，为此JDK1.5引入了一个新的关键字enum,我们可以这样定义一个枚举类型。 Demo:一个最简单的枚举类 public enum ColorType { RED
第11章动画效果（上） onestopweb 动画
index.html <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/
Eclipse中jsp、js文件编辑时，卡死现象解决汇总 ljf_home eclipse jsp卡死 js卡死
使用Eclipse编辑jsp、js文件时，经常出现卡死现象，在网上百度了N次，经过N次优化调整后，卡死现象逐步好转，具体那个方法起到作用，不太好讲。将所有用过的方法罗列如下： 1、取消验证 windows–>perferences–>validation 把除了manual 下面的全部点掉，build下只留 classpath dependency Valida
MySQL编程中的6个重要的实用技巧 tomcat_oracle mysql
每一行命令都是用分号(;)作为结束对于MySQL，第一件你必须牢记的是它的每一行命令都是用分号(;)作为结束的，但当一行MySQL被插入在PHP代码中时，最好把后面的分号省略掉，例如： mysql_query("INSERT INTO tablename(first_name,last_name)VALUES('$first_name',$last_name')");
zoj 3820 Building Fire Stations(二分+bfs) 阿尔萨斯 Build
题目链接：zoj 3820 Building Fire Stations 题目大意：给定一棵树，选取两个建立加油站，问说所有点距离加油站距离的最大值的最小值是多少，并且任意输出一种建立加油站的方式。解题思路：二分距离判断，判断函数的复杂度是o(n)，这样的复杂度应该是o(nlogn)，即使常数系数偏大，但是居然跑了4.5s，也是醉了。判断函数里面做了3次bfs，但是每次bfs节点最多

大数据竞赛MR培训与题型

MapReduce编程模板

1.自定义 Mapper类继承类并重写map方法：

2.自定义Reducer类集成Reducer，并重写Reduce方法

3.Driver 主入口，整合mapper和reducer

MapReduce各个情景实战

1.多个输入

2.排序

3.多层MR处理 多 Job 串联

4.TopN算法-自定义 GroupComparator

5.全局计数器

6.MapJoin

7.最简单的wordcount

8.求温度平均值

9.分区多路输出

10.分区并全排序

11.推荐认识好友

12。自定义文件名并多路输出

你可能感兴趣的:(大数据,大数据,mr,hadoop)

3.多层MR处理多 Job 串联