【基于MapReduce实现用户基础数据统计】

CSDN话题挑战赛第2期
参赛话题:大数据学习成长记录

文章目录

    • 使用MapReduce实现用户基础数据统计
      • 基础数据
    • 业务分析点
      • 业务开发流程
    • 在Hive中创建hive表
    • 基于Java开发MR
      • 配置pom.xml文件
      • 开发com.lh.banksys.mr.AgeStatJob
      • 开发com.lh.banksys.mr.EducationStatJob
      • 打包好的架包上传node1节点并运行
      • 数据导入hive


使用MapReduce实现用户基础数据统计

基础数据

项目原始数据如下

create table ods_t_bank(
id INT COMMENT '表自增主键',
age INT COMMENT '年龄',
job STRING COMMENT '工作类型',
marital STRING COMMENT '婚否',
education STRING COMMENT '教育程度',
credit STRING COMMENT '是否有信用卡',
housing STRING COMMENT '房贷',
loan STRING COMMENT '贷款',
contact STRING COMMENT '联系途径',
month_of_year STRING COMMENT '月份',
day_of_week STRING COMMENT '星期几',
duration INT COMMENT '持续时间',
campaign INT COMMENT '本次活动联系的次数',
pdays INT COMMENT '与上一次联系的时间间隔',
previous INT COMMENT '之前与客户联系的次数',
poutcome STRING COMMENT '之前市场活动的结果',
emp_var_rate DOUBLE COMMENT '就业变化速率',
cons_price_idx DOUBLE COMMENT '消费者物价指数',
cons_conf_idx DOUBLE COMMENT '消费者信心指数',
euribor3m DOUBLE COMMENT '欧元存款利率',
nr_employed DOUBLE COMMENT '职工人数',
y TINYINT COMMENT '是否有定期存款'
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';

原始数据存储在MySQL数据库中,可以通过Sqoop或者DataX导入到hive,这块教程参照后续博客

返回目录

业务分析点

基于原始数据,可知用户的基本统计指标有:

  • 教育程度分析
  • 年龄分析
    • 年龄小于40 -> 年轻人
    • 年龄介于40~60 -> 中年人
    • 年龄大于60 -> 老年人
  • 其他分析项(可依据数据自由扩展)

业务开发流程

MR开发流程图
【基于MapReduce实现用户基础数据统计】_第1张图片

返回目录

在Hive中创建hive表

-- 配合java开发的MR需要更新字段的分隔符
drop table if exists dws.dws_t_bank_age;
CREATE TABLE dws.dws_t_bank_age
(age string,
num int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;
drop table if exists dws.dws_t_bank_edu;
create table dws.dws_t_bank_edu(
education string,
num int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n' STORED AS TEXTFILE;

返回目录

基于Java开发MR

在idea-community中中开发MR

配置pom.xml文件


<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0modelVersion>

    <groupId>org.huikegroupId>
    <artifactId>bank_v2artifactId>
    <version>1.2version>

    <properties>
        <maven.compiler.source>1.8maven.compiler.source>
        <maven.compiler.target>1.8maven.compiler.target>
        <hadoop.version>2.7.3hadoop.version>
        <scala.version>2.11.8scala.version>
        <spark.version>2.4.8spark.version>
        <java.version>1.8java.version>
        <encoding>UTF-8encoding>
    properties>

    <dependencies>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-hdfsartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-commonartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-annotationsartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-authartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-yarn-apiartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-yarn-commonartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-mapreduce-client-coreartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-mapreduce-client-commonartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>org.scala-langgroupId>
            <artifactId>scala-libraryartifactId>
            <version>${scala.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-core_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-sql_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.sparkgroupId>
            <artifactId>spark-hive_2.11artifactId>
            <version>${spark.version}version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>${hadoop.version}version>
        dependency>

        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <version>5.1.49version>
        dependency>

        <dependency>
            <groupId>io.nettygroupId>
            <artifactId>netty-allartifactId>
            <version>4.1.50.Finalversion>
        dependency>

        <dependency>
            <groupId>com.fasterxml.jackson.coregroupId>
            <artifactId>jackson-databindartifactId>
            <version>2.6.7version>
        dependency>

        
        <dependency>
            <groupId>com.fasterxml.jackson.coregroupId>
            <artifactId>jackson-coreartifactId>
            <version>2.6.7version>
        dependency>

        <dependency>
            <groupId>com.hankcsgroupId>
            <artifactId>hanlpartifactId>
            <version>portable-1.7.7version>
        dependency>

    dependencies>

    <build>
        <plugins>
            
            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-compiler-pluginartifactId>
                <version>3.0version>
                <configuration>
                    <source>8source>
                    <target>8target>
                    <encoding>UTF-8encoding>
                configuration>
                <executions>
                    <execution>
                        <phase>compilephase>
                        <goals>
                            <goal>compilegoal>
                        goals>
                    execution>
                executions>
            plugin>
            
            <plugin>
                <groupId>net.alchim31.mavengroupId>
                <artifactId>scala-maven-pluginartifactId>
                <version>3.2.1version>
                <executions>
                    <execution>
                        <id>scala-compile-firstid>
                        <phase>process-resourcesphase>
                        <goals>
                            <goal>add-sourcegoal>
                            <goal>compilegoal>
                        goals>
                    execution>
                executions>
            plugin>
            
            <plugin>
                <groupId>org.apache.maven.pluginsgroupId>
                <artifactId>maven-assembly-pluginartifactId>
                <version>2.5.5version>
                <configuration>
                    
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependenciesdescriptorRef>
                    descriptorRefs>
                configuration>
                <executions>
                    <execution>
                        <id>make-assemblyid>
                        <phase>packagephase>
                        <goals>
                            <goal>singlegoal>
                        goals>
                    execution>
                executions>
            plugin>
        plugins>
    build>

project>

开发com.lh.banksys.mr.AgeStatJob

package com.lh.banksys.mr;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @Classname AgeStatJob
 * @Description 年龄阶段统计分析
 * @Date 2021/10/14 19:14
 * @Created by Tiger_Li
 */

public class AgeStatJob {
    public static class AgeStatMap extends Mapper<Object, Text, Text, IntWritable> {
//        private static Text newKey =new Text();
        public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
            String[] words = StringUtils.split(value.toString(), '\001');
            if(words[0] != "" && StringUtils.isNotEmpty(words[0])){
                int age = Integer.parseInt(words[0]);
                if(age <= 40){
                    context.write(new Text("young"), new IntWritable(1));
                }else if(age > 40 && age <=60){
                    context.write(new Text("middle"), new IntWritable(1));
                }else{
                    context.write(new Text("old"), new IntWritable(1));
                }
            }
        }
    }

    public static class AgeStatReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        protected void reduce(Text key, Iterable<IntWritable> values,Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for(IntWritable i : values){
                sum += i.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        Configuration config = new Configuration();
        try {
            FileSystem fs = FileSystem.get(config);
            Job job = Job.getInstance(config);
            job.setJobName("AgeStatJob");

            job.setJarByClass(AgeStatJob.class);
            job.setMapperClass(AgeStatMap.class);
            job.setReducerClass(AgeStatReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setNumReduceTasks(1);

            FileInputFormat.addInputPath(job, new Path("hdfs://lh1/user/hive/warehouse/dwd.db/dwd_t_bank"));
            Path outPath = new Path("hdfs://lh1/Java/projects/t_bank_age_dws_temp");
            if(fs.exists(outPath)){
                fs.delete(outPath, true);
            }
            FileOutputFormat.setOutputPath(job, outPath);
            Boolean result = job.waitForCompletion(true);
            if(result){
                System.out.println("Job is complete!");
            }else{
                System.out.println("Job is fail!");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}


开发com.lh.banksys.mr.EducationStatJob

package com.lh.banksys.mr;


import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


/**
 * @Classname EducationStatJob
 * @Description 受教育程度
 * @Date 2021/10/14 19:14
 * @Created by Tiger_Li
 */
public class EducationStatJob {

    public static class EducationStatMap extends Mapper<Object, Text, Text, IntWritable> {
        private static Text newKey =new Text();
        public void map(Object key, Text value, Context context)throws IOException, InterruptedException {
            String[] words = StringUtils.split(value.toString(), '\001');
            if(StringUtils.isNotBlank(words[3])){
                context.write(new Text(words[3]), new IntWritable(1));
            }
        }
    }

    public static class EducationStatReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        protected void reduce(Text key, Iterable<IntWritable> values,Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for(IntWritable i : values){
                sum += i.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        Configuration config = new Configuration();
        try {
            FileSystem fs = FileSystem.get(config);
            Job job = Job.getInstance(config);
            job.setJobName("EducationStatJob");

            job.setJarByClass(EducationStatJob.class);
            job.setMapperClass(EducationStatMap.class);
            job.setReducerClass(EducationStatReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setNumReduceTasks(1);

            FileInputFormat.addInputPath(job, new Path("hdfs://lh1/user/hive/warehouse/dwd.db/dwd_t_bank"));
            Path outPath = new Path("hdfs://lh1/Java/projects/t_bank_edu_dws_temp");
            if(fs.exists(outPath)){
                fs.delete(outPath, true);
            }
            FileOutputFormat.setOutputPath(job, outPath);
            Boolean result = job.waitForCompletion(true);
            if(result){
                System.out.println("Job is complete!");
            }else{
                System.out.println("Job is fail!");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

返回目录

打包好的架包上传node1节点并运行

# done
/opt/soft_installed/hadoop-2.7.3/bin/hadoop jar /home/lh/bank_v2-1.2.jar com.lh.banksys.mr.AgeStatJob
/opt/soft_installed/hadoop-2.7.3/bin/hadoop jar /home/lh/bank_v2-1.2.jar com.lh.banksys.mr.EducationStatJob

# ok
[root@master lh]# /opt/soft_installed/hadoop-2.7.3/bin/hadoop jar /home/lh/bank_v2-1.2-jar-with-dependencies.jar com.lh.banksys.mr.AgeStatJob

# check

[root@master ~]# hdfs dfs -ls /Java/projects/
Found 2 items
drwxr-xr-x   - root supergroup          0 2022-09-27 06:10 /Java/projects/t_bank_age_dws_temp
drwxr-xr-x   - root supergroup          0 2022-09-27 06:23 /Java/projects/t_bank_edu_dws_temp


返回目录

数据导入hive

在zeppelin中运行一下代码完成数据导入hive

-- done
load data inpath 'hdfs://lh1/Java/projects/t_bank_age_dws_temp' overwrite into table dws.dws_t_bank_age;
load data inpath 'hdfs://lh1/Java/projects/t_bank_edu_dws_temp' overwrite into table dws.dws_t_bank_edu;

select * from dws.dws_t_bank_age

【基于MapReduce实现用户基础数据统计】_第2张图片

返回目录

你可能感兴趣的:(Hadoop,mapreduce,hive,大数据)