Java实现Spark词配对Wordcount计数

    • 需求
    • 实现过程
      • 开启hadoop和spark
      • jar包处理
      • HDFS文件设置
      • spark-submit程序
      • 查询结果
    • 附加代码

需求

  使用Spark实现对一个文档中的每一行的单词进行词配对计数,要求去标点符号,将大写符号统一转化成为小写单词。

  举例说明,最初的文档为:

“a a, A b
a b c

则处理后的结果为:

(a a) 2
(a b) 2
(a c) 1
(b a) 4
(b c) 1
(c a) 1
(c b) 1

实现过程

开启hadoop和spark

进入Hadoop所在的文件夹并执行启动语句:

$ sbin/start-all.sh

进入Spark所在的文件夹并执行启动语句:

$ sbin/start-all.sh

Java实现Spark词配对Wordcount计数_第1张图片

jar包处理

将编码程序打包成jar包进行处理
Java实现Spark词配对Wordcount计数_第2张图片

HDFS文件设置

使用hdfs创建文件夹,并将input文件放在hdfs文件夹下:

$ hadoop dfs -mkdir -p /wordcount2/input
$ hadoop dfs -put /Users/liuqi/Desktop/input2.txt /wordcount2/input
$ bin/hdfs dfs -ls /wordcount2/input

Java实现Spark词配对Wordcount计数_第3张图片

spark-submit程序

运行mapreduce程序:

$ bin/spark-submit --class WordCount --num-executors 2 --executor-memory 6g --executor-cores 4 /Users/liuqi/Desktop/wordcountspark.jar /wordcount2/output

Java实现Spark词配对Wordcount计数_第4张图片

结果显示如下:
Java实现Spark词配对Wordcount计数_第5张图片
注:如果中间有错,则删除对应文件重新进行操作:

$ bin/hdfs dfs -rm -r /wordcount2/input

查询结果

结果保存在本地:

$ hadoop dfs -getmerge /wordcount2/output /Users/liuqi/Desktop/wordcount2/

Java实现Spark词配对Wordcount计数_第6张图片

附加代码

Java实现Spark词配对Wordcount计数_第7张图片
注:这里只是显示java代码,整个工程去我的CSDN博客进行下载。

WordCount.java:

import scala.Tuple2;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/** 
 *  
 * WordCount 
 * @author 刘琦
 */ 

public final class WordCount {
  private static final Pattern SPACE = Pattern.compile(" ");
  public static void main(String[] args) throws Exception {
    if (args.length < 1) {
      System.err.println("Usage: WordCount ");
      System.exit(1);
    }

    SparkConf sparkConf = new SparkConf().setAppName("WordCount");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);
    JavaRDD lines = ctx.textFile("hdfs://localhost:54310/wordcount2/input");

    JavaRDD words = lines.flatMap(new FlatMapFunction() {

        public Iterator call(String s) throws Exception {
            // TODO Auto-generated method stub
            //大小写转化与去符号
            String newStr = s.toLowerCase().replaceAll("[\\d\\pP\\p{Punct}]", "");;
            String[] wordResult = SPACE.split(newStr);
            List wordNewResult = new ArrayList();
            String[][] result = new String[wordResult.length][2];
            for (int i = 0; i < wordResult.length; i ++){
                result[i][0] = wordResult[i];
                 result[i][1] = "0";
            }

            //对每一行的单词进行处理
            for(int i = 0; i < wordResult.length ; i++){
                for(int j = 0; j < wordResult.length; j++){
                    if(i == j){
                        continue;
                    }else if (result[i][1].equals("1")){
                        //这个词之前出现过了,这里只统计它之后还有没有相同的数据,后来发现不需要这一步了,相同的只计算一次就好
                        if(i0].equals(result[j][0])){
                            result[j][1] = "1";
                            //wordNewResult.add("(" + result[i][0] + "  " + result[j][0] + ")");
                            //word.set("(" + result[i][0] + "  " + result[j][0] + ")");
//                          output.collect(word, one);
                        }
                    }else{
                        //这个词之前没有出现过
                        if (!result[i][0].equals(result[j][0])){
                            //普通操作
                            wordNewResult.add("(" + result[i][0] + "  " + result[j][0] + ")");
//                          word.set("(" + result[i][0] + "  " + result[j][0] + ")");  
                        }else{
                            //说明两个单词是一样的,并且这个单词之前没有统计过
                            result[j][1] = "1";
                            wordNewResult.add("(" + result[i][0] + "  " + result[j][0] + ")");
//                          word.set("(" + result[i][0] + "  " + result[j][0] + ")");  
                        }
                    }
                }
            }

            return wordNewResult.iterator();
        }

      });

      JavaPairRDD ones = words.mapToPair(new PairFunction() {
        public Tuple2 call(String s) {
          return new Tuple2(s, 1);
        }
      });

      JavaPairRDD counts = ones.reduceByKey(new Function2() {
        public Integer call(Integer i1, Integer i2) {
          return i1 + i2;
        }
      });

      //创建Hdfs文件,打开Hdfs输出流
      HdfsOperate.openHdfsFile("hdfs://localhost:54310/wordcount2/output");

      List> output = counts.collect();
      for (Tuple2 tuple : output) {
        System.out.println(tuple._1() + ": " + tuple._2());
        HdfsOperate.writeString(tuple._1() + ": " + tuple._2());
      }
      ctx.stop();
      //关闭Hdfs输出流
      HdfsOperate.closeHdfsFile();
    }

  }

HdfsOperate.java:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.web.resources.ExceptionHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URI;
/**
 * 使用Hadoop的FileSystem把数据写入到HDFS
 */
public class HdfsOperate implements Serializable{

    private static Logger logger = LoggerFactory.getLogger(HdfsOperate.class);
    private static Configuration conf = new Configuration();
    private static BufferedWriter writer = null;

    //在hdfs的目标位置新建一个文件,得到一个输出流
    public static void openHdfsFile(String path) throws Exception {
        FileSystem fs = FileSystem.get(URI.create(path),conf);
        writer = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(path))));
        if(null!=writer){
            logger.info("[HdfsOperate]>> initialize writer succeed!");
        }
    }

    //往hdfs文件中写入数据
    public static void writeString(String line) {
        try {
            writer.write(line + "\n");
        }catch(Exception e){
            logger.error("[HdfsOperate]>> writer a line error:"  ,  e);
        }
    }

    //关闭hdfs输出流
    public static void closeHdfsFile() {
        try {
            if (null != writer) {
                writer.close();
                logger.info("[HdfsOperate]>> closeHdfsFile close writer succeed!");
            }
            else{
                logger.error("[HdfsOperate]>> closeHdfsFile writer is null");
            }
        }catch(Exception e){
            logger.error("[HdfsOperate]>> closeHdfsFile close hdfs error:" + e);
        }
    }

}

你可能感兴趣的:(JAVA,数据挖掘,spark)