大数据学习心得与总结

大数据学习心得与总结
在此之前,需要做的准备:
1、vnware创建centos虚拟机
2、安装secureCRT
3、安装eclipse
4、配置JDK

作业一:centos搭建伪分布式

修改虚拟机配置文件
①core-site.xml


        fs.defaultFS
        hdfs://bigdata128:9000


        hadoop.tmp.dir
        /opt/module/hadoop-2.7.3/tmp
 

②hdfs-site.xml

		 
	     dfs.replication		 
	     1		 
		

         dfs.namenode.secondary.http-address
         bigdata128:50090

③mapred-site.xml(该配置文件不存在,先复制)
cp mapred-site.xml.template mapred-site.xml

			 
	  mapreduce.framework.name 
	  yarn 
	 

④yarn-site.xml

			 
	  yarn.resourcemanager.hostname 
	  bigdata128 
	 	 
	 
	  yarn.nodemanager.aux-services 
	  mapreduce_shuffle 
	 

⑤在slaves文件中添加bigdata128
⑥修改/etc/hosts,添加IP
例如:192.168.60.132 bigdata128
之后重启虚拟机
⑦格式化hdfs namenode -format
⑧start-all.sh启动伪分布,启动完成后输入jps,如果NameNode、DataNode、SecondaryNameNode、ResourceManager、NodeManager全部启动,伪分布式配置成功

作业二HDFS实现上传下载

程序详情:
①HDFSDownload

package hdfs.files;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import java.io.*;

public class HDFSDownload {
    //声明输入流、输出流
    private static InputStream input;
    private static OutputStream output;

    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //创建本地文件的输出流
        output = new FileOutputStream("E:\\download.txt");
        //创建HDFS的输入流
        input = client.open(new Path("/aadir/upload1.txt"));
        //写文件到HDFS
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = input.read(buffer)) != -1) {
            output.write(buffer, 0, len);
        }
        //防止输出数据不完整
        output.flush();
        //使用工具类IOUtils上传或下载
        //IOUtils.copy(input, output);
        //关闭输入输出流
        input.close();
        output.close();
        System.out.println("成功!");
    }
}

②HDFSFilelfExist

package hdfs.files;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
public class HDFSFilelfExist {
    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //声明文件对象
        String fileName = "/aadir/aaout.txt";
        //判断文件是否存在
        if (client.exists(new Path(fileName))) {
            System.out.println("文件存在!");
        } else {
            System.out.println("文件不存在!");
        }
    }
    
}

③HDFSMKdir

package hdfs.files;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.IOException;

public class HDFSMKdir {
    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //在HDFS的根目录下创建aadir
        client.mkdirs(new Path("/aadir"));
        //关闭连接对象
        client.close();
        //输出"successful!"
        System.out.println("successfully!");
    }
}

④HDFSUpload

package hdfs.files;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

public class HDFSUpload {
    //声明输入流、输出流
    private static InputStream input;
    private static OutputStream output;

    public static void main(String[] args) throws IOException {
        //设置root权限
        System.setProperty("HADOOP_USER_NAME", "root");
        //创建HDFS连接对象client
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", "hdfs://192.168.60.130:9000");
        FileSystem client = FileSystem.get(conf);
        //创建本地文件的输入流
        input = new FileInputStream("E:\\upload.txt");
        //创建HDFS的输出流
        output = client.create(new Path("/aadir/upload1.txt"));
        //写文件到HDFS
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = input.read(buffer)) != -1) {
            output.write(buffer, 0, len);
        }
        //防止输出数据不完整
        output.flush();
        //使用工具类IOUtils上传或下载
        //IOUtils.copy(input, output);
        //关闭输入输出流
        input.close();
        output.close();
        System.out.println("成功!");
    }
}

启动伪分布式后在网页IP:50070上查看程序运行结果

作业三 JAVA程序实现mapreduce的wordcount

程序详情

package hdfs.files;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

public class WordCountDriver {
	
	public static class WordCountMapper extends Mapper{
		   
		 protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

		   String line = value.toString();
		 
		   String[] words=line.split(" ");
		  
		   for(String w:words) {
		  
		   context.write(new Text(w), new IntWritable(1));
		   }
		  }
	}

	public static class WordCountReducer extends Reducer {
		  protected void reduce(Text Key, Iterable values, Context context) throws IOException, InterruptedException {
			    
			    int sum=0;
			   
			    for(IntWritable v:values) {
			      sum +=v.get();
			    }
			    context.write(Key, new IntWritable(sum));
			  }
		}

	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		
		  System.setProperty("HADOOP_USER_NAME", "root");
		 
		  Configuration conf=new Configuration();
		  Job job=Job.getInstance(conf);
		 
		  job.setJarByClass(WordCountDriver.class);
		
		  job.setMapperClass(WordCountMapper.class);
	
		  job.setReducerClass(WordCountReducer.class);
		
		  job.setMapOutputKeyClass(Text.class);
		  job.setMapOutputValueClass(IntWritable.class);
	
		  job.setOutputKeyClass(Text.class);
		  job.setOutputValueClass(IntWritable.class);
		
		  FileInputFormat.setInputPaths(job, new Path("/usr/local/hdfs/input/cc.txt"));
		
		  FileOutputFormat.setOutputPath(job, new Path("/usr/local/hdfs/output"));
		
		  Boolean rs=job.waitForCompletion(true);
		
		  System.exit(rs?0:1);
	}
}

作业四 安装配置HBASE

官网下载安装包https://mirrors.tuna.tsinghua.edu.cn/apache/hbase/stable/
将安装包上传到虚拟机
使用tar命令解压安装包
cd /etc/profile修改文件

export HBASE_HOME=【hbase安装目录】
export PATH=$HBASE_HOME/bin:$PATH

使用命令hbase测试是否安装成功

修改hbase-env.sh

export JAVA_HOME=【java安装地址】
export HBASE_CLASSPATH=【hbase安装目录】
export HBASE_MANAGES_ZK=true

修改hbase-site.xml




        hbase.rootdir
        hdfs://bigdata128:9000/hbase



        hbase.cluster.distributed
        true




        hbase.zookeeper.quorum
        localhost

start-hbase.sh

对应的java程序

/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.client;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.concurrent.ExecutorService;

import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.security.UserProvider;


/**
 * A non-instantiable class that manages creation of {@link Connection}s.
 * Managing the lifecycle of the {@link Connection}s to the cluster is the responsibility of
 * the caller.
 * From a {@link Connection}, {@link Table} implementations are retrieved
 * with {@link Connection#getTable(TableName)}. Example:
 * 
 * Connection connection = ConnectionFactory.createConnection(config);
 * Table table = connection.getTable(TableName.valueOf("table1"));
 * try {
 *   // Use the table as needed, for a single operation and a single thread
 * } finally {
 *   table.close();
 *   connection.close();
 * }
 * 
* * Similarly, {@link Connection} also returns {@link Admin} and {@link RegionLocator} * implementations. * * This class replaces {@link HConnectionManager}, which is now deprecated. * @see Connection * @since 0.99.0 */ @InterfaceAudience.Public @InterfaceStability.Evolving public class ConnectionFactory { /** No public c.tors */ protected ConnectionFactory() { } /** * Create a new Connection instance using default HBaseConfiguration. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection();
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @return Connection object for conf */ public static Connection createConnection() throws IOException { return createConnection(HBaseConfiguration.create(), null, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @return Connection object for conf */ public static Connection createConnection(Configuration conf) throws IOException { return createConnection(conf, null, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("mytable"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param pool the thread pool to use for batch operations * @return Connection object for conf */ public static Connection createConnection(Configuration conf, ExecutorService pool) throws IOException { return createConnection(conf, pool, null); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param user the user the connection is for * @return Connection object for conf */ public static Connection createConnection(Configuration conf, User user) throws IOException { return createConnection(conf, null, user); } /** * Create a new Connection instance using the passed conf instance. Connection * encapsulates all housekeeping for a connection to the cluster. All tables and interfaces * created from returned connection share zookeeper connection, meta cache, and connections * to region servers and masters. *
* The caller is responsible for calling {@link Connection#close()} on the returned * connection instance. * * Typical usage: *
   * Connection connection = ConnectionFactory.createConnection(conf);
   * Table table = connection.getTable(TableName.valueOf("table1"));
   * try {
   *   table.get(...);
   *   ...
   * } finally {
   *   table.close();
   *   connection.close();
   * }
   * 
* * @param conf configuration * @param user the user the connection is for * @param pool the thread pool to use for batch operations * @return Connection object for conf */ public static Connection createConnection(Configuration conf, ExecutorService pool, User user) throws IOException { if (user == null) { UserProvider provider = UserProvider.instantiate(conf); user = provider.getCurrent(); } return createConnection(conf, false, pool, user); } static Connection createConnection(final Configuration conf, final boolean managed, final ExecutorService pool, final User user) throws IOException { String className = conf.get(HConnection.HBASE_CLIENT_CONNECTION_IMPL, ConnectionManager.HConnectionImplementation.class.getName()); Class clazz = null; try { clazz = Class.forName(className); } catch (ClassNotFoundException e) { throw new IOException(e); } try { // Default HCM#HCI is not accessible; make it so before invoking. Constructor constructor = clazz.getDeclaredConstructor(Configuration.class, boolean.class, ExecutorService.class, User.class); constructor.setAccessible(true); return (Connection) constructor.newInstance(conf, managed, pool, user); } catch (Exception e) { throw new IOException(e); } } }

作业五 安装redis

参考网址:https://www.cnblogs.com/renzhicai/p/7773080.html

作业六 hive的安装和使用

一:安装mysql
下载安装包wget http://dev.mysql.com/get/mysql-community-release-el7-5.noarch.rpm
解压:rpm -ivh mysql-community-release-el7-5.noarch.rpm
安装yum install mysql-community-server
重启mysql服务:service mysqld restart
mysql -u root
为root用户设置密码root:mysql> set password for ‘root’@‘localhost’ =password(‘root’);
配置文件/etc/my.cnf加上编码配置:[mysql] default-character-set =utf8
grant all privileges on . to root@’ %'identified by ‘root’;
flush privileges; 刷新权限

二:hive的安装及配置
官网下载安装包:http://mirror.bit.edu.cn/apache/hive/ 并上传到虚拟机
解压安装到指定目录下/opt/module
修改etc/profile文件,添加HIVE_HOME安装路径
Source profile,使其生效
配置hive-env.sh

cp hive-env.sh.template  hive-env.sh
修改Hadoop的安装路径
HADOOP_HOME=/opt/module /hadoop-2.7.3
修改Hive的conf目录的路径
export HIVE_CONF_DIR=/opt/module/hive/conf

配置hive-site.xml

 
    javax.jdo.option.ConnectionURL
    jdbc:mysql://127.0.0.1:3306/hive?characterEncoding=UTF-8&serverTimezone=GMT%2B8
    
      JDBC connect string for a JDBC metastore.
      To use SSL to encrypt/authenticate the connection, provide database-specific SSL flag in the connection URL.
      For example, jdbc:postgresql://myhost/db?ssl=true for postgres database.
    
  
 
  
    javax.jdo.option.ConnectionDriverName
    com.mysql.cj.jdbc.Driver
    Driver class name for a JDBC metastore
  
 
  
    javax.jdo.option.ConnectionUserName
    root
    Username to use against metastore database
  
 
  
    javax.jdo.option.ConnectionPassword
    123456
    password to use against metastore database
  

 
    hive.exec.local.scratchdir
    /usr/local/hive/apache-hive-2.3.4-bin/tmp/${user.name}
    Local scratch space for Hive jobs
  
 
  
    hive.downloaded.resources.dir
    /usr/local/hive/apache-hive-2.3.4-bin/iotmp/${hive.session.id}_resources
    Temporary local directory for added resources in the remote file system.
  
 
  
    hive.querylog.location
    /usr/local/hive/apache-hive-2.3.4-bin/iotmp/${system:user.name}
    Location of Hive run time structured log file
  
 
  
    hive.server2.logging.operation.log.location
    /usr/local/hive/apache-hive-2.3.4-bin/iotmp/${system:user.name}/operation_logs
    Top level directory where operation logs are stored if logging functionality is enabled
  
 
  
    hive.server2.thrift.bind.host
    bigdata
    Bind host on which to run the HiveServer2 Thrift service.
  
 
  
    system:java.io.tmpdir
    /usr/local/hive/apache-hive-2.3.4-bin/iotmp
    
  

初始化:schematool -dbType mysql -initSchema

编写wordcount程序(上传文件到hdfs)

[root@bigdata ~]# vim 1.txt
[root@bigdata ~]# hdfs dfs -mkdir /input
[root@bigdata ~]# hdfs dfs -put 1.txt /input
[root@bigdata ~]# hdfs dfs -ls /input
1、create table words(line string);
2、load data inpath '/input/1.txt' overwrite into table words;
3、create table wordcount as select word, count(1) as count from (select explode(split(line,' '))as word from words) w group by word order by word;
4、select * from wordcount;

作业七 安装spark并编写scala,java程序实现wordcount

一、安装scala
1、官网下载安装Scala:scala-2.12.8.tgz
https://www.scala-lang.org/download/
2、tar -zxvf scala-2.12.8.tgz -C /opt/module
3、mv scala-2.12.8 scala
4、测试:scala -version
5、启动:scala

二、安装spark
1、官网下载安装Spark:spark-2.4.2-bin-hadoop2.7.tgz
https://www.apache.org/dyn/closer.lua/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
2、解压压缩包
3、先启动hadoop 环境start-all.sh
4、启动spark环境
进入到SPARK_HOME/sbin下运行start-all.sh
/opt/module/spark/sbin/start-all.sh

三、搭建spark伪分布
配置spark-env.sh:

export JAVA_HOME=/usr/java/jdk1.8.0_211-amd64
export SCALA_HOME=/usr/share/scala
export HADOOP_HOME=/usr/local/hadoop/hadoop-2.7.7
export HADOOP_CONF_DIR=/usr/local/hadoop/hadoop-2.7.7/etc/hadoop
export SPARK_MASTER_HOST=bigdata
export SPARK_MASTER_PORT=7077
export  LD_LIBRARY_PATH=$HADOOP_HOME/lib/native

配置etc/profile

export JAVA_HOME=/usr/java/jdk1.8.0_211-amd64
export HADOOP_HOME=/usr/local/hadoop/hadoop-2.7.7
export HBASE_HOME=/usr/local/hbase/hbase-1.4.9
export HIVE_HOME=/usr/local/hive/apache-hive-2.3.4-bin
export SPARK_HOME=/usr/local/spark/spark-2.4.2-bin-hadoop2.7
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native

source profile使其生效
进入Spark 的 sbin 目录执行 start-all.sh 启动 spark:./start-all.sh
输入spark-shell进入spark界面

四、安装sbt
参考网址:http://dblab.xmu.edu.cn/blog/1307-2/

五、统计本地文件

val textFile = sc.textFile("file:///usr/local/spark/mycode/wordcount/word.txt")
wordCount.collect()

六、scala程序实现wordcount统计

spark-submit --class "WordCount"  /usr/local/spark/mycode/wordcount/target/scala-2.11/simple-project_2.11-4.1.jar

相关scala程序:

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object WordCount {
    def main(args: Array[String]) {
        val inputFile =  "file:///usr/local/spark/mycode/wordcount/word.txt"
        val conf = new SparkConf().setAppName("WordCount").setMaster("local[2]")
        val sc = new SparkContext(conf)
                val textFile = sc.textFile(inputFile)
                val wordCount = textFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((a, b) => a + b)
                wordCount.foreach(println)
    }
}

七、java程序实现wordcount统计
程序详情:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
 
import java.util.Arrays;
 
public class JavaWordCount {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("Spark WordCount written by java!");
 
        JavaSparkContext sc = new JavaSparkContext(conf);
        
        JavaRDD textFile = sc.textFile("hdfs:///user/hadoop/word.txt");
        JavaPairRDD counts = textFile
                .flatMap(s -> Arrays.asList(s.split(" ")).iterator())
                .mapToPair(word -> new Tuple2<>(word, 1))
                .reduceByKey((a, b) -> a + b);
        counts.saveAsTextFile(hdfs:///user/hadoop/writeback");
        sc.close();
    }
}

相关依赖


  4.0.0
  Spark
  SPARK
  0.0.1-SNAPSHOT
    
        
            org.apache.spark
            spark-core_2.11
            2.4.1
        
    
 
    
        
            
                
                    maven-assembly-plugin
                    
                        false
                        
                            jar-with-dependencies
                        
                        
                            
                                JavaWordCount
                            
                        
                    
                    
                        
                            make-assembly
                            package
                            
                                assembly
                            
                        
                    
                
                
                    org.apache.maven.plugins
                    maven-compiler-plugin
                    
                        8
                        8
                    
                
            
        
    
    

打包上传到centos后

spark-submit --class spark.JavaWordCount --master spark://bigdata:7077 /usr/local/spark/mycode/sparkt-wordcount.jar

大数据学习心得与总结_第1张图片

你可能感兴趣的:(学习心得)