大数据Spark SQL慕课网日志分析

前5章小结


环境变量
/etc/profile
JAVA_HOME=/usr/local/src/java/jdk
CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
HIVE_HOME=/usr/local/src/app/hive-1.1.0-cdh5.12.0
SPARK_HOME=/usr/local/src/app/spark-2.2.0-bin-2.6.0-cdh5.12.0
HADOOP_HOME=/usr/local/src/app/hadoop-2.6.0-cdh5.12.0
PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
export JAVA_HOME  PATH CLASSPATH SPARK_HOME HADOOP_HOME HIVE_HOME

免密登陆
$ ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
$ chmod 0600 ~/.ssh/authorized_keys
vim /etc/ssh/ssh_config
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
systemctl restart sshd.service

配置hadoop
hadoop-env.sh
export JAVA_HOME=/usr/local/src/java/jdk
etc/hadoop/core-site.xml:

    
        fs.defaultFS
        hdfs://localhost:8020
    
    
        hadoop.tmp.dir
        /usr/local/src/app/tmp
    



etc/hadoop/hdfs-site.xml:

    
        dfs.replication
        1
    



:etc/hadoop/mapred-site.xml:

    
        mapreduce.framework.name
        yarn
    

ls



etc/hadoop/yarn-site.xml:

    
        yarn.nodemanager.aux-services
        mapreduce_shuffle
    

c

sbin
启动hadoop
$ bin/hdfs namenode -format
$ sbin/start-dfs.sh
$ sbin/stop-dfs.sh
$ bin/hdfs dfs -put etc/hadoop input
$ bin/hdfs dfs -mkdir /user
$ bin/hdfs dfs -mkdir /user/
$ sbin/start-yarn.sh
$ sbin/stop-yarn.sh



配置hive
yum -y  install perl
yum -y install libaio
yum -y install autoconfcd /usr

vim /etc/my.cnf
log-error=/var/lib/mysql/error.log
pid-file=/var/lib/mysql/p.pid
cd /var/lib/mysql/
service mysql start
service mysql stop
mysqladmin –u root password "root"
mysql –uroot –proot
grant all on *.* to 'root'@'%' identified by 'root';

hive-env.sh
HADOOP_HOME=/usr/local/src/app/hadoop-2.6.0-cdh5.12.0

lib  mysql-connector-java-5.1.42-bin.jar

hive-site.xml




        javax.jdo.option.ConnectionURL
        jdbc:mysql://localhost:3306/sparksql?createDatabaseIfNotExist=true
    
    
        javax.jdo.option.ConnectionDriverName
        com.mysql.jdbc.Driver
    
    
        javax.jdo.option.ConnectionUserName
        root
    
    
        javax.jdo.option.ConnectionPassword
        root
    



create table hive_wordcount(context string)
load data local inpath '/usr/local/src/app/hello.txt' into table hive_wordcount;
select word, count(1) from hive_wordcount  lateral view explode(split(context, ',')) wc as word group by word;


dept.txt
10,ACCOUNTING,  NEW YORK
20,RESEARCH ,DALLAS
30,SALES    ,CHICAGO
40,OPERATIONS,  BOSTON

emp.txt
7369,SMITH,CLERK,7902,1980-12-17,800.00,20
7499,ALLEN,SALESMAN,7698,1981-2-20,1600.00,30

create table emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
create table dept(
deptno int,
dname string,
location string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

load data local inpath '/usr/local/src/dept.txt' into table dept;
load data local inpath '/usr/local/src/emp.txt' into table emp;

select  deptno, count(1) from emp group by deptno;



配置spark
spark/jars     mysql-connector-java-5.1.42-bin.jar
spark/conf     hive-site.xml
spark-env.sh
export JAVA_HOME=/usr/local/src/java/jdk
SPARK_MASTER_HOST=localhost
SPARK_WORKER_CORES=2
SPARK_WORKER_INSTANCES=1
SPARK_WORKER_MEMORY=2G

slaves
sbin/start-all.sh
bin/spark-shell --help
bin/spark-shell --master local[2]
bin/spark-shell --master spark://localhost:7077

val file = spark.sparkContext.textFile("file:///home/hadoop/data/wc.txt")
val wordCounts = file.flatMap(line => line.split(",")).map((word => (word, 1))).reduceByKey(_ + _)
wordCounts.collect



scala>>
spark.sql("show tables").show
spark.sql("select * from emp e join dept d on e.deptno=d.deptno").show
select * from emp e join dept d on e.deptno=d.deptno;


bin/spark-sql --master local[2]
select * from emp e join dept d on e.deptno=d.deptno;
create table t(key string , value string);
show tables;exit

create table t(key string, value string);
explain extended select a.key*(2+3), b.value from  t a join t b on a.key = b.key and a.key > 3;



./sbin/start-thriftserver.sh \
  --hiveconf hive.server2.thrift.port=14000 \
  --master local[2]

./bin/beeline  -u jdbc:hive2://localhost:14000 -n ecs-404b
show tables;
select * from emp e join dept d on e.deptno=d.deptno;

你可能感兴趣的:(大数据Spark SQL慕课网日志分析)