0、将以下写入到 /etc/profile 最后
export JAVA_HOME=/usr/java/latest/jdk1.8.0_161
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
export HADOOP_HOME=/home/ubuntu/hadoop-2.2.0
PATH=/home/ubuntu/bin:/home/ubuntu/.local/bin:/usr/java/latest/jdk1.8.0_161/bin:/home/ubuntu/bin:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/bin:/home/ubuntu/hadoop-2.2.0/bin:/home/ubuntu/apache-hive-1.2.2-bin/bin
export HIVE_HOME=/home/ubuntu/apache-hive-1.2.2-bin
PATH=/home/ubuntu/bin:/home/ubuntu/.local/bin:/home/ubuntu/bin:/home/ubuntu/.local/bin:/usr/java/latest/jdk1.8.0_161/bin:/home/ubuntu/bin:/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/bin:/home/ubuntu/hadoop-2.2.0/bin:/bin
hadoop学习笔记:
1、wordcount
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.2.0.jar wordcount WorkSpace/hadoop WorkSpace/result
hdfs -cat WorkSpace/result/*
2、hadoop-2.2.0 需和 hive-1.2.2搭配
3、创建hive的目录及赋权
sudo mkdir -p /user/hive/warehouse
sudo chmod a+rwx /user/hive/warehouse
4、进入hive的命令
hive
hive-site.xml配置
hive.metastore.warehouse.dir
/user/hive/warehouse
Local or HDFS directory where Hive keeps table contents.
hive.metastore.local
true
Use false if a production metastore server is used.
javax.jdo.option.ConnectionURL
jdbc:derby:;databaseName=/user/hive/metastore_db;create=true
The JDBC connection URL.
====
hive.metastore.warehouse.dir
/user/hive/warehouse
Local or HDFS directory where Hive keeps table contents.
hive.metastore.local
true
Use false if a production metastore server is used.
javax.jdo.option.ConnectionURL
jdbc:mysql://db1.mydomain.pvt/hive_db?createDatabaseIfNotExist=true
The JDBC connection URL.
javax.jdo.option.ConnectionDriverName
com.mysql.jdbc.Driver
javax.jdo.option.ConnectionUserName
database_user
javax.jdo.option.ConnectionPassword
database_pass
INSERT INTO table_name ( field1, field2,...fieldN )
VALUES
( value1, value2,...valueN );
hive -e "select * from table limit x";
hive -S -e "select * from table limit x" > /tmp/myquery
hive -S -e "set" | grep warehouse
5、从文件执行hive查询:
hive -f WorkSpace/hives/file/withqueries.hql
source WorkSpace/hives/file/withqueries.hql;
hive -e "load data local inpath '/home/ubuntu/WorkSpace/hives/file/myfile' into table src";
6、hive里执行shell命令
在命令前加 ! 命令后加 ;
如:
hive> ! /bin/echo "what up dog";
"what up dog"
hive> ! pwd;
/home/ubuntu
7、hive里使用 hadoop里的 dfs命令
hive> dfs -ls / ;
8、hive脚本中如何进行注释
前加 -- 进行注释, 如
-- this is description
select * from who;
9、hive对文件的分隔符等级
\n > ^A > ^B > ^C (^A即 Ctrl+A)
另外可自定义分隔符
FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
LINES TERMINATED BY '\n'
10、HiveQL数据定义
10.1 Hive中的数据库,hive中的数据库本质上仅仅是表的一个目录或者命名空间。
10.2 创建一个数据库 create database if not exists financials;
show databases;
正则表达式筛选显示
show databases like 'h.*';
数据库的文件目录名式以.db结尾的
创建数据库的默认路径是 /user/hive/warehouse 即开始时定义的目录
可修改默认位置: create database financials location '/my/preferred/directory';
描述数据库 describe database financials;
financials holds ......
10.3 使用某个数据库可以用 use
use financials;
10.4 显示表 show tables;
10.5 删除数据库
drop database if exists financials;
hive 不允许删除有表的数据库的,必须先删除表然后再删除数据库。或者 在语句最后加上CASCADE字段
drop database if exists financials cascade;
10.6 修改数据库,可修改 dbprperties属性,元数据信息是不可更改的.
alter database financials set dbproperties ('edited-by'='Joe Dba');
11、创建表
11.1 可以像MySQL一样创建数据表,而且hive支持更多的数据格式 如 array、map等
11.2 hive创建表 可以拷贝一张已经存在的表,如
create table if not exists mydb.employees2 like mydb.employees;
11.3 查看表信息 describe extended mydb.employees;
11.4 创建外部表
create external table if not exists stocks(.....) row format delimited fields terminated by ',' location '/data/stocks';
11.5 分区表 partitioned by
create table employees(...) partitioned by (country string, state string);
分区 最重要的原因就是为了更快地查询
显示所有分区,显示分区 show partitions table;
显示个别分区,show partitions table partition(country='US');
给表添加分区 alter table ...and partition
11.6 删除表
drop table if exists employees;
11.7 修改表 用alter
11.7.1 表重命名 alter table log_message rename to logmsgs;
11.7.2 增加、修改、删除表分区
alter table log_messages add if not exists partition(year=2011,month=1,day=1) location '...'; 新增
alter table log_messages partition(year=2011,month=1,day=1) set location '...'; 修改路径
alter table log_messages drop if exists partition(year=2011,month=1,day=1); 删除
12 数据操作
12.1 装载数据
load data loacl inpath '${env:HOME}/california-employees' overwrite into table employees partition (country='xx', state='xx');
12.2 通过查询插入语句
insert overwrite table employees partition (country='xx',state='xx') select * from staged_employees se where se.cnty='xx' and se.st = 'xx';
用overwrite会覆盖掉原有数据, 如果用into 会追加到表里
12.3 动态分区
insert overwrite table employees partition (country, state) select ..., se.cnty, se.st from staged_employees se;
可以静态与动态结合 使用, 在句尾加 where se.cnty='us', 注意 静态分区键必须出现在动态分区键之前
12.4 创建表并加载数据
create table ca_employees as select name, salary, address from employees where se.state='ca';
12.5 导出数据
如果数据文件是需要的格式,可直接拷贝文件 hadoop fs -cp source_path target_path
格式不同,则可如下操作
insert overwrite local directory '/tmp/ca_employees' select name, salary, address from employees where se.sate='CA';
having 是 group by 下的子语句
13 调优
13.1 explain 解释字段 explain extended 会输出更多的信息
在查询语句前加explain
14 压缩
14.1 sequence file格式的文件压缩可进行分割
在创建表时加 stored as sequencefile
create table a (n, int) stored as sequencefile;
15 开发
==============================================================
python 机器学习
安装pip
$ wget https://bootstrap.pypa.io/get-pip.py
$ sudo python get-pip.py
$ pip -V #查看pip版本
sudo pip install pandas
sudo apt-get install libatlas-base-dev gfortran //这一步是后面安装scipy所必需的
sudo pip install scipy
sudo apt-get install libxml2-dev libxslt1-dev python-dev
sudo apt-get install libevent-dev
sudo pip install lxml
sudo pip install matplotlib
sudo pip install numpy
sudo pip install seaborn
sudo pip install statsmodels
sudo pip install scikit-learn
===================================================================
ubuntu 下安装Python-tk
1. tk开发包: sudo apt-get install tk-dev
2. Python的tk模块:sudo apt-get install python-tk
========python 升级3====
1、安装Python3 sudo apt-get install python3
2、备份Python2 sudo mv python python.bak
3、设置Python3关联 sudo ln -s /usr/bin/python3.5 /usr/bin/python
4、查看版本 python -V
======安装python3-tk
2、sudo apt-get update
3、sudo apt-get install python3-tk
4、sudo apt-get install -f