关于副本数量的问题
副本数由客户端的参数dfs.replication决定(优先级: conf.set > 自定义配置文件 > jar包中的hdfs-default.xml)
1 概述
2.hdfs读文件示意图
3.namenode管理元数据的机制
4.hdfs的java客户端端
hdfs dfsadmin -report打印集群的状态
package hadoop.hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import java.util.Map;
/**
@program:bigdata
@package:hadoop.hdfs
@filename:SimpleHdfsDemo.java
@create:2019.09.22.20.19
@author:Administrator
@descrption.
*/
public class SimpleHdfsDemo {
FileSystem fs=null;
Configuration conf=null;
@Before
public void init() throws Exception {
conf=new Configuration();
// conf.set(“fs.defaultFS”,“hdfs://master:9000”);
// 这种方式需要在运行时加参数指定用户-DHADOOP_USER_NAME=hadoop
fs=FileSystem.get(new URI(“hdfs://192.168.186.5:9000”),conf,“hadoop”);
}
/*
* 上传文件
* /
@Test
public void testUpload() throws Exception {
fs.copyFromLocalFile(
new Path(“D:\大数据\大数据全套 (已分享)\文档资料\day06\day06\hadoop2.4.1集群搭建.txt”),
new Path("/test1.txt"));
fs.close();
}
/
* 下载文件
* /
@Test
public void testDownload() throws IOException {
fs.copyToLocalFile(new Path("/test1.txt"),new Path(“d:\”));
fs.close();
}
/
打印参数
* /
@Test
public void testConf(){
Iterator
while (it.hasNext()){
Map.Entry
System.out.println(next.getKey()+":"+next.getValue());
}
}
@Test
public void testMkdir() throws IOException {
boolean mkdirs = fs.mkdirs(new Path("/testMkdir"));
System.out.println(mkdirs);
fs.close();
}
@Test
public void testDelete() throws IOException {
//第二个参数表示是否递归删除
boolean delete = fs.delete(new Path("/testMkdir"), true);
System.out.println(delete);
}
/
* 递归查看数据
* 数据量大时返回的是迭代器,因为迭代器不用存储数据
* /
@Test
public void testLs() throws IOException {
RemoteIterator ls = fs.listFiles(new Path("/"), true);
while (ls.hasNext()){
LocatedFileStatus next = ls.next();
System.out.println(“blocksize”+next.getBlockSize());
System.out.println(“owner”+next.getOwner());
System.out.println(“replication”+next.getReplication());
System.out.println(“Permission”+next.getPermission());
System.out.println(“name”+next.getPath().getName());
System.out.println("------------");
/
* 文件的块位置信息
* /
BlockLocation[] bl = next.getBlockLocations();
for (BlockLocation b:bl ){
String[] hosts = b.getHosts();
b.getOffset();
}
}
}
/
* 不会递归返回数据
* */
@Test
public void testLs2() throws IOException {
FileStatus[] f = fs.listStatus(new Path("/"));
for (FileStatus file:f){
System.out.println(file.getPath().getName());
}
}
}
2.通过流的方式
package hadoop.hdfs;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Before;
import org.junit.Test;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
/**
@program:bigdata
@package:hadoop.hdfs
@filename:HdfsStreamAccess.java
@create:2019.09.23.09.21
@author:Administrator
@descrption.用流的方式来操作hdfs上的文件,可以实现指定偏移量的数据
*/
public class HdfsStreamAccess {
FileSystem fs=null;
Configuration conf=null;
@Before
public void init() throws Exception {
conf=new Configuration();
fs=FileSystem.get(new URI("hdfs://192.168.186.5:9000"),conf,"hadoop");
}
/*
/*
5.定时采集日子脚本
#!/bin/bash
#set java env
export JAVA_HOME=/home/hadoop/app/jdk1.7.0_51
export JRE_HOME= J A V A H O M E / j r e e x p o r t C L A S S P A T H = . : {JAVA_HOME}/jre export CLASSPATH=.: JAVAHOME/jreexportCLASSPATH=.:{JAVA_HOME}/lib: J R E H O M E / l i b e x p o r t P A T H = {JRE_HOME}/lib export PATH= JREHOME/libexportPATH={JAVA_HOME}/bin:$PATH
#set hadoop env
export HADOOP_HOME=/home/hadoop/app/hadoop-2.6.4
export PATH= H A D O O P H O M E / b i n : {HADOOP_HOME}/bin: HADOOPHOME/bin:{HADOOP_HOME}/sbin:$PATH
#版本1的问题:
#虽然上传到Hadoop集群上了,但是原始文件还在。如何处理?
#日志文件的名称都是xxxx.log1,再次上传文件时,因为hdfs上已经存在了,会报错。如何处理?
#如何解决版本1的问题
#日志文件存放的目录
log_src_dir=/home/hadoop/logs/log/
#待上传文件存放的目录
log_toupload_dir=/home/hadoop/logs/toupload/
#日志文件上传到hdfs的根路径
hdfs_root_dir=/data/clickLog/20151226/
#打印环境变量信息
echo “envs: hadoop_home: $HADOOP_HOME”
#读取日志文件的目录,判断是否有需要上传的文件
echo “log_src_dir:”$log_src_dir
ls l o g s r c d i r ∣ w h i l e r e a d f i l e N a m e d o i f [ [ " log_src_dir | while read fileName do if [[ " logsrcdir∣whilereadfileNamedoif[["fileName" == access.log.* ]]; then
# if [ “access.log” = “$fileName” ];then
date=date +%Y_%m_%d_%H_%M_%S
#将文件移动到待上传目录并重命名
#打印信息
echo “moving l o g s r c d i r log_src_dir logsrcdirfileName to KaTeX parse error: Expected group after '_' at position 33: …xxxxx_click_log_̲fileName”$date"
mv l o g s r c d i r log_src_dir logsrcdirfileName KaTeX parse error: Expected group after '_' at position 33: …xxxxx_click_log_̲fileName"$date
#将待上传的文件path写入一个列表文件willDoing
echo KaTeX parse error: Expected group after '_' at position 33: …xxxxx_click_log_̲fileName"$date >> l o g t o u p l o a d d i r " w i l l D o i n g . " log_toupload_dir"willDoing." logtouploaddir"willDoing."date
fi
done
#找到列表文件willDoing
ls KaTeX parse error: Expected 'EOF', got '#' at position 89: … read line do #̲打印信息 echo "tou…line
#将待上传文件列表willDoing改名为willDoing_COPY_
mv l o g t o u p l o a d d i r log_toupload_dir logtouploaddirline l o g t o u p l o a d d i r log_toupload_dir logtouploaddirline"COPY"
#读列表文件willDoing_COPY_的内容(一个一个的待上传文件名) ,此处的line 就是列表中的一个待上传文件的path
cat l o g t o u p l o a d d i r log_toupload_dir logtouploaddirline"COPY" |while read line
do
#打印信息
echo “puting… l i n e t o h d f s p a t h . . . . . line to hdfs path..... linetohdfspath.....hdfs_root_dir”
hadoop fs -put $line $hdfs_root_dir
done
mv l o g t o u p l o a d d i r log_toupload_dir logtouploaddirline"COPY" l o g t o u p l o a d d i r log_toupload_dir logtouploaddirline"DONE"
done