hdfs写文件示例代码
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(file), conf);
Path path = new Path(file);
FSDataOutputStream out = fs.create(path); //创建文件
out.write(“hello”.getBytes("UTF-8"));
out.close();
out.write(“hello”.getBytes(“UTF-8”));
FSDataOutputStream 的 write 方法先是调用父类FilterOutputStream write方法
//FilterOutputStream
public void write(byte b[]) throws IOException {
write(b, 0, b.length);
}
``
然后 进入 DataOutputStream
public synchronized void write(byte b[], int off, int len)
throws IOException
{
out.write(b, off, len);
incCount(len);
}
进入 FSDataOutputStream 内部类 PositionCache
public void write(byte[] b, int off, int len) throws IOException {
this.out.write(b, off, len);
this.position += (long)len;
if(this.statistics != null) {
this.statistics.incrementBytesWritten((long)len);
}
}
此时进入DFSOutputStream 父类FSOutputSummer write(DFSOutputStream 有些方法并没有实现,直接使用父类)
@Override
public synchronized void write(byte b[], int off, int len)
throws IOException {
checkClosed();
if (off < 0 || len < 0 || off > b.length - len) {
throw new ArrayIndexOutOfBoundsException();
}
//这里很巧妙,循环设计非常精妙,每次发送剩余的所有数据
//write1 返回的最大值为 buf.length 4608(默认为9个chunk的长度),
for (int n=0;n
跳入 FSOutputSummer write1 方法
private int write1(byte b[], int off, int len) throws IOException {
if(count==0 && len>=buf.length) {
// local buffer is empty and user buffer size >= local buffer size, so
// simply checksum the user buffer and send it directly to the underlying
// stream
final int length = buf.length;
writeChecksumChunks(b, off, length);
return length;
}
//write 只写了五个字节,逻辑走这一步
// copy user data to local buffer
int bytesToCopy = buf.length-count;
bytesToCopy = (lencount, bytesToCopy);
count += bytesToCopy;
//count 才等于5 ,不刷新
// 正常情况下,一次最多写9个chunk,除非遇到强制刷新
if (count == buf.length) {
// local buffer is full
flushBuffer();
}
return bytesToCopy;
}
正在触发写的动作是
out.close();
close 此外还验证数据块 ,最后解除租约
DFSOutputStream close实现
@Override
public synchronized void close() throws IOException {
if (closed) {
IOException e = lastException.getAndSet(null);
if (e == null)
return;
else
throw e;
}
try {
//先刷新
flushBuffer(); // flush from all upper layers
if (currentPacket != null) {
waitAndQueueCurrentPacket();
}
if (bytesCurBlock != 0) {
// send an empty packet to mark the end of the block
currentPacket = createPacket(0, 0, bytesCurBlock, currentSeqno++);
currentPacket.lastPacketInBlock = true;
currentPacket.syncBlock = shouldSyncBlock;
}
flushInternal(); // flush all data to Datanodes
// get last block before destroying the streamer
ExtendedBlock lastBlock = streamer.getBlock();
closeThreads(false);
//客户端向NameNode发送completeFile请求,NameNode收到请求后,验证块的BlockPoolId是否正确
completeFile(lastBlock);
// 停止文件租约(endFileLease)
dfsClient.endFileLease(fileId);
} catch (ClosedChannelException e) {
} finally {
closed = true;
}
}
它会触发 flushBuffer
protected synchronized void flushBuffer() throws IOException {
this.flushBuffer(false, true);
}
跳入 其实现
protected synchronized int flushBuffer(boolean keep, boolean flushPartial) throws IOException {
int bufLen = this.count;
int partialLen = bufLen % this.sum.getBytesPerChecksum();
int lenToFlush = flushPartial?bufLen:bufLen - partialLen;
if(lenToFlush != 0) {
this.writeChecksumChunks(this.buf, 0, lenToFlush);
if(flushPartial && !keep) {
this.count = 0;
} else {
this.count = partialLen;
System.arraycopy(this.buf, bufLen - this.count, this.buf, 0, this.count);
}
}
return this.count - (bufLen - lenToFlush);
}
调用 writeChecksumChunks
private void writeChecksumChunks(byte[] b, int off, int len) throws IOException {
this.sum.calculateChunkedSums(b, off, len, this.checksum, 0);
for(int i = 0; i < len; i += this.sum.getBytesPerChecksum()) {
int chunkLen = Math.min(this.sum.getBytesPerChecksum(), len - i);
int ckOffset = i / this.sum.getBytesPerChecksum() * this.getChecksumSize();
this.writeChunk(b, off + i, chunkLen, this.checksum, ckOffset, this.getChecksumSize());
}
}
最后终于来到 子类 DFSOutputStream 的 writeChunk 方法
//writeChunk 每次只写一个chunk
protected synchronized void writeChunk(byte[] b, int offset, int len, byte[] checksum, int ckoff, int cklen) throws IOException {
this.dfsClient.checkOpen();
this.checkClosed();
if(len > this.bytesPerChecksum) {
throw new IOException("writeChunk() buffer size is " + len + " is larger than supported bytesPerChecksum " + this.bytesPerChecksum);
} else if(cklen != 0 && cklen != this.getChecksumSize()) {
throw new IOException("writeChunk() checksum size is supposed to be " + this.getChecksumSize() + " but found to be " + cklen);
} else {
//currentPacket 都是全局的,写完之后赋值为 null
if(this.currentPacket == null) {
this.currentPacket = this.createPacket(this.packetSize, this.chunksPerPacket, this.bytesCurBlock, (long)(this.currentSeqno++));
if(DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("DFSClient writeChunk allocating new packet seqno=" + this.currentPacket.seqno + ", src=" + this.src + ", packetSize=" + this.packetSize + ", chunksPerPacket=" + this.chunksPerPacket + ", bytesCurBlock=" + this.bytesCurBlock);
}
}
// 这里开始 Packet
// 写校验和,占用4个字节,cklen = 4
// 所谓的写,其实就是先放到指定的buf
this.currentPacket.writeChecksum(checksum, ckoff, cklen);
this.currentPacket.writeData(b, offset, len);
++this.currentPacket.numChunks;
this.bytesCurBlock += (long)len;
if(this.currentPacket.numChunks == this.currentPacket.maxChunks || this.bytesCurBlock == this.blockSize) {
if(DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("DFSClient writeChunk packet full seqno=" + this.currentPacket.seqno + ", src=" + this.src + ", bytesCurBlock=" + this.bytesCurBlock + ", blockSize=" + this.blockSize + ", appendChunk=" + this.appendChunk);
}
this.waitAndQueueCurrentPacket();
if(this.appendChunk && this.bytesCurBlock % (long)this.bytesPerChecksum == 0L) {
this.appendChunk = false;
this.resetChecksumBufSize();
}
if(!this.appendChunk) {
int psize = Math.min((int)(this.blockSize - this.bytesCurBlock), this.dfsClient.getConf().writePacketSize);
this.computePacketChunkSize(psize, this.bytesPerChecksum);
}
// 未达到写入要求,不执行以下逻辑
if(this.bytesCurBlock == this.blockSize) {
this.currentPacket = this.createPacket(0, 0, this.bytesCurBlock, (long)(this.currentSeqno++));
this.currentPacket.lastPacketInBlock = true;
this.currentPacket.syncBlock = this.shouldSyncBlock;
this.waitAndQueueCurrentPacket();
this.bytesCurBlock = 0L;
this.lastFlushOffset = 0L;
}
}
}
}
writeData 写数据,其实就是复制数据到buf指定位置, 校验数据统一放在前面,数据本身放在最后
void writeData(byte[] inarray, int off, int len) {
if (dataPos + len > buf.length) {
throw new BufferOverflowException();
}
System.arraycopy(inarray, off, buf, dataPos, len);
dataPos += len;
}
回到 close方法,虽然数据没有达到发送要求(this.currentPacket.numChunks == this.currentPacket.maxChunks || this.bytesCurBlock == this.blockSize),但是也会触发waitAndQueueCurrentPacket,因为毕竟 要执行关闭操作了
DFSOutputStream waitAndQueueCurrentPacket 实现
private void waitAndQueueCurrentPacket() throws IOException {
synchronized (dataQueue) {
try {
// If queue is full, then wait till we have enough space
while (!closed && dataQueue.size() + ackQueue.size() > dfsClient.getConf().writeMaxPackets) {
try {
dataQueue.wait();
} catch (InterruptedException e) {
// 异常情况 中断当前线程
Thread.currentThread().interrupt();
break;
}
}
checkClosed();
//将当前 currentPacket 加入到队列里
queueCurrentPacket();
} catch (ClosedChannelException e) {
}
}
}
private void queueCurrentPacket() {
synchronized (dataQueue) {
if (currentPacket == null) return;
// 加入到队列的末尾位置
dataQueue.addLast(currentPacket);
lastQueuedSeqno = currentPacket.seqno;
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("Queued packet " + currentPacket.seqno);
}
//指向空
currentPacket = null;
//唤醒等待线程
dataQueue.notifyAll();
}
}
一般的hdfs文件流写操作注意事项