核心方法在:dbms/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp
中的write
方法
void ReplicatedMergeTreeBlockOutputStream::write(const Block & block)
{
last_block_is_duplicate = false;
/// 判断是否执行延迟插入
storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event);
// 获取zk连接并判断是否过期
auto zookeeper = storage.getZooKeeper();
assertSessionIsNotExpired(zookeeper);
// 这里主要是判断是否设定了多副本提交验证,并执行一些验证和判断逻辑
if (quorum)
checkQuorumPrecondition(zookeeper);
// 根据最大块大小限制(64K~1M),将block拆分为多个,
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block);
for (auto & current_block : part_blocks)
{
Stopwatch watch;
// 将current_block写入临时part,并做checksum校验
MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block);
String block_id;
// 如果不重复
if (deduplicate)
{
SipHash hash;
part->checksums.computeTotalChecksumDataOnly(hash);
union
{
char bytes[16];
UInt64 words[2];
} hash_value;
hash.get128(hash_value.bytes);
// 生成block_id
block_id = part->info.partition_id + "_" + toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]);
LOG_DEBUG(log, "Wrote block with ID '" << block_id << "', " << block.rows() << " rows");
}
else
{
LOG_DEBUG(log, "Wrote block with " << block.rows() << " rows");
}
try
{
// 提交part到zk
commitPart(zookeeper, part, block_id);
int error = (deduplicate && last_block_is_duplicate) ? ErrorCodes::INSERT_WAS_DEDUPLICATED : 0;
PartLog::addNewPart(storage.global_context, part, watch.elapsed(), ExecutionStatus(error));
}
catch (...)
{
PartLog::addNewPart(storage.global_context, part, watch.elapsed(), ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__));
throw;
}
}
}
再来看一下commitPart
方法,这个方法比较长,请耐心阅读
void ReplicatedMergeTreeBlockOutputStream::commitPart(zkutil::ZooKeeperPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const String & block_id)
{
// 做列名和列类型检测及zk检测
storage.check(part->getColumns());
assertSessionIsNotExpired(zookeeper);
/// 分配block number并检查是否重复
bool deduplicate_block = !block_id.empty();
String block_id_path = deduplicate_block ? storage.zookeeper_path + "/blocks/" + block_id : "";
auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path);
// 如果重复就记录日志并标记event为inserted
if (!block_number_lock)
{
LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it.");
part->is_duplicate = true;
last_block_is_duplicate = true;
ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
return;
}
// 如果没有重复执行以下代码
Int64 block_number = block_number_lock->getNumber();
// 生成part名及详细信息
part->info.min_block = block_number;
part->info.max_block = block_number;
part->info.level = 0;
String part_name = part->getNewName(part->info);
part->name = part_name;
// 生成log节点及详细信息
StorageReplicatedMergeTree::LogEntry log_entry;
log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
log_entry.create_time = time(nullptr);
log_entry.source_replica = storage.replica_name;
log_entry.new_part_name = part_name;
log_entry.quorum = quorum;
log_entry.block_id = block_id;
// 下面是构造将要向zk做的操作
Coordination::Requests ops;
storage.getCommitPartOps(ops, part, block_id_path);
/// 生成log节点名,这里是顺序持久化的节点
ops.emplace_back(zkutil::makeCreateRequest(
storage.zookeeper_path + "/log/log-",
log_entry.toString(),
zkutil::CreateMode::PersistentSequential));
block_number_lock->getUnlockOps(ops);
// quorum相关,暂不说明,不影响整个主流程
if (quorum)
{
......
}
// 如果提交zk失败,还会将该操作进行重试,保证成功,以事务的方式来包装
MergeTreeData::Transaction transaction(storage);
// 重命名part
storage.renameTempPartAndAdd(part, nullptr, &transaction);
Coordination::Responses responses;
int32_t multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
if (multi_code == Coordination::ZOK)
{
transaction.commit();
// 触发merge操作前的part select任务
storage.merge_selecting_task->schedule();
// 对操作解锁
block_number_lock->assumeUnlocked();
}
else if (multi_code == Coordination::ZCONNECTIONLOSS
|| multi_code == Coordination::ZOPERATIONTIMEOUT)
{
// 由于返回值是失去连接或超时,如果成功写入zk再次写入会有问题,所以这里还是做commit()处理
transaction.commit();
// 延时一段时间再检查该part是否成功提交
storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
// 不知道是否成功,抛个异常
throw Exception("Unknown status, client must retry. Reason: " + String(Coordination::errorMessage(multi_code)),
ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
}
else if (Coordination::isUserError(multi_code))
{
String failed_op_path = zkutil::KeeperMultiException(multi_code, ops, responses).getPathForFirstFailedOp();
if (multi_code == Coordination::ZNODEEXISTS && deduplicate_block && failed_op_path == block_id_path)
{
// 如果block存在,则产生冲突,回滚insert操作
LOG_INFO(log, "Block with ID " << block_id << " already exists; ignoring it (removing part " << part->name << ")");
part->is_duplicate = true;
transaction.rollback();
last_block_is_duplicate = true;
ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
}
else if (multi_code == Coordination::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
{
// 发现其他副本已经在执行操作了,节点已经存在,也执行回滚
transaction.rollback();
throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
}
else
{
// 其他情况,执行回滚
transaction.rollback();
throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ zkutil::ZooKeeper::error2string(multi_code) + ", path " + failed_op_path,
ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
}
}
else if (Coordination::isHardwareError(multi_code))
{
// 存储设备异常回滚
transaction.rollback();
throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
}
else
{
transaction.rollback();
throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
}
if (quorum)
{
// 等待指定副本数都完成
LOG_TRACE(log, "Waiting for quorum");
String quorum_status_path = storage.zookeeper_path + "/quorum/status";
......
LOG_TRACE(log, "Quorum satisfied");
}
}