“我背上有个背篓,里面装了很多血泪换来的经验教训,我看着你们在台下嗷嗷待哺想要这个背篓里的东西,但事实上我给不了你们”,实践出真知。
Ugly is easy to identify because the messes always have something in common, but not beauty. -- C++ 之父 Bjarne Stroustrup
代码质量与其整洁度成正比。 -- 《代码整洁之道》作者 Robert C. Martin
1.1 开发
要点1 :语义简单明确
bool throttle_is_quota_valid(int64_t value)
{
// 复杂的判断条件
// 请你在三秒内说出 value 如何取值是合法的?
if (value < 0 && value != THROTL_UNSET && value != THROTL_NO_LIMIT)
{
return false;
}
return true;
}
bool throttle_is_quota_valid(int64_t value)
{
// 这是修改后的代码,value 取值合法有三种情况,一目了然
return value >= 0 || value == THROTL_UNSET || value == THROTL_NO_LIMIT;
}
要点2 :简洁 ≠ 代码短
void RecycleBin::Load(BindCallbackR1<Status>* done)
{
......
FOREACH(iter, fileStats)
{
RecycleFile item;
Status status = ParseDeletedFileName(iter->path, &item.timestamp);
if (!status.IsOk() { ...... }
item.fileName = iter->path;
item.size = iter->size;
item.physicalSize = iter->refCount > 1 ? 0 : iter->physicalSize;
......
// 这是修改前的代码
// earliestTimestamp[item.medium] =
// item.timestamp != 0 && item.timestamp < earliestTimestamp[item.medium] ?
// item.timestamp : earliestTimestamp[item.medium];
// }
// 这是修改后的代码
if (item.timestamp != 0 &&
item.timestamp < earliestTimestamp[item.medium])
{
earliestTimestamp[item.medium] = item.timestamp;
}
}
......
}
Status Foo()
{
Status status = Check1();
if (!status.IsOk())
{
return status;
}
else
{
status = Check2();
if (!status.IsOk())
{
return status;
}
else
{
status = Check3();
if (!status.IsOk())
{
return status;
}
else
{
DoSomeRealWork();
return OK;
// 四层潜套 if
}
}
}
}
Status Foo()
{
Status status = Check1();
if (!status.IsOk())
{
return status;
}
status = Check2();
if (!status.IsOk())
{
return status;
}
status = Check3();
if (!status.IsOk())
{
return status;
}
DoSomeRealWork();
return OK;
}
void Foo(RpcController* ctrl,
const FooRequest* request,
FooResponse* response,
Closure* done)
{
Status status = Check1(request);
if (!status.IsOk())
{
response->set_errorcode(status.Code());
// 第一处
done->Run();
return;
}
status = Check2(request);
if (!status.IsOk())
{
response->set_errorcode(status.Code());
// 第二处
done->Run();
return;
}
DoSomeRealWork(...);
// 第三处
done->Run();
}
void Foo(RpcController* ctrl,
const FooRequest* request,
FooResponse* response,
Closure* _done)
{
// 仅一处,不遗漏
erpc::ScopedCallback done(_done);
Status status = Check1(request);
if (!status.IsOk())
{
response->set_errorcode(status.Code());
return;
}
status = Check2(request);
if (!status.IsOk())
{
response->set_errorcode(status.Code());
return;
}
DoSomeRealWork(...);
}
void CompactTask::checkFileUtilizationRewrite()
{
// 此处采取朴素的排序算法,并未采取更高效的 TopK 算法
std::sort(sealedFilesUsage.begin(), sealedFilesUsage.end(), GarbageCollectionCompare);
int64_t sealedFileMaxSize = INT64_FLAG(lsm_CompactionSealedMaxSize);
int32_t sealedFileMaxNum = INT32_FLAG(lsm_CompactionSealedMaxFileNum);
int64_t targetFileSize = 0;
int32_t sourceFileCnt = 0;
// 前者简单清淅,并在几十个 File 中选择前几个文件的场景并不算太慢
FOREACH(itr, sealedFilesUsage)
{
LogicalFileId fileId = itr->fileId;
const FileUsage* usage = baseMap->GetFileUsage(fileId);
const File* file = fileSet->GetFile(fileId);
targetFileSize += usage->blocks * mBlockSize;
sourceFileCnt++;
if (targetFileSize > sealedFileMaxSize || sourceFileCnt > sealedFileMaxNum)
{
break;
}
mRewriteSealedFiles[fileId] = true;
}
......
}
void UserRequestControl::WaitForPendingIOs()
{
erpc::ExponentialBackoff delayTimeBackOff;
delayTimeBackOff.Reset(
INT64_FLAG(lsm_UnloadWaitingBackoffBaseUs),
INT64_FLAG(lsm_UnloadWaitingBackoffLimitUs),
INT64_FLAG(lsm_UnloadWaitingBackoffScaleFactor));
// 轮循等待在途的请求返回
// 请思考如何用条件变量实现精确的同步
while (!mWriteQueue.empty()|| !mReadQueue.empty())
{
uint64_t delayTime = delayTimeBackOff.Next();
PGLOG_INFO(sLogger,
(__FUNCTION__, "Waiting for inflight requests during segment unload")
("Segment", mSegment->GetName())
("Write Requests", mWriteQueue.size())
("ReadRequests", mReadQueue.size())
("DelayTimeInUs", delayTime));
easy_coroutine_usleep(delayTime); // 退避等待
}
}
pthread_mutex_t mutex;
pthread_cond_t nonEmptyCondition;
std::list<Task*> queue;
void ConsumerLoop()
{
pthread_mutex_lock(&mutex);
while (true)
{
while (queue.empty())
{
struct timespec ts;
ts.tv_sec = 1;
ts.tv_nsec = 0;
// 使用timewait
pthread_cond_timedwait(&nonEmptyCondition, &mutex, timespec);
}
Task* firstTask = queue.front();
queue.pop_front();
consume(firstTask);
}
pthread_mutex_unlock(&mutex);
}
// load.cpp
Status LoadTask::Execute()
{
Status status;
status = func();
if (!status.IsOk()) { ... }
// 串行执行下列步骤
RUN_STEP(doPrepareDirs);
...... // 十几步
RUN_STEP(doTask);
......
}
// files.cpp
Status FileMap::SealFilesForLiveDevice()
{
Status status = OK;
std::vector<SyncClosureR1<Status>*> sealDones;
STLDeleteElementsGuard<std::vector<SyncClosureR1<Status>*> >
donesDeleter(&sealDones);
// 并行 seal 每个文件
FOREACH(iter, mActiveFiles)
{
File* file = iter->second;
sealDones.push_back(new SyncClosureR1<Status>());
Closure* work = stone::NewClosure(
this,
&FileMap::doSealFileForLiveDevice,
file,
static_cast<BindCallbackR1<Status>*>(sealDones.back()));
InvokeCoroutineInCurrentThread(work);
}
// 收集结果
FOREACH(done, sealDones)
{
(*done)->Wait();
if (!(*done)->GetResult0().IsOk())
{
status = (*done)->GetResult0();
}
}
return status;
}
1)关键的数据结构,如 数据分片 结构 ;
// stream.h
class Stream
{
public:
Stream();
~Stream();
void Read(ReadArgs* args);
......
private:
// 增加 magic 字段
// 通常使用 uint32 或 uint64
uint64_t mObjectMagic;
......
};
// stream.cpp
// 定义 magic 常量
// 常量值选择 hexdump 时能识别的字符串,以便在 gdb 查看 coredump 时快速识别
// 此处使用 “STREAM” 的 ASCII 串
static uint64_t STREAM_OBJECT_MAGIC = 0x4e4d474553564544LL;
Stream::Stream()
: mObjectMagic(STREAM_OBJECT_MAGIC) // 在构造函数中赋值
{
......
}
Stream::~Stream()
{
// 在析构函数中检查并破坏 magic 字段,预防 double-free 错误
easy_assert(mObjectMagic == STREAM_OBJECT_MAGIC);
mObjectMagic = FREED_OBJECT_MAGIC;
......
}
void DeviceSegment::Read(ReadArgs* args)
{
// 在重要的函数中检查 magic 字段,预防 use-after-free 错误
easy_assert(mObjectMagic == DEVICE_SEGMENT_OBJECT_MAGIC);
......
}
class StreamWriter
{
public:
......
private:
struct StreamGroup
{
WriteAttemptList failureQueue;
WriteAttemptList inflightQueue;
WriteAttemptList pendingQueue;
uint64_t commitSeq;
uint64_t lastSeq;
};
uint32_t mStreamGroupCount;
StreamGroup mStreamGroups[STREAM_GROUP_COUNT];
......
};
void StreamWriter::sanityCheck()
{
for (uint32_t i = 0; i < mStreamGroupCount; i++)
{
// Check that sequence in "failureQueue", "inflightQueue" and "pendingQueue" are ordered.
const StreamGroup* group = &mStreamGroups[i];
uint64_t prevSeq = group->commitSeq;
const WriteAttemptList* queues[] = {
&group->failureQueue,
&group->inflightQueue,
&group->pendingQueue
};
for (size_t k = 0; k < easy_count_of(queues); k++)
{
FOREACH(iter, *queues[k])
{
const WriteRequest* write = iter->write;
PANGU_ASSERT(prevSeq <= write->seq); // SanityCheck
prevSeq = write->seq + write->lbaRange.rangeSize;
}
}
ASSERT(prevSeq == group->lastSeq); // SanityCheck
}
......
}
Status LoadTask::doTailScanFiles()
{
......
for (id = FIRST_REAL_FILE_ID; id < mFileSet->GetTotalFileCount(); id++)
{
File* file = mDiskFileSet->GetFile(id);
if (file->GetLogicalLength() < logicalLengthInIndex)
{
const char* msg = “BUG!! Found a data on disk with shorter length ”
“than in map. This is probably caused by length reduction of ”
“that file.”; // 记录详细的日志,包括文件名、期望长度、实际长度等
PGLOG_FATAL(sLogger, (__FUNCTION__, msg)
(“Stream”, mStream->GetName())
(“File”, file->GetFileName())
(“FileId”, file->GetFileId())
(“FileLengthOnDisk”, file->GetFileLength())
(“FileLengthInIndex”, physicalLengthInIndex)
(“LogicalLengthOnDisk”, file->GetLogicalLength())
(“LogicalLengthInIndex”, logicalLengthInIndex)
(“MissingSize”, physicalLengthInIndex - file->GetFileLength()));
SERVICE_ADD_COUNTER(“LSM:CriticalIssueCount”, 1); // 触发电话告警
return LSM_FILE_CORRUPTED;
}
}
}
linux/include/linux/jiffies.h
/*•* Have the 32 bit jiffies value wrap 5 minutes after boot
* so jiffies wrap bugs show up earlier.
*/
/*
* These inlines deal with timer wrapping correctly
You are•* strongly encouraged to use them
* 1. Because people otherwise forget
* 2. Because if the timer wrap changes in future you won't have to
* alter your driver code.
*
* time_after(a,b) returns true if the time a is after time b.
*/
(typecheck(unsigned long, a) && \
typecheck(unsigned long, b) && \
((long)((b) - (a)) < 0))
// easy/src/io/easy_timer.h
// ----------------------------------------------------------------------------------
// following interface, use easy_timer_sched from th(io thread or worker thread),
// ** DON NOT support async call **
//
int easy_timer_start_on_th(easy_baseth_t *th, easy_timer_t *timer);
int easy_timer_stop_on_th(easy_baseth_t *th, easy_timer_t *timer);
1.2 测试
TEST_F(..., SharedDisk_StopOneBs)(...)
{
BenchMarkStart(mOption);
// for循环反复注入
mCluster->StopServer(0);
mCluster->StartServer(0);
// 修复前无第12行无代码,无下限检查,全部失败时Case PASS
// 共享盘开盘后线程死锁必IO Hang,有测试无断言遗漏Bug导致P1故障
EXPECT_GT(mIoBench->GetLastPrintIops(), 0);
EXPECT_GT(mIoBench->GetMaxLatency(), 0);
// 断言检查,边界上限
EXPECT_GT(20 * 1000000, mIoBench->GetMaxLatency());
// Do something below
}
Status PRConfig::Register )(...)
{
assertIoThread();
// 修复前缺少=,导致Sever Crash
if (unlikely(mRegistrants.size() >= MAX_REGISTRANT_NUM))
{
LOG_ERROR(...);
return SC_RESERVATION_CONFLICT;
}
// Do something below
}
void WalStreamWriterPool::tryCreateWalWriter()
{
AssertCoroutine();
ASSERT_DEBUG(mIsCreating);
Status status = OK;
while (...)
{
WalStreamWriter *writer = mWalManager->CreateWalWriter();
status = writer->Open();
// 修复前无第14行代码部分,未处理Commit,失败导致丢掉WAL文件,进而丢数据
if (status.IsOk())
{
status = mWalManager->Commit();
}
// Do something below
}
void RPCController::StartCancel()
{
if (_session) {
if (_pendingRpc != NULL) {
// 修复前无第29行代码,线程Hang进而IOHang
// 未测试覆盖call StartCancel before handshake
_session->need_cancel = true;
} else {
easy_session_cancel(_session);
}
} else {
easy_error_log(...);
}
}
Status CompressOffsetTable::Seal()
{
// Do something before
status = mTableFile->Seal();
if (!status.IsOk())
{
PGLOG_ERROR(...);
return status;
}
mIsSealed = true;
// 修复前无第14行代码,文件写入已完成,清空缓存,释放内存
mEasyPool.reset();
// Do something below
}
void ActiveManager::SubmitIO(
{
// 【版本兼容性】 SDK 和 Server线程不对齐,旧版本SDK不支持切线程
if (UNLIKELY(GetCurrentThread() != serverThread))
PGLOG_WARNING(... "Server thread mismatch");
response->ErrorCode = SERVER_BUSY;
done->Run();
}
void ChunkListAccessor::SetChunkInfoAndLocations()
{
uint8_t flags = mFileNodePtr->fileFlags;
bool isLogFile = IsFlatLogFile(flags);
ASSERT(
//【协议兼容性】Master 和 SDK异常场景定长误判
(isLogFile && vecChunkInfoNode[0].version <= masterChunkInfo.version) ||
!isLogFile);
// Do something below
}
// 【API兼容性】 Server 和 Master的错误码不一致,数据分片反复加载/卸载
// Master侧,device_load.cpp
// if(status.Code() == LSM_SEGMENT_EXIST_OTHER_VERSION))
// Server侧,device_load.cpp
// return LSM_NOT_OWN_SEGMENT;
TEST_F(FastPathSmokeTestFixture, Resize)
{
// ... Do something
ResizeVolume(uri, DEVICE_SIZE * 2);
Status status = OK;
do {
// 状态依赖,未检查resize 是否成功,导致错误的认为是越界io处理
status = Write(handle, wbuf.get(), 0, 4096);
if (status.Code() == OK)
{
break;
}
easy_coroutine_usleep(100*1000);
} while(1);
// ... Do something
}
// volume_iov_split_test.cpp
TEST(VolumeIovSplitTest, Iovsplit_Random)
{
// ... Do something
size_t totalLength = 0;
// 修改前无+1,0是非法随机值,造成Case低概率失败
totalLength = rand() % (10*1024*1024) + 1
// ... Do something
}
2.1 Docker单机集群
2.2 本地出包自助E2E
3.1 编写测试样例
3.2 代码门禁说明
4.1 For Submitter
4.1 For Reviewer
5.1 主干开发
列宁:帝国主义是资本主义的最高阶段
南门:主干开发(trunk based development)是持续集成(continuous integration)的最高阶段
5.2 主干/分支发布
6.1 测试脚手架
6.2 环境标准化
7.1 git-poison投毒
7.2 poison发布阻塞
8.1 从开发到上线
8.2 分模块发布
9.1 效率工具和方法
文档类:语雀的在线UML图/流程图/里程碑方便多人共同编辑等、Teambition的项目管理甘特图、 Aone的需求管理和缺陷管理、 离线工具诸如Xmind思维导图/draw.io 流程图/OneNote。
T:Time-bound,有明确的截止期限
Trade-off: 优势和劣势是什么?带来了哪些挑战?
系统实现: 组成部分和关键实现,核心思想和核心组件,灵魂在哪里?
Related Works: 这个问题上还有什么其他的工作?相关系统对比?不同的实现、不同的侧重、不同的思路?
9.2 个人成长和踩坑
代码提交尽量做到原子,即不可分割的特性、修复或者优化,测试代码同生产代码同一个patch提交
实现一个可以运行起来的脚手架,再持续添加内容
新人往往脚踏实地,忘记了仰望星空,只顾着埋头苦干,不思考背后的业务价值,这一锄头,那一铁锹,遍地都是坑,就是不开花,费时费力,成就感低。
Good Case:「在100g网络标卡CX6验证性能时,8 jobs 32 depth iosize 4K场景下,极限IOPS从120万下降至110万,与FIC卡相比性能存在8%差异」
日拱一卒,功不唐捐,共勉。
参考链接:
[1] 接近不可接受的负载边界
https://www.usenix.org/conference/srecon18americas/presentation/schwartz
[2] Software Engineering at Google
https://qiangmzsx.github.io/Software-Engineering-at-Google/#/zh-cn/Chapter-12_Unit_Testing/Chapter-12_Unit_Testing
[3] 测试左移在大型分布式系统中的工程实践
https://mp.weixin.qq.com/s/DSsscC_5ldOTCTbW6u-ubw
[4] Best Practices for Code Review
https://smartbear.com/learn/code-review/best-practices-for-peer-code-review/
[5] Design Docs at Google
https://www.industrialempathy.com/posts/design-docs-at-google/
[6] 提问的智慧
https://github.com/ryanhanwu/How-To-Ask-Questions-The-Smart-Way/blob/main/README-zh_CN.md
《阿里云存储白皮书》
随着阿里云的崛起,集团内部的各种技术开始以阿里云作为唯一出口,阿里云成为阿里巴巴经济体的技术底座,阿里云的“盘古”存储也成为阿里巴巴经济体的存储底盘。用“稳定安全高性能,普惠智能新存储”来形容这本白皮书的内涵最为恰当不过了。“不畏浮云遮望眼,自缘身在最高层。”基于盘古的阿里云存储必将继续引领全球产业进入未来的“新存储”大时代。
点击阅读原文查看详情。