目前分布式链路追踪系统基本都是根据谷歌的《Dapper大规模分布式系统的跟踪系统》这篇论文发展而来,主流的有zipkin,pinpoint,skywalking,cat,jaeger等。
论文链接:https://storage.googleapis.com/pub-tools-public-publication-data/pdf/36356.pdf
一.httpServer实现TracingHandler
func TracingHandler(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
carrier, err := trace.Extract(trace.HttpFormat, r.Header)
// ErrInvalidCarrier means no trace id was set in http header
if err != nil && err != trace.ErrInvalidCarrier {
logx.Error(err)
}
ctx, span := trace.StartServerSpan(r.Context(), carrier, sysx.Hostname(), r.RequestURI)
defer span.Finish()
r = r.WithContext(ctx)
next.ServeHTTP(w, r)
})
}
func StartServerSpan(ctx context.Context, carrier Carrier, serviceName, operationName string) (
context.Context, tracespec.Trace) {
span := newServerSpan(carrier, serviceName, operationName)
return context.WithValue(ctx, tracespec.TracingKey, span), span
}
span的关键字代码,如果header里无TraceID,则随机生成。
func newServerSpan(carrier Carrier, serviceName, operationName string) tracespec.Trace {
traceId := stringx.TakeWithPriority(func() string {
if carrier != nil {
return carrier.Get(traceIdKey)
}
return ""
}, func() string {
return stringx.RandId()
})
spanId := stringx.TakeWithPriority(func() string {
if carrier != nil {
return carrier.Get(spanIdKey)
}
return ""
}, func() string {
return initSpanId
})
return &Span{
ctx: spanContext{
traceId: traceId,
spanId: spanId,
},
serviceName: serviceName,
operationName: operationName,
startTime: timex.Time(),
flag: serverFlag,
}
}
二.rpcClient
往grpc自带的Interceptor拦截器中注入TracingInterceptor方法。该方法实现根据ctx来生成子span
func (c *client) buildDialOptions(opts ...ClientOption) []grpc.DialOption {
var clientOptions ClientOptions
for _, opt := range opts {
opt(&clientOptions)
}
options := []grpc.DialOption{
grpc.WithInsecure(),
grpc.WithBlock(),
WithUnaryClientInterceptors(
clientinterceptors.TracingInterceptor,
clientinterceptors.DurationInterceptor,
clientinterceptors.BreakerInterceptor,
clientinterceptors.PrometheusInterceptor,
clientinterceptors.TimeoutInterceptor(clientOptions.Timeout),
),
}
for _, interceptor := range c.interceptors {
options = append(options, WithUnaryClientInterceptors(interceptor))
}
return append(options, clientOptions.DialOptions...)
}
func TracingInterceptor(ctx context.Context, method string, req, reply interface{},
cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
ctx, span := trace.StartClientSpan(ctx, cc.Target(), method)
defer span.Finish()
var pairs []string
span.Visit(func(key, val string) bool {
pairs = append(pairs, key, val)
return true
})
ctx = metadata.AppendToOutgoingContext(ctx, pairs...)
return invoker(ctx, method, req, reply, cc, opts...)
}
func StartClientSpan(ctx context.Context, serviceName, operationName string) (context.Context, tracespec.Trace) {
if span, ok := ctx.Value(tracespec.TracingKey).(*Span); ok {
return span.Fork(ctx, serviceName, operationName)
}
return ctx, emptyNoopSpan
}
func (s *Span) Fork(ctx context.Context, serviceName, operationName string) (context.Context, tracespec.Trace) {
span := &Span{
ctx: spanContext{
traceId: s.ctx.traceId,
spanId: s.forkSpanId(),
},
serviceName: serviceName,
operationName: operationName,
startTime: timex.Time(),
flag: clientFlag,
}
return context.WithValue(ctx, tracespec.TracingKey, span), span
}
三.grpcServer
func UnaryTracingInterceptor(serviceName string) grpc.UnaryServerInterceptor {
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
handler grpc.UnaryHandler) (resp interface{}, err error) {
md, ok := metadata.FromIncomingContext(ctx)
if !ok {
return handler(ctx, req)
}
carrier, err := trace.Extract(trace.GrpcFormat, md)
if err != nil {
return handler(ctx, req)
}
ctx, span := trace.StartServerSpan(ctx, carrier, serviceName, info.FullMethod)
defer span.Finish()
return handler(ctx, req)
}
}
四.最终效果
逻辑调用关系 httpServer->grpcClient1->grpcServer1->grpcClient2->grpcServer2
httpServer通过LogHandler中间件打印:
grpcServer1通过UnaryStatInterceptor打印:
grpcServer2同理打印
全链路的超时控制。如果再顶端控制10s超时,在grpcServer1耗时3s时,那么接下来链路只能享用剩下的7s。实现思路也是基于Context的上下文传递,通过WithDeadLine来控制。
一.httpServer设置TimeoutHandler中间件
func TimeoutHandler(duration time.Duration) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
if duration > 0 {
return http.TimeoutHandler(next, duration, reason)
} else {
return next
}
}
}
二.grpcClient设置TimeoutInterceptor
func TimeoutInterceptor(timeout time.Duration) grpc.UnaryClientInterceptor {
if timeout <= 0 {
timeout = defaultTimeout
}
return func(ctx context.Context, method string, req, reply interface{}, cc *grpc.ClientConn,
invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
ctx, cancel := contextx.ShrinkDeadline(ctx, timeout)
defer cancel()
return invoker(ctx, method, req, reply, cc, opts...)
}
}
三.grpcServer设置UnaryTimeoutInterceptor
func UnaryTimeoutInterceptor(timeout time.Duration) grpc.UnaryServerInterceptor {
return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo,
handler grpc.UnaryHandler) (resp interface{}, err error) {
ctx, cancel := contextx.ShrinkDeadline(ctx, timeout)
defer cancel()
return handler(ctx, req)
}
}
func ShrinkDeadline(ctx context.Context, timeout time.Duration) (context.Context, func()) {
if deadline, ok := ctx.Deadline(); ok {
leftTime := time.Until(deadline)
if leftTime < timeout {
timeout = leftTime
}
}
return context.WithDeadline(ctx, time.Now().Add(timeout))
}
B站的client实现思路
// Call invokes the named function, waits for it to complete, and returns its error status.
func (c *Client) Call(ctx context.Context, serviceMethod string, args interface{}, reply interface{}) (err error) {
var (
ok bool
code string
rc *client
call *Call
cancel func()
t trace.Trace
timeout = time.Duration(c.timeout)
)
if rc, ok = c.client.Load().(*client); !ok || rc == errClient {
xlog.Error("client is errClient (no rpc client) by ping addr(%s) error", c.addr)
return ErrNoClient
}
if t, ok = trace.FromContext(ctx); !ok {
t = trace.New(serviceMethod)
}
t = t.Fork(_family, serviceMethod)
t.SetTag(trace.String(trace.TagAddress, rc.remoteAddr))
defer t.Finish(&err)
// breaker
brk := c.breaker.Get(serviceMethod)
if err = brk.Allow(); err != nil {
code = "breaker"
stats.Incr(serviceMethod, code)
return
}
defer c.onBreaker(brk, &err)
// stat
now := time.Now()
defer func() {
stats.Timing(serviceMethod, int64(time.Since(now)/time.Millisecond))
if code != "" {
stats.Incr(serviceMethod, code)
}
}()
// timeout: get from conf
// if context > conf use conf else context
deliver := true
if deadline, ok := ctx.Deadline(); ok {
if ctimeout := time.Until(deadline); ctimeout < timeout {
timeout = ctimeout
deliver = false
}
}
if deliver {
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
color := metadata.String(ctx, metadata.Color)
remoteIP := metadata.String(ctx, metadata.RemoteIP)
// call
call = &Call{
ServiceMethod: serviceMethod,
Args: args,
Reply: reply,
Trace: t,
Color: color,
RemoteIP: remoteIP,
Timeout: timeout,
}
rc.Do(call)
select {
case call = <-call.Done:
err = call.Error
code = ecode.Cause(err).Error()
case <-ctx.Done():
err = ecode.Deadline
code = "timeout"
}
return
}
B站的metadata数据
const (
// Network
RemoteIP = "remote_ip"
RemotePort = "remote_port"
ServerAddr = "server_addr"
ClientAddr = "client_addr"
// Router
Color = "color"
// Trace
Trace = "trace"
Caller = "caller"
// Timeout
Timeout = "timeout"
// Dispatch
CPUUsage = "cpu_usage"
Errors = "errors"
Requests = "requests"
// Mirror
Mirror = "mirror"
// Mid
// 外网账户用户id
Mid = "mid"
// Uid
// 内网manager平台的用户id user_id
Uid = "uid"
// Username
// LDAP平台的username
Username = "username"
// Device
Device = "device"
// Cluster cluster info key
Cluster = "cluster"
)
由此可见,B站思路大致也是如此。附上毛剑老师的讲解视频
https://www.bilibili.com/video/BV1At411V7aT?p=5
带来的思考.用户信息等也可通过context传递。流量染色,限流熔断等也是类似思路。
go-micro的实现思路(interface)
Micro通过Wrapper实现了三种trace接口,aswxray,opencensus,opentracing。第一个是亚马逊AWS的。
opentracing是一个开源的标准。提供对厂商中立的 API,用来向应用程序添加追踪功能并将追踪数据发送到分布式的追踪系统。已经快成功行业的标准了。
opencensus是谷歌开源的数据收集和分布式跟踪框架。OpenCensus也是实现了opentracing标准。OpenCensus 不仅提供规范,还提供开发语言的实现,和连接协议,而且它不仅只做追踪,还引入了额外的度量指标。opencensus也支持把数据导出到别的系统做分析。比如zipkin和Prometheus等
opencensus+zipkin
opentracing+zipkin
opentracing+Jaeger
注:zipkin 是 twitter 开源的分布式跟踪系统,并且具有UI界面来显示每个跟踪请求的状态。
docker run -d -p 9411:9411 openzipkin/zipkin
然后浏览器访问host:9411端口,即可看到zipkin的UI界面
我们使用opencensus的trace功能。也是实现了opentracing的标准。具体实现是由opencensus的trace来做的,然后通过zipkin的exporter把trace收集到的数据丢给zipkin。
1.1 需要引入的包
import (
...
"go.opencensus.io/trace"
"go.opencensus.io/exporter/zipkin"
wrapperTrace "github.com/micro/go-plugins/wrapper/trace/opencensus"
openzipkin "github.com/openzipkin/zipkin-go"
zipkinHTTP "github.com/openzipkin/zipkin-go/reporter/http"
...
)
1.2修改order微服务下的main.go
创建TraceBoot方法
func TraceBoot() {
apiURL := "http://192.168.0.111:9411/api/v2/spans"
hostPort,_ := os.Hostname()
serviceName := "go.micro.srv.order"
localEndpoint, err := openzipkin.NewEndpoint(serviceName, hostPort)
if err != nil {
log.Fatalf("Failed to create the local zipkinEndpoint: %v", err)
}
reporter := zipkinHTTP.NewReporter(apiURL)
ze := zipkin.NewExporter(reporter, localEndpoint)
trace.RegisterExporter(ze)
trace.ApplyConfig(trace.Config{DefaultSampler: trace.AlwaysSample()})
return
}
在func main里引用
// boot trace
TraceBoot()
// New Service
service := grpc.NewService(
micro.Name("go.micro.srv.order"),
micro.Version("latest"),
micro.Broker(b),
micro.WrapHandler(wrapperTrace.NewHandlerWrapper()),
micro.WrapClient(wrapperTrace.NewClientWrapper()),
)
1.3修改order微服务下的handler
将原来handler里的上下文传进去。因为为了识别是同一个请求,需要把相同的traceId传过去。这个是在上下文环境里的。具体操作就是将原来代码里的context.TODO()替换成ctx即可。