与重载配置相同的是我们也需要通过信号来通知server重启,但关键在于平滑重启,如果只是简单的重启,只需要kill掉,然后再拉起即可。平滑重启意味着server升级的时候可以不用停止业务。
我们先来看下Github上有没有相应的库解决这个问题,然后找到了如下三个库:
我们分别来学习一下,下面只讲解http server的重启。
我们来分别使用这三个库来做平滑重启的事情,之后来对比其优缺点。
这三个库的官方都给了相应的例子,例子如下:
但三个库官方的例子不太一致,我们来统一一下:
我们参考官方的例子分别来写下用来对比的例子:
package main
import (
"time"
"net/http"
"github.com/facebookgo/grace/gracehttp"
)
func main() {
gracehttp.Serve(
&http.Server{Addr: ":5001", Handler: newGraceHandler()},
&http.Server{Addr: ":5002", Handler: newGraceHandler()},
)
}
func newGraceHandler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/sleep", func(w http.ResponseWriter, r *http.Request) {
duration, err := time.ParseDuration(r.FormValue("duration"))
if err != nil {
http.Error(w, err.Error(), 400)
return
}
time.Sleep(duration)
w.Write([]byte("Hello World"))
})
return mux
}
package main
import (
"log"
"net/http"
"os"
"sync"
"time"
"github.com/fvbock/endless"
"github.com/gorilla/mux"
)
func handler(w http.ResponseWriter, r *http.Request) {
duration, err := time.ParseDuration(r.FormValue("duration"))
if err != nil {
http.Error(w, err.Error(), 400)
return
}
time.Sleep(duration)
w.Write([]byte("Hello World"))
}
func main() {
mux1 := mux.NewRouter()
mux1.HandleFunc("/sleep", handler)
w := sync.WaitGroup{}
w.Add(2)
go func() {
err := endless.ListenAndServe(":5003", mux1)
if err != nil {
log.Println(err)
}
log.Println("Server on 5003 stopped")
w.Done()
}()
go func() {
err := endless.ListenAndServe(":5004", mux1)
if err != nil {
log.Println(err)
}
log.Println("Server on 5004 stopped")
w.Done()
}()
w.Wait()
log.Println("All servers stopped. Exiting.")
os.Exit(0)
}
package main
import (
"fmt"
"net/http"
"time"
"github.com/jpillora/overseer"
)
//see example.sh for the use-case
// BuildID is compile-time variable
var BuildID = "0"
//convert your 'main()' into a 'prog(state)'
//'prog()' is run in a child process
func prog(state overseer.State) {
fmt.Printf("app#%s (%s) listening...\n", BuildID, state.ID)
http.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
duration, err := time.ParseDuration(r.FormValue("duration"))
if err != nil {
http.Error(w, err.Error(), 400)
return
}
time.Sleep(duration)
w.Write([]byte("Hello World"))
fmt.Fprintf(w, "app#%s (%s) says hello\n", BuildID, state.ID)
}))
http.Serve(state.Listener, nil)
fmt.Printf("app#%s (%s) exiting...\n", BuildID, state.ID)
}
//then create another 'main' which runs the upgrades
//'main()' is run in the initial process
func main() {
overseer.Run(overseer.Config{
Program: prog,
Addresses: []string{":5005", ":5006"},
//Fetcher: &fetcher.File{Path: "my_app_next"},
Debug: false, //display log of overseer actions
})
}
下面给一下操作命令
# 第一次构建项目
go build grace.go
# 运行项目,这时就可以做内容修改了
./grace &
# 请求项目,60s后返回
curl "http://127.0.0.1:5001/sleep?duration=60s" &
# 再次构建项目,这里是新内容
go build grace.go
# 重启,2096为pid
kill -USR2 2096
# 新API请求
curl "http://127.0.0.1:5001/sleep?duration=1s"
# 第一次构建项目
go build endless.go
# 运行项目,这时就可以做内容修改了
./endless &
# 请求项目,60s后返回
curl "http://127.0.0.1:5003/sleep?duration=60s" &
# 再次构建项目,这里是新内容
go build endless.go
# 重启,22072为pid
kill -1 22072
# 新API请求
curl "http://127.0.0.1:5003/sleep?duration=1s"
# 第一次构建项目
go build -ldflags '-X main.BuildID=1' overseer.go
# 运行项目,这时就可以做内容修改了
./overseer &
# 请求项目,60s后返回
curl "http://127.0.0.1:5005/sleep?duration=60s" &
# 再次构建项目,这里是新内容,注意版本号不同了
go build -ldflags '-X main.BuildID=2' overseer.go
# 重启,28300为主进程pid
kill -USR2 28300
# 新API请求
curl "http://127.0.0.1:5005/sleep?duration=1s"
示例 | 旧API返回值 | 新API返回值 | 旧pid | 新pid | 结论 |
---|---|---|---|---|---|
grace | Hello world | Hello Harry | 2096 | 3100 | 旧API不会断掉,会执行原来的逻辑,pid会变化 |
endless | Hello world | Hello Harry | 22072 | 22365 | 旧API不会断掉,会执行原来的逻辑,pid会变化 |
overseer | Hello world | Hello Harry | 28300 | 28300 | 旧API不会断掉,会执行原来的逻辑,主进程pid不会变化 |
可以看出grace和endless是比较像的。
热重启的原理非常简单,但是涉及到一些系统调用以及父子进程之间文件句柄的传递等等细节比较多。
处理过程分为以下几个步骤:
overseer是与grace和endless有些不同,主要是两点:
package main
import (
"context"
"errors"
"flag"
"log"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"syscall"
"time"
)
var (
server *http.Server
listener net.Listener
graceful = flag.Bool("graceful", false, "listen on fd open 3 (internal use only)")
)
func handler(w http.ResponseWriter, r *http.Request) {
time.Sleep(20 * time.Second)
w.Write([]byte("hello world233333!!!!"))
}
func main() {
flag.Parse()
http.HandleFunc("/hello", handler)
server = &http.Server{Addr: ":9999"}
var err error
if *graceful {
log.Print("main: Listening to existing file descriptor 3.")
// cmd.ExtraFiles: If non-nil, entry i becomes file descriptor 3+i.
// when we put socket FD at the first entry, it will always be 3(0+3)
f := os.NewFile(3, "")
listener, err = net.FileListener(f)
} else {
log.Print("main: Listening on a new file descriptor.")
listener, err = net.Listen("tcp", server.Addr)
}
if err != nil {
log.Fatalf("listener error: %v", err)
}
go func() {
// server.Shutdown() stops Serve() immediately, thus server.Serve() should not be in main goroutine
err = server.Serve(listener)
log.Printf("server.Serve err: %v\n", err)
}()
signalHandler()
log.Printf("signal end")
}
func reload() error {
tl, ok := listener.(*net.TCPListener)
if !ok {
return errors.New("listener is not tcp listener")
}
f, err := tl.File()
if err != nil {
return err
}
args := []string{"-graceful"}
cmd := exec.Command(os.Args[0], args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
// put socket FD at the first entry
cmd.ExtraFiles = []*os.File{f}
return cmd.Start()
}
func signalHandler() {
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR2)
for {
sig := <-ch
log.Printf("signal: %v", sig)
// timeout context for shutdown
ctx, _ := context.WithTimeout(context.Background(), 20*time.Second)
switch sig {
case syscall.SIGINT, syscall.SIGTERM:
// stop
log.Printf("stop")
signal.Stop(ch)
server.Shutdown(ctx)
log.Printf("graceful shutdown")
return
case syscall.SIGUSR2:
// reload
log.Printf("reload")
err := reload()
if err != nil {
log.Fatalf("graceful restart error: %v", err)
}
server.Shutdown(ctx)
log.Printf("graceful reload")
return
}
}
}
代码可参考:https://github.com/CraryPrimitiveMan/go-in-action/tree/master/ch4
父进程退出之后,子进程会挂到1号进程上面。这种情况下使用systemd和supervisord等管理程序会显示进程处于failed的状态。解决这个问题有两个方法: