forkjo/modules/graceful/manager_unix.go
wxiaoguang a8f5449cd9
Avoid unexpected panic in graceful manager (#29629)
There is a fundamental design problem of the "manager" and the "wait
group".
If nothing has started, the "Wait" just panics: sync: WaitGroup is
reused before previous Wait has returned
There is no clear solution besides a complete rewriting of the "manager"

If there are some mistakes in the app.ini, end users would just see the
"panic", but not the real error messages. A real case: #27643

This PR is just a quick fix for the annoying panic problem.

(cherry picked from commit 90a3f2d4b7ed3890d9655c0334444f86d89b7b30)
2024-03-11 23:36:58 +07:00

203 lines
5.5 KiB
Go

// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !windows
package graceful
import (
"context"
"errors"
"os"
"os/signal"
"runtime/pprof"
"strconv"
"syscall"
"time"
"code.gitea.io/gitea/modules/graceful/releasereopen"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/process"
"code.gitea.io/gitea/modules/setting"
)
func pidMsg() systemdNotifyMsg {
return systemdNotifyMsg("MAINPID=" + strconv.Itoa(os.Getpid()))
}
// Notify systemd of status via the notify protocol
func (g *Manager) notify(msg systemdNotifyMsg) {
conn, err := getNotifySocket()
if err != nil {
// the err is logged in getNotifySocket
return
}
if conn == nil {
return
}
defer conn.Close()
if _, err = conn.Write([]byte(msg)); err != nil {
log.Warn("Failed to notify NOTIFY_SOCKET: %v", err)
return
}
}
func (g *Manager) start() {
// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
pprof.SetGoroutineLabels(g.managerCtx)
defer pprof.SetGoroutineLabels(g.ctx)
g.isChild = len(os.Getenv(listenFDsEnv)) > 0 && os.Getppid() > 1
g.notify(statusMsg("Starting Gitea"))
g.notify(pidMsg())
go g.handleSignals(g.managerCtx)
// Handle clean up of unused provided listeners and delayed start-up
startupDone := make(chan struct{})
go func() {
defer close(startupDone)
// Wait till we're done getting all the listeners and then close the unused ones
func() {
// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
// There is no clear solution besides a complete rewriting of the "manager"
defer func() {
_ = recover()
}()
g.createServerWaitGroup.Wait()
}()
// Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
_ = CloseProvidedListeners()
g.notify(readyMsg)
}()
if setting.StartupTimeout > 0 {
go func() {
select {
case <-startupDone:
return
case <-g.IsShutdown():
func() {
// When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it.
defer func() {
_ = recover()
}()
// Ensure that the createServerWaitGroup stops waiting
for {
g.createServerWaitGroup.Done()
}
}()
return
case <-time.After(setting.StartupTimeout):
log.Error("Startup took too long! Shutting down")
g.notify(statusMsg("Startup took too long! Shutting down"))
g.notify(stoppingMsg)
g.doShutdown()
}
}()
}
}
func (g *Manager) handleSignals(ctx context.Context) {
ctx, _, finished := process.GetManager().AddTypedContext(ctx, "Graceful: HandleSignals", process.SystemProcessType, true)
defer finished()
signalChannel := make(chan os.Signal, 1)
signal.Notify(
signalChannel,
syscall.SIGHUP,
syscall.SIGUSR1,
syscall.SIGUSR2,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGTSTP,
)
watchdogTimeout := getWatchdogTimeout()
t := &time.Ticker{}
if watchdogTimeout != 0 {
g.notify(watchdogMsg)
t = time.NewTicker(watchdogTimeout / 2)
}
pid := syscall.Getpid()
for {
select {
case sig := <-signalChannel:
switch sig {
case syscall.SIGHUP:
log.Info("PID: %d. Received SIGHUP. Attempting GracefulRestart...", pid)
g.DoGracefulRestart()
case syscall.SIGUSR1:
log.Warn("PID %d. Received SIGUSR1. Releasing and reopening logs", pid)
g.notify(statusMsg("Releasing and reopening logs"))
if err := releasereopen.GetManager().ReleaseReopen(); err != nil {
log.Error("Error whilst releasing and reopening logs: %v", err)
}
case syscall.SIGUSR2:
log.Warn("PID %d. Received SIGUSR2. Hammering...", pid)
g.DoImmediateHammer()
case syscall.SIGINT:
log.Warn("PID %d. Received SIGINT. Shutting down...", pid)
g.DoGracefulShutdown()
case syscall.SIGTERM:
log.Warn("PID %d. Received SIGTERM. Shutting down...", pid)
g.DoGracefulShutdown()
case syscall.SIGTSTP:
log.Info("PID %d. Received SIGTSTP.", pid)
default:
log.Info("PID %d. Received %v.", pid, sig)
}
case <-t.C:
g.notify(watchdogMsg)
case <-ctx.Done():
log.Warn("PID: %d. Background context for manager closed - %v - Shutting down...", pid, ctx.Err())
g.DoGracefulShutdown()
return
}
}
}
func (g *Manager) doFork() error {
g.lock.Lock()
if g.forked {
g.lock.Unlock()
return errors.New("another process already forked. Ignoring this one")
}
g.forked = true
g.lock.Unlock()
g.notify(reloadingMsg)
// We need to move the file logs to append pids
setting.RestartLogsWithPIDSuffix()
_, err := RestartProcess()
return err
}
// DoGracefulRestart causes a graceful restart
func (g *Manager) DoGracefulRestart() {
if setting.GracefulRestartable {
log.Info("PID: %d. Forking...", os.Getpid())
err := g.doFork()
if err != nil {
if err.Error() == "another process already forked. Ignoring this one" {
g.DoImmediateHammer()
} else {
log.Error("Error whilst forking from PID: %d : %v", os.Getpid(), err)
}
}
// doFork calls RestartProcess which starts a new Gitea process, so this parent process needs to exit
// Otherwise some resources (eg: leveldb lock) will be held by this parent process and the new process will fail to start
log.Info("PID: %d. Shutting down after forking ...", os.Getpid())
g.doShutdown()
} else {
log.Info("PID: %d. Not set restartable. Shutting down...", os.Getpid())
g.notify(stoppingMsg)
g.doShutdown()
}
}