2024-02-17 06:59:40 +00:00
|
|
|
package supervisor
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"log/slog"
|
|
|
|
"os"
|
|
|
|
"os/signal"
|
|
|
|
"sync"
|
|
|
|
"time"
|
2024-04-26 19:30:35 +00:00
|
|
|
|
|
|
|
"git.gocasts.ir/ebhomengo/niki/logger"
|
2024-02-17 06:59:40 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// ProcessFunc is a long-running process which listens on finishSignal
|
2024-04-26 19:30:35 +00:00
|
|
|
// It notifies the supervisor by terminate channel when it terminates.
|
2024-02-17 06:59:40 +00:00
|
|
|
type ProcessFunc func(finishSignal context.Context, processName string, terminateChannel chan<- string) error
|
|
|
|
|
|
|
|
var noopProcessFunc = func(finishSignal context.Context, processName string, terminateChannel chan<- string) error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Supervisor is responsible to manage long-running processes
|
2024-04-26 19:30:35 +00:00
|
|
|
// Supervisor is not for concurrent use and should be used as the main goroutine of application.
|
2024-02-17 06:59:40 +00:00
|
|
|
type Supervisor struct {
|
|
|
|
logger *slog.Logger
|
|
|
|
lock *sync.Mutex
|
|
|
|
processes map[string]Process
|
|
|
|
shutdownSignal chan os.Signal
|
|
|
|
ctx context.Context
|
|
|
|
ctxCancel context.CancelFunc
|
|
|
|
shutdownTimeout time.Duration
|
|
|
|
// terminateChannel should be used to notify supervisor when a process terminates
|
|
|
|
terminateChannel chan string
|
|
|
|
}
|
|
|
|
|
|
|
|
func New(shutdownTimeout time.Duration, l *slog.Logger) *Supervisor {
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
|
|
|
|
if l == nil {
|
|
|
|
l = logger.L()
|
|
|
|
}
|
|
|
|
|
|
|
|
if shutdownTimeout == 0 {
|
|
|
|
shutdownTimeout = DefaultGracefulShutdownTimeout
|
|
|
|
}
|
|
|
|
|
|
|
|
return &Supervisor{
|
|
|
|
lock: &sync.Mutex{},
|
|
|
|
logger: l.WithGroup(LogNSSupervisor),
|
|
|
|
processes: make(map[string]Process),
|
|
|
|
shutdownSignal: make(chan os.Signal, 1),
|
|
|
|
ctx: ctx,
|
|
|
|
ctxCancel: cancel,
|
|
|
|
shutdownTimeout: shutdownTimeout,
|
|
|
|
// TODO : how to set terminateChannel buffer?
|
2024-04-26 19:30:35 +00:00
|
|
|
//nolint
|
2024-02-17 06:59:40 +00:00
|
|
|
terminateChannel: make(chan string, 10),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type Process struct {
|
|
|
|
name string
|
|
|
|
handler ProcessFunc
|
|
|
|
options ProcessOption
|
|
|
|
state ProcessState
|
|
|
|
}
|
|
|
|
|
|
|
|
type ProcessState struct {
|
|
|
|
// RecoveredNum count number of time the process recovered
|
|
|
|
RecoveredNum int
|
|
|
|
}
|
|
|
|
|
|
|
|
type ProcessOption struct {
|
|
|
|
Recover bool
|
|
|
|
RecoverInterval time.Duration
|
|
|
|
RecoverCount int
|
|
|
|
RetryCount int
|
|
|
|
RetryInterval time.Duration
|
|
|
|
IsFatal bool
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
ProcessRetryCount = 3
|
|
|
|
ProcessRetryInterval = 3 * time.Second
|
|
|
|
ProcessRecoverCount = 10
|
|
|
|
ProcessRecoverInterval = 2 * time.Second
|
|
|
|
DefaultGracefulShutdownTimeout = 5 * time.Second
|
|
|
|
LogNSSupervisor = "supervisor"
|
|
|
|
)
|
|
|
|
|
|
|
|
var defaultOptions = ProcessOption{
|
|
|
|
Recover: true,
|
|
|
|
RetryInterval: ProcessRetryInterval,
|
|
|
|
RecoverInterval: ProcessRecoverInterval,
|
|
|
|
RecoverCount: ProcessRecoverCount,
|
|
|
|
RetryCount: ProcessRetryCount,
|
|
|
|
IsFatal: true,
|
|
|
|
}
|
|
|
|
|
2024-04-26 19:30:35 +00:00
|
|
|
// Register registers a new process to supervisor.
|
2024-02-17 06:59:40 +00:00
|
|
|
func (s *Supervisor) Register(name string, process ProcessFunc, options *ProcessOption) {
|
|
|
|
// TODO : don't allow any registration after Start is called using a mutex
|
|
|
|
|
|
|
|
s.warnIfNameAlreadyInUse(name)
|
|
|
|
|
|
|
|
// TODO : validate name
|
|
|
|
p := Process{
|
|
|
|
name: name,
|
|
|
|
handler: process,
|
|
|
|
options: defaultOptions,
|
|
|
|
state: ProcessState{RecoveredNum: 0},
|
|
|
|
}
|
|
|
|
|
|
|
|
if options != nil {
|
|
|
|
p.options = *options
|
|
|
|
}
|
|
|
|
|
|
|
|
s.lock.Lock()
|
|
|
|
s.processes[name] = p
|
|
|
|
s.lock.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Start spawns a new goroutine for each process
|
2024-04-26 19:30:35 +00:00
|
|
|
// Spawned goroutine is responsible to handle the panics and restart the process.
|
2024-02-17 06:59:40 +00:00
|
|
|
func (s *Supervisor) Start() {
|
|
|
|
// TODO : is it viable to use a goroutine pool such as Ants ?
|
|
|
|
for name := range s.processes {
|
|
|
|
go s.executeProcessWithRetryPolicy(name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) executeProcessWithRetryPolicy(name string) {
|
|
|
|
defer func() {
|
|
|
|
if r := recover(); r != nil {
|
|
|
|
s.logger.Error("recover from panic", slog.String("process_name", name), slog.Any("panic", r))
|
|
|
|
|
|
|
|
if s.isRecoverable(name) {
|
|
|
|
s.incRecover(name)
|
|
|
|
s.waitFoRecover(name)
|
|
|
|
s.logger.Info("restart the process", slog.String("process_name", name))
|
|
|
|
|
|
|
|
// spawn new goroutine to avoid heap/stack memory leak when the recover count is big
|
|
|
|
go s.executeProcessWithRetryPolicy(name)
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
s.logger.Info("don't try any more to restart the process", slog.String("process_name", name))
|
|
|
|
s.removeProcess(name)
|
|
|
|
|
|
|
|
if s.isFatal(name) {
|
|
|
|
s.logger.Error("can't recover important process. exit..", slog.String("process_name", name))
|
|
|
|
s.shutdownSignal <- os.Interrupt
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
for i := 1; i <= s.retryCount(name); i++ {
|
|
|
|
s.logger.Info("execute process", slog.String("process_name", name))
|
|
|
|
f := s.handler(name)
|
|
|
|
err := f(s.ctx, name, s.terminateChannel)
|
|
|
|
if err != nil {
|
|
|
|
s.logger.Error("failed to execute process", slog.String("process_name", name),
|
|
|
|
slog.Int("attempt", i), slog.String("error", err.Error()))
|
|
|
|
|
|
|
|
s.waitFoRetry(name)
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// don't expect handler return if it hasn't any error because it's long-running process
|
|
|
|
// it should return when receives shutdown signal
|
|
|
|
s.logger.Info("process terminates with no error", slog.String("process_name", name))
|
|
|
|
|
|
|
|
if s.isFatal(name) {
|
|
|
|
s.logger.Error("can't recover important process. exit..", slog.String("process_name", name))
|
|
|
|
s.shutdownSignal <- os.Interrupt
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
s.logger.Info("don't try any more to execute process", slog.String("process_name", name))
|
|
|
|
s.removeProcess(name)
|
|
|
|
}
|
|
|
|
|
|
|
|
// WaitOnShutdownSignal wait to receive shutdown signal.
|
2024-04-26 19:30:35 +00:00
|
|
|
// WaitOnShutdownSignal should not be called in other goroutines except main goroutine of application.
|
2024-02-17 06:59:40 +00:00
|
|
|
func (s *Supervisor) WaitOnShutdownSignal() {
|
|
|
|
// TODO : is it necessary to add os.Interrupt to supervisor config?
|
|
|
|
signal.Notify(s.shutdownSignal, os.Interrupt)
|
|
|
|
<-s.shutdownSignal
|
|
|
|
|
|
|
|
s.gracefulShutdown()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) gracefulShutdown() {
|
|
|
|
s.logger.Info("shutdown all processes gracefully")
|
|
|
|
|
|
|
|
s.logger.Info("notify all processes (goroutines) to finish their jobs", slog.Duration("shutdown_timeout", s.shutdownTimeout))
|
|
|
|
s.ctxCancel()
|
|
|
|
|
|
|
|
forceExitCtx, forceExitCancel := context.WithTimeout(context.Background(), s.shutdownTimeout)
|
|
|
|
defer forceExitCancel()
|
|
|
|
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case name := <-s.terminateChannel:
|
|
|
|
s.logger.Info("process terminates gracefully", slog.String("process_name", name))
|
|
|
|
s.removeProcess(name)
|
|
|
|
|
|
|
|
case <-forceExitCtx.Done():
|
|
|
|
s.logger.Info("supervisor terminates its job.", slog.Int("number_of_unfinished_processes", len(s.processes)))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) removeProcess(name string) {
|
|
|
|
s.lock.Lock()
|
|
|
|
delete(s.processes, name)
|
|
|
|
s.lock.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) isRecoverable(name string) bool {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
if v.options.Recover && v.state.RecoveredNum < v.options.RecoverCount {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) isFatal(name string) bool {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return v.options.IsFatal
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) incRecover(name string) {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
v.state.RecoveredNum++
|
|
|
|
s.processes[name] = v
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) retryCount(name string) int {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return -1
|
|
|
|
}
|
|
|
|
|
|
|
|
return v.options.RetryCount
|
|
|
|
}
|
|
|
|
|
2024-09-10 20:55:19 +00:00
|
|
|
// nolint
|
2024-02-17 06:59:40 +00:00
|
|
|
func (s *Supervisor) retryInterval(name string) time.Duration {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return -1
|
|
|
|
}
|
|
|
|
|
|
|
|
return v.options.RetryInterval
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) waitFoRecover(name string) {
|
|
|
|
s.lock.Lock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
t := v.options.RecoverInterval
|
|
|
|
|
|
|
|
// free lock before sleep
|
|
|
|
s.lock.Unlock()
|
|
|
|
|
|
|
|
time.Sleep(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) waitFoRetry(name string) {
|
|
|
|
s.lock.Lock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
t := v.options.RetryInterval
|
|
|
|
|
|
|
|
// free lock before sleep
|
|
|
|
s.lock.Unlock()
|
|
|
|
|
|
|
|
s.logger.Info("wait to retry execute process after sleep interval",
|
|
|
|
slog.String("process_name", name), slog.Duration("interval",
|
|
|
|
t))
|
|
|
|
|
|
|
|
time.Sleep(t)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) handler(name string) ProcessFunc {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
v, ok := s.processes[name]
|
|
|
|
if !ok {
|
|
|
|
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
2024-04-26 19:30:35 +00:00
|
|
|
|
2024-02-17 06:59:40 +00:00
|
|
|
return noopProcessFunc
|
|
|
|
}
|
|
|
|
|
|
|
|
return v.handler
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Supervisor) warnIfNameAlreadyInUse(name string) {
|
|
|
|
s.lock.Lock()
|
|
|
|
defer s.lock.Unlock()
|
|
|
|
|
|
|
|
if _, ok := s.processes[name]; ok {
|
|
|
|
s.logger.Warn("process name already in use", slog.String("process_name", name))
|
|
|
|
}
|
|
|
|
}
|