forked from ebhomengo/niki
add supervisor
This commit is contained in:
parent
74c1d223d1
commit
748dee60e2
|
@ -0,0 +1,350 @@
|
|||
package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"git.gocasts.ir/ebhomengo/niki/logger"
|
||||
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ProcessFunc is a long-running process which listens on finishSignal
|
||||
// It notifies the supervisor by terminate channel when it terminates
|
||||
type ProcessFunc func(finishSignal context.Context, processName string, terminateChannel chan<- string) error
|
||||
|
||||
var noopProcessFunc = func(finishSignal context.Context, processName string, terminateChannel chan<- string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Supervisor is responsible to manage long-running processes
|
||||
// Supervisor is not for concurrent use and should be used as the main goroutine of application
|
||||
type Supervisor struct {
|
||||
logger *slog.Logger
|
||||
lock *sync.Mutex
|
||||
processes map[string]Process
|
||||
shutdownSignal chan os.Signal
|
||||
ctx context.Context
|
||||
ctxCancel context.CancelFunc
|
||||
shutdownTimeout time.Duration
|
||||
// terminateChannel should be used to notify supervisor when a process terminates
|
||||
terminateChannel chan string
|
||||
}
|
||||
|
||||
func New(shutdownTimeout time.Duration, l *slog.Logger) *Supervisor {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
if l == nil {
|
||||
l = logger.L()
|
||||
}
|
||||
|
||||
if shutdownTimeout == 0 {
|
||||
shutdownTimeout = DefaultGracefulShutdownTimeout
|
||||
}
|
||||
|
||||
return &Supervisor{
|
||||
lock: &sync.Mutex{},
|
||||
logger: l.WithGroup(LogNSSupervisor),
|
||||
processes: make(map[string]Process),
|
||||
shutdownSignal: make(chan os.Signal, 1),
|
||||
ctx: ctx,
|
||||
ctxCancel: cancel,
|
||||
shutdownTimeout: shutdownTimeout,
|
||||
// TODO : how to set terminateChannel buffer?
|
||||
terminateChannel: make(chan string, 10),
|
||||
}
|
||||
}
|
||||
|
||||
type Process struct {
|
||||
name string
|
||||
handler ProcessFunc
|
||||
options ProcessOption
|
||||
state ProcessState
|
||||
}
|
||||
|
||||
type ProcessState struct {
|
||||
// RecoveredNum count number of time the process recovered
|
||||
RecoveredNum int
|
||||
}
|
||||
|
||||
type ProcessOption struct {
|
||||
Recover bool
|
||||
RecoverInterval time.Duration
|
||||
RecoverCount int
|
||||
RetryCount int
|
||||
RetryInterval time.Duration
|
||||
IsFatal bool
|
||||
}
|
||||
|
||||
const (
|
||||
ProcessRetryCount = 3
|
||||
ProcessRetryInterval = 3 * time.Second
|
||||
ProcessRecoverCount = 10
|
||||
ProcessRecoverInterval = 2 * time.Second
|
||||
DefaultGracefulShutdownTimeout = 5 * time.Second
|
||||
LogNSSupervisor = "supervisor"
|
||||
)
|
||||
|
||||
var defaultOptions = ProcessOption{
|
||||
Recover: true,
|
||||
RetryInterval: ProcessRetryInterval,
|
||||
RecoverInterval: ProcessRecoverInterval,
|
||||
RecoverCount: ProcessRecoverCount,
|
||||
RetryCount: ProcessRetryCount,
|
||||
IsFatal: true,
|
||||
}
|
||||
|
||||
// Register registers a new process to supervisor
|
||||
func (s *Supervisor) Register(name string, process ProcessFunc, options *ProcessOption) {
|
||||
// TODO : don't allow any registration after Start is called using a mutex
|
||||
|
||||
s.warnIfNameAlreadyInUse(name)
|
||||
|
||||
// TODO : validate name
|
||||
p := Process{
|
||||
name: name,
|
||||
handler: process,
|
||||
options: defaultOptions,
|
||||
state: ProcessState{RecoveredNum: 0},
|
||||
}
|
||||
|
||||
if options != nil {
|
||||
p.options = *options
|
||||
}
|
||||
|
||||
s.lock.Lock()
|
||||
s.processes[name] = p
|
||||
s.lock.Unlock()
|
||||
}
|
||||
|
||||
// Start spawns a new goroutine for each process
|
||||
// Spawned goroutine is responsible to handle the panics and restart the process
|
||||
func (s *Supervisor) Start() {
|
||||
// TODO : is it viable to use a goroutine pool such as Ants ?
|
||||
for name := range s.processes {
|
||||
go s.executeProcessWithRetryPolicy(name)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Supervisor) executeProcessWithRetryPolicy(name string) {
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
s.logger.Error("recover from panic", slog.String("process_name", name), slog.Any("panic", r))
|
||||
|
||||
if s.isRecoverable(name) {
|
||||
s.incRecover(name)
|
||||
s.waitFoRecover(name)
|
||||
s.logger.Info("restart the process", slog.String("process_name", name))
|
||||
|
||||
// spawn new goroutine to avoid heap/stack memory leak when the recover count is big
|
||||
go s.executeProcessWithRetryPolicy(name)
|
||||
return
|
||||
}
|
||||
|
||||
s.logger.Info("don't try any more to restart the process", slog.String("process_name", name))
|
||||
s.removeProcess(name)
|
||||
|
||||
if s.isFatal(name) {
|
||||
s.logger.Error("can't recover important process. exit..", slog.String("process_name", name))
|
||||
s.shutdownSignal <- os.Interrupt
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
for i := 1; i <= s.retryCount(name); i++ {
|
||||
s.logger.Info("execute process", slog.String("process_name", name))
|
||||
f := s.handler(name)
|
||||
err := f(s.ctx, name, s.terminateChannel)
|
||||
if err != nil {
|
||||
s.logger.Error("failed to execute process", slog.String("process_name", name),
|
||||
slog.Int("attempt", i), slog.String("error", err.Error()))
|
||||
|
||||
s.waitFoRetry(name)
|
||||
continue
|
||||
}
|
||||
|
||||
// don't expect handler return if it hasn't any error because it's long-running process
|
||||
// it should return when receives shutdown signal
|
||||
s.logger.Info("process terminates with no error", slog.String("process_name", name))
|
||||
|
||||
if s.isFatal(name) {
|
||||
s.logger.Error("can't recover important process. exit..", slog.String("process_name", name))
|
||||
s.shutdownSignal <- os.Interrupt
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
s.logger.Info("don't try any more to execute process", slog.String("process_name", name))
|
||||
s.removeProcess(name)
|
||||
}
|
||||
|
||||
// WaitOnShutdownSignal wait to receive shutdown signal.
|
||||
// WaitOnShutdownSignal should not be called in other goroutines except main goroutine of application
|
||||
func (s *Supervisor) WaitOnShutdownSignal() {
|
||||
// TODO : is it necessary to add os.Interrupt to supervisor config?
|
||||
signal.Notify(s.shutdownSignal, os.Interrupt)
|
||||
<-s.shutdownSignal
|
||||
|
||||
s.gracefulShutdown()
|
||||
}
|
||||
|
||||
func (s *Supervisor) gracefulShutdown() {
|
||||
s.logger.Info("shutdown all processes gracefully")
|
||||
|
||||
s.logger.Info("notify all processes (goroutines) to finish their jobs", slog.Duration("shutdown_timeout", s.shutdownTimeout))
|
||||
s.ctxCancel()
|
||||
|
||||
forceExitCtx, forceExitCancel := context.WithTimeout(context.Background(), s.shutdownTimeout)
|
||||
defer forceExitCancel()
|
||||
|
||||
for {
|
||||
select {
|
||||
case name := <-s.terminateChannel:
|
||||
s.logger.Info("process terminates gracefully", slog.String("process_name", name))
|
||||
s.removeProcess(name)
|
||||
|
||||
case <-forceExitCtx.Done():
|
||||
s.logger.Info("supervisor terminates its job.", slog.Int("number_of_unfinished_processes", len(s.processes)))
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Supervisor) removeProcess(name string) {
|
||||
s.lock.Lock()
|
||||
delete(s.processes, name)
|
||||
s.lock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Supervisor) isRecoverable(name string) bool {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return false
|
||||
}
|
||||
|
||||
if v.options.Recover && v.state.RecoveredNum < v.options.RecoverCount {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *Supervisor) isFatal(name string) bool {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return false
|
||||
}
|
||||
|
||||
return v.options.IsFatal
|
||||
}
|
||||
|
||||
func (s *Supervisor) incRecover(name string) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return
|
||||
}
|
||||
|
||||
v.state.RecoveredNum++
|
||||
s.processes[name] = v
|
||||
}
|
||||
|
||||
func (s *Supervisor) retryCount(name string) int {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return -1
|
||||
}
|
||||
|
||||
return v.options.RetryCount
|
||||
}
|
||||
|
||||
func (s *Supervisor) retryInterval(name string) time.Duration {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return -1
|
||||
}
|
||||
|
||||
return v.options.RetryInterval
|
||||
}
|
||||
|
||||
func (s *Supervisor) waitFoRecover(name string) {
|
||||
s.lock.Lock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return
|
||||
}
|
||||
|
||||
t := v.options.RecoverInterval
|
||||
|
||||
// free lock before sleep
|
||||
s.lock.Unlock()
|
||||
|
||||
time.Sleep(t)
|
||||
}
|
||||
|
||||
func (s *Supervisor) waitFoRetry(name string) {
|
||||
s.lock.Lock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return
|
||||
}
|
||||
|
||||
t := v.options.RetryInterval
|
||||
|
||||
// free lock before sleep
|
||||
s.lock.Unlock()
|
||||
|
||||
s.logger.Info("wait to retry execute process after sleep interval",
|
||||
slog.String("process_name", name), slog.Duration("interval",
|
||||
t))
|
||||
|
||||
time.Sleep(t)
|
||||
}
|
||||
|
||||
func (s *Supervisor) handler(name string) ProcessFunc {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
v, ok := s.processes[name]
|
||||
if !ok {
|
||||
s.logger.Warn("process doesn't exist", slog.String("process_name", name))
|
||||
return noopProcessFunc
|
||||
}
|
||||
|
||||
return v.handler
|
||||
}
|
||||
|
||||
func (s *Supervisor) warnIfNameAlreadyInUse(name string) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
if _, ok := s.processes[name]; ok {
|
||||
s.logger.Warn("process name already in use", slog.String("process_name", name))
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue