Просмотр исходного кода

fix(jobs): isolate per-node background goroutines from panics (#5397)

A panic in a goroutine without a recover takes the whole panel down. The
per-node heartbeat and traffic-sync goroutines run remote network I/O for
each node with no panic isolation, so one misbehaving node could crash the
master.

Add common.GoRecover(name, fn), which runs fn in a goroutine guarded by a
recover that logs the panic with a stack trace instead of crashing, and use
it for the per-node heartbeat, traffic-sync and global-push goroutines. The
deferred WaitGroup/semaphore releases still run during panic unwind, so the
group never stalls. Other background goroutines can adopt the same helper.
n0ctal 1 день назад
Родитель
Сommit
f63ed9f510

+ 15 - 0
internal/util/common/err.go

@@ -4,6 +4,7 @@ package common
 import (
 	"errors"
 	"fmt"
+	"runtime/debug"
 
 	"github.com/mhsanaei/3x-ui/v3/internal/logger"
 )
@@ -30,3 +31,17 @@ func Recover(msg string) any {
 	}
 	return panicErr
 }
+
+// GoRecover runs fn in a new goroutine guarded by a recover, so a panic in a
+// background goroutine is logged (with name and a stack trace) instead of taking
+// the whole process down. name identifies the goroutine in the log.
+func GoRecover(name string, fn func()) {
+	go func() {
+		defer func() {
+			if r := recover(); r != nil {
+				logger.Error("panic in goroutine", name, ":", r, "\n"+string(debug.Stack()))
+			}
+		}()
+		fn()
+	}()
+}

+ 41 - 0
internal/util/common/gorecover_test.go

@@ -0,0 +1,41 @@
+package common
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/mhsanaei/3x-ui/v3/internal/logger"
+	"github.com/op/go-logging"
+)
+
+func TestMain(m *testing.M) {
+	logger.InitLogger(logging.ERROR)
+	os.Exit(m.Run())
+}
+
+func TestGoRecover_RunsFn(t *testing.T) {
+	done := make(chan struct{})
+	GoRecover("test-run", func() { close(done) })
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("fn did not run")
+	}
+}
+
+func TestGoRecover_RecoversPanic(t *testing.T) {
+	done := make(chan struct{})
+	// If GoRecover did not recover, this panic would crash the test binary.
+	GoRecover("test-panic", func() {
+		defer close(done)
+		panic("boom")
+	})
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("goroutine did not complete")
+	}
+	// Let the deferred recover+log run before the test ends.
+	time.Sleep(50 * time.Millisecond)
+}

+ 4 - 2
internal/web/job/node_heartbeat_job.go

@@ -9,6 +9,7 @@ import (
 	"github.com/mhsanaei/3x-ui/v3/internal/database/model"
 	"github.com/mhsanaei/3x-ui/v3/internal/eventbus"
 	"github.com/mhsanaei/3x-ui/v3/internal/logger"
+	"github.com/mhsanaei/3x-ui/v3/internal/util/common"
 	"github.com/mhsanaei/3x-ui/v3/internal/web/service"
 	"github.com/mhsanaei/3x-ui/v3/internal/web/websocket"
 )
@@ -50,11 +51,12 @@ func (j *NodeHeartbeatJob) Run() {
 		}
 		wg.Add(1)
 		sem <- struct{}{}
-		go func(n *model.Node) {
+		n := n
+		common.GoRecover("node-heartbeat:"+n.Name, func() {
 			defer wg.Done()
 			defer func() { <-sem }()
 			j.probeOne(n)
-		}(n)
+		})
 	}
 	wg.Wait()
 

+ 7 - 5
internal/web/job/node_traffic_sync_job.go

@@ -8,10 +8,10 @@ import (
 
 	"github.com/mhsanaei/3x-ui/v3/internal/database/model"
 	"github.com/mhsanaei/3x-ui/v3/internal/logger"
+	"github.com/mhsanaei/3x-ui/v3/internal/util/common"
 	"github.com/mhsanaei/3x-ui/v3/internal/web/runtime"
 	"github.com/mhsanaei/3x-ui/v3/internal/web/service"
 	"github.com/mhsanaei/3x-ui/v3/internal/web/websocket"
-	"github.com/mhsanaei/3x-ui/v3/internal/xray"
 )
 
 const (
@@ -96,11 +96,12 @@ func (j *NodeTrafficSyncJob) Run() {
 		}
 		wg.Add(1)
 		sem <- struct{}{}
-		go func(n *model.Node) {
+		n := n
+		common.GoRecover("node-traffic-sync:"+n.Name, func() {
 			defer wg.Done()
 			defer func() { <-sem }()
 			j.syncOne(mgr, n, doIpSync)
-		}(n)
+		})
 	}
 	wg.Wait()
 
@@ -211,7 +212,8 @@ func (j *NodeTrafficSyncJob) maybePushGlobals(mgr *runtime.Manager, nodes []*mod
 		}
 		wg.Add(1)
 		sem <- struct{}{}
-		go func(n *model.Node, remote *runtime.Remote, traffics []*xray.ClientTraffic) {
+		n, remote, traffics := n, remote, traffics
+		common.GoRecover("node-global-push:"+n.Name, func() {
 			defer wg.Done()
 			defer func() { <-sem }()
 			ctx, cancel := context.WithTimeout(context.Background(), nodeTrafficSyncRequestTimeout)
@@ -225,7 +227,7 @@ func (j *NodeTrafficSyncJob) maybePushGlobals(mgr *runtime.Manager, nodes []*mod
 					logger.Warning("node traffic sync: push globals to", n.Name, "failed:", err)
 				}
 			}
-		}(n, remote, traffics)
+		})
 	}
 	wg.Wait()
 }