فهرست منبع

feat(xray): add tunnel health monitor (#5480)

* feat(xray): add tunnel health monitor

* fix(tunnelmonitor): reuse netproxy client and init logger in tests

Replace the duplicated newHTTPClient/dialContextWithProxy with netproxy.NewHTTPClient, which centralises the http/https/socks5 handling and avoids the dial-goroutine connection leak on context cancellation. Cap failures at the threshold during cooldown so the counter stays a true consecutive-failure count. Add TestMain to initialise the logger and fix the nil-pointer panic in the success-after-failure path.

* fix(tunnelmonitor): observable recovery, signal headroom, and hardening

Address the remaining review findings on the tunnel health monitor:

- Recovery is now synchronous and observable: the callback calls
  server.RestartXray() directly and returns its error instead of just
  enqueuing SIGUSR1, so a failed restart no longer masks as success and
  arms the cooldown while the tunnel is still down.
- Give the OS signal channel headroom (buffer 8) so producers cannot
  starve a SIGTERM/SIGINT out of the single slot.
- Warn at startup when the monitor is enabled without a proxy, since the
  probe then measures host connectivity rather than the xray tunnel.
- Cap failures at the threshold in the nil-recover branch too, matching
  the cooldown cap.
- Document the XUI_TUNNEL_HEALTH_* vars in .env.example and the README.
- Add tests for status-code classification, Normalize bounds, New proxy
  scheme errors, the recovery-error and nil-recover paths, the cooldown
  cap, and Run context cancellation (coverage 90%).

---------

Co-authored-by: Sanaei <[email protected]>
Rick Sanchez 16 ساعت پیش
والد
کامیت
fe025e8af3
5فایلهای تغییر یافته به همراه773 افزوده شده و 1 حذف شده
  1. 13 0
      .env.example
  2. 7 0
      README.md
  3. 271 0
      internal/tunnelmonitor/monitor.go
  4. 454 0
      internal/tunnelmonitor/monitor_test.go
  5. 28 1
      main.go

+ 13 - 0
.env.example

@@ -4,3 +4,16 @@ XUI_LOG_FOLDER=x-ui
 XUI_BIN_FOLDER=x-ui
 XUI_INIT_WEB_BASE_PATH=/
 # XUI_PORT=8080
+
+# Optional tunnel health monitor (disabled by default). It periodically probes a
+# URL and restarts xray-core after repeated failures. Point XUI_TUNNEL_HEALTH_PROXY
+# at a local xray inbound so the probe tests the tunnel; without it the probe only
+# checks host connectivity and a restart will not fix host network issues. A restart
+# drops every connected client.
+# XUI_TUNNEL_HEALTH_MONITOR=true
+# XUI_TUNNEL_HEALTH_PROXY=socks5://127.0.0.1:1080
+# XUI_TUNNEL_HEALTH_URL=https://www.cloudflare.com/cdn-cgi/trace
+# XUI_TUNNEL_HEALTH_INTERVAL=30s
+# XUI_TUNNEL_HEALTH_TIMEOUT=10s
+# XUI_TUNNEL_HEALTH_FAILURES=3
+# XUI_TUNNEL_HEALTH_COOLDOWN=5m

+ 7 - 0
README.md

@@ -146,6 +146,13 @@ docker run -d --cap-add=NET_ADMIN --cap-add=NET_RAW ... ghcr.io/mhsanaei/3x-ui
 | `XUI_ENABLE_FAIL2BAN` | Enable Fail2ban-based IP-limit enforcement | `true` |
 | `XUI_LOG_LEVEL` | Log verbosity (`debug`, `info`, `warning`, `error`) | `info` |
 | `XUI_DEBUG` | Enable debug mode | `false` |
+| `XUI_TUNNEL_HEALTH_MONITOR` | Enable the tunnel health monitor (probes a URL and restarts xray after repeated failures; a restart drops all clients) | `false` |
+| `XUI_TUNNEL_HEALTH_PROXY` | Proxy the probe is sent through; point it at a local xray inbound so the probe tests the tunnel (e.g. `socks5://127.0.0.1:1080`). Empty means the probe only checks host connectivity | — |
+| `XUI_TUNNEL_HEALTH_URL` | URL probed for tunnel health | `https://www.cloudflare.com/cdn-cgi/trace` |
+| `XUI_TUNNEL_HEALTH_INTERVAL` | Interval between probes | `30s` |
+| `XUI_TUNNEL_HEALTH_TIMEOUT` | Per-probe timeout | `10s` |
+| `XUI_TUNNEL_HEALTH_FAILURES` | Consecutive failures before a restart is triggered | `3` |
+| `XUI_TUNNEL_HEALTH_COOLDOWN` | Minimum delay between consecutive restarts | `5m` |
 
 ## Supported Languages
 

+ 271 - 0
internal/tunnelmonitor/monitor.go

@@ -0,0 +1,271 @@
+package tunnelmonitor
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/mhsanaei/3x-ui/v3/internal/logger"
+	"github.com/mhsanaei/3x-ui/v3/internal/util/netproxy"
+)
+
+const (
+	defaultHealthURL        = "https://www.cloudflare.com/cdn-cgi/trace"
+	defaultInterval         = 30 * time.Second
+	defaultTimeout          = 10 * time.Second
+	defaultFailureThreshold = 3
+	defaultCooldown         = 5 * time.Minute
+)
+
+// Config controls the optional tunnel health monitor.
+type Config struct {
+	Enabled          bool
+	URL              string
+	ProxyURL         string
+	Interval         time.Duration
+	Timeout          time.Duration
+	FailureThreshold int
+	Cooldown         time.Duration
+}
+
+// RecoveryFunc performs recovery after the monitor reaches the configured
+// failure threshold. The panel wires this to an Xray core restart.
+type RecoveryFunc func(context.Context) error
+
+// Monitor periodically probes a URL and triggers recovery after repeated
+// failures. It is intentionally independent from panel settings/UI so it can be
+// enabled safely through service environment variables first.
+type Monitor struct {
+	cfg          Config
+	client       *http.Client
+	recover      RecoveryFunc
+	failures     int
+	lastRecovery time.Time
+	now          func() time.Time
+}
+
+// DefaultConfig returns disabled-by-default monitor settings.
+func DefaultConfig() Config {
+	return Config{
+		Enabled:          false,
+		URL:              defaultHealthURL,
+		Interval:         defaultInterval,
+		Timeout:          defaultTimeout,
+		FailureThreshold: defaultFailureThreshold,
+		Cooldown:         defaultCooldown,
+	}
+}
+
+// ConfigFromEnv builds Config from XUI_TUNNEL_HEALTH_* environment variables.
+//
+// Supported variables:
+//   - XUI_TUNNEL_HEALTH_MONITOR=true
+//   - XUI_TUNNEL_HEALTH_URL=https://www.cloudflare.com/cdn-cgi/trace
+//   - XUI_TUNNEL_HEALTH_PROXY=socks5://127.0.0.1:1080
+//   - XUI_TUNNEL_HEALTH_INTERVAL=30s
+//   - XUI_TUNNEL_HEALTH_TIMEOUT=10s
+//   - XUI_TUNNEL_HEALTH_FAILURES=3
+//   - XUI_TUNNEL_HEALTH_COOLDOWN=5m
+func ConfigFromEnv() Config {
+	cfg := DefaultConfig()
+
+	cfg.Enabled = parseBool(os.Getenv("XUI_TUNNEL_HEALTH_MONITOR"))
+	cfg.URL = firstNonEmpty(os.Getenv("XUI_TUNNEL_HEALTH_URL"), cfg.URL)
+	cfg.ProxyURL = strings.TrimSpace(os.Getenv("XUI_TUNNEL_HEALTH_PROXY"))
+	cfg.Interval = parseDurationEnv("XUI_TUNNEL_HEALTH_INTERVAL", cfg.Interval)
+	cfg.Timeout = parseDurationEnv("XUI_TUNNEL_HEALTH_TIMEOUT", cfg.Timeout)
+	cfg.Cooldown = parseDurationEnv("XUI_TUNNEL_HEALTH_COOLDOWN", cfg.Cooldown)
+	cfg.FailureThreshold = parseIntEnv("XUI_TUNNEL_HEALTH_FAILURES", cfg.FailureThreshold)
+
+	return cfg.Normalize()
+}
+
+// Normalize applies safe bounds and defaults.
+func (c Config) Normalize() Config {
+	if strings.TrimSpace(c.URL) == "" {
+		c.URL = defaultHealthURL
+	}
+	c.URL = strings.TrimSpace(c.URL)
+	c.ProxyURL = strings.TrimSpace(c.ProxyURL)
+
+	if c.Interval < time.Second {
+		c.Interval = defaultInterval
+	}
+	if c.Timeout < time.Second {
+		c.Timeout = defaultTimeout
+	}
+	if c.FailureThreshold < 1 {
+		c.FailureThreshold = defaultFailureThreshold
+	}
+	if c.Cooldown < time.Second {
+		c.Cooldown = defaultCooldown
+	}
+
+	return c
+}
+
+// New creates a monitor with an HTTP client based on cfg.
+func New(cfg Config, recover RecoveryFunc) (*Monitor, error) {
+	cfg = cfg.Normalize()
+
+	client, err := netproxy.NewHTTPClient(cfg.ProxyURL, cfg.Timeout)
+	if err != nil {
+		return nil, err
+	}
+
+	return newWithClient(cfg, client, recover), nil
+}
+
+func newWithClient(cfg Config, client *http.Client, recover RecoveryFunc) *Monitor {
+	cfg = cfg.Normalize()
+	if client == nil {
+		client = &http.Client{Timeout: cfg.Timeout}
+	}
+
+	return &Monitor{
+		cfg:     cfg,
+		client:  client,
+		recover: recover,
+		now:     time.Now,
+	}
+}
+
+// Run starts the monitor loop until ctx is cancelled.
+func (m *Monitor) Run(ctx context.Context) {
+	if m == nil || !m.cfg.Enabled {
+		return
+	}
+
+	logger.Info("Tunnel health monitor enabled: ", m.cfg.URL)
+
+	ticker := time.NewTicker(m.cfg.Interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			logger.Info("Tunnel health monitor stopped")
+			return
+		case <-ticker.C:
+			recovered, err := m.Step(ctx)
+			if err != nil {
+				logger.Warning("Tunnel health monitor check failed: ", err)
+			}
+			if recovered {
+				logger.Warning("Tunnel health monitor triggered Xray restart")
+			}
+		}
+	}
+}
+
+// Step performs one probe and maybe triggers recovery.
+func (m *Monitor) Step(ctx context.Context) (bool, error) {
+	if m == nil {
+		return false, errors.New("nil monitor")
+	}
+
+	if err := m.probe(ctx); err != nil {
+		m.failures++
+
+		if m.failures < m.cfg.FailureThreshold {
+			return false, fmt.Errorf("probe failed %d/%d: %w", m.failures, m.cfg.FailureThreshold, err)
+		}
+
+		now := m.now()
+		if !m.lastRecovery.IsZero() && now.Sub(m.lastRecovery) < m.cfg.Cooldown {
+			m.failures = m.cfg.FailureThreshold
+			return false, fmt.Errorf("probe failed %d/%d; recovery cooldown active: %w", m.failures, m.cfg.FailureThreshold, err)
+		}
+
+		if m.recover == nil {
+			m.failures = m.cfg.FailureThreshold
+			return false, errors.New("recovery function is not configured")
+		}
+
+		if recErr := m.recover(ctx); recErr != nil {
+			return false, fmt.Errorf("recovery failed after probe error %v: %w", err, recErr)
+		}
+
+		m.lastRecovery = now
+		m.failures = 0
+		return true, err
+	}
+
+	if m.failures > 0 {
+		logger.Info("Tunnel health monitor recovered after successful probe")
+	}
+	m.failures = 0
+	return false, nil
+}
+
+func (m *Monitor) probe(ctx context.Context) error {
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, m.cfg.URL, nil)
+	if err != nil {
+		return err
+	}
+
+	resp, err := m.client.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4096))
+
+	if resp.StatusCode < http.StatusOK || resp.StatusCode >= http.StatusBadRequest {
+		return fmt.Errorf("unexpected HTTP status %d", resp.StatusCode)
+	}
+
+	return nil
+}
+
+func parseBool(value string) bool {
+	switch strings.ToLower(strings.TrimSpace(value)) {
+	case "1", "true", "yes", "y", "on", "enable", "enabled":
+		return true
+	default:
+		return false
+	}
+}
+
+func parseDurationEnv(name string, fallback time.Duration) time.Duration {
+	value := strings.TrimSpace(os.Getenv(name))
+	if value == "" {
+		return fallback
+	}
+
+	d, err := time.ParseDuration(value)
+	if err != nil {
+		return fallback
+	}
+
+	return d
+}
+
+func parseIntEnv(name string, fallback int) int {
+	value := strings.TrimSpace(os.Getenv(name))
+	if value == "" {
+		return fallback
+	}
+
+	n, err := strconv.Atoi(value)
+	if err != nil {
+		return fallback
+	}
+
+	return n
+}
+
+func firstNonEmpty(value string, fallback string) string {
+	value = strings.TrimSpace(value)
+	if value == "" {
+		return fallback
+	}
+	return value
+}

+ 454 - 0
internal/tunnelmonitor/monitor_test.go

@@ -0,0 +1,454 @@
+package tunnelmonitor
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/mhsanaei/3x-ui/v3/internal/logger"
+	"github.com/op/go-logging"
+)
+
+func TestMain(m *testing.M) {
+	logger.InitLogger(logging.ERROR)
+	m.Run()
+}
+
+type roundTripFunc func(*http.Request) (*http.Response, error)
+
+func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {
+	return f(req)
+}
+
+func TestMonitorRestartsAfterFailureThreshold(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 2,
+		Cooldown:         time.Minute,
+	}
+
+	client := &http.Client{
+		Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
+			return nil, errors.New("tunnel down")
+		}),
+	}
+
+	restarts := 0
+	monitor := newWithClient(cfg, client, func(ctx context.Context) error {
+		restarts++
+		return nil
+	})
+
+	monitor.now = func() time.Time {
+		return time.Unix(100, 0)
+	}
+
+	if recovered, _ := monitor.Step(context.Background()); recovered {
+		t.Fatal("first failure must not trigger recovery")
+	}
+
+	if recovered, _ := monitor.Step(context.Background()); !recovered {
+		t.Fatal("second consecutive failure should trigger recovery")
+	}
+
+	if restarts != 1 {
+		t.Fatalf("expected 1 recovery, got %d", restarts)
+	}
+}
+
+func TestMonitorRespectsRecoveryCooldown(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 1,
+		Cooldown:         time.Minute,
+	}
+
+	client := &http.Client{
+		Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
+			return nil, errors.New("tunnel down")
+		}),
+	}
+
+	now := time.Unix(100, 0)
+	restarts := 0
+
+	monitor := newWithClient(cfg, client, func(ctx context.Context) error {
+		restarts++
+		return nil
+	})
+
+	monitor.now = func() time.Time {
+		return now
+	}
+
+	recovered, _ := monitor.Step(context.Background())
+	if !recovered {
+		t.Fatal("first failure should trigger recovery when threshold is 1")
+	}
+
+	recovered, _ = monitor.Step(context.Background())
+	if recovered {
+		t.Fatal("cooldown should suppress immediate second recovery")
+	}
+
+	if restarts != 1 {
+		t.Fatalf("expected 1 recovery during cooldown, got %d", restarts)
+	}
+
+	now = now.Add(time.Minute + time.Second)
+
+	recovered, _ = monitor.Step(context.Background())
+	if !recovered {
+		t.Fatal("recovery should be allowed after cooldown")
+	}
+
+	if restarts != 2 {
+		t.Fatalf("expected 2 recoveries after cooldown, got %d", restarts)
+	}
+}
+
+func TestMonitorSuccessResetsFailures(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 2,
+		Cooldown:         time.Minute,
+	}
+
+	fail := true
+	client := &http.Client{
+		Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
+			if fail {
+				return nil, errors.New("tunnel down")
+			}
+
+			return &http.Response{
+				StatusCode: http.StatusOK,
+				Body:       http.NoBody,
+			}, nil
+		}),
+	}
+
+	restarts := 0
+	monitor := newWithClient(cfg, client, func(ctx context.Context) error {
+		restarts++
+		return nil
+	})
+
+	_, _ = monitor.Step(context.Background())
+
+	fail = false
+	if recovered, err := monitor.Step(context.Background()); recovered || err != nil {
+		t.Fatalf("successful probe should not recover or fail, recovered=%v err=%v", recovered, err)
+	}
+
+	fail = true
+	if recovered, _ := monitor.Step(context.Background()); recovered {
+		t.Fatal("failure after success should be counted as first failure again")
+	}
+
+	if restarts != 0 {
+		t.Fatalf("expected no recovery, got %d", restarts)
+	}
+}
+
+func TestConfigFromEnvParsesValues(t *testing.T) {
+	t.Setenv("XUI_TUNNEL_HEALTH_MONITOR", "true")
+	t.Setenv("XUI_TUNNEL_HEALTH_URL", "https://example.com/health")
+	t.Setenv("XUI_TUNNEL_HEALTH_PROXY", "socks5://127.0.0.1:1080")
+	t.Setenv("XUI_TUNNEL_HEALTH_INTERVAL", "15s")
+	t.Setenv("XUI_TUNNEL_HEALTH_TIMEOUT", "3s")
+	t.Setenv("XUI_TUNNEL_HEALTH_FAILURES", "4")
+	t.Setenv("XUI_TUNNEL_HEALTH_COOLDOWN", "2m")
+
+	cfg := ConfigFromEnv()
+
+	if !cfg.Enabled {
+		t.Fatal("expected monitor to be enabled")
+	}
+
+	if cfg.URL != "https://example.com/health" {
+		t.Fatalf("unexpected URL: %s", cfg.URL)
+	}
+
+	if !strings.HasPrefix(cfg.ProxyURL, "socks5://") {
+		t.Fatalf("unexpected proxy URL: %s", cfg.ProxyURL)
+	}
+
+	if cfg.Interval != 15*time.Second {
+		t.Fatalf("unexpected interval: %s", cfg.Interval)
+	}
+
+	if cfg.Timeout != 3*time.Second {
+		t.Fatalf("unexpected timeout: %s", cfg.Timeout)
+	}
+
+	if cfg.FailureThreshold != 4 {
+		t.Fatalf("unexpected threshold: %d", cfg.FailureThreshold)
+	}
+
+	if cfg.Cooldown != 2*time.Minute {
+		t.Fatalf("unexpected cooldown: %s", cfg.Cooldown)
+	}
+}
+
+func failingClient() *http.Client {
+	return &http.Client{
+		Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
+			return nil, errors.New("tunnel down")
+		}),
+	}
+}
+
+func statusClient(code int) *http.Client {
+	return &http.Client{
+		Transport: roundTripFunc(func(req *http.Request) (*http.Response, error) {
+			return &http.Response{StatusCode: code, Body: http.NoBody}, nil
+		}),
+	}
+}
+
+func TestProbeStatusCodeClassification(t *testing.T) {
+	cases := []struct {
+		status  int
+		healthy bool
+	}{
+		{199, false},
+		{200, true},
+		{204, true},
+		{301, true},
+		{399, true},
+		{400, false},
+		{404, false},
+		{500, false},
+	}
+
+	for _, tc := range cases {
+		cfg := Config{
+			Enabled:          true,
+			URL:              "http://example.test",
+			Interval:         time.Minute,
+			Timeout:          time.Second,
+			FailureThreshold: 100,
+			Cooldown:         time.Minute,
+		}
+
+		monitor := newWithClient(cfg, statusClient(tc.status), func(ctx context.Context) error {
+			return nil
+		})
+
+		recovered, err := monitor.Step(context.Background())
+		if recovered {
+			t.Fatalf("status %d: unexpected recovery", tc.status)
+		}
+		if tc.healthy && err != nil {
+			t.Fatalf("status %d: expected healthy probe, got error %v", tc.status, err)
+		}
+		if !tc.healthy && err == nil {
+			t.Fatalf("status %d: expected failure, got nil error", tc.status)
+		}
+	}
+}
+
+func TestNormalizeClampsBounds(t *testing.T) {
+	got := Config{
+		URL:              "   ",
+		Interval:         0,
+		Timeout:          500 * time.Millisecond,
+		FailureThreshold: 0,
+		Cooldown:         0,
+	}.Normalize()
+
+	if got.URL != defaultHealthURL {
+		t.Fatalf("URL not defaulted: %q", got.URL)
+	}
+	if got.Interval != defaultInterval {
+		t.Fatalf("Interval not clamped: %s", got.Interval)
+	}
+	if got.Timeout != defaultTimeout {
+		t.Fatalf("Timeout not clamped: %s", got.Timeout)
+	}
+	if got.FailureThreshold != defaultFailureThreshold {
+		t.Fatalf("FailureThreshold not clamped: %d", got.FailureThreshold)
+	}
+	if got.Cooldown != defaultCooldown {
+		t.Fatalf("Cooldown not clamped: %s", got.Cooldown)
+	}
+
+	valid := Config{
+		URL:              "https://example.com/health",
+		Interval:         15 * time.Second,
+		Timeout:          3 * time.Second,
+		FailureThreshold: 5,
+		Cooldown:         2 * time.Minute,
+	}.Normalize()
+
+	if valid.URL != "https://example.com/health" ||
+		valid.Interval != 15*time.Second ||
+		valid.Timeout != 3*time.Second ||
+		valid.FailureThreshold != 5 ||
+		valid.Cooldown != 2*time.Minute {
+		t.Fatalf("valid config was mutated: %+v", valid)
+	}
+}
+
+func TestNewRejectsUnsupportedProxyScheme(t *testing.T) {
+	m, err := New(Config{ProxyURL: "ftp://127.0.0.1:21"}, func(ctx context.Context) error {
+		return nil
+	})
+	if err == nil || m != nil {
+		t.Fatalf("expected error and nil monitor for bad scheme, got m=%v err=%v", m, err)
+	}
+
+	m, err = New(Config{}, func(ctx context.Context) error {
+		return nil
+	})
+	if err != nil || m == nil {
+		t.Fatalf("expected a valid monitor for empty proxy, got m=%v err=%v", m, err)
+	}
+}
+
+func TestMonitorRecoveryErrorDoesNotArmCooldown(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 1,
+		Cooldown:         time.Minute,
+	}
+
+	attempts := 0
+	monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error {
+		attempts++
+		return errors.New("restart failed")
+	})
+	monitor.now = func() time.Time {
+		return time.Unix(100, 0)
+	}
+
+	recovered, err := monitor.Step(context.Background())
+	if recovered || err == nil {
+		t.Fatalf("failed recovery must report recovered=false with an error, got recovered=%v err=%v", recovered, err)
+	}
+	if !monitor.lastRecovery.IsZero() {
+		t.Fatal("a failed recovery must not arm the cooldown")
+	}
+
+	if _, err := monitor.Step(context.Background()); err == nil {
+		t.Fatal("expected error on the second failing step")
+	}
+	if attempts != 2 {
+		t.Fatalf("recovery should be retried (no cooldown) after a failure, attempts=%d", attempts)
+	}
+}
+
+func TestMonitorNilRecoverStaysBounded(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 2,
+		Cooldown:         time.Minute,
+	}
+
+	monitor := newWithClient(cfg, failingClient(), nil)
+
+	for i := 0; i < 5; i++ {
+		recovered, _ := monitor.Step(context.Background())
+		if recovered {
+			t.Fatal("a nil recovery func must never report recovery")
+		}
+		if monitor.failures > cfg.FailureThreshold {
+			t.Fatalf("failures must stay capped at threshold %d, got %d", cfg.FailureThreshold, monitor.failures)
+		}
+	}
+}
+
+func TestMonitorFailuresCappedDuringCooldown(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Interval:         time.Minute,
+		Timeout:          time.Second,
+		FailureThreshold: 2,
+		Cooldown:         time.Minute,
+	}
+
+	restarts := 0
+	monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error {
+		restarts++
+		return nil
+	})
+	monitor.now = func() time.Time {
+		return time.Unix(100, 0)
+	}
+
+	monitor.Step(context.Background())
+	if recovered, _ := monitor.Step(context.Background()); !recovered {
+		t.Fatal("expected recovery once the threshold is reached")
+	}
+
+	for i := 0; i < 6; i++ {
+		monitor.Step(context.Background())
+		if monitor.failures > cfg.FailureThreshold {
+			t.Fatalf("failures must never exceed threshold %d during cooldown, got %d", cfg.FailureThreshold, monitor.failures)
+		}
+	}
+
+	if restarts != 1 {
+		t.Fatalf("cooldown should suppress further recoveries, restarts=%d", restarts)
+	}
+}
+
+func TestMonitorRunStopsOnContextCancel(t *testing.T) {
+	cfg := Config{
+		Enabled:          true,
+		URL:              "http://example.test",
+		Timeout:          time.Second,
+		FailureThreshold: 1,
+		Cooldown:         time.Hour,
+	}
+
+	recovered := make(chan struct{})
+	var once sync.Once
+	monitor := newWithClient(cfg, failingClient(), func(ctx context.Context) error {
+		once.Do(func() { close(recovered) })
+		return nil
+	})
+	monitor.cfg.Interval = 5 * time.Millisecond
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		monitor.Run(ctx)
+		close(done)
+	}()
+
+	select {
+	case <-recovered:
+	case <-time.After(2 * time.Second):
+		cancel()
+		t.Fatal("Run did not trigger recovery within the deadline")
+	}
+
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		t.Fatal("Run did not return after context cancellation")
+	}
+}

+ 28 - 1
main.go

@@ -3,6 +3,7 @@
 package main
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"log"
@@ -18,6 +19,7 @@ import (
 	"github.com/mhsanaei/3x-ui/v3/internal/database"
 	"github.com/mhsanaei/3x-ui/v3/internal/logger"
 	"github.com/mhsanaei/3x-ui/v3/internal/sub"
+	"github.com/mhsanaei/3x-ui/v3/internal/tunnelmonitor"
 	"github.com/mhsanaei/3x-ui/v3/internal/util/crypto"
 	"github.com/mhsanaei/3x-ui/v3/internal/util/sys"
 	"github.com/mhsanaei/3x-ui/v3/internal/web"
@@ -91,7 +93,7 @@ func runWebServer() {
 		return
 	}
 
-	sigCh := make(chan os.Signal, 1)
+	sigCh := make(chan os.Signal, 8)
 	// Trap shutdown signals
 	signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGTERM, sys.SIGUSR1, os.Interrupt)
 	global.SetRestartHook(func() {
@@ -100,6 +102,27 @@ func runWebServer() {
 		default:
 		}
 	})
+
+	var stopTunnelHealthMonitor context.CancelFunc
+	monitorCfg := tunnelmonitor.ConfigFromEnv()
+	if monitorCfg.Enabled {
+		if monitorCfg.ProxyURL == "" {
+			logger.Warning("Tunnel health monitor enabled without XUI_TUNNEL_HEALTH_PROXY: the probe measures host connectivity, not the xray tunnel, so failures will restart xray without fixing host network issues")
+		}
+
+		monitorCtx, cancel := context.WithCancel(context.Background())
+		stopTunnelHealthMonitor = cancel
+
+		monitor, err := tunnelmonitor.New(monitorCfg, func(_ context.Context) error {
+			logger.Warning("Tunnel health monitor threshold reached, restarting xray-core")
+			return server.RestartXray()
+		})
+		if err != nil {
+			logger.Warning("Tunnel health monitor disabled: ", err)
+		} else {
+			go monitor.Run(monitorCtx)
+		}
+	}
 	for {
 		sig := <-sigCh
 
@@ -142,6 +165,10 @@ func runWebServer() {
 			}
 
 		default:
+			if stopTunnelHealthMonitor != nil {
+				stopTunnelHealthMonitor()
+			}
+
 			// --- FIX FOR TELEGRAM BOT CONFLICT (409) on full shutdown ---
 			tgbot.StopBot()
 			// ------------------------------------------------------------