|
|
@@ -13,6 +13,7 @@ import (
|
|
|
"runtime"
|
|
|
"strconv"
|
|
|
"strings"
|
|
|
+ "sync"
|
|
|
"syscall"
|
|
|
"time"
|
|
|
|
|
|
@@ -44,10 +45,64 @@ const (
|
|
|
// devReleaseTag is the fixed-tag rolling pre-release the CI force-moves to the
|
|
|
// newest main commit; the dev update channel installs from it.
|
|
|
devReleaseTag = "dev-latest"
|
|
|
+
|
|
|
+ updateStatePending = "pending"
|
|
|
+ updateStateSuccess = "success"
|
|
|
+ updateStateFailed = "failed"
|
|
|
)
|
|
|
|
|
|
+// PanelUpdateStatus reports the outcome of the most recently launched panel
|
|
|
+// self-update. RunID lets the caller confirm this status belongs to the
|
|
|
+// update it started rather than a stale result left over from an earlier
|
|
|
+// run; State is one of "pending", "success", or "failed". RunID is a decimal
|
|
|
+// string, not a JSON number: it's a formatted UnixNano timestamp, and
|
|
|
+// JavaScript's number type can't represent that precisely (it exceeds
|
|
|
+// Number.MAX_SAFE_INTEGER), which would let two different runs round to the
|
|
|
+// same value on the wire and defeat the whole point of this field.
|
|
|
+type PanelUpdateStatus struct {
|
|
|
+ RunID string `json:"runId" example:"1735689600123456789"`
|
|
|
+ State string `json:"state" example:"success"`
|
|
|
+ ExitCode int `json:"exitCode" example:"0"`
|
|
|
+ FinishedAt int64 `json:"finishedAt" example:"1735689612"`
|
|
|
+}
|
|
|
+
|
|
|
var releaseCommitRegex = regexp.MustCompile(`(?i)commit=([0-9a-f]{7,40})`)
|
|
|
|
|
|
+// updateMu guards updateRunning/updateStarted/updateRunID/updatePID, which
|
|
|
+// stop a second self-update from launching while one is still in flight (two
|
|
|
+// concurrent update.sh runs would race each other extracting the release
|
|
|
+// tarball and swapping the service unit). A slot is released as soon as the
|
|
|
+// in-flight run's own status file reports success or failure -- checked
|
|
|
+// against updateRunID so a stale file from an even earlier run can't be
|
|
|
+// mistaken for this one finishing -- so a fast failure doesn't lock out a
|
|
|
+// retry.
|
|
|
+//
|
|
|
+// For a run that never reaches a terminal state at all, staleness is judged
|
|
|
+// primarily by whether the process we actually launched is still alive
|
|
|
+// (updatePID, via processAlive), not by wall-clock time alone: update.sh
|
|
|
+// runs install_base() (a package-manager update+install) before anything
|
|
|
+// else, plus several downloads, which can legitimately run past a short
|
|
|
+// fixed timeout on a slow or throttled host without anything being wrong.
|
|
|
+// updateStaleAfter/updatePID together are only a fallback for the systemd-run
|
|
|
+// launch path, where the process we can observe (systemd-run itself) has
|
|
|
+// already exited by the time startUpdate returns and the actual update.sh
|
|
|
+// unit's PID is never recorded -- for that path this is still a pure
|
|
|
+// wall-clock heuristic. updateHardCeiling is an absolute backstop so a
|
|
|
+// genuinely wedged run (alive but hung forever) can never lock out retries
|
|
|
+// permanently, even on the PID-tracked path.
|
|
|
+var (
|
|
|
+ updateMu sync.Mutex
|
|
|
+ updateRunning bool
|
|
|
+ updateStarted time.Time
|
|
|
+ updateRunID int64
|
|
|
+ updatePID int
|
|
|
+)
|
|
|
+
|
|
|
+const (
|
|
|
+ updateStaleAfter = 20 * time.Minute
|
|
|
+ updateHardCeiling = 2 * time.Hour
|
|
|
+)
|
|
|
+
|
|
|
func (s *PanelService) RestartPanel(delay time.Duration) error {
|
|
|
go func() {
|
|
|
time.Sleep(delay)
|
|
|
@@ -122,39 +177,77 @@ func getDevUpdateInfo() (*PanelUpdateInfo, error) {
|
|
|
}, nil
|
|
|
}
|
|
|
|
|
|
-// StartUpdate starts the official updater using this panel's own channel setting.
|
|
|
-func (s *PanelService) StartUpdate() error {
|
|
|
+// StartUpdate starts the official updater using this panel's own channel
|
|
|
+// setting. Returns the run ID to pass to GetUpdateStatus so the caller can
|
|
|
+// tell this run's result apart from a stale one.
|
|
|
+func (s *PanelService) StartUpdate() (int64, error) {
|
|
|
return s.startUpdate(devChannelActive())
|
|
|
}
|
|
|
|
|
|
// StartUpdateChannel runs the updater against an explicitly chosen channel,
|
|
|
// overriding the local dev-channel setting. Used by the master node updater so
|
|
|
// a node can be moved to the dev channel from the central panel.
|
|
|
-func (s *PanelService) StartUpdateChannel(dev bool) error {
|
|
|
+func (s *PanelService) StartUpdateChannel(dev bool) (int64, error) {
|
|
|
return s.startUpdate(dev)
|
|
|
}
|
|
|
|
|
|
-func (s *PanelService) startUpdate(useDev bool) error {
|
|
|
+// GetUpdateStatus reports the outcome of the most recently launched panel
|
|
|
+// self-update, as recorded by update.sh's EXIT trap (see the script for why
|
|
|
+// that covers every exit path, not just the happy one). This is a best-effort
|
|
|
+// side channel: a missing or unreadable status file reads as "pending"
|
|
|
+// rather than an error, since the update itself is what matters, not this
|
|
|
+// status file.
|
|
|
+func (s *PanelService) GetUpdateStatus() *PanelUpdateStatus {
|
|
|
+ data, err := os.ReadFile(config.GetUpdateStatusFilePath())
|
|
|
+ if err != nil {
|
|
|
+ return &PanelUpdateStatus{State: updateStatePending}
|
|
|
+ }
|
|
|
+ var status PanelUpdateStatus
|
|
|
+ if err := json.Unmarshal(data, &status); err != nil {
|
|
|
+ return &PanelUpdateStatus{State: updateStatePending}
|
|
|
+ }
|
|
|
+ if status.State != updateStateSuccess && status.State != updateStateFailed {
|
|
|
+ status.State = updateStatePending
|
|
|
+ }
|
|
|
+ return &status
|
|
|
+}
|
|
|
+
|
|
|
+func (s *PanelService) startUpdate(useDev bool) (int64, error) {
|
|
|
+ runID := time.Now().UnixNano()
|
|
|
+ if !acquireUpdateSlot(runID) {
|
|
|
+ return 0, fmt.Errorf("a panel update is already in progress")
|
|
|
+ }
|
|
|
+ launched := false
|
|
|
+ defer func() {
|
|
|
+ if !launched {
|
|
|
+ releaseUpdateSlot()
|
|
|
+ }
|
|
|
+ }()
|
|
|
+
|
|
|
if runtime.GOOS != "linux" {
|
|
|
- return fmt.Errorf("panel web update is supported only on Linux installations")
|
|
|
+ return 0, fmt.Errorf("panel web update is supported only on Linux installations")
|
|
|
}
|
|
|
|
|
|
bash, err := exec.LookPath("bash")
|
|
|
if err != nil {
|
|
|
- return fmt.Errorf("bash is required to run the panel updater: %w", err)
|
|
|
+ return 0, fmt.Errorf("bash is required to run the panel updater: %w", err)
|
|
|
}
|
|
|
|
|
|
scriptPath, err := downloadPanelUpdater()
|
|
|
if err != nil {
|
|
|
- return err
|
|
|
+ return 0, err
|
|
|
}
|
|
|
|
|
|
+ statusFile := config.GetUpdateStatusFilePath()
|
|
|
+
|
|
|
mainFolder, serviceFolder := resolveUpdateFolders()
|
|
|
updateTag := ""
|
|
|
if useDev {
|
|
|
updateTag = devReleaseTag
|
|
|
}
|
|
|
updateScript := fmt.Sprintf("set -e; trap 'rm -f %s' EXIT; %s %s", shellQuote(scriptPath), shellQuote(bash), shellQuote(scriptPath))
|
|
|
+ runIDEnv := "XUI_UPDATE_RUN_ID=" + strconv.FormatInt(runID, 10)
|
|
|
+ statusFileEnv := "XUI_UPDATE_STATUS_FILE=" + statusFile
|
|
|
|
|
|
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
|
|
|
unitName := fmt.Sprintf("x-ui-web-update-%d", time.Now().Unix())
|
|
|
@@ -163,6 +256,8 @@ func (s *PanelService) startUpdate(useDev bool) error {
|
|
|
"--setenv", "XUI_MAIN_FOLDER="+mainFolder,
|
|
|
"--setenv", "XUI_SERVICE="+serviceFolder,
|
|
|
"--setenv", "XUI_UPDATE_TAG="+updateTag,
|
|
|
+ "--setenv", runIDEnv,
|
|
|
+ "--setenv", statusFileEnv,
|
|
|
bash, "-lc", updateScript,
|
|
|
)
|
|
|
out, err := cmd.CombinedOutput()
|
|
|
@@ -171,12 +266,13 @@ func (s *PanelService) startUpdate(useDev bool) error {
|
|
|
if !strings.Contains(output, "System has not been booted with systemd") &&
|
|
|
!strings.Contains(output, "Failed to connect to bus") {
|
|
|
_ = os.Remove(scriptPath)
|
|
|
- return fmt.Errorf("failed to start panel update job: %w: %s", err, output)
|
|
|
+ return 0, fmt.Errorf("failed to start panel update job: %w: %s", err, output)
|
|
|
}
|
|
|
logger.Warning("systemd-run is unavailable, falling back to detached update process:", output)
|
|
|
} else {
|
|
|
logger.Infof("started panel update job via systemd-run unit %s", unitName)
|
|
|
- return nil
|
|
|
+ launched = true
|
|
|
+ return runID, nil
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -185,17 +281,77 @@ func (s *PanelService) startUpdate(useDev bool) error {
|
|
|
"XUI_MAIN_FOLDER="+mainFolder,
|
|
|
"XUI_SERVICE="+serviceFolder,
|
|
|
"XUI_UPDATE_TAG="+updateTag,
|
|
|
+ runIDEnv,
|
|
|
+ statusFileEnv,
|
|
|
)
|
|
|
setDetachedProcess(cmd)
|
|
|
if err := cmd.Start(); err != nil {
|
|
|
_ = os.Remove(scriptPath)
|
|
|
- return fmt.Errorf("failed to start panel update job: %w", err)
|
|
|
+ return 0, fmt.Errorf("failed to start panel update job: %w", err)
|
|
|
}
|
|
|
if err := cmd.Process.Release(); err != nil {
|
|
|
logger.Warning("failed to release panel update process:", err)
|
|
|
}
|
|
|
logger.Infof("started panel update job with pid %d", cmd.Process.Pid)
|
|
|
- return nil
|
|
|
+ recordUpdatePID(cmd.Process.Pid)
|
|
|
+ launched = true
|
|
|
+ return runID, nil
|
|
|
+}
|
|
|
+
|
|
|
+// acquireUpdateSlot claims the single in-flight-update slot for runID. It
|
|
|
+// refuses while another run is genuinely still in flight, but grants the
|
|
|
+// slot immediately once that run's own status file reports a terminal
|
|
|
+// result (success or failure) -- a fast failure shouldn't force the next
|
|
|
+// attempt to wait out updateStaleAfter for no reason. Past updateStaleAfter
|
|
|
+// with no terminal status yet, it grants the slot anyway UNLESS the process
|
|
|
+// we recorded (updatePID) is confirmed still alive, so a merely-slow run
|
|
|
+// isn't mistaken for a crashed one; past updateHardCeiling it grants the
|
|
|
+// slot unconditionally regardless of liveness, so a truly wedged run can
|
|
|
+// never lock out retries forever.
|
|
|
+func acquireUpdateSlot(runID int64) bool {
|
|
|
+ updateMu.Lock()
|
|
|
+ defer updateMu.Unlock()
|
|
|
+ if updateRunning && !previousRunIsTerminal() {
|
|
|
+ elapsed := time.Since(updateStarted)
|
|
|
+ if elapsed < updateHardCeiling {
|
|
|
+ stale := elapsed >= updateStaleAfter
|
|
|
+ alive := updatePID > 0 && processAlive(updatePID)
|
|
|
+ if !stale || alive {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ updateRunning = true
|
|
|
+ updateStarted = time.Now()
|
|
|
+ updateRunID = runID
|
|
|
+ updatePID = 0
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+// recordUpdatePID notes the PID of the detached update.sh process the
|
|
|
+// current slot is tracking, so a later acquireUpdateSlot call can check
|
|
|
+// whether it is actually still running instead of only how long ago it
|
|
|
+// started. Only reachable for the detached-fallback launch path -- the
|
|
|
+// systemd-run path never learns update.sh's own PID, since the process it
|
|
|
+// directly observes (systemd-run) has already exited by the time it returns.
|
|
|
+func recordUpdatePID(pid int) {
|
|
|
+ updateMu.Lock()
|
|
|
+ updatePID = pid
|
|
|
+ updateMu.Unlock()
|
|
|
+}
|
|
|
+
|
|
|
+// previousRunIsTerminal reports whether the run currently recorded in
|
|
|
+// updateRunID has reached success or failure per its status file. Must be
|
|
|
+// called with updateMu held.
|
|
|
+func previousRunIsTerminal() bool {
|
|
|
+ status := (&PanelService{}).GetUpdateStatus()
|
|
|
+ return status.RunID == strconv.FormatInt(updateRunID, 10) && status.State != updateStatePending
|
|
|
+}
|
|
|
+
|
|
|
+func releaseUpdateSlot() {
|
|
|
+ updateMu.Lock()
|
|
|
+ updateRunning = false
|
|
|
+ updateMu.Unlock()
|
|
|
}
|
|
|
|
|
|
func downloadPanelUpdater() (string, error) {
|
|
|
@@ -230,6 +386,9 @@ func downloadPanelUpdater() (string, error) {
|
|
|
if err != nil {
|
|
|
return "", fmt.Errorf("write panel updater: %w", err)
|
|
|
}
|
|
|
+ if n == 0 {
|
|
|
+ return "", fmt.Errorf("panel updater download is empty")
|
|
|
+ }
|
|
|
if n > maxPanelUpdaterBytes {
|
|
|
return "", fmt.Errorf("panel updater exceeds %d bytes", maxPanelUpdaterBytes)
|
|
|
}
|