Преглед на файлове

Fix IP Limit continuous ban loop from stale DB entries (#4077)

After 60abeaa flipped the excess-IP selector to "oldest wins,
newest loses" (to protect the original/current connections), the
per-client IP table in `inbound_client_ips.ips` never evicted IPs
that stopped connecting. Their stored timestamp stayed ancient, so
on every subsequent run they counted as the "oldest protected"
slot(s) and whichever IP was actually using the config now was
classified as "new excess" and re-banned via fail2ban.

This is exactly the #4077 scenario: two IPs connect once and get
recorded, the ban lifts after the configured duration, the lone
legitimate IP that reconnects gets banned again, and again, and
again — a permanent 3xipl.log loop with no real abuser anywhere.

Fix: when merging the persisted `old` list with the freshly
observed `new` log lines, drop entries whose last-seen timestamp
is older than `ipStaleAfterSeconds` (30 minutes). A client that's
actually still active refreshes its timestamp any time xray emits
a new `accepted` line for a fresh TCP, so the cutoff is far above
even idle streaming sessions; a client that's genuinely gone falls
out of the table in bounded time and frees its slot.

Extracted the merge into `mergeClientIps` so it can be exercised
by unit tests without spinning up the full DB-backed job.

Tests cover:
- stale old entry is dropped (the #4077 regression)
- fresh old entries are still carried forward (access-log rotation
  is still backed by the persisted table)
- newer timestamp wins when the same IP appears in both lists
- a clock-skewed old `new` entry can't resurrect a stale IP
- a zero cutoff never over-evicts

Closes #4077
pwnnex преди 3 дни
родител
ревизия
eef2d311f4
променени са 2 файла, в които са добавени 124 реда и са изтрити 10 реда
  1. 47 10
      web/job/check_client_ip_job.go
  2. 77 0
      web/job/check_client_ip_job_test.go

+ 47 - 10
web/job/check_client_ip_job.go

@@ -35,6 +35,25 @@ var job *CheckClientIpJob
 
 const defaultXrayAPIPort = 62789
 
+// ipStaleAfterSeconds controls how long a client IP kept in the
+// per-client tracking table (model.InboundClientIps.Ips) is considered
+// still "active" before it's evicted during the next scan.
+//
+// Without this eviction, an IP that connected once and then went away
+// keeps sitting in the table with its old timestamp. Because the
+// excess-IP selector sorts ascending ("oldest wins, newest loses") to
+// protect the original/current connections, that stale entry keeps
+// occupying a slot and the IP that is *actually* currently using the
+// config gets classified as "new excess" and banned by fail2ban on
+// every single run — producing the continuous ban loop from #4077.
+//
+// 30 minutes is chosen so an actively-streaming client (where xray
+// emits a fresh `accepted` log line whenever it opens a new TCP) will
+// always refresh its timestamp well within the window, but a client
+// that has really stopped using the config will drop out of the table
+// in a bounded time and free its slot.
+const ipStaleAfterSeconds = int64(30 * 60)
+
 // NewCheckClientIpJob creates a new client IP monitoring job instance.
 func NewCheckClientIpJob() *CheckClientIpJob {
 	job = new(CheckClientIpJob)
@@ -202,6 +221,31 @@ func (j *CheckClientIpJob) processLogFile() bool {
 	return shouldCleanLog
 }
 
+// mergeClientIps combines the persisted (old) and freshly observed (new)
+// IP-with-timestamp lists for a single client into a map. An entry is
+// dropped if its last-seen timestamp is older than staleCutoff.
+//
+// Extracted as a helper so updateInboundClientIps can stay DB-oriented
+// and the merge policy can be exercised by a unit test.
+func mergeClientIps(old, new []IPWithTimestamp, staleCutoff int64) map[string]int64 {
+	ipMap := make(map[string]int64, len(old)+len(new))
+	for _, ipTime := range old {
+		if ipTime.Timestamp < staleCutoff {
+			continue
+		}
+		ipMap[ipTime.IP] = ipTime.Timestamp
+	}
+	for _, ipTime := range new {
+		if ipTime.Timestamp < staleCutoff {
+			continue
+		}
+		if existingTime, ok := ipMap[ipTime.IP]; !ok || ipTime.Timestamp > existingTime {
+			ipMap[ipTime.IP] = ipTime.Timestamp
+		}
+	}
+	return ipMap
+}
+
 func (j *CheckClientIpJob) checkFail2BanInstalled() bool {
 	cmd := "fail2ban-client"
 	args := []string{"-h"}
@@ -310,16 +354,9 @@ func (j *CheckClientIpJob) updateInboundClientIps(inboundClientIps *model.Inboun
 		json.Unmarshal([]byte(inboundClientIps.Ips), &oldIpsWithTime)
 	}
 
-	// Merge old and new IPs, keeping the latest timestamp for each IP
-	ipMap := make(map[string]int64)
-	for _, ipTime := range oldIpsWithTime {
-		ipMap[ipTime.IP] = ipTime.Timestamp
-	}
-	for _, ipTime := range newIpsWithTime {
-		if existingTime, ok := ipMap[ipTime.IP]; !ok || ipTime.Timestamp > existingTime {
-			ipMap[ipTime.IP] = ipTime.Timestamp
-		}
-	}
+	// Merge old and new IPs, evicting entries that haven't been
+	// re-observed in a while. See mergeClientIps / #4077 for why.
+	ipMap := mergeClientIps(oldIpsWithTime, newIpsWithTime, time.Now().Unix()-ipStaleAfterSeconds)
 
 	// Convert back to slice and sort by timestamp (oldest first)
 	// This ensures we always protect the original/current connections and ban new excess ones.

+ 77 - 0
web/job/check_client_ip_job_test.go

@@ -0,0 +1,77 @@
+package job
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestMergeClientIps_EvictsStaleOldEntries(t *testing.T) {
+	// #4077: after a ban expires, a single IP that reconnects used to get
+	// banned again immediately because a long-disconnected IP stayed in the
+	// DB with an ancient timestamp and kept "protecting" itself against
+	// eviction. Guard against that regression here.
+	old := []IPWithTimestamp{
+		{IP: "1.1.1.1", Timestamp: 100},  // stale — client disconnected long ago
+		{IP: "2.2.2.2", Timestamp: 1900}, // fresh — still connecting
+	}
+	new := []IPWithTimestamp{
+		{IP: "2.2.2.2", Timestamp: 2000}, // same IP, newer log line
+	}
+
+	got := mergeClientIps(old, new, 1000)
+
+	want := map[string]int64{"2.2.2.2": 2000}
+	if !reflect.DeepEqual(got, want) {
+		t.Fatalf("stale 1.1.1.1 should have been dropped\ngot:  %v\nwant: %v", got, want)
+	}
+}
+
+func TestMergeClientIps_KeepsFreshOldEntriesUnchanged(t *testing.T) {
+	// Backwards-compat: entries that aren't stale are still carried forward,
+	// so enforcement survives access-log rotation.
+	old := []IPWithTimestamp{
+		{IP: "1.1.1.1", Timestamp: 1500},
+	}
+	got := mergeClientIps(old, nil, 1000)
+
+	want := map[string]int64{"1.1.1.1": 1500}
+	if !reflect.DeepEqual(got, want) {
+		t.Fatalf("fresh old IP should have been retained\ngot:  %v\nwant: %v", got, want)
+	}
+}
+
+func TestMergeClientIps_PrefersLaterTimestampForSameIp(t *testing.T) {
+	old := []IPWithTimestamp{{IP: "1.1.1.1", Timestamp: 1500}}
+	new := []IPWithTimestamp{{IP: "1.1.1.1", Timestamp: 1700}}
+
+	got := mergeClientIps(old, new, 1000)
+
+	if got["1.1.1.1"] != 1700 {
+		t.Fatalf("expected latest timestamp 1700, got %d", got["1.1.1.1"])
+	}
+}
+
+func TestMergeClientIps_DropsStaleNewEntries(t *testing.T) {
+	// A log line with a clock-skewed old timestamp must not resurrect a
+	// stale IP past the cutoff.
+	new := []IPWithTimestamp{{IP: "1.1.1.1", Timestamp: 500}}
+	got := mergeClientIps(nil, new, 1000)
+
+	if len(got) != 0 {
+		t.Fatalf("stale new IP should have been dropped, got %v", got)
+	}
+}
+
+func TestMergeClientIps_NoStaleCutoffStillWorks(t *testing.T) {
+	// Defensive: a zero cutoff (e.g. during very first run on a fresh
+	// install) must not over-evict.
+	old := []IPWithTimestamp{{IP: "1.1.1.1", Timestamp: 100}}
+	new := []IPWithTimestamp{{IP: "2.2.2.2", Timestamp: 200}}
+
+	got := mergeClientIps(old, new, 0)
+
+	want := map[string]int64{"1.1.1.1": 100, "2.2.2.2": 200}
+	if !reflect.DeepEqual(got, want) {
+		t.Fatalf("zero cutoff should keep everything\ngot:  %v\nwant: %v", got, want)
+	}
+}