-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathvalidator.go
More file actions
241 lines (206 loc) · 6.43 KB
/
validator.go
File metadata and controls
241 lines (206 loc) · 6.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
package knownbots
import (
"context"
"fmt"
"net/http"
"net/netip"
"os"
"path/filepath"
"sync/atomic"
"time"
"github.com/cnlangzi/knownbots/asn"
)
// Default settings
const (
FailLRULimit = 1000
SchedulerInterval = 24 * time.Hour
)
// ResultStatus represents the verification result status.
type ResultStatus int
const (
StatusVerified ResultStatus = 1 // IP verified successfully
StatusPending ResultStatus = 2 // RDNS network error, can retry
StatusFailed ResultStatus = 3 // IP not matched, suspected fake bot
StatusUnknown ResultStatus = 0 // Not a bot (normal browser)
)
// Result represents the verification result.
type Result struct {
BotName string `json:"bot_name"`
BotKind BotKind `json:"bot_kind"`
IsBot bool `json:"is_bot"`
Status ResultStatus `json:"status"`
}
// Validator is the core bot verification engine.
type Validator struct {
root string
bots atomic.Pointer[[]*Bot] // []*Bot, atomic for lock-free reads
uaIndex atomic.Pointer[map[byte][]*Bot] // map[byte][]*Bot, byte-level index for UA lookup
asnStore *asn.Store
cancel context.CancelFunc
failLimit int
classifyUA bool
}
// getBots returns the current bots slice atomically.
func (v *Validator) getBots() []*Bot {
return *v.bots.Load()
}
// setBots stores the bots slice atomically and builds the UA index.
func (v *Validator) setBots(bots []*Bot) {
uaIndex := buildUAIndex(bots)
v.bots.Store(&bots)
v.uaIndex.Store(&uaIndex)
}
// New creates a new Validator instance with background scheduler.
func New(opts ...Option) (*Validator, error) {
cfg := Config{
Root: "./bots",
FailLimit: FailLRULimit,
ClassifyUA: false, // Default: skip classifyUA for performance
}
for _, opt := range opts {
opt(&cfg)
}
bots, err := Load(cfg.Root)
if err != nil {
return nil, err
}
// Initialize ASN store first
asnStore := asn.NewStore(cfg.Root)
for _, bot := range bots {
botDir := filepath.Join(cfg.Root, bot.Name)
if err := os.MkdirAll(botDir, 0755); err != nil {
return nil, fmt.Errorf("failed to create cache directory: %w", err)
}
bot.initIPs(filepath.Join(botDir, "ips.txt"))
bot.initASN(asnStore)
bot.initRDNS(filepath.Join(botDir, "rdns.txt"))
bot.fail = NewLRU(cfg.FailLimit)
}
ctx, cancel := context.WithCancel(context.Background())
v := &Validator{
root: cfg.Root,
asnStore: asnStore,
cancel: cancel,
failLimit: cfg.FailLimit,
classifyUA: cfg.ClassifyUA,
}
v.setBots(bots)
go v.startScheduler(ctx)
return v, nil
}
// startScheduler runs background tasks:
// - refreshIPs: download and update IP ranges from official URLs
// - pruneCaches: verify and clean up cached RDNS entries
// - persistCaches: write valid cache entries to persistent storage
func (v *Validator) startScheduler(ctx context.Context) {
httpClient := &http.Client{Timeout: 30 * time.Second}
ticker := time.NewTicker(SchedulerInterval)
defer ticker.Stop()
// Run immediately on start
v.runScheduler(httpClient)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
v.runScheduler(httpClient)
}
}
}
func (v *Validator) runScheduler(httpClient *http.Client) {
bots := v.getBots()
for _, bot := range bots {
// Update IP ranges
bot.refreshIPs(httpClient, v.root)
// Update ASN data with ASN configured
bot.refreshASN(v.asnStore)
// Prune and persist RDNS caches
bot.refreshRDNS()
}
}
// Validate verifies if the given UserAgent and IP belong to a known bot.
// By default (classifyUA disabled), unknown UAs return IsBot=false for performance.
// When WithClassifyUA() enabled:
// - IsBot: true if UA matches a known bot or is suspicious, false if it's a legitimate browser
// - IsVerified: true if the IP is verified for the bot
// - Status: verified (bot confirmed), failed (bot suspected, IP not verified), or unknown
func (v *Validator) Validate(ua, ip string) Result {
// Step 1: Check if UA matches any known bot (claims to be a known bot)
if bot := v.findBotByUA(ua); bot != nil {
result := v.verifyIP(bot, ip)
result.IsBot = true
return result
}
// Step 2: Classify UA type (single pass)
if v.classifyUA {
switch classifyUA(ua) {
case Browser:
// Valid browser structure → not a bot
return Result{Status: StatusUnknown, BotKind: KindUnknown, IsBot: false}
case Suspicious:
// Claims to be browser but malformed → suspicious bot
return Result{Status: StatusUnknown, BotKind: KindUnknown, IsBot: true}
default:
// Unknown (not browser-like)
return Result{Status: StatusUnknown, BotKind: KindUnknown, IsBot: true}
}
}
// classifyUA disabled (default): unknown UA, assume not a bot
return Result{Status: StatusUnknown, BotKind: KindUnknown, IsBot: false}
}
// verifyIP verifies if the IP belongs to the given bot.
// Verification order: IP ranges → ASN → RDNS (fastest to slowest)
func (v *Validator) verifyIP(bot *Bot, ipStr string) Result {
// Check IP ranges first (fastest, ~200ns)
if bot.ContainsIP(ipStr) {
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusVerified, IsBot: true}
}
// ASN verification (fast after cache load, ~100ns)
if bot.asns != nil {
cache := bot.asns.Load()
if cache != nil {
ip, err := netip.ParseAddr(ipStr)
if err == nil && cache.Contains(ip) {
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusVerified, IsBot: true}
}
}
}
// RDNS verification (50-200ms cold, ~450ns cached)
if bot.RDNS && bot.rdns != nil {
switch bot.VerifyRDNS(ipStr) {
case StatusVerified:
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusVerified, IsBot: true}
case StatusPending:
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusPending, IsBot: true}
default:
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusFailed, IsBot: true}
}
}
// No match found
return Result{BotName: bot.Name, BotKind: bot.Kind, Status: StatusFailed, IsBot: true}
}
// findBotByUA finds a bot by matching the UserAgent marker.
// Uses byte-level index for fast lookup, then validates with word boundary matching.
func (v *Validator) findBotByUA(ua string) *Bot {
if len(ua) == 0 {
return nil
}
index := v.uaIndex.Load()
for i := 0; i < len(ua); i++ {
candidates := (*index)[ua[i]]
if len(candidates) == 0 {
continue
}
for _, bot := range candidates {
if containsWord(ua, bot.UA) {
return bot
}
}
}
return nil
}
// Close stops the scheduler.
func (v *Validator) Close() error {
v.cancel()
return nil
}