diff --git a/HISTORY.md b/HISTORY.md index cc4b6f9..66987bc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,7 @@ - enhance the `dump` command with a worker pool for concurrency and thread-safe data writing mechanics. - include comprehensive unit test coverage for the validation and concurrency logic in fields_test.go and dump_test.go. - update README.md and README_ZH.md to natively reference the official FOFA API documentation URLs for valid fields. +- optimize `search` command by adding `-bs` parameter to customize batch size for returned data. ## v0.2.28 fix dedup mode diff --git a/README.md b/README.md index abbbfce..11cc5a8 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,7 @@ categories: | urlPrefix | | http:// | URL prefix | | full | | false | Retrieves full data | | uniqByIP | | false | Removes duplicates based on IP | +| batchSize | bs | 1000 | Pagination size per fetch *3 | | workers | | 10 | Number of threads | | rate | | 2 | Query rate per second | | template | | ip={} | Replaces `{}` with content from pipeline input | @@ -158,12 +159,13 @@ categories: *1: When the query contains `cert` and `banner`, the maximum results size setting is 2000 per page. *2: When the query contains `body`, the maximum results size setting is 500 per page. +*3: When the `body` field is included, the default `batchSize` is automatically capped at 500. If the `-bs` parameter is explicitly set, the set value will be used instead. ### `dump` | Parameter | Abbreviation | Default Value | Description | |-------------|--------------|---------------|-----------------------------------------------------------| -| fields | f | ip,port | Fields returned by FOFA, valid fields refer to https://en.fofa.info/api, for dump command refer to https://en.fofa.info/api/batches_pages | +| fields | f | ip,port | Fields returned by FOFA, valid fields refer to https://en.fofa.info/api/batches_pages | | format | | csv | Output format: csv/json/xml | | outFile | o | | Output file. If not set, prints to terminal | | inFile | i | | Input file. If not set, reads from pipeline input | @@ -171,7 +173,7 @@ categories: | fixUrl | | false | Combines URLs (e.g., 1.1.1.1,80 becomes http://1.1.1.1) | | urlPrefix | | http:// | URL prefix | | full | | false | Retrieves full data | -| batchSize | bs | 1000 | Number of records to fetch per batch | +| batchSize | bs | 1000 | Number of records to fetch per batch *3 | | batchType | bt | | Batch query type: ip/domain | | workers | | 10 | Number of threads, defaults to 10 when using -i | | rate | | 2 | Query rate per second | @@ -179,6 +181,7 @@ categories: *1: When the query contains `cert` and `banner`, the maximum results size setting is 2000 per page. *2: When the query contains `body`, the maximum results size setting is 500 per page. +*3: When the `body` field is included, the `batchSize` is automatically capped at 500. ### `jsRender` diff --git a/README_ZH.md b/README_ZH.md index 4698a71..8ef992a 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -148,6 +148,7 @@ categories: | urlPrefix | | http:// | url前缀 | | full | | false | 是否调取全量数据 | | uniqByIP | | false | 是否根据ip去重 | +| batchSize | bs | 1000 | 每次拉取的分页大小 *3 | | workers | | 10 | 线程数量 | | rate | | 2 | 每秒查询次数 | | template | | ip={} | 从管道获取输入,输入的内容会替换{} | @@ -161,12 +162,13 @@ categories: *1:当获取字段包含 `cert` 和 `banner` 时,单次查询 size 最大支持 2000。 *2:当获取字段包含 `body` 时,单次查询 size 最大支持 500。 +*3:当获取字段包含 `body` 时,默认的 `batchSize` 会自动限制为 500。如果手动设置了 `-bs` 参数,则以设置的值为准。 ### dump | 参数 | 参数简写 | 默认值 | 简介 | | --------- | -------- | ------- | ----------------------------------------------------- | -| fields | f | ip,port | FOFA返回的字段选择,有效字段参考https://fofa.info/api,dump的参考https://fofa.info/api/batches_pages | +| fields | f | ip,port | FOFA返回的字段选择,有效字段参考https://fofa.info/api/batches_pages | | format | | csv | 输出格式,可以为csv/json/xml | | outFile | o | | 输出文件,如果不设置则终端打印 | | inFile | i | | 输入文件,如果不设置则读取管道输入 | @@ -174,7 +176,7 @@ categories: | fixUrl | | false | 是否组合url,例如1.1.1.1,80组合为http://1.1.1.1 | | urlPrefix | | http:// | url前缀 | | full | | false | 是否调取全量数据 | -| batchSize | bs | 1000 | 每次拉取多少条数据 | +| batchSize | bs | 1000 | 每次拉取多少条数据 *3 | | batchType | bt | | 批量查询,可以为ip/domain | | workers | | 10 | 线程数量,当使用-i时默认10 | | rate | | 2 | 每秒查询次数 | @@ -182,6 +184,7 @@ categories: *1:当获取字段包含 `cert` 和 `banner` 时,单次查询 size 最大支持 2000。 *2:当获取字段包含 `body` 时,单次查询 size 最大支持 500。 +*3:当获取字段包含 `body` 时,每次拉取的 `batchSize` 会自动限制为 500。 ### jsRender diff --git a/USER_GUIDE.md b/USER_GUIDE.md index 3cbed73..44ebb46 100644 --- a/USER_GUIDE.md +++ b/USER_GUIDE.md @@ -635,12 +635,13 @@ $ fofa --version | fields | f | ip,port | FOFA fields to retrieve. [Learn More](https://en.fofa.info/vip) | | format | | csv | Output format: csv/json/xml | | outFile | o | | Output file. If not set, prints to terminal | -| size | s | 100 | Query size. Maximum is 10,000, limited by `deductMode` | +| size | s | 100 | Query size. Maximum is 10,000, limited by `deductMode` | | deductMode | | | Determines consumption of f-points. Uses free quota by default | | fixUrl | | false | Concatenates URLs (e.g., `1.1.1.1,80` → `http://1.1.1.1`) | | urlPrefix | | http:// | URL prefix | | full | | false | Retrieves full data | | uniqByIP | | false | Removes duplicates by IP | +| batchSize | bs | 1000 | Pagination size per fetch *1 | | workers | | 10 | Number of threads | | rate | | 2 | Queries per second | | template | | ip={} | Replaces `{}` with content from pipeline input | @@ -652,6 +653,8 @@ $ fofa --version | headline | | false | Outputs CSV headers (only applicable for CSV format) | | customFields | cf | | use custom fields | | help | h | false | Displays usage instructions | + +*1: When the `body` field is included, the default `batchSize` is automatically capped at 500. If the `-bs` parameter is explicitly set, the set value will be used instead. ### Dump @@ -665,11 +668,13 @@ $ fofa --version | fixUrl | | false | Concatenates URLs (e.g., `1.1.1.1,80` → `http://1.1.1.1`) | | urlPrefix | | http:// | URL prefix | | full | | false | Retrieves full data | -| batchSize | bs | 1000 | Number of records fetched per batch | +| batchSize | bs | 1000 | Number of records fetched per batch *1 | | batchType | bt | | Batch query type: ip/domain | | customFields | cf | | use custom fields | | help | h | false | Displays usage instructions | +*1: When the `body` field is included, the `batchSize` is automatically capped at 500. + ### jsRender | Parameter | Abbreviation | Default Value | Description | diff --git a/USER_GUIDE_ZH.md b/USER_GUIDE_ZH.md index d833da9..167f738 100644 --- a/USER_GUIDE_ZH.md +++ b/USER_GUIDE_ZH.md @@ -637,6 +637,7 @@ $ fofa --version | urlPrefix | | http:// | url前缀 | | full | | false | 是否调取全量数据 | | uniqByIP | | false | 是否根据ip去重 | +| batchSize | bs | 1000 | 每次拉取的分页大小 *1 | | workers | | 10 | 线程数量 | | rate | | 2 | 每秒查询次数 | | template | | ip={} | 从管道获取输入,输入的内容会替换{} | @@ -649,6 +650,8 @@ $ fofa --version | customFields | cf | | 使用自定义fields字段 | | help | h | false | 使用方法 | +*1:当获取字段包含 `body` 时,默认的 `batchSize` 会自动限制为 500。如果手动设置了 `-bs` 参数,则以设置的值为准。 + ### dump | 参数 | 参数简写 | 默认值 | 简介 | @@ -661,11 +664,13 @@ $ fofa --version | fixUrl | | false | 是否组合url,例如1.1.1.1,80组合为http://1.1.1.1 | | urlPrefix | | http:// | url前缀 | | full | | false | 是否调取全量数据 | -| batchSize | bs | 1000 | 每次拉取多少条数据 | +| batchSize | bs | 1000 | 每次拉取多少条数据 *1 | | batchType | bt | | 批量查询,可以为ip/domain | | customFields | cf | | 使用自定义fields字段 | | help | h | false | 使用方法 | +*1:当获取字段包含 `body` 时,每次拉取的 `batchSize` 会自动限制为 500。 + ### jsRender | 参数 | 参数简写 | 默认值 | 简介 | diff --git a/client_test.go b/client_test.go index 8db54da..9318d84 100644 --- a/client_test.go +++ b/client_test.go @@ -149,6 +149,26 @@ var ( w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["1.1.1.1:81","1.1.1.1","81"],["1.1.1.1:82","1.1.1.1","82"],["1.1.1.1:83","1.1.1.1","83"],["1.1.1.1:84","1.1.1.1","84"],["1.1.1.1:85","1.1.1.1","85"],["1.1.1.1:86","1.1.1.1","86"],["1.1.1.1:87","1.1.1.1","87"],["1.1.1.1:88","1.1.1.1","88"],["1.1.1.1:89","1.1.1.1","89"],["1.1.1.1:90","1.1.1.1","90"]]}`)) } + return + case "ip,port,body": + // Test for body field with batchSize auto-cap + switch r.FormValue("size") { + case "10": + w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["94.130.128.248","80","test body 1"],["186.6.19.151","80","test body 2"],["72.247.70.195","80","test body 3"],["18.66.199.67","80","test body 4"],["91.122.52.148","80","test body 5"],["113.23.57.252","80","test body 6"],["54.144.154.222","80","test body 7"],["188.223.2.247","80","test body 8"],["50.213.108.254","80","test body 9"],["34.237.16.144","80","test body 10"]]}`)) + case "100": + w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["94.130.128.248","80","body"],["186.6.19.151","80","body"],["72.247.70.195","80","body"],["18.66.199.67","80","body"],["91.122.52.148","80","body"],["113.23.57.252","80","body"],["54.144.154.222","80","body"],["188.223.2.247","80","body"],["50.213.108.254","80","body"],["34.237.16.144","80","body"]]}`)) + default: + w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["94.130.128.248","80","body"]]}`)) + } + return + case "ip,port,host": + // Test for non-body fields + switch r.FormValue("size") { + case "10": + w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["94.130.128.248","80","94.130.128.248:80"],["186.6.19.151","80","186.6.19.151:80"],["72.247.70.195","80","72.247.70.195:80"],["18.66.199.67","80","18.66.199.67:80"],["91.122.52.148","80","91.122.52.148:80"],["113.23.57.252","80","113.23.57.252:80"],["54.144.154.222","80","54.144.154.222:80"],["188.223.2.247","80","188.223.2.247:80"],["50.213.108.254","80","50.213.108.254:80"],["34.237.16.144","80","34.237.16.144:80"]]}`)) + default: + w.Write([]byte(`{"error":false,"size":470293950,"page":1,"mode":"extended","query":"port=\"80\"","results":[["94.130.128.248","80","94.130.128.248:80"]]}`)) + } return } case "port=5354": @@ -255,6 +275,8 @@ var ( case "host,ip,port,protocol": data = append([]string{fmt.Sprintf("http://%d.%d.%d.%d", i, i, i, i)}, data...) data = append(data, "http") + case "ip,port,body": + data = append(data, fmt.Sprintf("body content %d", i+j)) } results = append(results, data) diff --git a/cmd/fofa/cmd/dump.go b/cmd/fofa/cmd/dump.go index 9371ce5..a440e0a 100644 --- a/cmd/fofa/cmd/dump.go +++ b/cmd/fofa/cmd/dump.go @@ -92,8 +92,8 @@ var dumpCmd = &cli.Command{ &cli.IntFlag{ Name: "batchSize", Aliases: []string{"bs"}, - Value: 1000, - Usage: "the amount of data contained in each batch", + Value: 0, + Usage: "the amount of data contained in each batch, default 1000", Destination: &batchSize, }, &cli.StringFlag{ diff --git a/cmd/fofa/cmd/search.go b/cmd/fofa/cmd/search.go index ed0fce8..acc651c 100644 --- a/cmd/fofa/cmd/search.go +++ b/cmd/fofa/cmd/search.go @@ -98,6 +98,13 @@ var searchCmd = &cli.Command{ Usage: "search result for over a year", Destination: &full, }, + &cli.IntFlag{ + Name: "batchSize", + Aliases: []string{"bs"}, + Value: 0, + Usage: "amount of data contained in each page batch, default 1000", + Destination: &batchSize, + }, &cli.BoolFlag{ Name: "uniqByIP", Value: false, @@ -351,6 +358,7 @@ func SearchAction(ctx *cli.Context) error { DeWildcard: deWildcard, Filter: filter, DedupHost: dedupHost, + BatchSize: batchSize, }) if err != nil { return err diff --git a/host.go b/host.go index e7b4fff..1597302 100644 --- a/host.go +++ b/host.go @@ -5,11 +5,13 @@ import ( "encoding/base64" "errors" "fmt" - "github.com/Knetic/govaluate" - "github.com/expr-lang/expr" + "log" "math" "strconv" "strings" + + "github.com/Knetic/govaluate" + "github.com/expr-lang/expr" ) const ( @@ -55,6 +57,7 @@ type SearchOptions struct { DeWildcard int // number of wildcard domains retained Filter string // filter data by rules DedupHost bool // prioritize subdomain data retention + BatchSize int // custom batch size } // fixHostToUrl 替换host为url @@ -226,11 +229,34 @@ func (c *Client) HostSearch(query string, size int, fields []string, options ... } page := 1 - perPage := int(math.Min(float64(size), 1000)) // 最多一次取1000 - // 一次取所有数据,perPage 默认给 1000 + maxPerPage := 1000 + userSetBatchSize := false + if len(options) > 0 && options[0].BatchSize > 0 { + maxPerPage = options[0].BatchSize + userSetBatchSize = true + if maxPerPage > 10000 { + maxPerPage = 10000 // /search/all api limit + } + } + + for _, f := range fields { + if f == "body" { + if maxPerPage > 500 && !userSetBatchSize { + maxPerPage = 500 + if c.logger != nil { + c.logger.Warnf("fields contains body, change batchSize to %d", maxPerPage) + } + } + break + } + } + + perPage := int(math.Min(float64(size), float64(maxPerPage))) // 最多一次取 maxPerPage + + // 一次取所有数据,perPage 默认给 maxPerPage if size == -1 { - perPage = 1000 + perPage = maxPerPage } hostIndex, protocolIndex, fields, rawFieldSize, err := c.fixUrlCheck(fields, options...) @@ -397,6 +423,7 @@ func (c *Client) HostSearch(query string, size int, fields []string, options ... } res = append(res, results...) + log.Printf("size: %d for query: %s", len(res), query) // 数据填满了,完成 if size != -1 && size <= len(res) { @@ -489,10 +516,28 @@ func (c *Client) DumpSearch(query string, allSize int, batchSize int, fields []s next := "" perPage := batchSize + userSetBatchSize := true + if perPage == 0 { + perPage = 1000 + userSetBatchSize = false + } + if perPage < 1 || perPage > 100000 { return errors.New("batchSize must between 1 and 100000") } + for _, f := range fields { + if f == "body" { + if perPage > 500 && !userSetBatchSize { + perPage = 500 + if c.logger != nil { + c.logger.Warnf("fields contains body, change batchSize to %d", perPage) + } + } + break + } + } + // 确保urlfix开启后带上了protocol字段 hostIndex, protocolIndex, fields, rawFieldSize, err := c.fixUrlCheck(fields, options...) if err != nil { diff --git a/host_test.go b/host_test.go index 1031442..1fbd8ee 100644 --- a/host_test.go +++ b/host_test.go @@ -2,11 +2,13 @@ package gofofa import ( "context" - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" "net/http" "net/http/httptest" + "strings" "testing" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" ) func TestClient_HostSearch(t *testing.T) { @@ -421,3 +423,100 @@ func TestClient_DumpSearch(t *testing.T) { }, SearchOptions{FixUrl: true}) assert.NotNil(t, err) } + +func TestClient_BatchSizeWithBodyField(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(queryHander)) + defer ts.Close() + + account := validAccounts[1] + cli, err := NewClient(WithURL(ts.URL + "?email=" + account.Email + "&key=" + account.Key)) + assert.Nil(t, err) + + // Test 1: HostSearch with body field - should auto-cap batchSize at 500 + // The logger should warn about the batchSize change + res, err := cli.HostSearch("port=80", 10, []string{"ip", "port", "body"}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + // Test 2: HostSearch with body field and explicit batchSize - should use explicit value + res, err = cli.HostSearch("port=80", 10, []string{"ip", "port", "body"}, SearchOptions{BatchSize: 600}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + // Test 3: HostSearch without body field - should use default batchSize 1000 + res, err = cli.HostSearch("port=80", 10, []string{"ip", "port", "host"}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + // Test 4: HostSearch with body field and small explicit batchSize + res, err = cli.HostSearch("port=80", 10, []string{"ip", "port", "body"}, SearchOptions{BatchSize: 300}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + // Test 5: DumpSearch with body field - should auto-cap at 500 + dumpRes := make([][]string, 0) + err = cli.DumpSearch("port=80", 100, 1000, []string{"ip", "port", "body"}, func(i [][]string, i2 int) error { + dumpRes = append(dumpRes, i...) + return nil + }) + assert.Nil(t, err) + + // Test 6: DumpSearch with body field and explicit batchSize + dumpRes = make([][]string, 0) + err = cli.DumpSearch("port=80", 100, 800, []string{"ip", "port", "body"}, func(i [][]string, i2 int) error { + dumpRes = append(dumpRes, i...) + return nil + }) + assert.Nil(t, err) +} + +func TestClient_BatchSizeAutomaticCapping(t *testing.T) { + ts := httptest.NewServer(http.HandlerFunc(queryHander)) + defer ts.Close() + + // 使用高级会员账号,避免 100 条的免费限制 + account := validAccounts[3] + + // 设置日志钩子来捕获警告 + var logs []string + logger := logrus.New() + logger.AddHook(&testHook{f: func(e *logrus.Entry) { + logs = append(logs, e.Message) + }}) + + cli, err := NewClient(WithURL(ts.URL+"?email="+account.Email+"&key="+account.Key), WithLogger(logger)) + assert.Nil(t, err) + + // 测试 1: 包含 body 字段,且不指定 BatchSize (即为 0) + // 预期行为:内部逻辑应该触发警告并设置 batchSize 为 500 + // 这里 size 设置为 10,因为 mock server 对 size=10 有完整的 10 条数据返回 + res, err := cli.HostSearch("port=80", 10, []string{"ip", "port", "body"}, SearchOptions{BatchSize: 0}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + // 验证是否触发了自动调优 batchSize 的警告 + foundWarning := false + for _, l := range logs { + if strings.Contains(l, "fields contains body, change batchSize to 500") { + foundWarning = true + break + } + } + assert.True(t, foundWarning, "Should have found auto-capping warning in logs") + + // 测试 2: 不包含 body 字段,不指定 BatchSize + // 预期行为:不应该触发 body 相关的自动降级警告 + logs = nil + res, err = cli.HostSearch("port=80", 10, []string{"ip", "port"}, SearchOptions{BatchSize: 0}) + assert.Nil(t, err) + assert.Equal(t, 10, len(res)) + + foundWarning = false + for _, l := range logs { + if strings.Contains(l, "fields contains body, change batchSize to 500") { + foundWarning = true + break + } + } + assert.False(t, foundWarning, "Should NOT have found auto-capping warning for non-body fields") +}