Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7c94f19
Improve error handling in htmlToText function
GDATTACKER-RESEARCHER Nov 17, 2025
d9e8bac
chore(deps): bump golang.org/x/text from 0.30.0 to 0.31.0
dependabot[bot] Nov 17, 2025
b4545c2
chore(deps): bump golang.org/x/net from 0.46.0 to 0.47.0
dependabot[bot] Nov 17, 2025
6454c83
chore(deps): bump github.com/PuerkitoBio/goquery from 1.10.3 to 1.11.0
dependabot[bot] Nov 17, 2025
4ed7148
Merge branch 'dev' into dev
GDATTACKER-RESEARCHER Nov 17, 2025
329c781
Merge branch 'dev' into dependabot/go_modules/dev/golang.org/x/text-0…
Mzack9999 Nov 18, 2025
0336bec
chore(deps): bump the modules group with 5 updates
dependabot[bot] Nov 17, 2025
45d8476
Merge pull request #2312 from projectdiscovery/dependabot/go_modules/…
Mzack9999 Nov 18, 2025
05c6364
better error handling
Mzack9999 Nov 18, 2025
c6948ed
Merge pull request #2309 from GDATTACKER-RESEARCHER/dev
Mzack9999 Nov 18, 2025
0658afd
chore(deps): bump golang.org/x/crypto from 0.44.0 to 0.45.0
dependabot[bot] Nov 20, 2025
0309110
adding panic guard + tests
Mzack9999 Nov 20, 2025
5f13beb
lint
Mzack9999 Nov 20, 2025
9388133
chore(deps): bump github.com/weppos/publicsuffix-go
dependabot[bot] Nov 24, 2025
340eb79
chore(deps): bump the modules group with 11 updates
dependabot[bot] Nov 24, 2025
275d63f
fix test
Mzack9999 Nov 28, 2025
61a791c
Merge pull request #2317 from projectdiscovery/bufgix-2316-stack-over…
Mzack9999 Nov 28, 2025
a0fec14
chore(deps): bump github.com/JohannesKaufmann/html-to-markdown/v2
dependabot[bot] Dec 1, 2025
8b04cd9
chore(deps): bump the modules group with 10 updates
dependabot[bot] Dec 1, 2025
d114722
feat: update `-ldp` option to show default ports in CLI output (#2331)
ehsandeep Dec 6, 2025
599441e
fix: HTML parser panic protection with multiple fallback (#2330)
ehsandeep Dec 6, 2025
a00f9c4
fix: host JSON field now returns hostname instead of IP (#2333)
ehsandeep Dec 6, 2025
f7712f7
version update
ehsandeep Dec 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/httpx/httpx.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package httpx

import (
"context"
"crypto/tls"
"fmt"
"io"
Expand All @@ -25,7 +26,6 @@ import (
pdhttputil "github.com/projectdiscovery/utils/http"
stringsutil "github.com/projectdiscovery/utils/strings"
urlutil "github.com/projectdiscovery/utils/url"
"golang.org/x/net/context"
"golang.org/x/net/http2"
)

Expand Down
135 changes: 126 additions & 9 deletions common/pagetypeclassifier/pagetypeclassifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ package pagetypeclassifier

import (
_ "embed"
"fmt"
"strings"
"sync"

htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/microcosm-cc/bluemonday"
"github.com/projectdiscovery/utils/ml/naive_bayes"
)

Expand All @@ -14,26 +18,139 @@ type PageTypeClassifier struct {
classifier *naive_bayes.NaiveBayesClassifier
}

func New() *PageTypeClassifier {
func New() (*PageTypeClassifier, error) {
classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
if err != nil {
panic(err)
return nil, err
}
return &PageTypeClassifier{classifier: classifier}
return &PageTypeClassifier{classifier: classifier}, nil
}

func (n *PageTypeClassifier) Classify(html string) string {
text := htmlToText(html)
if text == "" {
text, err := htmlToText(html)
if err != nil || text == "" {
return "other"
}
return n.classifier.Classify(text)
}

func htmlToText(html string) string {
text, err := htmltomarkdown.ConvertString(html)
var (
// sanitizerPolicy is an aggressive bluemonday policy that strips most HTML
// to reduce nesting depth and prevent parser stack overflow
sanitizerPolicy *bluemonday.Policy
sanitizerPolicyOnce sync.Once
)

// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips
// almost all elements to minimize nesting depth and prevent parser stack overflow.
func getSanitizerPolicy() *bluemonday.Policy {
sanitizerPolicyOnce.Do(func() {
p := bluemonday.NewPolicy()
// Ultra-aggressive policy: Allow only the most basic text elements
// to minimize nesting and reduce parser stack depth
p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6")
p.AllowElements("strong", "em", "b", "i")
// Remove div, span, ul, ol, li as they can create deep nesting
// No attributes allowed to prevent style-based nesting issues
sanitizerPolicy = p
})
return sanitizerPolicy
}

// htmlToText safely converts HTML to text with multiple fallback strategies.
// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
// Strategy:
// 1. Length limit the input HTML to prevent massive documents
// 2. Sanitize HTML aggressively with bluemonday to reduce nesting
// 3. Convert sanitized HTML to markdown with panic recovery
// 4. If conversion fails, fallback to plain text extraction
func htmlToText(html string) (text string, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("html parser panic: %v", r)
text = ""
}
}()

// Limit input size to prevent processing extremely large HTML documents
const maxHTMLSize = 1024 * 1024 // 1MB limit
if len(html) > maxHTMLSize {
html = html[:maxHTMLSize]
}

// First, sanitize HTML with ultra-aggressive bluemonday policy
sanitizedHTML := getSanitizerPolicy().Sanitize(html)

// If sanitization failed or produced empty result, try plain text fallback
if sanitizedHTML == "" {
return extractPlainText(html), nil
}

// Convert sanitized HTML to markdown
text, err = htmltomarkdown.ConvertString(sanitizedHTML)
if err != nil {
panic(err)
// If markdown conversion fails, fallback to plain text extraction
return extractPlainText(sanitizedHTML), nil
}

if text == "" {
// If result is empty, try plain text fallback
return extractPlainText(sanitizedHTML), nil
}

return text, nil
}

// extractPlainText is a simple fallback that extracts text content without HTML parsing
// This is used when the HTML parser fails due to complexity or nesting depth
func extractPlainText(html string) string {
// Simple regex-based text extraction as fallback
// Remove script and style tags first
text := html

// Remove script tags and content
for {
start := strings.Index(text, "<script")
if start == -1 {
break
}
end := strings.Index(text[start:], "</script>")
if end == -1 {
text = text[:start]
break
}
text = text[:start] + text[start+end+9:]
}

// Remove style tags and content
for {
start := strings.Index(text, "<style")
if start == -1 {
break
}
end := strings.Index(text[start:], "</style>")
if end == -1 {
text = text[:start]
break
}
text = text[:start] + text[start+end+8:]
}

// Simple HTML tag removal (not perfect but safe)
result := ""
inTag := false
for _, char := range text {
if char == '<' {
inTag = true
} else if char == '>' {
inTag = false
result += " " // Replace tags with spaces
} else if !inTag {
result += string(char)
}
}
return text

// Clean up multiple spaces
words := strings.Fields(result)
return strings.Join(words, " ")
}
105 changes: 97 additions & 8 deletions common/pagetypeclassifier/pagetypeclassifier_test.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
package pagetypeclassifier

import (
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestPageTypeClassifier(t *testing.T) {

t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
epc := New()
assert.NotNil(t, epc)
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)
})

t.Run("test classification non error page text", func(t *testing.T) {
epc := New()
assert.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)
require.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
Expand All @@ -30,8 +33,10 @@ func TestPageTypeClassifier(t *testing.T) {
})

t.Run("test classification on error page text", func(t *testing.T) {
epc := New()
assert.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)
require.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
<html>
<head>
<title>Error 403: Forbidden</title>
Expand All @@ -51,4 +56,88 @@ func TestPageTypeClassifier(t *testing.T) {
</html>
`))
})

t.Run("test resilience with deeply nested HTML", func(t *testing.T) {
epc, err := New()
require.NoError(t, err)
require.NotNil(t, epc)

// Generate deeply nested HTML that would have exceeded the 512 node stack limit
// With our enhanced sanitization and fallback mechanisms, this should now work
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
}
deeplyNestedHTML += "Some text content"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "</span></div>"
}
deeplyNestedHTML += "</div>"

// Should not panic and should successfully classify the content
result := epc.Classify(deeplyNestedHTML)
require.NotEmpty(t, result)
// Should be able to extract and classify the text content
require.NotEqual(t, "", result)
})

t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
// Generate deeply nested HTML that would have exceeded the 512 node stack limit
deeplyNestedHTML := "<div>"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "<div><span>"
}
deeplyNestedHTML += "Some text content"
for i := 0; i < 600; i++ {
deeplyNestedHTML += "</span></div>"
}
deeplyNestedHTML += "</div>"

// Should not panic and should successfully extract text with enhanced sanitization
result, err := htmlToText(deeplyNestedHTML)
require.NoError(t, err)
require.NotEmpty(t, result)
require.Contains(t, result, "Some text content")
})

t.Run("test htmlToText with normal HTML", func(t *testing.T) {
normalHTML := `<html><body><h1>Title</h1><p>Some content here</p></body></html>`
result, err := htmlToText(normalHTML)
require.NoError(t, err)
require.NotEmpty(t, result)
})

t.Run("test htmlToText with extremely large HTML", func(t *testing.T) {
// Create a very large HTML document (over 1MB)
largeContent := strings.Repeat("<p>This is a test paragraph with some content. ", 50000)
largeHTML := "<html><body>" + largeContent + "</body></html>"

// Should handle large documents without panic
result, err := htmlToText(largeHTML)
require.NoError(t, err)
require.NotEmpty(t, result)
})

t.Run("test extractPlainText fallback", func(t *testing.T) {
htmlWithScriptAndStyle := `<html>
<head>
<style>body { color: red; }</style>
<script>alert('test');</script>
</head>
<body>
<h1>Title</h1>
<p>Some <strong>important</strong> content here</p>
<div><span>Nested content</span></div>
</body>
</html>`

result := extractPlainText(htmlWithScriptAndStyle)
require.NotEmpty(t, result)
require.Contains(t, result, "Title")
require.Contains(t, result, "important")
require.Contains(t, result, "content")
// Should not contain script or style content
require.NotContains(t, result, "alert")
require.NotContains(t, result, "color: red")
})
}
8 changes: 8 additions & 0 deletions common/stringz/stringz.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ func AddURLDefaultPort(rawURL string) string {
if err != nil {
return rawURL
}
// Force default port to be added if not present
if u.Port() == "" {
if u.Scheme == urlutil.HTTP {
u.UpdatePort("80")
} else if u.Scheme == urlutil.HTTPS {
u.UpdatePort("443")
}
}
return u.String()
}

Expand Down
Loading
Loading