projectdiscovery · ehsandeep · Dec 6, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
diff --git a/common/httpx/httpx.go b/common/httpx/httpx.go
@@ -1,6 +1,7 @@
 package httpx
 
 import (
+	"context"
 	"crypto/tls"
 	"fmt"
 	"io"
@@ -25,7 +26,6 @@ import (
 	pdhttputil "github.com/projectdiscovery/utils/http"
 	stringsutil "github.com/projectdiscovery/utils/strings"
 	urlutil "github.com/projectdiscovery/utils/url"
-	"golang.org/x/net/context"
 	"golang.org/x/net/http2"
 )
 

diff --git a/common/pagetypeclassifier/pagetypeclassifier.go b/common/pagetypeclassifier/pagetypeclassifier.go
@@ -2,8 +2,12 @@ package pagetypeclassifier
 
 import (
 	_ "embed"
+	"fmt"
+	"strings"
+	"sync"
 
 	htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
+	"github.com/microcosm-cc/bluemonday"
 	"github.com/projectdiscovery/utils/ml/naive_bayes"
 )
 
@@ -14,26 +18,139 @@ type PageTypeClassifier struct {
 	classifier *naive_bayes.NaiveBayesClassifier
 }
 
-func New() *PageTypeClassifier {
+func New() (*PageTypeClassifier, error) {
 	classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
 	if err != nil {
-		panic(err)
+		return nil, err
 	}
-	return &PageTypeClassifier{classifier: classifier}
+	return &PageTypeClassifier{classifier: classifier}, nil
 }
 
 func (n *PageTypeClassifier) Classify(html string) string {
-	text := htmlToText(html)
-	if text == "" {
+	text, err := htmlToText(html)
+	if err != nil || text == "" {
 		return "other"
 	}
 	return n.classifier.Classify(text)
 }
 
-func htmlToText(html string) string {
-	text, err := htmltomarkdown.ConvertString(html)
+var (
+	// sanitizerPolicy is an aggressive bluemonday policy that strips most HTML
+	// to reduce nesting depth and prevent parser stack overflow
+	sanitizerPolicy     *bluemonday.Policy
+	sanitizerPolicyOnce sync.Once
+)
+
+// getSanitizerPolicy returns an ultra-aggressive HTML sanitizer policy that strips
+// almost all elements to minimize nesting depth and prevent parser stack overflow.
+func getSanitizerPolicy() *bluemonday.Policy {
+	sanitizerPolicyOnce.Do(func() {
+		p := bluemonday.NewPolicy()
+		// Ultra-aggressive policy: Allow only the most basic text elements
+		// to minimize nesting and reduce parser stack depth
+		p.AllowElements("p", "br", "h1", "h2", "h3", "h4", "h5", "h6")
+		p.AllowElements("strong", "em", "b", "i")
+		// Remove div, span, ul, ol, li as they can create deep nesting
+		// No attributes allowed to prevent style-based nesting issues
+		sanitizerPolicy = p
+	})
+	return sanitizerPolicy
+}
+
+// htmlToText safely converts HTML to text with multiple fallback strategies.
+// The 512 node limit in golang.org/x/net/html is hardcoded and cannot be increased.
+// Strategy:
+// 1. Length limit the input HTML to prevent massive documents
+// 2. Sanitize HTML aggressively with bluemonday to reduce nesting
+// 3. Convert sanitized HTML to markdown with panic recovery
+// 4. If conversion fails, fallback to plain text extraction
+func htmlToText(html string) (text string, err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("html parser panic: %v", r)
+			text = ""
+		}
+	}()
+
+	// Limit input size to prevent processing extremely large HTML documents
+	const maxHTMLSize = 1024 * 1024 // 1MB limit
+	if len(html) > maxHTMLSize {
+		html = html[:maxHTMLSize]
+	}
+
+	// First, sanitize HTML with ultra-aggressive bluemonday policy
+	sanitizedHTML := getSanitizerPolicy().Sanitize(html)
+
+	// If sanitization failed or produced empty result, try plain text fallback
+	if sanitizedHTML == "" {
+		return extractPlainText(html), nil
+	}
+
+	// Convert sanitized HTML to markdown
+	text, err = htmltomarkdown.ConvertString(sanitizedHTML)
 	if err != nil {
-		panic(err)
+		// If markdown conversion fails, fallback to plain text extraction
+		return extractPlainText(sanitizedHTML), nil
+	}
+
+	if text == "" {
+		// If result is empty, try plain text fallback
+		return extractPlainText(sanitizedHTML), nil
+	}
+
+	return text, nil
+}
+
+// extractPlainText is a simple fallback that extracts text content without HTML parsing
+// This is used when the HTML parser fails due to complexity or nesting depth
+func extractPlainText(html string) string {
+	// Simple regex-based text extraction as fallback
+	// Remove script and style tags first
+	text := html
+
+	// Remove script tags and content
+	for {
+		start := strings.Index(text, "<script")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(text[start:], "</script>")
+		if end == -1 {
+			text = text[:start]
+			break
+		}
+		text = text[:start] + text[start+end+9:]
+	}
+
+	// Remove style tags and content
+	for {
+		start := strings.Index(text, "<style")
+		if start == -1 {
+			break
+		}
+		end := strings.Index(text[start:], "</style>")
+		if end == -1 {
+			text = text[:start]
+			break
+		}
+		text = text[:start] + text[start+end+8:]
+	}
+
+	// Simple HTML tag removal (not perfect but safe)
+	result := ""
+	inTag := false
+	for _, char := range text {
+		if char == '<' {
+			inTag = true
+		} else if char == '>' {
+			inTag = false
+			result += " " // Replace tags with spaces
+		} else if !inTag {
+			result += string(char)
+		}
 	}
-	return text
+
+	// Clean up multiple spaces
+	words := strings.Fields(result)
+	return strings.Join(words, " ")
 }
diff --git a/common/pagetypeclassifier/pagetypeclassifier_test.go b/common/pagetypeclassifier/pagetypeclassifier_test.go
@@ -1,21 +1,24 @@
 package pagetypeclassifier
 
 import (
+	"strings"
 	"testing"
 
-	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestPageTypeClassifier(t *testing.T) {
-
 	t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
-		epc := New()
-		assert.NotNil(t, epc)
+		epc, err := New()
+		require.NoError(t, err)
+		require.NotNil(t, epc)
 	})
 
 	t.Run("test classification non error page text", func(t *testing.T) {
-		epc := New()
-		assert.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
+		epc, err := New()
+		require.NoError(t, err)
+		require.NotNil(t, epc)
+		require.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
 		<html lang="en">
 		<head>
 			<meta charset="UTF-8">
@@ -30,8 +33,10 @@ func TestPageTypeClassifier(t *testing.T) {
 	})
 
 	t.Run("test classification on error page text", func(t *testing.T) {
-		epc := New()
-		assert.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
+		epc, err := New()
+		require.NoError(t, err)
+		require.NotNil(t, epc)
+		require.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
 		<html>
 		<head>
 			<title>Error 403: Forbidden</title>
@@ -51,4 +56,88 @@ func TestPageTypeClassifier(t *testing.T) {
 		</html>
 		`))
 	})
+
+	t.Run("test resilience with deeply nested HTML", func(t *testing.T) {
+		epc, err := New()
+		require.NoError(t, err)
+		require.NotNil(t, epc)
+
+		// Generate deeply nested HTML that would have exceeded the 512 node stack limit
+		// With our enhanced sanitization and fallback mechanisms, this should now work
+		deeplyNestedHTML := "<div>"
+		for i := 0; i < 600; i++ {
+			deeplyNestedHTML += "<div><span>"
+		}
+		deeplyNestedHTML += "Some text content"
+		for i := 0; i < 600; i++ {
+			deeplyNestedHTML += "</span></div>"
+		}
+		deeplyNestedHTML += "</div>"
+
+		// Should not panic and should successfully classify the content
+		result := epc.Classify(deeplyNestedHTML)
+		require.NotEmpty(t, result)
+		// Should be able to extract and classify the text content
+		require.NotEqual(t, "", result)
+	})
+
+	t.Run("test htmlToText with deeply nested HTML", func(t *testing.T) {
+		// Generate deeply nested HTML that would have exceeded the 512 node stack limit
+		deeplyNestedHTML := "<div>"
+		for i := 0; i < 600; i++ {
+			deeplyNestedHTML += "<div><span>"
+		}
+		deeplyNestedHTML += "Some text content"
+		for i := 0; i < 600; i++ {
+			deeplyNestedHTML += "</span></div>"
+		}
+		deeplyNestedHTML += "</div>"
+
+		// Should not panic and should successfully extract text with enhanced sanitization
+		result, err := htmlToText(deeplyNestedHTML)
+		require.NoError(t, err)
+		require.NotEmpty(t, result)
+		require.Contains(t, result, "Some text content")
+	})
+
+	t.Run("test htmlToText with normal HTML", func(t *testing.T) {
+		normalHTML := `<html><body><h1>Title</h1><p>Some content here</p></body></html>`
+		result, err := htmlToText(normalHTML)
+		require.NoError(t, err)
+		require.NotEmpty(t, result)
+	})
+
+	t.Run("test htmlToText with extremely large HTML", func(t *testing.T) {
+		// Create a very large HTML document (over 1MB)
+		largeContent := strings.Repeat("<p>This is a test paragraph with some content. ", 50000)
+		largeHTML := "<html><body>" + largeContent + "</body></html>"
+
+		// Should handle large documents without panic
+		result, err := htmlToText(largeHTML)
+		require.NoError(t, err)
+		require.NotEmpty(t, result)
+	})
+
+	t.Run("test extractPlainText fallback", func(t *testing.T) {
+		htmlWithScriptAndStyle := `<html>
+			<head>
+				<style>body { color: red; }</style>
+				<script>alert('test');</script>
+			</head>
+			<body>
+				<h1>Title</h1>
+				<p>Some <strong>important</strong> content here</p>
+				<div><span>Nested content</span></div>
+			</body>
+		</html>`
+
+		result := extractPlainText(htmlWithScriptAndStyle)
+		require.NotEmpty(t, result)
+		require.Contains(t, result, "Title")
+		require.Contains(t, result, "important")
+		require.Contains(t, result, "content")
+		// Should not contain script or style content
+		require.NotContains(t, result, "alert")
+		require.NotContains(t, result, "color: red")
+	})
 }
diff --git a/common/stringz/stringz.go b/common/stringz/stringz.go
@@ -83,6 +83,14 @@ func AddURLDefaultPort(rawURL string) string {
 	if err != nil {
 		return rawURL
 	}
+	// Force default port to be added if not present
+	if u.Port() == "" {
+		if u.Scheme == urlutil.HTTP {
+			u.UpdatePort("80")
+		} else if u.Scheme == urlutil.HTTPS {
+			u.UpdatePort("443")
+		}
+	}
 	return u.String()
 }