feat(convert): add content-focused extraction with boilerplate stripping (#78)

chaliy · web-flow · commit 4162557cd486 · 2026-03-26T22:05:55.000-05:00
## What Add `strip_boilerplate()` function and `content_focus` request field to reduce token waste from navigation, footers, and sidebars. ## Why Most web pages are 80%+ boilerplate. Agents waste LLM context tokens on nav menus, footers, and sidebars that are irrelevant to the content they're trying to understand. ## How - New `content_focus` field on `FetchRequest`: `"main"` strips boilerplate, `"full"` (default) keeps everything - `strip_boilerplate()` strategy: 1. If `<main>` or `<article>` exists, extract only that content 2. If `role="main"` element exists, extract that 3. Fallback: strip `<nav>`, `<footer>`, `<aside>`, `<header>` and elements with roles `navigation`, `banner`, `contentinfo`, `complementary` - Applied before HTML→Markdown/Text conversion in DefaultFetcher ## Risk - Low — opt-in via `content_focus: "main"`, default behavior unchanged - Handles nested tags correctly ### Checklist - [x] Unit tests passed (9 strip_boilerplate tests) - [x] Clippy clean - [x] Docs build clean Closes #72
diff --git a/crates/fetchkit/src/convert.rs b/crates/fetchkit/src/convert.rs
@@ -733,6 +733,241 @@ fn extract_meta_tag(tag: &str, meta: &mut PageMetadata) {
     }
 }
 
+/// Strip boilerplate elements from HTML, keeping only main content.
+///
+/// Removes `<nav>`, `<footer>`, `<aside>`, and elements with
+/// `role="navigation"`, `role="banner"`, `role="contentinfo"`.
+/// If `<main>` or `<article>` is present, extracts only their content.
+///
+/// # Examples
+///
+/// ```
+/// use fetchkit::strip_boilerplate;
+///
+/// let html = r#"<nav>Menu</nav><main><p>Content</p></main><footer>Footer</footer>"#;
+/// let result = strip_boilerplate(html);
+/// assert!(result.contains("Content"));
+/// assert!(!result.contains("Menu"));
+/// assert!(!result.contains("Footer"));
+/// ```
+pub fn strip_boilerplate(html: &str) -> String {
+    // Strategy: if <main> or <article> exists, extract just that content.
+    // Otherwise, strip known boilerplate elements.
+
+    // Check if there's a <main> or <article> to focus on
+    if let Some(focused) = extract_main_content(html) {
+        return focused;
+    }
+
+    // Fallback: strip boilerplate elements
+    strip_boilerplate_elements(html)
+}
+
+/// Extract content from `<main>` or `<article>` tag if present.
+fn extract_main_content(html: &str) -> Option<String> {
+    // Try <main> first, then <article>
+    for target_tag in &["main", "article"] {
+        if let Some(content) = extract_tag_content(html, target_tag) {
+            return Some(content);
+        }
+    }
+
+    // Try role="main"
+    extract_role_content(html, "main")
+}
+
+/// Extract the inner content of the first occurrence of a given tag.
+fn extract_tag_content(html: &str, target: &str) -> Option<String> {
+    let mut chars = html.chars().peekable();
+    let mut depth = 0i32;
+    let mut capturing = false;
+    let mut output = String::new();
+
+    while let Some(c) = chars.next() {
+        if c == '<' {
+            let mut tag = String::new();
+            while let Some(&next) = chars.peek() {
+                if next == '>' {
+                    chars.next();
+                    break;
+                }
+                tag.push(chars.next().unwrap());
+            }
+
+            let tag_lower = tag.to_lowercase();
+            let is_closing = tag_lower.starts_with('/');
+            let tag_name = if is_closing {
+                tag_lower[1..].split_whitespace().next().unwrap_or("")
+            } else {
+                tag_lower.split_whitespace().next().unwrap_or("")
+            };
+
+            if tag_name == target {
+                if is_closing {
+                    depth -= 1;
+                    if depth == 0 && capturing {
+                        return Some(output);
+                    }
+                } else if !tag.ends_with('/') {
+                    depth += 1;
+                    if depth == 1 && !capturing {
+                        capturing = true;
+                        continue;
+                    }
+                }
+            }
+
+            if capturing {
+                output.push('<');
+                output.push_str(&tag);
+                output.push('>');
+            }
+        } else if capturing {
+            output.push(c);
+        }
+    }
+
+    None
+}
+
+/// Extract content of the first element with a given role attribute.
+fn extract_role_content(html: &str, role: &str) -> Option<String> {
+    let mut chars = html.chars().peekable();
+    let mut capture_tag: Option<String> = None;
+    let mut depth = 0i32;
+    let mut output = String::new();
+
+    while let Some(c) = chars.next() {
+        if c == '<' {
+            let mut tag = String::new();
+            while let Some(&next) = chars.peek() {
+                if next == '>' {
+                    chars.next();
+                    break;
+                }
+                tag.push(chars.next().unwrap());
+            }
+
+            let tag_lower = tag.to_lowercase();
+            let is_closing = tag_lower.starts_with('/');
+            let tag_name = if is_closing {
+                tag_lower[1..].split_whitespace().next().unwrap_or("")
+            } else {
+                tag_lower.split_whitespace().next().unwrap_or("")
+            };
+
+            if let Some(ref target) = capture_tag {
+                if tag_name == target.as_str() {
+                    if is_closing {
+                        depth -= 1;
+                        if depth == 0 {
+                            return Some(output);
+                        }
+                    } else if !tag.ends_with('/') {
+                        depth += 1;
+                    }
+                }
+
+                if depth > 0 {
+                    output.push('<');
+                    output.push_str(&tag);
+                    output.push('>');
+                }
+            } else if !is_closing {
+                // Check for role attribute
+                if let Some(attr_role) = extract_attribute(&tag, "role") {
+                    if attr_role.eq_ignore_ascii_case(role) && !tag.ends_with('/') {
+                        capture_tag = Some(tag_name.to_string());
+                        depth = 1;
+                        continue;
+                    }
+                }
+            }
+        } else if capture_tag.is_some() && depth > 0 {
+            output.push(c);
+        }
+    }
+
+    None
+}
+
+/// Boilerplate tags to strip when no <main>/<article> found.
+const BOILERPLATE_TAGS: &[&str] = &["nav", "footer", "aside", "header"];
+
+/// Roles that indicate boilerplate.
+const BOILERPLATE_ROLES: &[&str] = &["navigation", "banner", "contentinfo", "complementary"];
+
+/// Strip known boilerplate elements from HTML.
+fn strip_boilerplate_elements(html: &str) -> String {
+    let mut output = String::new();
+    let mut chars = html.chars().peekable();
+    let mut skip_depth = 0i32;
+    let mut skip_tag: Option<String> = None;
+
+    while let Some(c) = chars.next() {
+        if c == '<' {
+            let mut tag = String::new();
+            while let Some(&next) = chars.peek() {
+                if next == '>' {
+                    chars.next();
+                    break;
+                }
+                tag.push(chars.next().unwrap());
+            }
+
+            let tag_lower = tag.to_lowercase();
+            let is_closing = tag_lower.starts_with('/');
+            let tag_name = if is_closing {
+                tag_lower[1..].split_whitespace().next().unwrap_or("")
+            } else {
+                tag_lower.split_whitespace().next().unwrap_or("")
+            };
+
+            // Track skip state
+            if let Some(ref target) = skip_tag {
+                if tag_name == target.as_str() {
+                    if is_closing {
+                        skip_depth -= 1;
+                        if skip_depth == 0 {
+                            skip_tag = None;
+                            continue;
+                        }
+                    } else if !tag.ends_with('/') {
+                        skip_depth += 1;
+                    }
+                }
+                continue; // Skip everything inside boilerplate
+            }
+
+            // Check if this tag should be skipped
+            if !is_closing && !tag.ends_with('/') {
+                let is_boilerplate_tag = BOILERPLATE_TAGS.contains(&tag_name);
+                let is_boilerplate_role = extract_attribute(&tag, "role")
+                    .map(|r| {
+                        BOILERPLATE_ROLES
+                            .iter()
+                            .any(|br| r.eq_ignore_ascii_case(br))
+                    })
+                    .unwrap_or(false);
+
+                if is_boilerplate_tag || is_boilerplate_role {
+                    skip_tag = Some(tag_name.to_string());
+                    skip_depth = 1;
+                    continue;
+                }
+            }
+
+            output.push('<');
+            output.push_str(&tag);
+            output.push('>');
+        } else if skip_tag.is_none() {
+            output.push(c);
+        }
+    }
+
+    output
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1069,4 +1304,103 @@ mod tests {
         };
         assert!(!meta.is_empty());
     }
+
+    #[test]
+    fn test_strip_boilerplate_extracts_main() {
+        let html = r#"<nav><a href="/">Home</a></nav>
+            <main><p>Important content</p></main>
+            <footer>Copyright 2024</footer>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Important content"));
+        assert!(!result.contains("Home"));
+        assert!(!result.contains("Copyright"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_extracts_article() {
+        let html = r#"<nav>Menu</nav>
+            <article><h1>Title</h1><p>Body text</p></article>
+            <aside>Sidebar</aside>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Title"));
+        assert!(result.contains("Body text"));
+        assert!(!result.contains("Menu"));
+        assert!(!result.contains("Sidebar"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_main_takes_precedence_over_article() {
+        let html = r#"<main><p>Main content</p></main>
+            <article><p>Article content</p></article>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Main content"));
+        // Article is outside main, so not included
+        assert!(!result.contains("Article content"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_fallback_strips_nav_footer_aside() {
+        let html = r#"<div>
+            <nav>Navigation links</nav>
+            <p>Content paragraph</p>
+            <footer>Footer info</footer>
+            <aside>Sidebar widget</aside>
+        </div>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Content paragraph"));
+        assert!(!result.contains("Navigation links"));
+        assert!(!result.contains("Footer info"));
+        assert!(!result.contains("Sidebar widget"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_role_navigation() {
+        let html = r#"<div role="navigation">Nav menu</div>
+            <p>Content</p>
+            <div role="contentinfo">Footer stuff</div>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Content"));
+        assert!(!result.contains("Nav menu"));
+        assert!(!result.contains("Footer stuff"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_role_main() {
+        let html = r#"<nav>Nav</nav>
+            <div role="main"><p>Main content here</p></div>
+            <footer>Foot</footer>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Main content here"));
+        assert!(!result.contains("Nav"));
+        assert!(!result.contains("Foot"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_nested_nav() {
+        let html = r#"<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
+            <p>Page content</p>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Page content"));
+        assert!(!result.contains("Home"));
+        assert!(!result.contains("About"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_no_semantic_html() {
+        // No main/article/nav/footer — returns everything
+        let html = "<div><p>Content 1</p></div><div><p>Content 2</p></div>";
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Content 1"));
+        assert!(result.contains("Content 2"));
+    }
+
+    #[test]
+    fn test_strip_boilerplate_preserves_header_inside_main() {
+        let html = r#"<header>Site header</header>
+            <main><header><h1>Article header</h1></header><p>Body</p></main>"#;
+        let result = strip_boilerplate(html);
+        assert!(result.contains("Article header"));
+        assert!(result.contains("Body"));
+        assert!(!result.contains("Site header"));
+    }
 }
diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs
@@ -10,7 +10,7 @@
 use crate::client::FetchOptions;
 use crate::convert::{
     extract_headings, extract_metadata, filter_excessive_newlines, html_to_markdown, html_to_text,
-    is_html, is_markdown_content_type, is_plain_text_content_type,
+    is_html, is_markdown_content_type, is_plain_text_content_type, strip_boilerplate,
 };
 use crate::error::FetchError;
 use crate::fetchers::Fetcher;
@@ -253,8 +253,9 @@ impl Fetcher for DefaultFetcher {
         // Determine format and convert if needed
         // THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
         let is_html_content = is_html(&meta.content_type, &content);
+        let wants_main = request.wants_main_content();
 
-        // Extract structured metadata from HTML content
+        // Extract structured metadata from HTML content (before boilerplate stripping)
         let page_metadata = if is_html_content {
             let mut pm = extract_metadata(&content);
             pm.headings = extract_headings(&content);
@@ -277,12 +278,18 @@ impl Fetcher for DefaultFetcher {
                 debug!("Content-type is plain text; skipping HTML conversion");
                 ("text".to_string(), content)
             } else if is_html_content {
+                // Strip boilerplate before conversion if content_focus is "main"
+                let html = if wants_main {
+                    strip_boilerplate(&content)
+                } else {
+                    content
+                };
                 if wants_markdown {
-                    ("markdown".to_string(), html_to_markdown(&content))
+                    ("markdown".to_string(), html_to_markdown(&html))
                 } else if wants_text {
-                    ("text".to_string(), html_to_text(&content))
+                    ("text".to_string(), html_to_text(&html))
                 } else {
-                    ("raw".to_string(), content)
+                    ("raw".to_string(), html)
                 }
             } else {
                 ("raw".to_string(), content)
diff --git a/crates/fetchkit/src/lib.rs b/crates/fetchkit/src/lib.rs
@@ -85,7 +85,9 @@ mod tool;
 mod types;
 
 pub use client::{fetch, fetch_with_options, FetchOptions};
-pub use convert::{extract_headings, extract_metadata, html_to_markdown, html_to_text};
+pub use convert::{
+    extract_headings, extract_metadata, html_to_markdown, html_to_text, strip_boilerplate,
+};
 pub use dns::DnsPolicy;
 pub use error::{FetchError, ToolError};
 pub use fetchers::{
diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs