Skip to content

Commit 4162557

Browse files
authored
feat(convert): add content-focused extraction with boilerplate stripping (#78)
## What Add `strip_boilerplate()` function and `content_focus` request field to reduce token waste from navigation, footers, and sidebars. ## Why Most web pages are 80%+ boilerplate. Agents waste LLM context tokens on nav menus, footers, and sidebars that are irrelevant to the content they're trying to understand. ## How - New `content_focus` field on `FetchRequest`: `"main"` strips boilerplate, `"full"` (default) keeps everything - `strip_boilerplate()` strategy: 1. If `<main>` or `<article>` exists, extract only that content 2. If `role="main"` element exists, extract that 3. Fallback: strip `<nav>`, `<footer>`, `<aside>`, `<header>` and elements with roles `navigation`, `banner`, `contentinfo`, `complementary` - Applied before HTML→Markdown/Text conversion in DefaultFetcher ## Risk - Low — opt-in via `content_focus: "main"`, default behavior unchanged - Handles nested tags correctly ### Checklist - [x] Unit tests passed (9 strip_boilerplate tests) - [x] Clippy clean - [x] Docs build clean Closes #72
1 parent 26f1347 commit 4162557

File tree

4 files changed

+368
-6
lines changed

4 files changed

+368
-6
lines changed

crates/fetchkit/src/convert.rs

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,241 @@ fn extract_meta_tag(tag: &str, meta: &mut PageMetadata) {
733733
}
734734
}
735735

736+
/// Strip boilerplate elements from HTML, keeping only main content.
737+
///
738+
/// Removes `<nav>`, `<footer>`, `<aside>`, and elements with
739+
/// `role="navigation"`, `role="banner"`, `role="contentinfo"`.
740+
/// If `<main>` or `<article>` is present, extracts only their content.
741+
///
742+
/// # Examples
743+
///
744+
/// ```
745+
/// use fetchkit::strip_boilerplate;
746+
///
747+
/// let html = r#"<nav>Menu</nav><main><p>Content</p></main><footer>Footer</footer>"#;
748+
/// let result = strip_boilerplate(html);
749+
/// assert!(result.contains("Content"));
750+
/// assert!(!result.contains("Menu"));
751+
/// assert!(!result.contains("Footer"));
752+
/// ```
753+
pub fn strip_boilerplate(html: &str) -> String {
754+
// Strategy: if <main> or <article> exists, extract just that content.
755+
// Otherwise, strip known boilerplate elements.
756+
757+
// Check if there's a <main> or <article> to focus on
758+
if let Some(focused) = extract_main_content(html) {
759+
return focused;
760+
}
761+
762+
// Fallback: strip boilerplate elements
763+
strip_boilerplate_elements(html)
764+
}
765+
766+
/// Extract content from `<main>` or `<article>` tag if present.
767+
fn extract_main_content(html: &str) -> Option<String> {
768+
// Try <main> first, then <article>
769+
for target_tag in &["main", "article"] {
770+
if let Some(content) = extract_tag_content(html, target_tag) {
771+
return Some(content);
772+
}
773+
}
774+
775+
// Try role="main"
776+
extract_role_content(html, "main")
777+
}
778+
779+
/// Extract the inner content of the first occurrence of a given tag.
780+
fn extract_tag_content(html: &str, target: &str) -> Option<String> {
781+
let mut chars = html.chars().peekable();
782+
let mut depth = 0i32;
783+
let mut capturing = false;
784+
let mut output = String::new();
785+
786+
while let Some(c) = chars.next() {
787+
if c == '<' {
788+
let mut tag = String::new();
789+
while let Some(&next) = chars.peek() {
790+
if next == '>' {
791+
chars.next();
792+
break;
793+
}
794+
tag.push(chars.next().unwrap());
795+
}
796+
797+
let tag_lower = tag.to_lowercase();
798+
let is_closing = tag_lower.starts_with('/');
799+
let tag_name = if is_closing {
800+
tag_lower[1..].split_whitespace().next().unwrap_or("")
801+
} else {
802+
tag_lower.split_whitespace().next().unwrap_or("")
803+
};
804+
805+
if tag_name == target {
806+
if is_closing {
807+
depth -= 1;
808+
if depth == 0 && capturing {
809+
return Some(output);
810+
}
811+
} else if !tag.ends_with('/') {
812+
depth += 1;
813+
if depth == 1 && !capturing {
814+
capturing = true;
815+
continue;
816+
}
817+
}
818+
}
819+
820+
if capturing {
821+
output.push('<');
822+
output.push_str(&tag);
823+
output.push('>');
824+
}
825+
} else if capturing {
826+
output.push(c);
827+
}
828+
}
829+
830+
None
831+
}
832+
833+
/// Extract content of the first element with a given role attribute.
834+
fn extract_role_content(html: &str, role: &str) -> Option<String> {
835+
let mut chars = html.chars().peekable();
836+
let mut capture_tag: Option<String> = None;
837+
let mut depth = 0i32;
838+
let mut output = String::new();
839+
840+
while let Some(c) = chars.next() {
841+
if c == '<' {
842+
let mut tag = String::new();
843+
while let Some(&next) = chars.peek() {
844+
if next == '>' {
845+
chars.next();
846+
break;
847+
}
848+
tag.push(chars.next().unwrap());
849+
}
850+
851+
let tag_lower = tag.to_lowercase();
852+
let is_closing = tag_lower.starts_with('/');
853+
let tag_name = if is_closing {
854+
tag_lower[1..].split_whitespace().next().unwrap_or("")
855+
} else {
856+
tag_lower.split_whitespace().next().unwrap_or("")
857+
};
858+
859+
if let Some(ref target) = capture_tag {
860+
if tag_name == target.as_str() {
861+
if is_closing {
862+
depth -= 1;
863+
if depth == 0 {
864+
return Some(output);
865+
}
866+
} else if !tag.ends_with('/') {
867+
depth += 1;
868+
}
869+
}
870+
871+
if depth > 0 {
872+
output.push('<');
873+
output.push_str(&tag);
874+
output.push('>');
875+
}
876+
} else if !is_closing {
877+
// Check for role attribute
878+
if let Some(attr_role) = extract_attribute(&tag, "role") {
879+
if attr_role.eq_ignore_ascii_case(role) && !tag.ends_with('/') {
880+
capture_tag = Some(tag_name.to_string());
881+
depth = 1;
882+
continue;
883+
}
884+
}
885+
}
886+
} else if capture_tag.is_some() && depth > 0 {
887+
output.push(c);
888+
}
889+
}
890+
891+
None
892+
}
893+
894+
/// Boilerplate tags to strip when no <main>/<article> found.
895+
const BOILERPLATE_TAGS: &[&str] = &["nav", "footer", "aside", "header"];
896+
897+
/// Roles that indicate boilerplate.
898+
const BOILERPLATE_ROLES: &[&str] = &["navigation", "banner", "contentinfo", "complementary"];
899+
900+
/// Strip known boilerplate elements from HTML.
901+
fn strip_boilerplate_elements(html: &str) -> String {
902+
let mut output = String::new();
903+
let mut chars = html.chars().peekable();
904+
let mut skip_depth = 0i32;
905+
let mut skip_tag: Option<String> = None;
906+
907+
while let Some(c) = chars.next() {
908+
if c == '<' {
909+
let mut tag = String::new();
910+
while let Some(&next) = chars.peek() {
911+
if next == '>' {
912+
chars.next();
913+
break;
914+
}
915+
tag.push(chars.next().unwrap());
916+
}
917+
918+
let tag_lower = tag.to_lowercase();
919+
let is_closing = tag_lower.starts_with('/');
920+
let tag_name = if is_closing {
921+
tag_lower[1..].split_whitespace().next().unwrap_or("")
922+
} else {
923+
tag_lower.split_whitespace().next().unwrap_or("")
924+
};
925+
926+
// Track skip state
927+
if let Some(ref target) = skip_tag {
928+
if tag_name == target.as_str() {
929+
if is_closing {
930+
skip_depth -= 1;
931+
if skip_depth == 0 {
932+
skip_tag = None;
933+
continue;
934+
}
935+
} else if !tag.ends_with('/') {
936+
skip_depth += 1;
937+
}
938+
}
939+
continue; // Skip everything inside boilerplate
940+
}
941+
942+
// Check if this tag should be skipped
943+
if !is_closing && !tag.ends_with('/') {
944+
let is_boilerplate_tag = BOILERPLATE_TAGS.contains(&tag_name);
945+
let is_boilerplate_role = extract_attribute(&tag, "role")
946+
.map(|r| {
947+
BOILERPLATE_ROLES
948+
.iter()
949+
.any(|br| r.eq_ignore_ascii_case(br))
950+
})
951+
.unwrap_or(false);
952+
953+
if is_boilerplate_tag || is_boilerplate_role {
954+
skip_tag = Some(tag_name.to_string());
955+
skip_depth = 1;
956+
continue;
957+
}
958+
}
959+
960+
output.push('<');
961+
output.push_str(&tag);
962+
output.push('>');
963+
} else if skip_tag.is_none() {
964+
output.push(c);
965+
}
966+
}
967+
968+
output
969+
}
970+
736971
#[cfg(test)]
737972
mod tests {
738973
use super::*;
@@ -1069,4 +1304,103 @@ mod tests {
10691304
};
10701305
assert!(!meta.is_empty());
10711306
}
1307+
1308+
#[test]
1309+
fn test_strip_boilerplate_extracts_main() {
1310+
let html = r#"<nav><a href="/">Home</a></nav>
1311+
<main><p>Important content</p></main>
1312+
<footer>Copyright 2024</footer>"#;
1313+
let result = strip_boilerplate(html);
1314+
assert!(result.contains("Important content"));
1315+
assert!(!result.contains("Home"));
1316+
assert!(!result.contains("Copyright"));
1317+
}
1318+
1319+
#[test]
1320+
fn test_strip_boilerplate_extracts_article() {
1321+
let html = r#"<nav>Menu</nav>
1322+
<article><h1>Title</h1><p>Body text</p></article>
1323+
<aside>Sidebar</aside>"#;
1324+
let result = strip_boilerplate(html);
1325+
assert!(result.contains("Title"));
1326+
assert!(result.contains("Body text"));
1327+
assert!(!result.contains("Menu"));
1328+
assert!(!result.contains("Sidebar"));
1329+
}
1330+
1331+
#[test]
1332+
fn test_strip_boilerplate_main_takes_precedence_over_article() {
1333+
let html = r#"<main><p>Main content</p></main>
1334+
<article><p>Article content</p></article>"#;
1335+
let result = strip_boilerplate(html);
1336+
assert!(result.contains("Main content"));
1337+
// Article is outside main, so not included
1338+
assert!(!result.contains("Article content"));
1339+
}
1340+
1341+
#[test]
1342+
fn test_strip_boilerplate_fallback_strips_nav_footer_aside() {
1343+
let html = r#"<div>
1344+
<nav>Navigation links</nav>
1345+
<p>Content paragraph</p>
1346+
<footer>Footer info</footer>
1347+
<aside>Sidebar widget</aside>
1348+
</div>"#;
1349+
let result = strip_boilerplate(html);
1350+
assert!(result.contains("Content paragraph"));
1351+
assert!(!result.contains("Navigation links"));
1352+
assert!(!result.contains("Footer info"));
1353+
assert!(!result.contains("Sidebar widget"));
1354+
}
1355+
1356+
#[test]
1357+
fn test_strip_boilerplate_role_navigation() {
1358+
let html = r#"<div role="navigation">Nav menu</div>
1359+
<p>Content</p>
1360+
<div role="contentinfo">Footer stuff</div>"#;
1361+
let result = strip_boilerplate(html);
1362+
assert!(result.contains("Content"));
1363+
assert!(!result.contains("Nav menu"));
1364+
assert!(!result.contains("Footer stuff"));
1365+
}
1366+
1367+
#[test]
1368+
fn test_strip_boilerplate_role_main() {
1369+
let html = r#"<nav>Nav</nav>
1370+
<div role="main"><p>Main content here</p></div>
1371+
<footer>Foot</footer>"#;
1372+
let result = strip_boilerplate(html);
1373+
assert!(result.contains("Main content here"));
1374+
assert!(!result.contains("Nav"));
1375+
assert!(!result.contains("Foot"));
1376+
}
1377+
1378+
#[test]
1379+
fn test_strip_boilerplate_nested_nav() {
1380+
let html = r#"<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
1381+
<p>Page content</p>"#;
1382+
let result = strip_boilerplate(html);
1383+
assert!(result.contains("Page content"));
1384+
assert!(!result.contains("Home"));
1385+
assert!(!result.contains("About"));
1386+
}
1387+
1388+
#[test]
1389+
fn test_strip_boilerplate_no_semantic_html() {
1390+
// No main/article/nav/footer — returns everything
1391+
let html = "<div><p>Content 1</p></div><div><p>Content 2</p></div>";
1392+
let result = strip_boilerplate(html);
1393+
assert!(result.contains("Content 1"));
1394+
assert!(result.contains("Content 2"));
1395+
}
1396+
1397+
#[test]
1398+
fn test_strip_boilerplate_preserves_header_inside_main() {
1399+
let html = r#"<header>Site header</header>
1400+
<main><header><h1>Article header</h1></header><p>Body</p></main>"#;
1401+
let result = strip_boilerplate(html);
1402+
assert!(result.contains("Article header"));
1403+
assert!(result.contains("Body"));
1404+
assert!(!result.contains("Site header"));
1405+
}
10721406
}

crates/fetchkit/src/fetchers/default.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
use crate::client::FetchOptions;
1111
use crate::convert::{
1212
extract_headings, extract_metadata, filter_excessive_newlines, html_to_markdown, html_to_text,
13-
is_html, is_markdown_content_type, is_plain_text_content_type,
13+
is_html, is_markdown_content_type, is_plain_text_content_type, strip_boilerplate,
1414
};
1515
use crate::error::FetchError;
1616
use crate::fetchers::Fetcher;
@@ -253,8 +253,9 @@ impl Fetcher for DefaultFetcher {
253253
// Determine format and convert if needed
254254
// THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
255255
let is_html_content = is_html(&meta.content_type, &content);
256+
let wants_main = request.wants_main_content();
256257

257-
// Extract structured metadata from HTML content
258+
// Extract structured metadata from HTML content (before boilerplate stripping)
258259
let page_metadata = if is_html_content {
259260
let mut pm = extract_metadata(&content);
260261
pm.headings = extract_headings(&content);
@@ -277,12 +278,18 @@ impl Fetcher for DefaultFetcher {
277278
debug!("Content-type is plain text; skipping HTML conversion");
278279
("text".to_string(), content)
279280
} else if is_html_content {
281+
// Strip boilerplate before conversion if content_focus is "main"
282+
let html = if wants_main {
283+
strip_boilerplate(&content)
284+
} else {
285+
content
286+
};
280287
if wants_markdown {
281-
("markdown".to_string(), html_to_markdown(&content))
288+
("markdown".to_string(), html_to_markdown(&html))
282289
} else if wants_text {
283-
("text".to_string(), html_to_text(&content))
290+
("text".to_string(), html_to_text(&html))
284291
} else {
285-
("raw".to_string(), content)
292+
("raw".to_string(), html)
286293
}
287294
} else {
288295
("raw".to_string(), content)

crates/fetchkit/src/lib.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ mod tool;
8585
mod types;
8686

8787
pub use client::{fetch, fetch_with_options, FetchOptions};
88-
pub use convert::{extract_headings, extract_metadata, html_to_markdown, html_to_text};
88+
pub use convert::{
89+
extract_headings, extract_metadata, html_to_markdown, html_to_text, strip_boilerplate,
90+
};
8991
pub use dns::DnsPolicy;
9092
pub use error::{FetchError, ToolError};
9193
pub use fetchers::{

0 commit comments

Comments
 (0)