@@ -733,6 +733,241 @@ fn extract_meta_tag(tag: &str, meta: &mut PageMetadata) {
733733 }
734734}
735735
736+ /// Strip boilerplate elements from HTML, keeping only main content.
737+ ///
738+ /// Removes `<nav>`, `<footer>`, `<aside>`, and elements with
739+ /// `role="navigation"`, `role="banner"`, `role="contentinfo"`.
740+ /// If `<main>` or `<article>` is present, extracts only their content.
741+ ///
742+ /// # Examples
743+ ///
744+ /// ```
745+ /// use fetchkit::strip_boilerplate;
746+ ///
747+ /// let html = r#"<nav>Menu</nav><main><p>Content</p></main><footer>Footer</footer>"#;
748+ /// let result = strip_boilerplate(html);
749+ /// assert!(result.contains("Content"));
750+ /// assert!(!result.contains("Menu"));
751+ /// assert!(!result.contains("Footer"));
752+ /// ```
753+ pub fn strip_boilerplate ( html : & str ) -> String {
754+ // Strategy: if <main> or <article> exists, extract just that content.
755+ // Otherwise, strip known boilerplate elements.
756+
757+ // Check if there's a <main> or <article> to focus on
758+ if let Some ( focused) = extract_main_content ( html) {
759+ return focused;
760+ }
761+
762+ // Fallback: strip boilerplate elements
763+ strip_boilerplate_elements ( html)
764+ }
765+
766+ /// Extract content from `<main>` or `<article>` tag if present.
767+ fn extract_main_content ( html : & str ) -> Option < String > {
768+ // Try <main> first, then <article>
769+ for target_tag in & [ "main" , "article" ] {
770+ if let Some ( content) = extract_tag_content ( html, target_tag) {
771+ return Some ( content) ;
772+ }
773+ }
774+
775+ // Try role="main"
776+ extract_role_content ( html, "main" )
777+ }
778+
779+ /// Extract the inner content of the first occurrence of a given tag.
780+ fn extract_tag_content ( html : & str , target : & str ) -> Option < String > {
781+ let mut chars = html. chars ( ) . peekable ( ) ;
782+ let mut depth = 0i32 ;
783+ let mut capturing = false ;
784+ let mut output = String :: new ( ) ;
785+
786+ while let Some ( c) = chars. next ( ) {
787+ if c == '<' {
788+ let mut tag = String :: new ( ) ;
789+ while let Some ( & next) = chars. peek ( ) {
790+ if next == '>' {
791+ chars. next ( ) ;
792+ break ;
793+ }
794+ tag. push ( chars. next ( ) . unwrap ( ) ) ;
795+ }
796+
797+ let tag_lower = tag. to_lowercase ( ) ;
798+ let is_closing = tag_lower. starts_with ( '/' ) ;
799+ let tag_name = if is_closing {
800+ tag_lower[ 1 ..] . split_whitespace ( ) . next ( ) . unwrap_or ( "" )
801+ } else {
802+ tag_lower. split_whitespace ( ) . next ( ) . unwrap_or ( "" )
803+ } ;
804+
805+ if tag_name == target {
806+ if is_closing {
807+ depth -= 1 ;
808+ if depth == 0 && capturing {
809+ return Some ( output) ;
810+ }
811+ } else if !tag. ends_with ( '/' ) {
812+ depth += 1 ;
813+ if depth == 1 && !capturing {
814+ capturing = true ;
815+ continue ;
816+ }
817+ }
818+ }
819+
820+ if capturing {
821+ output. push ( '<' ) ;
822+ output. push_str ( & tag) ;
823+ output. push ( '>' ) ;
824+ }
825+ } else if capturing {
826+ output. push ( c) ;
827+ }
828+ }
829+
830+ None
831+ }
832+
833+ /// Extract content of the first element with a given role attribute.
834+ fn extract_role_content ( html : & str , role : & str ) -> Option < String > {
835+ let mut chars = html. chars ( ) . peekable ( ) ;
836+ let mut capture_tag: Option < String > = None ;
837+ let mut depth = 0i32 ;
838+ let mut output = String :: new ( ) ;
839+
840+ while let Some ( c) = chars. next ( ) {
841+ if c == '<' {
842+ let mut tag = String :: new ( ) ;
843+ while let Some ( & next) = chars. peek ( ) {
844+ if next == '>' {
845+ chars. next ( ) ;
846+ break ;
847+ }
848+ tag. push ( chars. next ( ) . unwrap ( ) ) ;
849+ }
850+
851+ let tag_lower = tag. to_lowercase ( ) ;
852+ let is_closing = tag_lower. starts_with ( '/' ) ;
853+ let tag_name = if is_closing {
854+ tag_lower[ 1 ..] . split_whitespace ( ) . next ( ) . unwrap_or ( "" )
855+ } else {
856+ tag_lower. split_whitespace ( ) . next ( ) . unwrap_or ( "" )
857+ } ;
858+
859+ if let Some ( ref target) = capture_tag {
860+ if tag_name == target. as_str ( ) {
861+ if is_closing {
862+ depth -= 1 ;
863+ if depth == 0 {
864+ return Some ( output) ;
865+ }
866+ } else if !tag. ends_with ( '/' ) {
867+ depth += 1 ;
868+ }
869+ }
870+
871+ if depth > 0 {
872+ output. push ( '<' ) ;
873+ output. push_str ( & tag) ;
874+ output. push ( '>' ) ;
875+ }
876+ } else if !is_closing {
877+ // Check for role attribute
878+ if let Some ( attr_role) = extract_attribute ( & tag, "role" ) {
879+ if attr_role. eq_ignore_ascii_case ( role) && !tag. ends_with ( '/' ) {
880+ capture_tag = Some ( tag_name. to_string ( ) ) ;
881+ depth = 1 ;
882+ continue ;
883+ }
884+ }
885+ }
886+ } else if capture_tag. is_some ( ) && depth > 0 {
887+ output. push ( c) ;
888+ }
889+ }
890+
891+ None
892+ }
893+
894+ /// Boilerplate tags to strip when no <main>/<article> found.
895+ const BOILERPLATE_TAGS : & [ & str ] = & [ "nav" , "footer" , "aside" , "header" ] ;
896+
897+ /// Roles that indicate boilerplate.
898+ const BOILERPLATE_ROLES : & [ & str ] = & [ "navigation" , "banner" , "contentinfo" , "complementary" ] ;
899+
900+ /// Strip known boilerplate elements from HTML.
901+ fn strip_boilerplate_elements ( html : & str ) -> String {
902+ let mut output = String :: new ( ) ;
903+ let mut chars = html. chars ( ) . peekable ( ) ;
904+ let mut skip_depth = 0i32 ;
905+ let mut skip_tag: Option < String > = None ;
906+
907+ while let Some ( c) = chars. next ( ) {
908+ if c == '<' {
909+ let mut tag = String :: new ( ) ;
910+ while let Some ( & next) = chars. peek ( ) {
911+ if next == '>' {
912+ chars. next ( ) ;
913+ break ;
914+ }
915+ tag. push ( chars. next ( ) . unwrap ( ) ) ;
916+ }
917+
918+ let tag_lower = tag. to_lowercase ( ) ;
919+ let is_closing = tag_lower. starts_with ( '/' ) ;
920+ let tag_name = if is_closing {
921+ tag_lower[ 1 ..] . split_whitespace ( ) . next ( ) . unwrap_or ( "" )
922+ } else {
923+ tag_lower. split_whitespace ( ) . next ( ) . unwrap_or ( "" )
924+ } ;
925+
926+ // Track skip state
927+ if let Some ( ref target) = skip_tag {
928+ if tag_name == target. as_str ( ) {
929+ if is_closing {
930+ skip_depth -= 1 ;
931+ if skip_depth == 0 {
932+ skip_tag = None ;
933+ continue ;
934+ }
935+ } else if !tag. ends_with ( '/' ) {
936+ skip_depth += 1 ;
937+ }
938+ }
939+ continue ; // Skip everything inside boilerplate
940+ }
941+
942+ // Check if this tag should be skipped
943+ if !is_closing && !tag. ends_with ( '/' ) {
944+ let is_boilerplate_tag = BOILERPLATE_TAGS . contains ( & tag_name) ;
945+ let is_boilerplate_role = extract_attribute ( & tag, "role" )
946+ . map ( |r| {
947+ BOILERPLATE_ROLES
948+ . iter ( )
949+ . any ( |br| r. eq_ignore_ascii_case ( br) )
950+ } )
951+ . unwrap_or ( false ) ;
952+
953+ if is_boilerplate_tag || is_boilerplate_role {
954+ skip_tag = Some ( tag_name. to_string ( ) ) ;
955+ skip_depth = 1 ;
956+ continue ;
957+ }
958+ }
959+
960+ output. push ( '<' ) ;
961+ output. push_str ( & tag) ;
962+ output. push ( '>' ) ;
963+ } else if skip_tag. is_none ( ) {
964+ output. push ( c) ;
965+ }
966+ }
967+
968+ output
969+ }
970+
736971#[ cfg( test) ]
737972mod tests {
738973 use super :: * ;
@@ -1069,4 +1304,103 @@ mod tests {
10691304 } ;
10701305 assert ! ( !meta. is_empty( ) ) ;
10711306 }
1307+
1308+ #[ test]
1309+ fn test_strip_boilerplate_extracts_main ( ) {
1310+ let html = r#"<nav><a href="/">Home</a></nav>
1311+ <main><p>Important content</p></main>
1312+ <footer>Copyright 2024</footer>"# ;
1313+ let result = strip_boilerplate ( html) ;
1314+ assert ! ( result. contains( "Important content" ) ) ;
1315+ assert ! ( !result. contains( "Home" ) ) ;
1316+ assert ! ( !result. contains( "Copyright" ) ) ;
1317+ }
1318+
1319+ #[ test]
1320+ fn test_strip_boilerplate_extracts_article ( ) {
1321+ let html = r#"<nav>Menu</nav>
1322+ <article><h1>Title</h1><p>Body text</p></article>
1323+ <aside>Sidebar</aside>"# ;
1324+ let result = strip_boilerplate ( html) ;
1325+ assert ! ( result. contains( "Title" ) ) ;
1326+ assert ! ( result. contains( "Body text" ) ) ;
1327+ assert ! ( !result. contains( "Menu" ) ) ;
1328+ assert ! ( !result. contains( "Sidebar" ) ) ;
1329+ }
1330+
1331+ #[ test]
1332+ fn test_strip_boilerplate_main_takes_precedence_over_article ( ) {
1333+ let html = r#"<main><p>Main content</p></main>
1334+ <article><p>Article content</p></article>"# ;
1335+ let result = strip_boilerplate ( html) ;
1336+ assert ! ( result. contains( "Main content" ) ) ;
1337+ // Article is outside main, so not included
1338+ assert ! ( !result. contains( "Article content" ) ) ;
1339+ }
1340+
1341+ #[ test]
1342+ fn test_strip_boilerplate_fallback_strips_nav_footer_aside ( ) {
1343+ let html = r#"<div>
1344+ <nav>Navigation links</nav>
1345+ <p>Content paragraph</p>
1346+ <footer>Footer info</footer>
1347+ <aside>Sidebar widget</aside>
1348+ </div>"# ;
1349+ let result = strip_boilerplate ( html) ;
1350+ assert ! ( result. contains( "Content paragraph" ) ) ;
1351+ assert ! ( !result. contains( "Navigation links" ) ) ;
1352+ assert ! ( !result. contains( "Footer info" ) ) ;
1353+ assert ! ( !result. contains( "Sidebar widget" ) ) ;
1354+ }
1355+
1356+ #[ test]
1357+ fn test_strip_boilerplate_role_navigation ( ) {
1358+ let html = r#"<div role="navigation">Nav menu</div>
1359+ <p>Content</p>
1360+ <div role="contentinfo">Footer stuff</div>"# ;
1361+ let result = strip_boilerplate ( html) ;
1362+ assert ! ( result. contains( "Content" ) ) ;
1363+ assert ! ( !result. contains( "Nav menu" ) ) ;
1364+ assert ! ( !result. contains( "Footer stuff" ) ) ;
1365+ }
1366+
1367+ #[ test]
1368+ fn test_strip_boilerplate_role_main ( ) {
1369+ let html = r#"<nav>Nav</nav>
1370+ <div role="main"><p>Main content here</p></div>
1371+ <footer>Foot</footer>"# ;
1372+ let result = strip_boilerplate ( html) ;
1373+ assert ! ( result. contains( "Main content here" ) ) ;
1374+ assert ! ( !result. contains( "Nav" ) ) ;
1375+ assert ! ( !result. contains( "Foot" ) ) ;
1376+ }
1377+
1378+ #[ test]
1379+ fn test_strip_boilerplate_nested_nav ( ) {
1380+ let html = r#"<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
1381+ <p>Page content</p>"# ;
1382+ let result = strip_boilerplate ( html) ;
1383+ assert ! ( result. contains( "Page content" ) ) ;
1384+ assert ! ( !result. contains( "Home" ) ) ;
1385+ assert ! ( !result. contains( "About" ) ) ;
1386+ }
1387+
1388+ #[ test]
1389+ fn test_strip_boilerplate_no_semantic_html ( ) {
1390+ // No main/article/nav/footer — returns everything
1391+ let html = "<div><p>Content 1</p></div><div><p>Content 2</p></div>" ;
1392+ let result = strip_boilerplate ( html) ;
1393+ assert ! ( result. contains( "Content 1" ) ) ;
1394+ assert ! ( result. contains( "Content 2" ) ) ;
1395+ }
1396+
1397+ #[ test]
1398+ fn test_strip_boilerplate_preserves_header_inside_main ( ) {
1399+ let html = r#"<header>Site header</header>
1400+ <main><header><h1>Article header</h1></header><p>Body</p></main>"# ;
1401+ let result = strip_boilerplate ( html) ;
1402+ assert ! ( result. contains( "Article header" ) ) ;
1403+ assert ! ( result. contains( "Body" ) ) ;
1404+ assert ! ( !result. contains( "Site header" ) ) ;
1405+ }
10721406}
0 commit comments