@@ -6,7 +6,7 @@ _hypher_ separates words into syllables.
66 efficiently encoded finite automata at build time.
77- Zero load time: Hyphenation automata operate directly over the embedded
88 binary data with no up-front decoding.
9- - No allocations unless when hyphenating very long words (> 41 bytes). You can
9+ - No allocations unless when hyphenating very long words (> 45 bytes). You can
1010 disable the `alloc` feature, but then overly long words lead to a panic.
1111- Support for many languages.
1212- No unsafe code, no dependencies, no std.
@@ -40,10 +40,10 @@ assert_eq!(syllables.next(), None);
4040) ]
4141/*!
4242# Languages
43- By default, this crate supports hyphenating more than 30 languages. Embedding
44- automata for all these languages will add ~1.1 MiB to your binary. Alternatively,
45- you can disable support for all languages and manually choose which ones get
46- added:
43+ By default, this crate supports hyphenating more than 30 languages.
44+ Embedding automata for all these languages will add ~1.1 MiB to your binary.
45+ Alternatively, you can disable support for all languages and manually choose
46+ which ones get added:
4747
4848```toml
4949[dependencies]
@@ -60,6 +60,7 @@ extern crate alloc;
6060
6161use core:: fmt:: { self , Debug , Formatter } ;
6262use core:: iter:: FusedIterator ;
63+ use core:: num:: NonZeroU8 ;
6364
6465// Include language data.
6566include ! ( "lang.rs" ) ;
@@ -71,8 +72,8 @@ include!("lang.rs");
7172/// This uses the default [bounds](Lang::bounds) for the language.
7273///
7374/// # Panics
74- /// Panics if the word is more than 41 bytes long and the `alloc` feature is
75- /// disabled.
75+ /// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
76+ /// feature is disabled.
7677///
7778/// # Example
7879/// ```
@@ -89,14 +90,14 @@ pub fn hyphenate(word: &str, lang: Lang) -> Syllables<'_> {
8990 hyphenate_bounded ( word, lang, left_min, right_min)
9091}
9192
92- /// Segment a word into syllables, but forbid breaking betwen the given number
93+ /// Segment a word into syllables, but forbid breaking between the given number
9394/// of chars to each side.
9495///
9596/// Returns an iterator over the syllables.
9697///
9798/// # Panics
98- /// Panics if the word is more than 41 bytes long and the `alloc` feature is
99- /// disabled.
99+ /// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
100+ /// feature is disabled.
100101///
101102/// # Example
102103/// By setting the left bound to three, we forbid the possible break between
@@ -262,22 +263,31 @@ impl ExactSizeIterator for Syllables<'_> {}
262263
263264impl FusedIterator for Syllables < ' _ > { }
264265
266+ /// The maximum size (in bytes) of words that may be hyphenated without
267+ /// allocating.
268+ pub const MAX_INLINE_SIZE : usize = 45 ;
269+ const INLINE_BUF_SIZE : usize = MAX_INLINE_SIZE + 2 ; // +2 for dots
270+
265271/// Storage for and iterator over bytes.
266272#[ derive( Clone ) ]
267273enum Bytes {
268- Array ( core :: array :: IntoIter < u8 , 40 > , usize ) ,
274+ Array ( [ u8 ; INLINE_BUF_SIZE ] , NonZeroU8 ) ,
269275 #[ cfg( feature = "alloc" ) ]
270276 Vec ( alloc:: vec:: IntoIter < u8 > ) ,
271277}
272278
273279impl Bytes {
274280 /// Create zero-initialized bytes.
275281 fn zeros ( len : usize ) -> Self {
276- if len <= 40 {
277- Self :: Array ( [ 0 ; 40 ] . into_iter ( ) , len)
282+ if len <= INLINE_BUF_SIZE {
283+ // MAX+1-MAX is still nonzero, we can unwrap
284+ let start = NonZeroU8 :: new ( INLINE_BUF_SIZE as u8 + 1 - len as u8 ) . unwrap ( ) ;
285+ Self :: Array ( [ 0 ; INLINE_BUF_SIZE ] , start)
278286 } else {
279287 #[ cfg( not( feature = "alloc" ) ) ]
280- panic ! ( "hypher: maximum word length is 41 when `alloc` is disabled" ) ;
288+ panic ! (
289+ "hypher: maximum word length is {MAX_INLINE_SIZE} bytes when `alloc` is disabled"
290+ ) ;
281291
282292 #[ cfg( feature = "alloc" ) ]
283293 Self :: Vec ( alloc:: vec![ 0 ; len] . into_iter ( ) )
@@ -287,7 +297,7 @@ impl Bytes {
287297 /// Access the bytes as a slice.
288298 fn as_slice ( & self ) -> & [ u8 ] {
289299 match self {
290- Self :: Array ( iter , len ) => & iter . as_slice ( ) [ .. * len ] ,
300+ Self :: Array ( arr , start ) => & arr [ start . get ( ) as usize - 1 .. ] ,
291301 #[ cfg( feature = "alloc" ) ]
292302 Self :: Vec ( iter) => iter. as_slice ( ) ,
293303 }
@@ -296,7 +306,7 @@ impl Bytes {
296306 /// Access the bytes as a mutable slice.
297307 fn as_mut_slice ( & mut self ) -> & mut [ u8 ] {
298308 match self {
299- Self :: Array ( iter , len ) => & mut iter . as_mut_slice ( ) [ .. * len ] ,
309+ Self :: Array ( arr , start ) => & mut arr [ start . get ( ) as usize - 1 .. ] ,
300310 #[ cfg( feature = "alloc" ) ]
301311 Self :: Vec ( iter) => iter. as_mut_slice ( ) ,
302312 }
@@ -308,10 +318,11 @@ impl Iterator for Bytes {
308318
309319 fn next ( & mut self ) -> Option < Self :: Item > {
310320 match self {
311- Self :: Array ( iter, len) => {
312- if * len > 0 {
313- * len -= 1 ;
314- iter. next ( )
321+ Self :: Array ( arr, start) => {
322+ let index = start. get ( ) as usize - 1 ;
323+ if index < INLINE_BUF_SIZE {
324+ * start = start. saturating_add ( 1 ) ; // Will never reach 255 anyways.
325+ Some ( arr[ index] )
315326 } else {
316327 None
317328 }
@@ -323,7 +334,7 @@ impl Iterator for Bytes {
323334
324335 fn size_hint ( & self ) -> ( usize , Option < usize > ) {
325336 match self {
326- Self :: Array ( _ , len ) => ( * len, Some ( * len) ) ,
337+ Self :: Array ( .. ) => ( self . as_slice ( ) . len ( ) , Some ( self . as_slice ( ) . len ( ) ) ) ,
327338 #[ cfg( feature = "alloc" ) ]
328339 Self :: Vec ( iter) => iter. size_hint ( ) ,
329340 }
@@ -442,7 +453,7 @@ fn is_char_boundary(b: u8) -> bool {
442453
443454#[ cfg( test) ]
444455mod tests {
445- use super :: { hyphenate, Lang } ;
456+ use super :: { hyphenate, Lang , MAX_INLINE_SIZE } ;
446457
447458 #[ allow( unused) ]
448459 use Lang :: * ;
@@ -457,16 +468,39 @@ mod tests {
457468 #[ test]
458469 #[ cfg( feature = "english" ) ]
459470 fn test_empty ( ) {
460- let mut syllables = hyphenate ( "" , Lang :: English ) ;
471+ let mut syllables = hyphenate ( "" , English ) ;
461472 assert_eq ! ( syllables. next( ) , None ) ;
462473 }
463474
464475 #[ test]
465476 #[ cfg( feature = "english" ) ]
466477 fn test_exact ( ) {
467- assert_eq ! ( hyphenate( "" , Lang :: English ) . len( ) , 0 ) ;
468- assert_eq ! ( hyphenate( "hello" , Lang :: English ) . len( ) , 1 ) ;
469- assert_eq ! ( hyphenate( "extensive" , Lang :: English ) . len( ) , 3 ) ;
478+ assert_eq ! ( hyphenate( "" , English ) . len( ) , 0 ) ;
479+ assert_eq ! ( hyphenate( "hello" , English ) . len( ) , 1 ) ;
480+ assert_eq ! ( hyphenate( "extensive" , English ) . len( ) , 3 ) ;
481+ }
482+
483+ const LONG_WORD : & str = "thisisaverylongstringwithanunrealisticwordlengthforenglishbutitmightbepossibleinanotherlanguage" ;
484+
485+ #[ test]
486+ #[ cfg( all( feature = "english" , feature = "alloc" ) ) ]
487+ fn test_alloc ( ) {
488+ assert_eq ! ( hyphenate( & LONG_WORD [ ..MAX_INLINE_SIZE - 1 ] , English ) . len( ) , 13 ) ;
489+ assert_eq ! ( hyphenate( & LONG_WORD [ ..MAX_INLINE_SIZE ] , English ) . len( ) , 12 ) ;
490+ assert_eq ! ( hyphenate( & LONG_WORD [ ..MAX_INLINE_SIZE + 1 ] , English ) . len( ) , 12 ) ;
491+ assert_eq ! ( hyphenate( LONG_WORD , English ) . len( ) , 25 ) ;
492+ }
493+
494+ #[ test]
495+ #[ cfg( all( feature = "english" , not( feature = "alloc" ) ) ) ]
496+ fn test_nonalloc ( ) {
497+ _ = hyphenate ( & LONG_WORD [ ..MAX_INLINE_SIZE ] , English ) . count ( ) ;
498+ }
499+ #[ test]
500+ #[ should_panic]
501+ #[ cfg( all( feature = "english" , not( feature = "alloc" ) ) ) ]
502+ fn test_nonalloc_fail ( ) {
503+ _ = hyphenate ( & LONG_WORD [ ..MAX_INLINE_SIZE + 1 ] , English ) . count ( ) ;
470504 }
471505
472506 #[ test]
0 commit comments