Skip to content

Commit 34637a1

Browse files
committed
Reduce size of Bytes
1 parent 6b40344 commit 34637a1

1 file changed

Lines changed: 60 additions & 26 deletions

File tree

src/lib.rs

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ _hypher_ separates words into syllables.
66
efficiently encoded finite automata at build time.
77
- Zero load time: Hyphenation automata operate directly over the embedded
88
binary data with no up-front decoding.
9-
- No allocations unless when hyphenating very long words (> 41 bytes). You can
9+
- No allocations unless when hyphenating very long words (> 45 bytes). You can
1010
disable the `alloc` feature, but then overly long words lead to a panic.
1111
- Support for many languages.
1212
- No unsafe code, no dependencies, no std.
@@ -40,10 +40,10 @@ assert_eq!(syllables.next(), None);
4040
)]
4141
/*!
4242
# Languages
43-
By default, this crate supports hyphenating more than 30 languages. Embedding
44-
automata for all these languages will add ~1.1 MiB to your binary. Alternatively,
45-
you can disable support for all languages and manually choose which ones get
46-
added:
43+
By default, this crate supports hyphenating more than 30 languages.
44+
Embedding automata for all these languages will add ~1.1 MiB to your binary.
45+
Alternatively, you can disable support for all languages and manually choose
46+
which ones get added:
4747
4848
```toml
4949
[dependencies]
@@ -60,6 +60,7 @@ extern crate alloc;
6060

6161
use core::fmt::{self, Debug, Formatter};
6262
use core::iter::FusedIterator;
63+
use core::num::NonZeroU8;
6364

6465
// Include language data.
6566
include!("lang.rs");
@@ -71,8 +72,8 @@ include!("lang.rs");
7172
/// This uses the default [bounds](Lang::bounds) for the language.
7273
///
7374
/// # Panics
74-
/// Panics if the word is more than 41 bytes long and the `alloc` feature is
75-
/// disabled.
75+
/// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
76+
/// feature is disabled.
7677
///
7778
/// # Example
7879
/// ```
@@ -89,14 +90,14 @@ pub fn hyphenate(word: &str, lang: Lang) -> Syllables<'_> {
8990
hyphenate_bounded(word, lang, left_min, right_min)
9091
}
9192

92-
/// Segment a word into syllables, but forbid breaking betwen the given number
93+
/// Segment a word into syllables, but forbid breaking between the given number
9394
/// of chars to each side.
9495
///
9596
/// Returns an iterator over the syllables.
9697
///
9798
/// # Panics
98-
/// Panics if the word is more than 41 bytes long and the `alloc` feature is
99-
/// disabled.
99+
/// Panics if the word is more than [`MAX_INLINE_SIZE`] bytes long and the `alloc`
100+
/// feature is disabled.
100101
///
101102
/// # Example
102103
/// By setting the left bound to three, we forbid the possible break between
@@ -262,22 +263,31 @@ impl ExactSizeIterator for Syllables<'_> {}
262263

263264
impl FusedIterator for Syllables<'_> {}
264265

266+
/// The maximum size (in bytes) of words that may be hyphenated without
267+
/// allocating.
268+
pub const MAX_INLINE_SIZE: usize = 45;
269+
const INLINE_BUF_SIZE: usize = MAX_INLINE_SIZE + 2; // +2 for dots
270+
265271
/// Storage for and iterator over bytes.
266272
#[derive(Clone)]
267273
enum Bytes {
268-
Array(core::array::IntoIter<u8, 40>, usize),
274+
Array([u8; INLINE_BUF_SIZE], NonZeroU8),
269275
#[cfg(feature = "alloc")]
270276
Vec(alloc::vec::IntoIter<u8>),
271277
}
272278

273279
impl Bytes {
274280
/// Create zero-initialized bytes.
275281
fn zeros(len: usize) -> Self {
276-
if len <= 40 {
277-
Self::Array([0; 40].into_iter(), len)
282+
if len <= INLINE_BUF_SIZE {
283+
// MAX+1-MAX is still nonzero, we can unwrap
284+
let start = NonZeroU8::new(INLINE_BUF_SIZE as u8 + 1 - len as u8).unwrap();
285+
Self::Array([0; INLINE_BUF_SIZE], start)
278286
} else {
279287
#[cfg(not(feature = "alloc"))]
280-
panic!("hypher: maximum word length is 41 when `alloc` is disabled");
288+
panic!(
289+
"hypher: maximum word length is {MAX_INLINE_SIZE} bytes when `alloc` is disabled"
290+
);
281291

282292
#[cfg(feature = "alloc")]
283293
Self::Vec(alloc::vec![0; len].into_iter())
@@ -287,7 +297,7 @@ impl Bytes {
287297
/// Access the bytes as a slice.
288298
fn as_slice(&self) -> &[u8] {
289299
match self {
290-
Self::Array(iter, len) => &iter.as_slice()[..*len],
300+
Self::Array(arr, start) => &arr[start.get() as usize - 1..],
291301
#[cfg(feature = "alloc")]
292302
Self::Vec(iter) => iter.as_slice(),
293303
}
@@ -296,7 +306,7 @@ impl Bytes {
296306
/// Access the bytes as a mutable slice.
297307
fn as_mut_slice(&mut self) -> &mut [u8] {
298308
match self {
299-
Self::Array(iter, len) => &mut iter.as_mut_slice()[..*len],
309+
Self::Array(arr, start) => &mut arr[start.get() as usize - 1..],
300310
#[cfg(feature = "alloc")]
301311
Self::Vec(iter) => iter.as_mut_slice(),
302312
}
@@ -308,10 +318,11 @@ impl Iterator for Bytes {
308318

309319
fn next(&mut self) -> Option<Self::Item> {
310320
match self {
311-
Self::Array(iter, len) => {
312-
if *len > 0 {
313-
*len -= 1;
314-
iter.next()
321+
Self::Array(arr, start) => {
322+
let index = start.get() as usize - 1;
323+
if index < INLINE_BUF_SIZE {
324+
*start = start.saturating_add(1); // Will never reach 255 anyways.
325+
Some(arr[index])
315326
} else {
316327
None
317328
}
@@ -323,7 +334,7 @@ impl Iterator for Bytes {
323334

324335
fn size_hint(&self) -> (usize, Option<usize>) {
325336
match self {
326-
Self::Array(_, len) => (*len, Some(*len)),
337+
Self::Array(..) => (self.as_slice().len(), Some(self.as_slice().len())),
327338
#[cfg(feature = "alloc")]
328339
Self::Vec(iter) => iter.size_hint(),
329340
}
@@ -442,7 +453,7 @@ fn is_char_boundary(b: u8) -> bool {
442453

443454
#[cfg(test)]
444455
mod tests {
445-
use super::{hyphenate, Lang};
456+
use super::{hyphenate, Lang, MAX_INLINE_SIZE};
446457

447458
#[allow(unused)]
448459
use Lang::*;
@@ -457,16 +468,39 @@ mod tests {
457468
#[test]
458469
#[cfg(feature = "english")]
459470
fn test_empty() {
460-
let mut syllables = hyphenate("", Lang::English);
471+
let mut syllables = hyphenate("", English);
461472
assert_eq!(syllables.next(), None);
462473
}
463474

464475
#[test]
465476
#[cfg(feature = "english")]
466477
fn test_exact() {
467-
assert_eq!(hyphenate("", Lang::English).len(), 0);
468-
assert_eq!(hyphenate("hello", Lang::English).len(), 1);
469-
assert_eq!(hyphenate("extensive", Lang::English).len(), 3);
478+
assert_eq!(hyphenate("", English).len(), 0);
479+
assert_eq!(hyphenate("hello", English).len(), 1);
480+
assert_eq!(hyphenate("extensive", English).len(), 3);
481+
}
482+
483+
const LONG_WORD: &str = "thisisaverylongstringwithanunrealisticwordlengthforenglishbutitmightbepossibleinanotherlanguage";
484+
485+
#[test]
486+
#[cfg(all(feature = "english", feature = "alloc"))]
487+
fn test_alloc() {
488+
assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE - 1], English).len(), 13);
489+
assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).len(), 12);
490+
assert_eq!(hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).len(), 12);
491+
assert_eq!(hyphenate(LONG_WORD, English).len(), 25);
492+
}
493+
494+
#[test]
495+
#[cfg(all(feature = "english", not(feature = "alloc")))]
496+
fn test_nonalloc() {
497+
_ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE], English).count();
498+
}
499+
#[test]
500+
#[should_panic]
501+
#[cfg(all(feature = "english", not(feature = "alloc")))]
502+
fn test_nonalloc_fail() {
503+
_ = hyphenate(&LONG_WORD[..MAX_INLINE_SIZE + 1], English).count();
470504
}
471505

472506
#[test]

0 commit comments

Comments
 (0)