From 30bd3d403ae165d853e8035937a6993409af769d Mon Sep 17 00:00:00 2001
From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:10:19 -0800
Subject: [PATCH 1/2] Benchmark for sharedmemory operations

Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
---
 src/hyperlight_host/benches/benchmarks.rs | 59 ++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/src/hyperlight_host/benches/benchmarks.rs b/src/hyperlight_host/benches/benchmarks.rs
index 70b0a0416..3b47781fb 100644
--- a/src/hyperlight_host/benches/benchmarks.rs
+++ b/src/hyperlight_host/benches/benchmarks.rs
@@ -23,12 +23,13 @@ use std::sync::{Arc, Barrier, Mutex};
 use std::thread;
 use std::time::{Duration, Instant};
 
-use criterion::{Criterion, criterion_group, criterion_main};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use flatbuffers::FlatBufferBuilder;
 use hyperlight_common::flatbuffer_wrappers::function_call::{FunctionCall, FunctionCallType};
 use hyperlight_common::flatbuffer_wrappers::function_types::{ParameterValue, ReturnType};
 use hyperlight_common::flatbuffer_wrappers::util::estimate_flatbuffer_capacity;
 use hyperlight_host::GuestBinary;
+use hyperlight_host::mem::shared_mem::ExclusiveSharedMemory;
 use hyperlight_host::sandbox::{MultiUseSandbox, SandboxConfiguration, UninitializedSandbox};
 use hyperlight_testing::sandbox_sizes::{LARGE_HEAP_SIZE, MEDIUM_HEAP_SIZE, SMALL_HEAP_SIZE};
 use hyperlight_testing::{c_simple_guest_as_string, simple_guest_as_string};
@@ -492,6 +493,59 @@ fn sample_workloads_benchmark(c: &mut Criterion) {
     group.finish();
 }
 
+// ============================================================================
+// Benchmark Category: Shared Memory Operations
+// ============================================================================
+
+fn shared_memory_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("shared_memory");
+
+    let sizes: &[(usize, &str)] = &[(1024 * 1024, "1MB"), (64 * 1024 * 1024, "64MB")];
+
+    for &(size, name) in sizes {
+        group.throughput(Throughput::Bytes(size as u64));
+
+        // Benchmark fill
+        group.bench_with_input(BenchmarkId::new("fill", name), &size, |b, &size| {
+            let eshm = ExclusiveSharedMemory::new(size).unwrap();
+            let (mut hshm, _) = eshm.build();
+            b.iter(|| {
+                hshm.fill(0xAB, 0, size).unwrap();
+            });
+        });
+
+        // Benchmark copy_to_slice (read from shared memory)
+        group.bench_with_input(
+            BenchmarkId::new("copy_to_slice", name),
+            &size,
+            |b, &size| {
+                let eshm = ExclusiveSharedMemory::new(size).unwrap();
+                let (hshm, _) = eshm.build();
+                let mut dst = vec![0u8; size];
+                b.iter(|| {
+                    hshm.copy_to_slice(&mut dst, 0).unwrap();
+                });
+            },
+        );
+
+        // Benchmark copy_from_slice (write to shared memory)
+        group.bench_with_input(
+            BenchmarkId::new("copy_from_slice", name),
+            &size,
+            |b, &size| {
+                let eshm = ExclusiveSharedMemory::new(size).unwrap();
+                let (hshm, _) = eshm.build();
+                let src = vec![0xCDu8; size];
+                b.iter(|| {
+                    hshm.copy_from_slice(&src, 0).unwrap();
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
 criterion_group! {
     name = benches;
     config = Criterion::default();
@@ -501,6 +555,7 @@ criterion_group! {
         snapshots_benchmark,
         guest_call_benchmark_large_param,
         function_call_serialization_benchmark,
-        sample_workloads_benchmark
+        sample_workloads_benchmark,
+        shared_memory_benchmark
 }
 criterion_main!(benches);

From 7de43ecca28ace801c5df097682319bc22dcb740 Mon Sep 17 00:00:00 2001
From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
Date: Tue, 13 Jan 2026 11:21:42 -0800
Subject: [PATCH 2/2] Optimize shared memory operations

Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>

Change u64 to u128

Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com>
---
 src/hyperlight_host/benches/benchmarks.rs |   1 +
 src/hyperlight_host/src/mem/shared_mem.rs | 331 +++++++++++++++++++++-
 2 files changed, 323 insertions(+), 9 deletions(-)

diff --git a/src/hyperlight_host/benches/benchmarks.rs b/src/hyperlight_host/benches/benchmarks.rs
index 3b47781fb..7dc51d299 100644
--- a/src/hyperlight_host/benches/benchmarks.rs
+++ b/src/hyperlight_host/benches/benchmarks.rs
@@ -524,6 +524,7 @@ fn shared_memory_benchmark(c: &mut Criterion) {
                 let mut dst = vec![0u8; size];
                 b.iter(|| {
                     hshm.copy_to_slice(&mut dst, 0).unwrap();
+                    std::hint::black_box(&dst);
                 });
             },
         );
diff --git a/src/hyperlight_host/src/mem/shared_mem.rs b/src/hyperlight_host/src/mem/shared_mem.rs
index 824cfde04..4d04dbe30 100644
--- a/src/hyperlight_host/src/mem/shared_mem.rs
+++ b/src/hyperlight_host/src/mem/shared_mem.rs
@@ -17,6 +17,7 @@ limitations under the License.
 use std::any::type_name;
 use std::ffi::c_void;
 use std::io::Error;
+use std::mem::{align_of, size_of};
 #[cfg(target_os = "linux")]
 use std::ptr::null_mut;
 use std::sync::{Arc, RwLock};
@@ -783,12 +784,39 @@ impl HostSharedMemory {
             .lock
             .try_read()
             .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
-        // todo: replace with something a bit more optimized + correct
-        for (i, b) in slice.iter_mut().enumerate() {
+
+        const CHUNK: usize = size_of::<u128>();
+        let len = slice.len();
+        let mut i = 0;
+
+        // Handle unaligned head bytes until we reach u128 alignment.
+        // Note: align_offset can return usize::MAX if alignment is impossible.
+        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
+        // operations for the entire slice.
+        let align_offset = base.align_offset(align_of::<u128>());
+        let head_len = align_offset.min(len);
+        while i < head_len {
+            unsafe {
+                slice[i] = base.add(i).read_volatile();
+            }
+            i += 1;
+        }
+
+        // Read aligned u128 chunks
+        while i + CHUNK <= len {
+            let value = unsafe { (base.add(i) as *const u128).read_volatile() };
+            slice[i..i + CHUNK].copy_from_slice(&value.to_ne_bytes());
+            i += CHUNK;
+        }
+
+        // Handle remaining tail bytes
+        while i < len {
             unsafe {
-                *b = base.wrapping_add(i).read_volatile();
+                slice[i] = base.add(i).read_volatile();
             }
+            i += 1;
         }
+
         drop(guard);
         Ok(())
     }
@@ -802,12 +830,51 @@ impl HostSharedMemory {
             .lock
             .try_read()
             .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
-        // todo: replace with something a bit more optimized + correct
-        for (i, b) in slice.iter().enumerate() {
+
+        const CHUNK: usize = size_of::<u128>();
+        let len = slice.len();
+        let mut i = 0;
+
+        // Handle unaligned head bytes until we reach u128 alignment.
+        // Note: align_offset can return usize::MAX if alignment is impossible.
+        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
+        // operations for the entire slice.
+        let align_offset = base.align_offset(align_of::<u128>());
+        let head_len = align_offset.min(len);
+        while i < head_len {
+            unsafe {
+                base.add(i).write_volatile(slice[i]);
+            }
+            i += 1;
+        }
+
+        // Write aligned u128 chunks
+        while i + CHUNK <= len {
+            let chunk: [u8; CHUNK] = slice[i..i + CHUNK].try_into().map_err(|_| {
+                new_error!(
+                    "Failed to convert slice to fixed-size array for u128 chunk: \
+                         expected length {}, got {} (total slice len {}, offset {})",
+                    CHUNK,
+                    slice[i..i + CHUNK].len(),
+                    len,
+                    i,
+                )
+            })?;
+            let value = u128::from_ne_bytes(chunk);
+            unsafe {
+                (base.add(i) as *mut u128).write_volatile(value);
+            }
+            i += CHUNK;
+        }
+
+        // Handle remaining tail bytes
+        while i < len {
             unsafe {
-                base.wrapping_add(i).write_volatile(*b);
+                base.add(i).write_volatile(slice[i]);
             }
+            i += 1;
         }
+
         drop(guard);
         Ok(())
     }
@@ -821,10 +888,40 @@ impl HostSharedMemory {
             .lock
             .try_read()
             .map_err(|e| new_error!("Error locking at {}:{}: {}", file!(), line!(), e))?;
-        // todo: replace with something a bit more optimized + correct
-        for i in 0..len {
-            unsafe { base.wrapping_add(i).write_volatile(value) };
+
+        const CHUNK: usize = size_of::<u128>();
+        let value_u128 = u128::from_ne_bytes([value; CHUNK]);
+        let mut i = 0;
+
+        // Handle unaligned head bytes until we reach u128 alignment.
+        // Note: align_offset can return usize::MAX if alignment is impossible.
+        // In that case, head_len = len via .min(), so we fall back to byte-by-byte
+        // operations for the entire slice.
+        let align_offset = base.align_offset(align_of::<u128>());
+        let head_len = align_offset.min(len);
+        while i < head_len {
+            unsafe {
+                base.add(i).write_volatile(value);
+            }
+            i += 1;
+        }
+
+        // Write aligned u128 chunks
+        while i + CHUNK <= len {
+            unsafe {
+                (base.add(i) as *mut u128).write_volatile(value_u128);
+            }
+            i += CHUNK;
+        }
+
+        // Handle remaining tail bytes
+        while i < len {
+            unsafe {
+                base.add(i).write_volatile(value);
+            }
+            i += 1;
         }
+
         drop(guard);
         Ok(())
     }
@@ -1137,6 +1234,222 @@ mod tests {
         assert_eq!(data, ret_vec);
     }
 
+    /// Tests for the optimized aligned memory operations.
+    /// These tests verify that the u128 chunk optimization works correctly
+    /// for various alignment scenarios and buffer sizes.
+    mod alignment_tests {
+        use super::*;
+
+        const CHUNK_SIZE: usize = 16; // size_of::<u128>()
+
+        /// Test copy operations with all possible starting alignment offsets (0-15)
+        #[test]
+        fn copy_with_various_alignments() {
+            // Use a buffer large enough to test all alignment cases
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            // Test all 16 possible alignment offsets (0 through 15)
+            for start_offset in 0..CHUNK_SIZE {
+                let test_len = 64; // Enough to cover head, aligned chunks, and tail
+                let test_data: Vec<u8> = (0..test_len).map(|i| (i + start_offset) as u8).collect();
+
+                // Write data at the given offset
+                hshm.copy_from_slice(&test_data, start_offset).unwrap();
+
+                // Read it back
+                let mut read_buf = vec![0u8; test_len];
+                hshm.copy_to_slice(&mut read_buf, start_offset).unwrap();
+
+                assert_eq!(
+                    test_data, read_buf,
+                    "Mismatch at alignment offset {}",
+                    start_offset
+                );
+            }
+        }
+
+        /// Test copy operations with lengths smaller than chunk size (< 16 bytes)
+        #[test]
+        fn copy_small_lengths() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            for len in 0..CHUNK_SIZE {
+                let test_data: Vec<u8> = (0..len).map(|i| i as u8).collect();
+
+                hshm.copy_from_slice(&test_data, 0).unwrap();
+
+                let mut read_buf = vec![0u8; len];
+                hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+
+                assert_eq!(test_data, read_buf, "Mismatch for length {}", len);
+            }
+        }
+
+        /// Test copy operations with lengths that don't align to chunk boundaries
+        #[test]
+        fn copy_non_aligned_lengths() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            // Test lengths like 17, 31, 33, 47, 63, 65, etc.
+            let test_lengths = [17, 31, 33, 47, 63, 65, 100, 127, 129, 255, 257];
+
+            for &len in &test_lengths {
+                let test_data: Vec<u8> = (0..len).map(|i| (i % 256) as u8).collect();
+
+                hshm.copy_from_slice(&test_data, 0).unwrap();
+
+                let mut read_buf = vec![0u8; len];
+                hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+
+                assert_eq!(test_data, read_buf, "Mismatch for length {}", len);
+            }
+        }
+
+        /// Test copy with exactly one chunk (16 bytes)
+        #[test]
+        fn copy_exact_chunk_size() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            let test_data: Vec<u8> = (0..CHUNK_SIZE).map(|i| i as u8).collect();
+
+            hshm.copy_from_slice(&test_data, 0).unwrap();
+
+            let mut read_buf = vec![0u8; CHUNK_SIZE];
+            hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+
+            assert_eq!(test_data, read_buf);
+        }
+
+        /// Test fill with various alignment offsets
+        #[test]
+        fn fill_with_various_alignments() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (mut hshm, _) = eshm.build();
+
+            for start_offset in 0..CHUNK_SIZE {
+                let fill_len = 64;
+                let fill_value = (start_offset % 256) as u8;
+
+                // Clear memory first
+                hshm.fill(0, 0, mem_size).unwrap();
+
+                // Fill at the given offset
+                hshm.fill(fill_value, start_offset, fill_len).unwrap();
+
+                // Read it back and verify
+                let mut read_buf = vec![0u8; fill_len];
+                hshm.copy_to_slice(&mut read_buf, start_offset).unwrap();
+
+                assert!(
+                    read_buf.iter().all(|&b| b == fill_value),
+                    "Fill mismatch at alignment offset {}",
+                    start_offset
+                );
+            }
+        }
+
+        /// Test fill with lengths smaller than chunk size
+        #[test]
+        fn fill_small_lengths() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (mut hshm, _) = eshm.build();
+
+            for len in 0..CHUNK_SIZE {
+                let fill_value = 0xAB;
+
+                hshm.fill(0, 0, mem_size).unwrap(); // Clear
+                hshm.fill(fill_value, 0, len).unwrap();
+
+                let mut read_buf = vec![0u8; len];
+                hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+
+                assert!(
+                    read_buf.iter().all(|&b| b == fill_value),
+                    "Fill mismatch for length {}",
+                    len
+                );
+            }
+        }
+
+        /// Test fill with non-aligned lengths
+        #[test]
+        fn fill_non_aligned_lengths() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (mut hshm, _) = eshm.build();
+
+            let test_lengths = [17, 31, 33, 47, 63, 65, 100, 127, 129, 255, 257];
+
+            for &len in &test_lengths {
+                let fill_value = 0xCD;
+
+                hshm.fill(0, 0, mem_size).unwrap(); // Clear
+                hshm.fill(fill_value, 0, len).unwrap();
+
+                let mut read_buf = vec![0u8; len];
+                hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+
+                assert!(
+                    read_buf.iter().all(|&b| b == fill_value),
+                    "Fill mismatch for length {}",
+                    len
+                );
+            }
+        }
+
+        /// Test edge cases: length 0 and length 1
+        #[test]
+        fn copy_edge_cases() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            // Length 0
+            let empty: Vec<u8> = vec![];
+            hshm.copy_from_slice(&empty, 0).unwrap();
+            let mut read_buf: Vec<u8> = vec![];
+            hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+            assert!(read_buf.is_empty());
+
+            // Length 1
+            let single = vec![0x42u8];
+            hshm.copy_from_slice(&single, 0).unwrap();
+            let mut read_buf = vec![0u8; 1];
+            hshm.copy_to_slice(&mut read_buf, 0).unwrap();
+            assert_eq!(single, read_buf);
+        }
+
+        /// Test combined: unaligned start + non-aligned length
+        #[test]
+        fn copy_unaligned_start_and_length() {
+            let mem_size: usize = 4096;
+            let eshm = ExclusiveSharedMemory::new(mem_size).unwrap();
+            let (hshm, _) = eshm.build();
+
+            // Start at offset 7 (unaligned), length 37 (not a multiple of 16)
+            let start_offset = 7;
+            let len = 37;
+            let test_data: Vec<u8> = (0..len).map(|i| (i * 3) as u8).collect();
+
+            hshm.copy_from_slice(&test_data, start_offset).unwrap();
+
+            let mut read_buf = vec![0u8; len];
+            hshm.copy_to_slice(&mut read_buf, start_offset).unwrap();
+
+            assert_eq!(test_data, read_buf);
+        }
+    }
+
     /// A test to ensure that, if a `SharedMem` instance is cloned
     /// and _all_ clones are dropped, the memory region will no longer
     /// be valid.