diff --git a/cmd/metalprobe/main.go b/cmd/metalprobe/main.go index 18db52795..780274d6c 100644 --- a/cmd/metalprobe/main.go +++ b/cmd/metalprobe/main.go @@ -24,6 +24,8 @@ func main() { var registryClientTimeout time.Duration var LLDPSyncInterval time.Duration var LLDPSyncDuration time.Duration + var cleanDisks bool + var diskCleaningMode string flag.StringVar(®istryURL, "registry-url", "", "Registry URL where the probe will register itself.") flag.StringVar(&serverUUID, "server-uuid", "", "Agent UUID to register with the registry.") @@ -34,6 +36,9 @@ func main() { "Duration of time to wait between lldpctl runs.") flag.DurationVar(&LLDPSyncDuration, "lldp-sync-duration", 30*time.Second, "Timeout for the lldpctl run.") + flag.BoolVar(&cleanDisks, "clean-disks", false, "Enable disk cleaning during discovery.") + flag.StringVar(&diskCleaningMode, "disk-cleaning-mode", "quick", + "Disk cleaning mode: 'quick' or 'secure'.") opts := zap.Options{ Development: true, @@ -53,11 +58,18 @@ func main() { os.Exit(1) } + if diskCleaningMode != "quick" && diskCleaningMode != "secure" { + setupLog.Error(nil, "Invalid disk cleaning mode", + "mode", diskCleaningMode, "valid", []string{"quick", "secure"}) + os.Exit(1) + } + ctx := ctrl.SetupSignalHandler() setupLog.Info("starting registry agent") + agent := probe.NewAgent(setupLog, serverUUID, registryURL, duration, registryClientTimeout, - LLDPSyncInterval, LLDPSyncDuration) + LLDPSyncInterval, LLDPSyncDuration, cleanDisks, diskCleaningMode) if err := agent.Start(ctx); err != nil { setupLog.Error(err, "problem running probe agent") os.Exit(1) diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 18ef345f5..11f12d4ac 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -64,6 +64,8 @@ spec: - /manager args: - --leader-elect + - --enable-disk-cleaning=false + - --disk-cleaning-mode=quick image: controller:latest name: manager ports: diff --git a/internal/controller/conditions.go b/internal/controller/conditions.go index b45781e80..03a62f240 100644 --- a/internal/controller/conditions.go +++ b/internal/controller/conditions.go @@ -31,6 +31,8 @@ const ( ConditionReady = "Ready" // ConditionRetryOfFailedResourceIssued indicates a retry of a failed resource has been issued. ConditionRetryOfFailedResourceIssued = "RetryOfFailedResourceIssued" + // ConditionDiskCleaningCompleted indicates that disk cleaning was completed + ConditionDiskCleaningCompleted = "DiskCleaningCompleted" ) // Shared reason strings used across multiple controllers. diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index 65469bdef..9b04fb6a2 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -714,6 +714,7 @@ func (r *ServerReconciler) applyDefaultIgnitionForServer(ctx context.Context, se log.V(1).Info("Applied SSH keypair secret", "SSHKeyPair", sshKeyPairNamespacedName) probeFlags := fmt.Sprintf("--registry-url=%s --server-uuid=%s", registryURL, server.Spec.SystemUUID) + ignitionData, err := r.generateDefaultIgnitionDataForServer(probeFlags, sshPublicKey, password) if err != nil { return fmt.Errorf("failed to generate default ignitionSecret data: %w", err) diff --git a/internal/controller/server_controller_test.go b/internal/controller/server_controller_test.go index 778662bcd..966536eaf 100644 --- a/internal/controller/server_controller_test.go +++ b/internal/controller/server_controller_test.go @@ -248,7 +248,7 @@ var _ = Describe("Server Controller", func() { )) By("Starting the probe agent") - probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond, false, "quick") go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent") @@ -444,7 +444,7 @@ var _ = Describe("Server Controller", func() { )) By("Starting the probe agent") - probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 50*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + probeAgent := probe.NewAgent(GinkgoLogr, server.Spec.SystemUUID, registryURL, 50*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond, false, "quick") go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent") diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 981c47642..45cfb0e63 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -216,7 +216,7 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { PowerPollingTimeout: 200 * time.Millisecond, BasicAuth: true, }, - DiscoveryTimeout: 30 * time.Second, // Set a short discovery timeout for testing + DiscoveryTimeout: 30 * time.Second, DiscoveryIgnitionPath: filepath.Join("..", "..", "config", "manager", "ignition-template.yaml"), }).SetupWithManager(k8sManager)).To(Succeed()) diff --git a/internal/probe/diskcleaning_darwin.go b/internal/probe/diskcleaning_darwin.go new file mode 100644 index 000000000..76be8a441 --- /dev/null +++ b/internal/probe/diskcleaning_darwin.go @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build darwin + +package probe + +import ( + "context" + "fmt" +) + +func cleanDisks(_ context.Context, _ string) error { + return fmt.Errorf("disk cleaning is only supported on Linux") +} diff --git a/internal/probe/diskcleaning_linux.go b/internal/probe/diskcleaning_linux.go new file mode 100644 index 000000000..01b750a79 --- /dev/null +++ b/internal/probe/diskcleaning_linux.go @@ -0,0 +1,424 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build linux + +package probe + +import ( + "context" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "strings" + "sync" + "time" + + "github.com/go-logr/logr" + "github.com/jaypipes/ghw" +) + +const diskCleaningMarkerFile = "/var/run/metal-operator/disk-cleaning-complete" + +// DiskCleaningResult represents the result of cleaning a single disk. +type DiskCleaningResult struct { + DeviceName string + Success bool + Error error + Duration time.Duration +} + +// devicePathRegex validates that device paths match expected format. +// Updated to support multipath/RAID devices (e.g., /dev/mapper/mpatha, /dev/cciss/c0d0) +var devicePathRegex = regexp.MustCompile(`^/dev/[a-zA-Z0-9\-_/]+$`) + +// wasDiskCleaningCompleted checks if disk cleaning was already completed. +func wasDiskCleaningCompleted() bool { + _, err := os.Stat(diskCleaningMarkerFile) + return err == nil +} + +// markDiskCleaningCompleted creates a marker file to indicate disk cleaning was completed. +func markDiskCleaningCompleted() error { + if err := os.MkdirAll(filepath.Dir(diskCleaningMarkerFile), 0755); err != nil { + return fmt.Errorf("failed to create marker directory: %w", err) + } + content := fmt.Sprintf("Disk cleaning completed at %s\n", time.Now().Format(time.RFC3339)) + if err := os.WriteFile(diskCleaningMarkerFile, []byte(content), 0644); err != nil { + return fmt.Errorf("failed to write marker file: %w", err) + } + return nil +} + +// validateDevicePath ensures a device path is safe to use in shell commands. +func validateDevicePath(devicePath string) error { + if !devicePathRegex.MatchString(devicePath) { + return fmt.Errorf("invalid device path format: %s", devicePath) + } + + // Additional safety: ensure it's a block device + fi, err := os.Stat(devicePath) + if err != nil { + return fmt.Errorf("device path does not exist: %w", err) + } + + if fi.Mode()&os.ModeDevice == 0 { + return fmt.Errorf("path is not a device: %s", devicePath) + } + if fi.Mode()&os.ModeCharDevice != 0 { + return fmt.Errorf("path is a character device, not a block device: %s", devicePath) + } + + return nil +} + +// cleanDisks performs disk cleaning based on the specified mode concurrently. +func cleanDisks(ctx context.Context, mode string) error { + log := logr.FromContextOrDiscard(ctx) + + // Check if disk cleaning was already completed + if wasDiskCleaningCompleted() { + log.Info("Disk cleaning already completed, skipping") + return nil + } + + // Validate mode upfront before launching goroutines + if mode != "quick" && mode != "secure" { + return fmt.Errorf("unsupported cleaning mode: %s (must be 'quick' or 'secure')", mode) + } + + log.Info("Starting concurrent disk cleaning", "mode", mode) + + // Get all block devices + blockStorage, err := ghw.Block() + if err != nil { + return fmt.Errorf("failed to enumerate block devices: %w", err) + } + + if len(blockStorage.Disks) == 0 { + log.Info("No disks found to clean") + return nil + } + + var mu sync.Mutex + results := make([]DiskCleaningResult, 0, len(blockStorage.Disks)) + var wg sync.WaitGroup + skippedCount := 0 + + // Limit concurrent disk wipes to avoid overwhelming the system + const maxConcurrentWipes = 4 + semaphore := make(chan struct{}, maxConcurrentWipes) + + for _, disk := range blockStorage.Disks { + ro, err := isReadOnly(disk.Name) + if err != nil { + log.Error(err, "Failed to check read-only status, skipping disk", "disk", disk.Name) + skippedCount++ + continue + } + if ro { + log.Info("Skipping read-only disk", "disk", disk.Name) + skippedCount++ + continue + } + + if disk.IsRemovable { + log.Info("Skipping removable disk", "disk", disk.Name) + skippedCount++ + continue + } + + devicePath := "/dev/" + disk.Name + if _, err := os.Stat(devicePath); err != nil { + log.Error(err, "Device path does not exist, skipping", "disk", disk.Name, "path", devicePath) + skippedCount++ + continue + } + + wg.Add(1) + // Launch each disk wipe in its own goroutine + go func(d *ghw.Disk, path string) { + defer wg.Done() + + // Acquire semaphore to limit concurrency + select { + case semaphore <- struct{}{}: + defer func() { <-semaphore }() + case <-ctx.Done(): + log.Info("Disk cleaning cancelled before starting", "disk", d.Name) + mu.Lock() + results = append(results, DiskCleaningResult{ + DeviceName: d.Name, + Success: false, + Error: ctx.Err(), + Duration: 0, + }) + mu.Unlock() + return + } + + log.Info("Cleaning disk", "disk", d.Name, "model", d.Model, "vendor", d.Vendor, + "size", d.SizeBytes, "removable", d.IsRemovable) + + start := time.Now() + var cleanErr error + + switch mode { + case "quick": + cleanErr = quickCleanDisk(ctx, d.Name, path) + case "secure": + cleanErr = secureCleanDisk(ctx, d.Name, path) + default: + cleanErr = fmt.Errorf("unsupported cleaning mode: %s", mode) + } + + duration := time.Since(start) + result := DiskCleaningResult{ + DeviceName: d.Name, + Success: cleanErr == nil, + Error: cleanErr, + Duration: duration, + } + + if cleanErr != nil { + log.Error(cleanErr, "Failed to clean disk", "disk", d.Name, "duration", duration) + } else { + log.Info("Successfully cleaned disk", "disk", d.Name, "duration", duration) + } + + mu.Lock() + results = append(results, result) + mu.Unlock() + + }(disk, devicePath) + } + + // Wait for all disk wipes to complete + wg.Wait() + + successCount := 0 + for _, r := range results { + if r.Success { + successCount++ + } + } + + log.Info("Disk cleaning completed", + "total", len(blockStorage.Disks), + "processed", len(results), + "success", successCount, + "failed", len(results)-successCount, + "skipped", skippedCount) + + if successCount < len(results) { + return fmt.Errorf("failed to clean %d out of %d disks", len(results)-successCount, len(results)) + } + + if successCount > 0 { + if err := markDiskCleaningCompleted(); err != nil { + log.Error(err, "Failed to mark disk cleaning as completed (will re-run on restart)") + } + } else { + log.Info("No disks were cleaned (all skipped or none found), not marking as completed") + } + + return nil +} + +func quickCleanDisk(ctx context.Context, diskName, devicePath string) error { + log := logr.FromContextOrDiscard(ctx) + + // Add timeout for quick clean operations (10 minutes should be enough) + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + + if err := validateDevicePath(devicePath); err != nil { + return err + } + + log.V(1).Info("Wiping disk header", "disk", diskName) + if err := wipeDiskRangeNative(ctx, devicePath, 0, 10*1024*1024); err != nil { + return fmt.Errorf("failed to wipe disk header: %w", err) + } + + sizeBytes, err := getDiskSize(ctx, devicePath) + if err != nil { + return fmt.Errorf("failed to get disk size: %w", err) + } + + if sizeBytes > 10*1024*1024 { + log.V(1).Info("Wiping disk footer", "disk", diskName) + offset := sizeBytes - (10 * 1024 * 1024) + if err := wipeDiskRangeNative(ctx, devicePath, offset, 10*1024*1024); err != nil { + return fmt.Errorf("failed to wipe disk footer: %w", err) + } + } + + if err := rereadPartitionTable(ctx, devicePath); err != nil { + log.V(1).Info("Warning: failed to re-read partition table", "error", err) + } + + return nil +} + +func secureCleanDisk(ctx context.Context, diskName, devicePath string) error { + log := logr.FromContextOrDiscard(ctx) + + ctx, cancel := context.WithTimeout(ctx, 24*time.Hour) + defer cancel() + + if err := validateDevicePath(devicePath); err != nil { + return err + } + + if !isRotational(diskName) { + log.V(1).Info("Detected non-rotational flash storage. Using blkdiscard.", "disk", diskName) + if err := executeBlkDiscard(ctx, devicePath, true); err == nil { + return rereadPartitionTable(ctx, devicePath) + } else { + log.Error(err, "blkdiscard failed, falling back to shred/dd", "disk", diskName) + } + } + + if _, err := exec.LookPath("shred"); err == nil { + log.V(1).Info("Using shred for secure wipe", "disk", diskName) + cmd := exec.CommandContext(ctx, "shred", "-n", "1", "-z", "--force", devicePath) + output, err := cmd.CombinedOutput() + if err != nil { + log.Error(err, "shred failed, falling back to dd", "disk", diskName, "output", string(output)) + } else { + if err := rereadPartitionTable(ctx, devicePath); err != nil { + log.V(1).Info("Warning: failed to re-read partition table", "error", err) + } + return nil + } + } + + log.V(1).Info("Using dd for secure wipe", "disk", diskName) + cmd := exec.CommandContext(ctx, "dd", + "if=/dev/urandom", + "of="+devicePath, + "bs=1M") + output, err := cmd.CombinedOutput() + if err != nil { + outputStr := string(output) + if !strings.Contains(outputStr, "No space left on device") { + return fmt.Errorf("dd failed: %w, output: %s", err, outputStr) + } + log.V(1).Info("dd completed (expected 'No space left' at end)", "disk", diskName) + } + + if err := rereadPartitionTable(ctx, devicePath); err != nil { + log.V(1).Info("Warning: failed to re-read partition table", "error", err) + } + + return nil +} + +// wipeDiskRangeNative uses standard Go I/O to avoid integer truncation bugs with `dd seek`. +func wipeDiskRangeNative(ctx context.Context, devicePath string, offset, size int64) error { + f, err := os.OpenFile(devicePath, os.O_WRONLY|os.O_SYNC, 0) + if err != nil { + return err + } + defer func() { + if closeErr := f.Close(); closeErr != nil { + // Log but don't fail - data is already written + } + }() + + if _, err := f.Seek(offset, io.SeekStart); err != nil { + return fmt.Errorf("failed to seek to offset %d: %w", offset, err) + } + + chunkSize := int64(1024 * 1024) // 1MB + zeros := make([]byte, chunkSize) + remaining := size + + for remaining > 0 { + // Respect context cancellation during long writes + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + writeSize := min(chunkSize, remaining) + if _, err := f.Write(zeros[:writeSize]); err != nil { + return fmt.Errorf("failed to write zeros: %w", err) + } + remaining -= writeSize + } + + return nil +} + +// isRotational checks sysfs to determine if the drive is a spinning HDD (true) or SSD/NVMe (false). +func isRotational(diskName string) bool { + // Need just the base name for sysfs, handles paths like /dev/mapper/mpatha properly + baseName := filepath.Base(diskName) + path := fmt.Sprintf("/sys/block/%s/queue/rotational", baseName) + + data, err := os.ReadFile(path) + if err != nil { + // If we can't tell, assume true (rotational) to be safe and use shred + return true + } + + return strings.TrimSpace(string(data)) == "1" +} + +// executeBlkDiscard issues TRIM/UNMAP commands to securely erase flash cells. +func executeBlkDiscard(ctx context.Context, devicePath string, secure bool) error { + args := []string{} + if secure { + args = append(args, "--secure") + } + args = append(args, devicePath) + + cmd := exec.CommandContext(ctx, "blkdiscard", args...) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("blkdiscard failed: %w, output: %s", err, string(output)) + } + return nil +} + +// isReadOnly checks if a disk is read-only (hardware write-protected). +func isReadOnly(diskName string) (bool, error) { + baseName := filepath.Base(diskName) + roPath := fmt.Sprintf("/sys/class/block/%s/ro", baseName) + data, err := os.ReadFile(roPath) + if err != nil { + return false, fmt.Errorf("failed to read ro sysfs attribute: %w", err) + } + + return strings.TrimSpace(string(data)) == "1", nil +} + +func getDiskSize(ctx context.Context, devicePath string) (int64, error) { + cmd := exec.CommandContext(ctx, "blockdev", "--getsize64", devicePath) + output, err := cmd.Output() + if err != nil { + return 0, fmt.Errorf("failed to get disk size: %w", err) + } + + sizeStr := strings.TrimSpace(string(output)) + size, err := strconv.ParseInt(sizeStr, 10, 64) + if err != nil { + return 0, fmt.Errorf("failed to parse disk size: %w", err) + } + + return size, nil +} + +func rereadPartitionTable(ctx context.Context, devicePath string) error { + cmd := exec.CommandContext(ctx, "blockdev", "--rereadpt", devicePath) + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to re-read partition table: %w", err) + } + return nil +} diff --git a/internal/probe/probe.go b/internal/probe/probe.go index 4217756d2..ef92d091d 100644 --- a/internal/probe/probe.go +++ b/internal/probe/probe.go @@ -25,10 +25,13 @@ type Agent struct { log logr.Logger LLDPSyncInterval time.Duration LLDPSyncDuration time.Duration + CleanDisks bool + DiskCleaningMode string + diskCleaningComplete bool } // NewAgent creates a new Agent with the specified system UUID and registry URL. -func NewAgent(log logr.Logger, systemUUID, registryURL string, duration, registryClientTimeout, LLDPSyncInterval, LLDPSyncDuration time.Duration) *Agent { +func NewAgent(log logr.Logger, systemUUID, registryURL string, duration, registryClientTimeout, LLDPSyncInterval, LLDPSyncDuration time.Duration, cleanDisks bool, diskCleaningMode string) *Agent { return &Agent{ log: log, SystemUUID: systemUUID, @@ -37,6 +40,9 @@ func NewAgent(log logr.Logger, systemUUID, registryURL string, duration, registr RegistryClientTimeout: registryClientTimeout, LLDPSyncInterval: LLDPSyncInterval, LLDPSyncDuration: LLDPSyncDuration, + CleanDisks: cleanDisks, + DiskCleaningMode: diskCleaningMode, + diskCleaningComplete: false, } } @@ -112,6 +118,32 @@ func (a *Agent) Init(ctx context.Context) error { return nil } +// PerformDiskCleaning executes disk cleaning if enabled and not already completed. +// Returns nil if cleaning is disabled or already complete. +// Returns error only if cleaning was attempted and failed. +func (a *Agent) PerformDiskCleaning(ctx context.Context) error { + if !a.CleanDisks { + return nil + } + + if a.diskCleaningComplete { + a.log.Info("Disk cleaning already completed, skipping") + return nil + } + + a.log.Info("Starting disk cleaning", "mode", a.DiskCleaningMode) + + // Add logger to context for cleanDisks + ctx = logr.NewContext(ctx, a.log) + if err := cleanDisks(ctx, a.DiskCleaningMode); err != nil { + return err + } + + a.log.Info("Disk cleaning completed successfully") + a.diskCleaningComplete = true + return nil +} + // Start begins the periodic registration process. func (a *Agent) Start(ctx context.Context) error { ticker := time.NewTicker(30 * time.Second) @@ -134,6 +166,11 @@ func (a *Agent) Start(ctx context.Context) error { } a.log.Info("Server registered", "uuid", a.SystemUUID) + // Perform disk cleaning after registration if enabled + if err := a.PerformDiskCleaning(ctx); err != nil { + a.log.Error(err, "Disk cleaning failed, continuing with periodic sync") + } + for { select { case <-ctx.Done(): diff --git a/internal/probe/probe_suite_test.go b/internal/probe/probe_suite_test.go index 69e7781cc..1994dfade 100644 --- a/internal/probe/probe_suite_test.go +++ b/internal/probe/probe_suite_test.go @@ -46,7 +46,7 @@ var _ = BeforeSuite(func() { }).Should(Succeed()) // Initialize your probe server - probeAgent = probe.NewAgent(GinkgoLogr, systemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond) + probeAgent = probe.NewAgent(GinkgoLogr, systemUUID, registryURL, 100*time.Millisecond, 1*time.Second, 50*time.Millisecond, 250*time.Millisecond, false, "quick") go func() { defer GinkgoRecover() Expect(probeAgent.Start(ctx)).To(Succeed(), "failed to start probe agent")