diff --git a/.gitignore b/.gitignore index 7d85b60..45a148e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,19 @@ __pycache__ *.vscode -cfg.yaml +cfg.yam *.pth *.pkl *.pyc *.so *.egg-info +g.txt # Ignore weight files but keep the folder structure weights/* -!weights/.gitkeep output/ -.claude/ \ No newline at end of file +.claude/ +*.ply +data/mono/* +data/pattern_cube/* +reports/* + +*.bak diff --git a/Utils.py b/Utils.py index 1f12387..d3aaaf8 100755 --- a/Utils.py +++ b/Utils.py @@ -1,4 +1,4 @@ -import os, sys, torch, imageio, logging, importlib, argparse +import os, sys, torch, logging, importlib #, argparse, imageio import cv2 import numpy as np import yaml diff --git a/core/foundation_stereo.py b/core/foundation_stereo.py index b1377d4..27f6c1a 100755 --- a/core/foundation_stereo.py +++ b/core/foundation_stereo.py @@ -1,4 +1,4 @@ -import torch,pdb,logging,timm +import torch #,pdb,logging,timm import torch.nn as nn import torch.nn.functional as F import numpy as np diff --git a/core/submodule.py b/core/submodule.py index 6764d64..ce32622 100755 --- a/core/submodule.py +++ b/core/submodule.py @@ -619,7 +619,8 @@ def __init__(self, in_planes, ratio=16): def forward(self, x): avg_out = self.fc(self.avg_pool(x)) - max_out = self.fc(self.max_pool(x)) + #max_out = self.fc(torch.amax(x, dim=(-2, -1), keepdim=True)) + max_out = self.fc(self.max_pool(x)) # niv out = avg_out + max_out return self.sigmoid(out) diff --git a/data/faro/index_0060/img_depth_faro.png b/data/faro/index_0060/img_depth_faro.png new file mode 100644 index 0000000..db7678a Binary files /dev/null and b/data/faro/index_0060/img_depth_faro.png differ diff --git a/data/faro/index_0060/img_depth_rs.png b/data/faro/index_0060/img_depth_rs.png new file mode 100644 index 0000000..1673727 Binary files /dev/null and b/data/faro/index_0060/img_depth_rs.png differ diff --git a/data/faro/index_0060/img_left.png b/data/faro/index_0060/img_left.png new file mode 100644 index 0000000..3a9780d Binary files /dev/null and b/data/faro/index_0060/img_left.png differ diff --git a/data/faro/index_0060/img_rgb.png b/data/faro/index_0060/img_rgb.png new file mode 100644 index 0000000..03ff669 Binary files /dev/null and b/data/faro/index_0060/img_rgb.png differ diff --git a/data/faro/index_0060/img_right.png b/data/faro/index_0060/img_right.png new file mode 100644 index 0000000..6ad0743 Binary files /dev/null and b/data/faro/index_0060/img_right.png differ diff --git a/data/faro/index_0441/img_depth_faro.png b/data/faro/index_0441/img_depth_faro.png new file mode 100644 index 0000000..758b0d1 Binary files /dev/null and b/data/faro/index_0441/img_depth_faro.png differ diff --git a/data/faro/index_0441/img_depth_rs.png b/data/faro/index_0441/img_depth_rs.png new file mode 100644 index 0000000..7e57812 Binary files /dev/null and b/data/faro/index_0441/img_depth_rs.png differ diff --git a/data/faro/index_0441/img_left.png b/data/faro/index_0441/img_left.png new file mode 100644 index 0000000..d6f7afc Binary files /dev/null and b/data/faro/index_0441/img_left.png differ diff --git a/data/faro/index_0441/img_rgb.png b/data/faro/index_0441/img_rgb.png new file mode 100644 index 0000000..f5a9da5 Binary files /dev/null and b/data/faro/index_0441/img_rgb.png differ diff --git a/data/faro/index_0441/img_right.png b/data/faro/index_0441/img_right.png new file mode 100644 index 0000000..03a35c8 Binary files /dev/null and b/data/faro/index_0441/img_right.png differ diff --git a/data/faro/index_0742/img_depth_faro.png b/data/faro/index_0742/img_depth_faro.png new file mode 100644 index 0000000..c1fe53d Binary files /dev/null and b/data/faro/index_0742/img_depth_faro.png differ diff --git a/data/faro/index_0742/img_depth_rs.png b/data/faro/index_0742/img_depth_rs.png new file mode 100644 index 0000000..3cc14f5 Binary files /dev/null and b/data/faro/index_0742/img_depth_rs.png differ diff --git a/data/faro/index_0742/img_left.png b/data/faro/index_0742/img_left.png new file mode 100644 index 0000000..c4a4812 Binary files /dev/null and b/data/faro/index_0742/img_left.png differ diff --git a/data/faro/index_0742/img_rgb.png b/data/faro/index_0742/img_rgb.png new file mode 100644 index 0000000..5ed54d7 Binary files /dev/null and b/data/faro/index_0742/img_rgb.png differ diff --git a/data/faro/index_0742/img_right.png b/data/faro/index_0742/img_right.png new file mode 100644 index 0000000..deb8d1c Binary files /dev/null and b/data/faro/index_0742/img_right.png differ diff --git a/data/faro/index_1241/img_depth_faro.png b/data/faro/index_1241/img_depth_faro.png new file mode 100644 index 0000000..a0929fa Binary files /dev/null and b/data/faro/index_1241/img_depth_faro.png differ diff --git a/data/faro/index_1241/img_depth_rs.png b/data/faro/index_1241/img_depth_rs.png new file mode 100644 index 0000000..6f8c2a6 Binary files /dev/null and b/data/faro/index_1241/img_depth_rs.png differ diff --git a/data/faro/index_1241/img_left.png b/data/faro/index_1241/img_left.png new file mode 100644 index 0000000..869d2da Binary files /dev/null and b/data/faro/index_1241/img_left.png differ diff --git a/data/faro/index_1241/img_rgb.png b/data/faro/index_1241/img_rgb.png new file mode 100644 index 0000000..9892d74 Binary files /dev/null and b/data/faro/index_1241/img_rgb.png differ diff --git a/data/faro/index_1241/img_right.png b/data/faro/index_1241/img_right.png new file mode 100644 index 0000000..47c4e79 Binary files /dev/null and b/data/faro/index_1241/img_right.png differ diff --git a/data/faro/index_1982/img_depth_faro.png b/data/faro/index_1982/img_depth_faro.png new file mode 100644 index 0000000..003b4fd Binary files /dev/null and b/data/faro/index_1982/img_depth_faro.png differ diff --git a/data/faro/index_1982/img_depth_rs.png b/data/faro/index_1982/img_depth_rs.png new file mode 100644 index 0000000..64527df Binary files /dev/null and b/data/faro/index_1982/img_depth_rs.png differ diff --git a/data/faro/index_1982/img_left.png b/data/faro/index_1982/img_left.png new file mode 100644 index 0000000..dae2c9c Binary files /dev/null and b/data/faro/index_1982/img_left.png differ diff --git a/data/faro/index_1982/img_rgb.png b/data/faro/index_1982/img_rgb.png new file mode 100644 index 0000000..af300b8 Binary files /dev/null and b/data/faro/index_1982/img_rgb.png differ diff --git a/data/faro/index_1982/img_right.png b/data/faro/index_1982/img_right.png new file mode 100644 index 0000000..6443dfd Binary files /dev/null and b/data/faro/index_1982/img_right.png differ diff --git a/data/faro/index_2441/img_depth_faro.png b/data/faro/index_2441/img_depth_faro.png new file mode 100644 index 0000000..f656010 Binary files /dev/null and b/data/faro/index_2441/img_depth_faro.png differ diff --git a/data/faro/index_2441/img_depth_rs.png b/data/faro/index_2441/img_depth_rs.png new file mode 100644 index 0000000..2464325 Binary files /dev/null and b/data/faro/index_2441/img_depth_rs.png differ diff --git a/data/faro/index_2441/img_left.png b/data/faro/index_2441/img_left.png new file mode 100644 index 0000000..be71db9 Binary files /dev/null and b/data/faro/index_2441/img_left.png differ diff --git a/data/faro/index_2441/img_rgb.png b/data/faro/index_2441/img_rgb.png new file mode 100644 index 0000000..09c4d45 Binary files /dev/null and b/data/faro/index_2441/img_rgb.png differ diff --git a/data/faro/index_2441/img_right.png b/data/faro/index_2441/img_right.png new file mode 100644 index 0000000..46e3ffe Binary files /dev/null and b/data/faro/index_2441/img_right.png differ diff --git a/data/faro/index_2537/img_depth_faro.png b/data/faro/index_2537/img_depth_faro.png new file mode 100644 index 0000000..e5a8098 Binary files /dev/null and b/data/faro/index_2537/img_depth_faro.png differ diff --git a/data/faro/index_2537/img_depth_rs.png b/data/faro/index_2537/img_depth_rs.png new file mode 100644 index 0000000..af0734a Binary files /dev/null and b/data/faro/index_2537/img_depth_rs.png differ diff --git a/data/faro/index_2537/img_left.png b/data/faro/index_2537/img_left.png new file mode 100644 index 0000000..0af09b8 Binary files /dev/null and b/data/faro/index_2537/img_left.png differ diff --git a/data/faro/index_2537/img_rgb.png b/data/faro/index_2537/img_rgb.png new file mode 100644 index 0000000..d9c6041 Binary files /dev/null and b/data/faro/index_2537/img_rgb.png differ diff --git a/data/faro/index_2537/img_right.png b/data/faro/index_2537/img_right.png new file mode 100644 index 0000000..82602e9 Binary files /dev/null and b/data/faro/index_2537/img_right.png differ diff --git a/data/faro/index_3049/img_depth_faro.png b/data/faro/index_3049/img_depth_faro.png new file mode 100644 index 0000000..5d1845f Binary files /dev/null and b/data/faro/index_3049/img_depth_faro.png differ diff --git a/data/faro/index_3049/img_depth_rs.png b/data/faro/index_3049/img_depth_rs.png new file mode 100644 index 0000000..a8a1c82 Binary files /dev/null and b/data/faro/index_3049/img_depth_rs.png differ diff --git a/data/faro/index_3049/img_left.png b/data/faro/index_3049/img_left.png new file mode 100644 index 0000000..82c8303 Binary files /dev/null and b/data/faro/index_3049/img_left.png differ diff --git a/data/faro/index_3049/img_rgb.png b/data/faro/index_3049/img_rgb.png new file mode 100644 index 0000000..6910b12 Binary files /dev/null and b/data/faro/index_3049/img_rgb.png differ diff --git a/data/faro/index_3049/img_right.png b/data/faro/index_3049/img_right.png new file mode 100644 index 0000000..4467644 Binary files /dev/null and b/data/faro/index_3049/img_right.png differ diff --git a/data/faro/index_3375/img_depth_faro.png b/data/faro/index_3375/img_depth_faro.png new file mode 100644 index 0000000..cb3b76c Binary files /dev/null and b/data/faro/index_3375/img_depth_faro.png differ diff --git a/data/faro/index_3375/img_depth_rs.png b/data/faro/index_3375/img_depth_rs.png new file mode 100644 index 0000000..f6ececf Binary files /dev/null and b/data/faro/index_3375/img_depth_rs.png differ diff --git a/data/faro/index_3375/img_left.png b/data/faro/index_3375/img_left.png new file mode 100644 index 0000000..f4a3989 Binary files /dev/null and b/data/faro/index_3375/img_left.png differ diff --git a/data/faro/index_3375/img_rgb.png b/data/faro/index_3375/img_rgb.png new file mode 100644 index 0000000..2152e5d Binary files /dev/null and b/data/faro/index_3375/img_rgb.png differ diff --git a/data/faro/index_3375/img_right.png b/data/faro/index_3375/img_right.png new file mode 100644 index 0000000..24d9835 Binary files /dev/null and b/data/faro/index_3375/img_right.png differ diff --git a/data/faro/index_3617/img_depth_faro.png b/data/faro/index_3617/img_depth_faro.png new file mode 100644 index 0000000..3bd6423 Binary files /dev/null and b/data/faro/index_3617/img_depth_faro.png differ diff --git a/data/faro/index_3617/img_depth_rs.png b/data/faro/index_3617/img_depth_rs.png new file mode 100644 index 0000000..84166c8 Binary files /dev/null and b/data/faro/index_3617/img_depth_rs.png differ diff --git a/data/faro/index_3617/img_left.png b/data/faro/index_3617/img_left.png new file mode 100644 index 0000000..ce52d8a Binary files /dev/null and b/data/faro/index_3617/img_left.png differ diff --git a/data/faro/index_3617/img_rgb.png b/data/faro/index_3617/img_rgb.png new file mode 100644 index 0000000..dcf6a2e Binary files /dev/null and b/data/faro/index_3617/img_rgb.png differ diff --git a/data/faro/index_3617/img_right.png b/data/faro/index_3617/img_right.png new file mode 100644 index 0000000..e4e707b Binary files /dev/null and b/data/faro/index_3617/img_right.png differ diff --git a/data/faro/index_3629/img_depth_faro.png b/data/faro/index_3629/img_depth_faro.png new file mode 100644 index 0000000..d324e1e Binary files /dev/null and b/data/faro/index_3629/img_depth_faro.png differ diff --git a/data/faro/index_3629/img_depth_rs.png b/data/faro/index_3629/img_depth_rs.png new file mode 100644 index 0000000..bc17188 Binary files /dev/null and b/data/faro/index_3629/img_depth_rs.png differ diff --git a/data/faro/index_3629/img_left.png b/data/faro/index_3629/img_left.png new file mode 100644 index 0000000..c3a56bc Binary files /dev/null and b/data/faro/index_3629/img_left.png differ diff --git a/data/faro/index_3629/img_rgb.png b/data/faro/index_3629/img_rgb.png new file mode 100644 index 0000000..8e4fb95 Binary files /dev/null and b/data/faro/index_3629/img_rgb.png differ diff --git a/data/faro/index_3629/img_right.png b/data/faro/index_3629/img_right.png new file mode 100644 index 0000000..d5f5289 Binary files /dev/null and b/data/faro/index_3629/img_right.png differ diff --git a/data/faro/index_3990/img_depth_faro.png b/data/faro/index_3990/img_depth_faro.png new file mode 100644 index 0000000..0f75d0d Binary files /dev/null and b/data/faro/index_3990/img_depth_faro.png differ diff --git a/data/faro/index_3990/img_depth_rs.png b/data/faro/index_3990/img_depth_rs.png new file mode 100644 index 0000000..e27d036 Binary files /dev/null and b/data/faro/index_3990/img_depth_rs.png differ diff --git a/data/faro/index_3990/img_left.png b/data/faro/index_3990/img_left.png new file mode 100644 index 0000000..dd45226 Binary files /dev/null and b/data/faro/index_3990/img_left.png differ diff --git a/data/faro/index_3990/img_rgb.png b/data/faro/index_3990/img_rgb.png new file mode 100644 index 0000000..0b5db51 Binary files /dev/null and b/data/faro/index_3990/img_rgb.png differ diff --git a/data/faro/index_3990/img_right.png b/data/faro/index_3990/img_right.png new file mode 100644 index 0000000..bee1ba9 Binary files /dev/null and b/data/faro/index_3990/img_right.png differ diff --git a/data/faro/index_4426/img_depth_faro.png b/data/faro/index_4426/img_depth_faro.png new file mode 100644 index 0000000..f1cb1a3 Binary files /dev/null and b/data/faro/index_4426/img_depth_faro.png differ diff --git a/data/faro/index_4426/img_depth_rs.png b/data/faro/index_4426/img_depth_rs.png new file mode 100644 index 0000000..3ce8f07 Binary files /dev/null and b/data/faro/index_4426/img_depth_rs.png differ diff --git a/data/faro/index_4426/img_left.png b/data/faro/index_4426/img_left.png new file mode 100644 index 0000000..455a0c6 Binary files /dev/null and b/data/faro/index_4426/img_left.png differ diff --git a/data/faro/index_4426/img_rgb.png b/data/faro/index_4426/img_rgb.png new file mode 100644 index 0000000..f365397 Binary files /dev/null and b/data/faro/index_4426/img_rgb.png differ diff --git a/data/faro/index_4426/img_right.png b/data/faro/index_4426/img_right.png new file mode 100644 index 0000000..e603b9d Binary files /dev/null and b/data/faro/index_4426/img_right.png differ diff --git a/data/faro/index_4609/img_depth_faro.png b/data/faro/index_4609/img_depth_faro.png new file mode 100644 index 0000000..3929700 Binary files /dev/null and b/data/faro/index_4609/img_depth_faro.png differ diff --git a/data/faro/index_4609/img_depth_rs.png b/data/faro/index_4609/img_depth_rs.png new file mode 100644 index 0000000..1d2fd54 Binary files /dev/null and b/data/faro/index_4609/img_depth_rs.png differ diff --git a/data/faro/index_4609/img_left.png b/data/faro/index_4609/img_left.png new file mode 100644 index 0000000..1d1a045 Binary files /dev/null and b/data/faro/index_4609/img_left.png differ diff --git a/data/faro/index_4609/img_rgb.png b/data/faro/index_4609/img_rgb.png new file mode 100644 index 0000000..a71aa9a Binary files /dev/null and b/data/faro/index_4609/img_rgb.png differ diff --git a/data/faro/index_4609/img_right.png b/data/faro/index_4609/img_right.png new file mode 100644 index 0000000..81250da Binary files /dev/null and b/data/faro/index_4609/img_right.png differ diff --git a/data/faro/index_4788/img_depth_faro.png b/data/faro/index_4788/img_depth_faro.png new file mode 100644 index 0000000..6f741f8 Binary files /dev/null and b/data/faro/index_4788/img_depth_faro.png differ diff --git a/data/faro/index_4788/img_depth_rs.png b/data/faro/index_4788/img_depth_rs.png new file mode 100644 index 0000000..7f50877 Binary files /dev/null and b/data/faro/index_4788/img_depth_rs.png differ diff --git a/data/faro/index_4788/img_left.png b/data/faro/index_4788/img_left.png new file mode 100644 index 0000000..677facd Binary files /dev/null and b/data/faro/index_4788/img_left.png differ diff --git a/data/faro/index_4788/img_rgb.png b/data/faro/index_4788/img_rgb.png new file mode 100644 index 0000000..80e0061 Binary files /dev/null and b/data/faro/index_4788/img_rgb.png differ diff --git a/data/faro/index_4788/img_right.png b/data/faro/index_4788/img_right.png new file mode 100644 index 0000000..8f948db Binary files /dev/null and b/data/faro/index_4788/img_right.png differ diff --git a/data/faro/index_5074/img_depth_faro.png b/data/faro/index_5074/img_depth_faro.png new file mode 100644 index 0000000..d28a077 Binary files /dev/null and b/data/faro/index_5074/img_depth_faro.png differ diff --git a/data/faro/index_5074/img_depth_rs.png b/data/faro/index_5074/img_depth_rs.png new file mode 100644 index 0000000..8b88004 Binary files /dev/null and b/data/faro/index_5074/img_depth_rs.png differ diff --git a/data/faro/index_5074/img_left.png b/data/faro/index_5074/img_left.png new file mode 100644 index 0000000..debdd5d Binary files /dev/null and b/data/faro/index_5074/img_left.png differ diff --git a/data/faro/index_5074/img_rgb.png b/data/faro/index_5074/img_rgb.png new file mode 100644 index 0000000..5a085b4 Binary files /dev/null and b/data/faro/index_5074/img_rgb.png differ diff --git a/data/faro/index_5074/img_right.png b/data/faro/index_5074/img_right.png new file mode 100644 index 0000000..87b4ba9 Binary files /dev/null and b/data/faro/index_5074/img_right.png differ diff --git a/data/faro/index_5646/img_depth_faro.png b/data/faro/index_5646/img_depth_faro.png new file mode 100644 index 0000000..25b3c4a Binary files /dev/null and b/data/faro/index_5646/img_depth_faro.png differ diff --git a/data/faro/index_5646/img_depth_rs.png b/data/faro/index_5646/img_depth_rs.png new file mode 100644 index 0000000..9772370 Binary files /dev/null and b/data/faro/index_5646/img_depth_rs.png differ diff --git a/data/faro/index_5646/img_left.png b/data/faro/index_5646/img_left.png new file mode 100644 index 0000000..2eaac0c Binary files /dev/null and b/data/faro/index_5646/img_left.png differ diff --git a/data/faro/index_5646/img_rgb.png b/data/faro/index_5646/img_rgb.png new file mode 100644 index 0000000..8b1dd1e Binary files /dev/null and b/data/faro/index_5646/img_rgb.png differ diff --git a/data/faro/index_5646/img_right.png b/data/faro/index_5646/img_right.png new file mode 100644 index 0000000..0d98e0c Binary files /dev/null and b/data/faro/index_5646/img_right.png differ diff --git a/data/faro/index_5661/img_depth_faro.png b/data/faro/index_5661/img_depth_faro.png new file mode 100644 index 0000000..cb58031 Binary files /dev/null and b/data/faro/index_5661/img_depth_faro.png differ diff --git a/data/faro/index_5661/img_depth_rs.png b/data/faro/index_5661/img_depth_rs.png new file mode 100644 index 0000000..812413c Binary files /dev/null and b/data/faro/index_5661/img_depth_rs.png differ diff --git a/data/faro/index_5661/img_left.png b/data/faro/index_5661/img_left.png new file mode 100644 index 0000000..f978787 Binary files /dev/null and b/data/faro/index_5661/img_left.png differ diff --git a/data/faro/index_5661/img_rgb.png b/data/faro/index_5661/img_rgb.png new file mode 100644 index 0000000..4a4bb82 Binary files /dev/null and b/data/faro/index_5661/img_rgb.png differ diff --git a/data/faro/index_5661/img_right.png b/data/faro/index_5661/img_right.png new file mode 100644 index 0000000..ab6fa1c Binary files /dev/null and b/data/faro/index_5661/img_right.png differ diff --git a/data/faro/index_5808/img_depth_faro.png b/data/faro/index_5808/img_depth_faro.png new file mode 100644 index 0000000..161d61d Binary files /dev/null and b/data/faro/index_5808/img_depth_faro.png differ diff --git a/data/faro/index_5808/img_depth_rs.png b/data/faro/index_5808/img_depth_rs.png new file mode 100644 index 0000000..f9efd39 Binary files /dev/null and b/data/faro/index_5808/img_depth_rs.png differ diff --git a/data/faro/index_5808/img_left.png b/data/faro/index_5808/img_left.png new file mode 100644 index 0000000..177ba6e Binary files /dev/null and b/data/faro/index_5808/img_left.png differ diff --git a/data/faro/index_5808/img_rgb.png b/data/faro/index_5808/img_rgb.png new file mode 100644 index 0000000..11c4db5 Binary files /dev/null and b/data/faro/index_5808/img_rgb.png differ diff --git a/data/faro/index_5808/img_right.png b/data/faro/index_5808/img_right.png new file mode 100644 index 0000000..2bce5ef Binary files /dev/null and b/data/faro/index_5808/img_right.png differ diff --git a/data/faro/index_5881/img_depth_faro.png b/data/faro/index_5881/img_depth_faro.png new file mode 100644 index 0000000..b5788ff Binary files /dev/null and b/data/faro/index_5881/img_depth_faro.png differ diff --git a/data/faro/index_5881/img_depth_rs.png b/data/faro/index_5881/img_depth_rs.png new file mode 100644 index 0000000..5c9b762 Binary files /dev/null and b/data/faro/index_5881/img_depth_rs.png differ diff --git a/data/faro/index_5881/img_left.png b/data/faro/index_5881/img_left.png new file mode 100644 index 0000000..331cd40 Binary files /dev/null and b/data/faro/index_5881/img_left.png differ diff --git a/data/faro/index_5881/img_rgb.png b/data/faro/index_5881/img_rgb.png new file mode 100644 index 0000000..de68e00 Binary files /dev/null and b/data/faro/index_5881/img_rgb.png differ diff --git a/data/faro/index_5881/img_right.png b/data/faro/index_5881/img_right.png new file mode 100644 index 0000000..fd0d235 Binary files /dev/null and b/data/faro/index_5881/img_right.png differ diff --git a/data/faro/index_5964/img_depth_faro.png b/data/faro/index_5964/img_depth_faro.png new file mode 100644 index 0000000..40116a6 Binary files /dev/null and b/data/faro/index_5964/img_depth_faro.png differ diff --git a/data/faro/index_5964/img_depth_rs.png b/data/faro/index_5964/img_depth_rs.png new file mode 100644 index 0000000..fe2d513 Binary files /dev/null and b/data/faro/index_5964/img_depth_rs.png differ diff --git a/data/faro/index_5964/img_left.png b/data/faro/index_5964/img_left.png new file mode 100644 index 0000000..b7fb250 Binary files /dev/null and b/data/faro/index_5964/img_left.png differ diff --git a/data/faro/index_5964/img_rgb.png b/data/faro/index_5964/img_rgb.png new file mode 100644 index 0000000..5620d7c Binary files /dev/null and b/data/faro/index_5964/img_rgb.png differ diff --git a/data/faro/index_5964/img_right.png b/data/faro/index_5964/img_right.png new file mode 100644 index 0000000..61a5c99 Binary files /dev/null and b/data/faro/index_5964/img_right.png differ diff --git a/data/faro/index_6422/img_depth_faro.png b/data/faro/index_6422/img_depth_faro.png new file mode 100644 index 0000000..3ff7b67 Binary files /dev/null and b/data/faro/index_6422/img_depth_faro.png differ diff --git a/data/faro/index_6422/img_depth_rs.png b/data/faro/index_6422/img_depth_rs.png new file mode 100644 index 0000000..2d13beb Binary files /dev/null and b/data/faro/index_6422/img_depth_rs.png differ diff --git a/data/faro/index_6422/img_left.png b/data/faro/index_6422/img_left.png new file mode 100644 index 0000000..e19b559 Binary files /dev/null and b/data/faro/index_6422/img_left.png differ diff --git a/data/faro/index_6422/img_rgb.png b/data/faro/index_6422/img_rgb.png new file mode 100644 index 0000000..cd9ddbb Binary files /dev/null and b/data/faro/index_6422/img_rgb.png differ diff --git a/data/faro/index_6422/img_right.png b/data/faro/index_6422/img_right.png new file mode 100644 index 0000000..f3dc911 Binary files /dev/null and b/data/faro/index_6422/img_right.png differ diff --git a/data/faro/index_6461/img_depth_faro.png b/data/faro/index_6461/img_depth_faro.png new file mode 100644 index 0000000..9e444fd Binary files /dev/null and b/data/faro/index_6461/img_depth_faro.png differ diff --git a/data/faro/index_6461/img_depth_rs.png b/data/faro/index_6461/img_depth_rs.png new file mode 100644 index 0000000..3e31eb3 Binary files /dev/null and b/data/faro/index_6461/img_depth_rs.png differ diff --git a/data/faro/index_6461/img_left.png b/data/faro/index_6461/img_left.png new file mode 100644 index 0000000..8ac7e85 Binary files /dev/null and b/data/faro/index_6461/img_left.png differ diff --git a/data/faro/index_6461/img_rgb.png b/data/faro/index_6461/img_rgb.png new file mode 100644 index 0000000..f6f3cef Binary files /dev/null and b/data/faro/index_6461/img_rgb.png differ diff --git a/data/faro/index_6461/img_right.png b/data/faro/index_6461/img_right.png new file mode 100644 index 0000000..e9e3f7e Binary files /dev/null and b/data/faro/index_6461/img_right.png differ diff --git a/data/faro/index_6539/img_depth_faro.png b/data/faro/index_6539/img_depth_faro.png new file mode 100644 index 0000000..fee8747 Binary files /dev/null and b/data/faro/index_6539/img_depth_faro.png differ diff --git a/data/faro/index_6539/img_depth_rs.png b/data/faro/index_6539/img_depth_rs.png new file mode 100644 index 0000000..44594e4 Binary files /dev/null and b/data/faro/index_6539/img_depth_rs.png differ diff --git a/data/faro/index_6539/img_left.png b/data/faro/index_6539/img_left.png new file mode 100644 index 0000000..5127108 Binary files /dev/null and b/data/faro/index_6539/img_left.png differ diff --git a/data/faro/index_6539/img_rgb.png b/data/faro/index_6539/img_rgb.png new file mode 100644 index 0000000..73a3ae9 Binary files /dev/null and b/data/faro/index_6539/img_rgb.png differ diff --git a/data/faro/index_6539/img_right.png b/data/faro/index_6539/img_right.png new file mode 100644 index 0000000..d56e3f2 Binary files /dev/null and b/data/faro/index_6539/img_right.png differ diff --git a/demo_data/camera.txt b/demo_data/camera.txt new file mode 100644 index 0000000..d193d89 --- /dev/null +++ b/demo_data/camera.txt @@ -0,0 +1,4 @@ +focal_px=901.0471 +baseline_m=0.054903 +source=FARO DA_RGB_Kitchen_1 frame 100 +resolution=1280x720 diff --git a/demo_data/depth.png b/demo_data/depth.png new file mode 100644 index 0000000..59f8c99 Binary files /dev/null and b/demo_data/depth.png differ diff --git a/demo_data/imageD_d16_000.png b/demo_data/imageD_d16_000.png new file mode 100644 index 0000000..f6bec18 Binary files /dev/null and b/demo_data/imageD_d16_000.png differ diff --git a/demo_data/imageL_d16_000.png b/demo_data/imageL_d16_000.png new file mode 100644 index 0000000..6ea2df4 Binary files /dev/null and b/demo_data/imageL_d16_000.png differ diff --git a/demo_data/imageR_d16_000.png b/demo_data/imageR_d16_000.png new file mode 100644 index 0000000..747fd3b Binary files /dev/null and b/demo_data/imageR_d16_000.png differ diff --git a/demo_data/left_faro.png b/demo_data/left_faro.png new file mode 100644 index 0000000..e8a1d90 Binary files /dev/null and b/demo_data/left_faro.png differ diff --git a/demo_data/right_faro.png b/demo_data/right_faro.png new file mode 100644 index 0000000..cf10ef1 Binary files /dev/null and b/demo_data/right_faro.png differ diff --git a/demo_data_out/depth.png b/demo_data_out/depth.png new file mode 100644 index 0000000..5fbbba8 Binary files /dev/null and b/demo_data_out/depth.png differ diff --git a/demo_data_out/depth_colorized.png b/demo_data_out/depth_colorized.png new file mode 100644 index 0000000..950c0da Binary files /dev/null and b/demo_data_out/depth_colorized.png differ diff --git a/demo_data_out/depth_meter.npy b/demo_data_out/depth_meter.npy new file mode 100644 index 0000000..b377082 Binary files /dev/null and b/demo_data_out/depth_meter.npy differ diff --git a/demo_data_out/disp_0044.png b/demo_data_out/disp_0044.png new file mode 100644 index 0000000..a1a632e Binary files /dev/null and b/demo_data_out/disp_0044.png differ diff --git a/demo_data_out/disp_0066.png b/demo_data_out/disp_0066.png new file mode 100644 index 0000000..e121744 Binary files /dev/null and b/demo_data_out/disp_0066.png differ diff --git a/demo_data_out/disp_0074.png b/demo_data_out/disp_0074.png new file mode 100644 index 0000000..1c58a6f Binary files /dev/null and b/demo_data_out/disp_0074.png differ diff --git a/docs/docs_niv/superpowers/plans/2026-05-18-image-server.md b/docs/docs_niv/superpowers/plans/2026-05-18-image-server.md new file mode 100644 index 0000000..7e6a95e --- /dev/null +++ b/docs/docs_niv/superpowers/plans/2026-05-18-image-server.md @@ -0,0 +1,469 @@ +# Image RGB-to-Grayscale TCP Server Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Build a threaded TCP server that receives JPEG images, converts RGB to grayscale with OpenCV, and returns the result — plus a client module and integration tests. + +**Architecture:** `socketserver.ThreadingTCPServer` with one handler thread per client. Both directions use the same framing: 4-byte big-endian length prefix followed by JPEG bytes. Protocol helpers live in `image_server.py` and are imported by `image_client.py`. + +**Tech Stack:** Python 3, `socketserver` (stdlib), `socket` (stdlib), `struct` (stdlib), `threading` (stdlib), `opencv-python` (`cv2`), `numpy`, `unittest` (stdlib). + +--- + +## File Map + +| File | Role | +|------|------| +| `image_server.py` | `send_msg`, `recv_msg`, `_recv_exact`, `ImageHandler`, `main()` | +| `image_client.py` | `ImageClient` class, `main()` CLI entry point | +| `test_server.py` | `TestProtocolHelpers`, `TestImageServer`, `TestImageClient` | + +--- + +### Task 1: Protocol framing helpers + +**Files:** +- Create: `image_server.py` +- Create: `test_server.py` + +- [ ] **Step 1: Write the failing tests** + +Create `test_server.py`: + +```python +import socket +import unittest +from image_server import send_msg, recv_msg + + +class TestProtocolHelpers(unittest.TestCase): + def _pair(self): + return socket.socketpair() + + def test_roundtrip_small(self): + a, b = self._pair() + try: + payload = b'hello world' + send_msg(a, payload) + result = recv_msg(b) + self.assertEqual(result, payload) + finally: + a.close() + b.close() + + def test_roundtrip_binary(self): + a, b = self._pair() + try: + payload = bytes(range(256)) * 100 + send_msg(a, payload) + result = recv_msg(b) + self.assertEqual(result, payload) + finally: + a.close() + b.close() + + def test_recv_returns_none_on_closed_socket(self): + a, b = self._pair() + a.close() + result = recv_msg(b) + self.assertIsNone(result) + b.close() + + +if __name__ == '__main__': + unittest.main() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +```bash +cd /home/adiroha/repos/niv/d1_min +python -m pytest test_server.py::TestProtocolHelpers -v +``` + +Expected: `ModuleNotFoundError: No module named 'image_server'` + +- [ ] **Step 3: Implement the protocol helpers** + +Create `image_server.py`: + +```python +import socket +import socketserver +import struct +import logging +import cv2 +import numpy as np + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') +logger = logging.getLogger(__name__) + + +def send_msg(sock: socket.socket, data: bytes) -> None: + header = struct.pack('>I', len(data)) + sock.sendall(header + data) + + +def recv_msg(sock: socket.socket) -> bytes | None: + header = _recv_exact(sock, 4) + if header is None: + return None + length = struct.unpack('>I', header)[0] + return _recv_exact(sock, length) + + +def _recv_exact(sock: socket.socket, n: int) -> bytes | None: + buf = b'' + while len(buf) < n: + chunk = sock.recv(n - len(buf)) + if not chunk: + return None + buf += chunk + return buf +``` + +- [ ] **Step 4: Run tests to verify they pass** + +```bash +python -m pytest test_server.py::TestProtocolHelpers -v +``` + +Expected: +``` +PASSED test_server.py::TestProtocolHelpers::test_roundtrip_small +PASSED test_server.py::TestProtocolHelpers::test_roundtrip_binary +PASSED test_server.py::TestProtocolHelpers::test_recv_returns_none_on_closed_socket +``` + +- [ ] **Step 5: Commit** + +```bash +git add image_server.py test_server.py +git commit -m "feat: add TCP framing helpers with tests" +``` + +--- + +### Task 2: Server handler and main + +**Files:** +- Modify: `image_server.py` — add `ImageHandler`, `main()` +- Modify: `test_server.py` — add `TestImageServer` with synthetic image test + +- [ ] **Step 1: Add the failing integration test** + +Append to `test_server.py` (before `if __name__ == '__main__':`): + +```python +import threading +import socketserver +import cv2 +import numpy as np +from image_server import send_msg, recv_msg, ImageHandler + + +def _make_server() -> socketserver.ThreadingTCPServer: + server = socketserver.ThreadingTCPServer(('127.0.0.1', 0), ImageHandler) + server.allow_reuse_address = True + return server + + +class TestImageServer(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.server = _make_server() + cls.port = cls.server.server_address[1] + cls.thread = threading.Thread(target=cls.server.serve_forever, daemon=True) + cls.thread.start() + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + cls.thread.join(timeout=2) + + def _connect(self) -> socket.socket: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect(('127.0.0.1', self.port)) + return sock + + def _send_image(self, sock: socket.socket, img: np.ndarray) -> None: + _, jpeg = cv2.imencode('.jpg', img) + send_msg(sock, jpeg.tobytes()) + + def _recv_gray(self, sock: socket.socket) -> np.ndarray: + data = recv_msg(sock) + self.assertIsNotNone(data, 'Server returned no data') + gray = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_GRAYSCALE) + self.assertIsNotNone(gray, 'Response could not be decoded as image') + return gray + + def test_synthetic_image_becomes_grayscale(self): + img = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8) + with self._connect() as sock: + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (100, 100)) +``` + +- [ ] **Step 2: Run the test to verify it fails** + +```bash +python -m pytest test_server.py::TestImageServer::test_synthetic_image_becomes_grayscale -v +``` + +Expected: `ImportError: cannot import name 'ImageHandler' from 'image_server'` + +- [ ] **Step 3: Implement ImageHandler and main()** + +Append to `image_server.py`: + +```python +class ImageHandler(socketserver.BaseRequestHandler): + def handle(self) -> None: + logger.info('Connection from %s', self.client_address) + try: + while True: + data = recv_msg(self.request) + if data is None: + break + img = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_COLOR) + if img is None: + logger.warning('Could not decode image, closing connection') + break + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + _, jpeg = cv2.imencode('.jpg', gray) + send_msg(self.request, jpeg.tobytes()) + except (ConnectionResetError, BrokenPipeError, OSError) as exc: + logger.warning('Connection error: %s', exc) + finally: + logger.info('Connection closed: %s', self.client_address) + + +def main() -> None: + host, port = '0.0.0.0', 9999 + with socketserver.ThreadingTCPServer((host, port), ImageHandler) as server: + server.allow_reuse_address = True + logger.info('Listening on %s:%d', host, port) + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info('Shutting down') + + +if __name__ == '__main__': + main() +``` + +- [ ] **Step 4: Run the test to verify it passes** + +```bash +python -m pytest test_server.py::TestImageServer::test_synthetic_image_becomes_grayscale -v +``` + +Expected: `PASSED` + +- [ ] **Step 5: Commit** + +```bash +git add image_server.py test_server.py +git commit -m "feat: add threaded TCP image server with grayscale conversion" +``` + +--- + +### Task 3: Multiple-image and real-image tests + +**Files:** +- Modify: `test_server.py` — add two more test methods to `TestImageServer` + +- [ ] **Step 1: Add the tests** + +Add these two methods inside the `TestImageServer` class in `test_server.py`: + +```python + def test_multiple_images_same_connection(self): + with self._connect() as sock: + for _ in range(3): + img = np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8) + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (64, 64)) + + def test_real_image(self): + img = cv2.imread('sample/left.png') + self.assertIsNotNone(img, 'sample/left.png must be readable') + h, w = img.shape[:2] + with self._connect() as sock: + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (h, w)) +``` + +- [ ] **Step 2: Run the new tests** + +```bash +python -m pytest test_server.py::TestImageServer -v +``` + +Expected: all three tests in `TestImageServer` pass. + +- [ ] **Step 3: Commit** + +```bash +git add test_server.py +git commit -m "test: add multi-image and real-image integration tests" +``` + +--- + +### Task 4: Client module + +**Files:** +- Create: `image_client.py` +- Modify: `test_server.py` — add `TestImageClient` + +- [ ] **Step 1: Add the failing client test** + +Append to `test_server.py` (before `if __name__ == '__main__':`): + +```python +class TestImageClient(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.server = _make_server() + cls.port = cls.server.server_address[1] + cls.thread = threading.Thread(target=cls.server.serve_forever, daemon=True) + cls.thread.start() + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + cls.thread.join(timeout=2) + + def test_client_process_returns_grayscale(self): + from image_client import ImageClient + img = np.random.randint(0, 256, (80, 80, 3), dtype=np.uint8) + with ImageClient('127.0.0.1', self.port) as client: + gray = client.process(img) + self.assertEqual(gray.shape, (80, 80)) + + def test_client_process_multiple_images(self): + from image_client import ImageClient + with ImageClient('127.0.0.1', self.port) as client: + for size in [32, 64, 128]: + img = np.random.randint(0, 256, (size, size, 3), dtype=np.uint8) + gray = client.process(img) + self.assertEqual(gray.shape, (size, size)) +``` + +- [ ] **Step 2: Run to verify failure** + +```bash +python -m pytest test_server.py::TestImageClient -v +``` + +Expected: `ModuleNotFoundError: No module named 'image_client'` + +- [ ] **Step 3: Implement image_client.py** + +Create `image_client.py`: + +```python +import socket +import sys +import cv2 +import numpy as np +from image_server import send_msg, recv_msg + + +class ImageClient: + def __init__(self, host: str, port: int) -> None: + self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self._sock.connect((host, port)) + + def process(self, img: np.ndarray) -> np.ndarray: + _, jpeg = cv2.imencode('.jpg', img) + send_msg(self._sock, jpeg.tobytes()) + data = recv_msg(self._sock) + if data is None: + raise RuntimeError('Server closed the connection unexpectedly') + gray = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_GRAYSCALE) + if gray is None: + raise RuntimeError('Server response could not be decoded as an image') + return gray + + def close(self) -> None: + self._sock.close() + + def __enter__(self) -> 'ImageClient': + return self + + def __exit__(self, *_) -> None: + self.close() + + +def main() -> None: + if len(sys.argv) != 3: + print(f'Usage: {sys.argv[0]} ') + sys.exit(1) + img = cv2.imread(sys.argv[1]) + if img is None: + print(f'Cannot read image: {sys.argv[1]}') + sys.exit(1) + with ImageClient('127.0.0.1', 9999) as client: + gray = client.process(img) + cv2.imwrite(sys.argv[2], gray) + print(f'Saved: {sys.argv[2]}') + + +if __name__ == '__main__': + main() +``` + +- [ ] **Step 4: Run the client tests** + +```bash +python -m pytest test_server.py::TestImageClient -v +``` + +Expected: +``` +PASSED test_server.py::TestImageClient::test_client_process_returns_grayscale +PASSED test_server.py::TestImageClient::test_client_process_multiple_images +``` + +- [ ] **Step 5: Run the full test suite** + +```bash +python -m pytest test_server.py -v +``` + +Expected: all 9 tests pass, no failures. + +- [ ] **Step 6: Commit** + +```bash +git add image_client.py test_server.py +git commit -m "feat: add ImageClient with integration tests" +``` + +--- + +## Usage + +**Run the server:** +```bash +python image_server.py +# Listening on 0.0.0.0:9999 +``` + +**Run the client (in another terminal):** +```bash +python image_client.py sample/left.png output_gray.png +# Saved: output_gray.png +``` + +**Run all tests:** +```bash +python -m pytest test_server.py -v +``` diff --git a/docs/docs_niv/superpowers/specs/2026-05-18-image-server-design.md b/docs/docs_niv/superpowers/specs/2026-05-18-image-server-design.md new file mode 100644 index 0000000..dde7220 --- /dev/null +++ b/docs/docs_niv/superpowers/specs/2026-05-18-image-server-design.md @@ -0,0 +1,87 @@ +# Image RGB-to-Grayscale TCP Server — Design Spec + +**Date:** 2026-05-18 +**Status:** Approved + +--- + +## Overview + +A Python TCP server that accepts image data from remote clients, converts each image from RGB to grayscale using OpenCV, and returns the result. Uses the existing `opencv-python` dependency — no new packages required. + +--- + +## Files + +| File | Purpose | +|------|---------| +| `image_server.py` | Server: listens for connections, receives images, converts, sends back | +| `image_client.py` | Client: sends a color image, receives and saves the grayscale result | +| `test_server.py` | Tests: integration tests using a real server on a free port | + +--- + +## Architecture + +`socketserver.ThreadingTCPServer` with a `BaseRequestHandler` subclass. Each client connection runs in its own thread, allowing multiple concurrent clients. Image conversion is handled by `cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)`. + +``` +Client Server (one thread per connection) +------ ---------------------------------- +encode image as JPEG +send [4-byte length][JPEG] --> recv length header + recv N bytes of JPEG + decode JPEG → BGR numpy array + cvtColor BGR → GRAY + encode GRAY as JPEG + <-- send [4-byte length][JPEG] +recv length header +recv N bytes of JPEG +decode → grayscale image +``` + +A single connection can carry multiple request/response pairs before closing. + +--- + +## Wire Protocol + +Every message in both directions: + +``` +[4 bytes, big-endian uint32: payload length][N bytes: JPEG image data] +``` + +- **Length header:** `struct.pack('>I', len(jpeg_bytes))` / `struct.unpack('>I', header)[0]` +- **Image encoding:** JPEG (OpenCV default quality) +- **Direction:** identical framing for both client→server and server→client + +--- + +## Error Handling + +| Scenario | Behavior | +|----------|----------| +| Client disconnects mid-transfer | Handler catches `ConnectionResetError` / partial read, logs, closes socket | +| Corrupt or non-image bytes | `cv2.imdecode` returns `None`; handler logs a warning and closes connection | +| `KeyboardInterrupt` on server | `server.shutdown()` called in `finally` block; exits cleanly | + +--- + +## Testing (`test_server.py`) + +Uses Python `unittest` — no new dependencies. Each test class starts a real `ThreadingTCPServer` on a random free port (port `0`) in a `setUpClass` background thread and shuts it down in `tearDownClass`. + +| Test | What it verifies | +|------|-----------------| +| `test_synthetic_image` | 100×100 synthetic RGB → response decodes as valid grayscale (shape `(100, 100)`) | +| `test_multiple_images_same_connection` | 3 images sent sequentially on one socket, all responses valid | +| `test_real_image` | Sends `sample/left.png` (known to exist); verifies grayscale response | + +--- + +## Non-Goals + +- No authentication or TLS (local / trusted network assumed) +- No image format negotiation (JPEG only) +- No retry logic on the client diff --git a/docs/superpowers/specs/2026-04-30-depth-fusion-design.md b/docs/superpowers/specs/2026-04-30-depth-fusion-design.md new file mode 100644 index 0000000..ad27f71 --- /dev/null +++ b/docs/superpowers/specs/2026-04-30-depth-fusion-design.md @@ -0,0 +1,239 @@ +# RealSense Depth Fusion — Design Spec + +**Date:** 2026-04-30 +**Status:** Approved for implementation +**Goal:** Fuse RealSense RGB-D depth into FastFoundationStereo at three pipeline stages to improve disparity accuracy, with no latency constraint and using fine-tuning of the pretrained model. + +--- + +## 1. Overview + +The modified model takes three inputs: `image1` (left RGB), `image2` (right RGB), and `depth` (RealSense depth map, single-channel metric in meters, hardware-registered to the left RGB camera). Depth is fused at three stages: early (feature level), mid (cost volume logits), and late (GRU initialization). + +**Modified forward signature:** +```python +def forward(self, image1, image2, depth=None, focal=None, baseline=None, iters=12, ...) +``` + +When `depth=None`, all three stages are skipped and the model behaves identically to the original. No existing code paths are changed. + +**Architecture diagram:** +``` +INPUTS + image1 (B,3,H,W) ──┐ + image2 (B,3,H,W) ──┤ + depth (B,1,H,W) ──┼──────────────────────────────────────────────────┐ + focal, baseline │ │ + │ │ +STAGE 1 — EARLY FEATURE FUSION │ + │ │ + image1+2 ──► Feature (EdgeNeXt) depth ──► DepthEncoder [NEW] │ + [x4,x8,x16,x32] [d4,d8,d16,d32] │ + │ │ │ + └──── DepthFusionModule [NEW] ───────┘ │ + (concat + 1×1 conv per scale) │ + → fused_left[0..3] │ + │ +COST VOLUME (unchanged) │ + │ + fused_left[0] + features_right[0] │ + ├── GWC volume │ + └── Concat volume │ + │ │ + corr_stem → corr_feature_att │ + hourglass (3D, FeatureAtt + CostVolumeDisparityAttention) │ + classifier → logits (B, D, H/4, W/4) │ + │ │ +STAGE 2 — DISPARITY PRIOR ON LOGITS │ + │ │ + │ depth ──► disp_prior = focal·baseline/depth ◄──────┘ + │ │ + │ prior_bias = Gaussian(μ=disp_prior, σ=σ_learned) + │ │ (zeroed where depth is invalid) + └──► logits + α·prior_bias [NEW α, σ learnable] + │ + softmax → prob → soft-argmax → stereo_init + │ +STAGE 3 — GRU INITIALIZATION BLEND │ + │ + disp_prior_14 ──┐ │ + validity_mask ──┼──► DepthInitBlend [NEW] ──► blend_w ─┤ + stereo_init ────┘ (3-layer CNN → sigmoid) │ + │ + init_disp = blend_w·disp_prior + (1-blend_w)·stereo_init ◄──────────┘ + +GRU ITERATIVE REFINEMENT (unchanged internals) + for itr in range(iters): + geo_feat = Combined_Geo_Encoding_Volume(disp) + net_list, mask_feat_4, delta_disp = update_block(...) + disp = disp + delta_disp + disp_up = upsample_disp(disp, mask_feat_4, stem_2x) + +OUTPUT: final disparity (B,1,H,W) +``` + +--- + +## 2. Depth Preprocessing + +Applied before `DepthEncoder` and before computing `disp_prior`. Shared logic, computed once in `forward`. + +- **Validity mask:** `M = (depth > 0) & torch.isfinite(depth)` → (B,1,H,W) float, 1=valid, 0=invalid +- **Normalization:** `d_norm = (depth - DEPTH_MEAN) / DEPTH_STD` applied only to valid pixels; invalid pixels set to 0 +- **Encoder input:** `torch.cat([d_norm, M], dim=1)` → (B,2,H,W) +- **`DEPTH_MEAN`, `DEPTH_STD`:** dataset statistics, stored as registered buffers (not trained) + +--- + +## 3. Stage 1 — DepthEncoder + DepthFusionModule + +**File:** `core/extractor.py` + +### DepthEncoder + +Lightweight 4-scale CNN built from existing `BasicConv` primitives. + +``` +Input: (B, 2, H, W) [normalized depth + validity mask] + +stem: BasicConv(2 → 32, k=3, stride=2, BN+ReLU) → (B, 32, H/2, W/2) +stage1: BasicConv(32 → 32, k=3, stride=2, BN+ReLU) → depth_x4 (B, 32, H/4, W/4) +stage2: BasicConv(32 → 64, k=3, stride=2, BN+ReLU) → depth_x8 (B, 64, H/8, W/8) +stage3: BasicConv(64 → 96, k=3, stride=2, BN+ReLU) → depth_x16 (B, 96, H/16, W/16) +stage4: BasicConv(96 → 128,k=3, stride=2, BN+ReLU) → depth_x32 (B, 128, H/32, W/32) + +depth_chans = [32, 64, 96, 128] +``` + +Each stage's output is multiplied by a downsampled validity mask so invalid regions produce near-zero features and do not corrupt fusion. + +### DepthFusionModule + +One 1×1 conv per scale. Input is concat of left feature and depth feature; output has the same shape as the left feature. + +``` +d_out = Feature.d_out = [96+vit_feat_dim, 192, 320, 304] + +fusion_4: Conv2d(d_out[0]+32, d_out[0], 1) +fusion_8: Conv2d(d_out[1]+64, d_out[1], 1) +fusion_16: Conv2d(d_out[2]+96, d_out[2], 1) +fusion_32: Conv2d(d_out[3]+128, d_out[3], 1) +``` + +Applied only to `features_left`. Right image features are not fused — RealSense depth is registered to the left camera only. + +**In `FastFoundationStereo.forward`:** +```python +if depth is not None: + depth_input = self.preprocess_depth(depth) # normalize + validity mask + depth_feats = self.depth_encoder(depth_input) + features_left = self.depth_fusion(features_left, depth_feats) +``` + +--- + +## 4. Stage 2 — Disparity Prior on Logits + +**File:** `core/foundation_stereo.py` + +Convert depth to disparity at 1/4 resolution, build a per-pixel Gaussian prior over the disparity axis, and add it as an additive bias to the cost volume logits before softmax. + +```python +# depth → disparity prior +disp_prior = (focal * baseline) / depth.clamp(min=1e-3) # (B,1,H,W) +disp_prior_14 = F.interpolate(disp_prior / 4, (H//4, W//4), mode='nearest') +validity_14 = F.interpolate(M.float(), (H//4, W//4), mode='nearest') + +# Gaussian bias over disparity axis +d_indices = torch.arange(D, device=disp_prior.device).view(1, D, 1, 1) +prior_bias = -0.5 * ((d_indices - disp_prior_14) / self.depth_sigma.abs()) ** 2 +prior_bias = prior_bias * validity_14 # flat (zero) where depth is invalid + +# Inject into logits +logits = self.classifier(comb_volume).squeeze(1) +if depth is not None: + logits = logits + self.depth_prior_scale * prior_bias +prob = F.softmax(logits, dim=1) +init_disp_stereo = disparity_regression(prob, self.args.max_disp // 4) +``` + +**New parameters:** +- `self.depth_sigma`: `nn.Parameter(torch.tensor(4.0))` — learned std in disparity units at 1/4 scale +- `self.depth_prior_scale`: `nn.Parameter(torch.tensor(0.1))` — initialized small so training starts near original stereo behavior + +--- + +## 5. Stage 3 — GRU Initialization Blend + +**File:** `core/foundation_stereo.py` + +A small 3-layer CNN learns per-pixel blend weights between the stereo soft-argmax estimate and the depth-derived disparity prior. + +```python +class DepthInitBlend(nn.Module): + # Input: concat(stereo_init, disp_prior_14, validity_14) → (B, 3, H/4, W/4) + # Layers: BasicConv(3→16), BasicConv(16→16), Conv2d(16→1) + # Output: blend_w ∈ [0,1] via sigmoid, masked to 0 where validity_14=0 +``` + +**In `FastFoundationStereo.forward`:** +```python +if depth is not None: + blend_w = self.depth_init_blend( + torch.cat([init_disp_stereo, disp_prior_14, validity_14], dim=1) + ) * validity_14 + init_disp = blend_w * disp_prior_14 + (1 - blend_w) * init_disp_stereo +else: + init_disp = init_disp_stereo +``` + +The blend learns *where* to trust depth (nearby objects, textureless regions) vs. stereo (edges, far range), giving the GRU a better warm start. + +--- + +## 6. New Modules Summary + +| Module | File | Est. Params | Role | +|---|---|---|---| +| `DepthEncoder` | `core/extractor.py` | ~200K | Encode raw depth to 4-scale features | +| `DepthFusionModule` | `core/extractor.py` | ~80K | Fuse depth features into left image features | +| `depth_sigma`, `depth_prior_scale` | `core/foundation_stereo.py` | 2 | Learned prior sharpness and weight | +| `DepthInitBlend` | `core/foundation_stereo.py` | ~5K | Per-pixel GRU init blend | + +--- + +## 7. Training Strategy + +**Frozen — do not update:** +- `self.feature.stem` and `self.feature.stages` (EdgeNeXt backbone, ~7M params) +- Cost volume builders (no parameters) + +**New modules — train from scratch at full LR:** +- `DepthEncoder` +- `DepthFusionModule` +- `depth_sigma`, `depth_prior_scale` +- `DepthInitBlend` + +**Existing modules — fine-tune at 0.1× base LR:** +- `self.feature.deconv32_16`, `deconv16_8`, `deconv8_4`, `conv4` +- `self.cost_agg` (hourglass, all layers) +- `self.update_block` (GRU) +- `self.cnet`, `self.context_zqr_convs` +- `self.classifier` + +**Loss:** standard sequence disparity loss (smooth L1 on all GRU iteration outputs, same as original training). No changes needed. + +**Dataset requirement:** Triplets of `(left_RGB, right_RGB, RealSense_depth, GT_disparity)`. GT disparity can come from LiDAR accumulation, structured-light ground truth at close range, or synthetic renders with simulated RealSense noise (holes at edges and reflective surfaces). + +--- + +## 8. Depth Hole Handling Summary + +Invalid pixels (holes, out-of-range readings) are handled consistently at each stage: + +| Stage | Invalid pixel behavior | +|---|---| +| DepthEncoder | Features multiplied by downsampled validity mask → near-zero output | +| DepthFusionModule | 1×1 conv learns to ignore near-zero depth features | +| Stage 2 prior bias | `prior_bias * validity_14` → flat (zero additive bias) for invalid pixels | +| Stage 3 blend | `blend_w * validity_14` → blend_w=0, falls back to stereo init | diff --git a/docs/superpowers/specs/2026-05-04-depth-fusion-v2-design.md b/docs/superpowers/specs/2026-05-04-depth-fusion-v2-design.md new file mode 100644 index 0000000..9131e9c --- /dev/null +++ b/docs/superpowers/specs/2026-05-04-depth-fusion-v2-design.md @@ -0,0 +1,161 @@ +# RealSense Depth Fusion v2 — Design Spec + +**Date:** 2026-05-04 +**Status:** Approved for implementation +**Goal:** Fuse RealSense depth into FastFoundationStereo without DepthEncoder or DepthFusionModule; fully freeze feature extraction; add a new post-GRU full-resolution output blend stage. + +--- + +## 1. Overview + +v1 (`finetune_inbolt_depthrs.py`) performed feature-level fusion via a learned depth encoder and per-scale residual projections. Despite fixing all initialization bugs and unit errors, the 13-sample Inbolt dataset was too small for the new modules to learn from scratch, and the model matched but did not beat the stereo fine-tuned baseline. + +v2 removes the heavy early-fusion modules entirely and instead acts only at the **cost volume** and **disparity output** stages, where the depth prior carries direct geometric meaning with no learned feature extraction required. + +**Forward signature (unchanged externally):** +```python +def forward(self, image1, image2, depth_rs_mm=None, iters=12, test_mode=False, ...) +``` + +When `depth_rs_mm=None`, all depth stages are skipped and the model behaves identically to the wrapped pretrained stereo model. + +--- + +## 2. Architecture Diagram + +``` +INPUTS + left_IR (B,3,H,W) ──────────────────────────────────────────────────────┐ + right_IR (B,3,H,W) ──────────────────────────────────────────────────────┤ + depth_rs_mm (B,1,H,W) ──────────────────────────────────────────────────────┤ + │ +FEATURE EXTRACTION [FULLY FROZEN] │ + left+right → Feature (EdgeNeXt backbone + FPN decoder) │ + → feat_left[x4,x8,x16,x32], feat_right[x4,x8,x16,x32] │ + → stem_2x (for convex upsampling) │ + │ +DEPTH PREPROCESSING [no learned params] │ + validity M = (depth_rs_mm > 0) & isfinite(depth_rs_mm) │ + depth_rs_m = depth_rs_mm / 1000.0 · M │ + disp_prior = where(M, focal·baseline_m / depth_rs_m, 0) ← full-res px │ + disp_prior_14 = F.interpolate(disp_prior / 4, H/4, W/4, 'nearest') │ + validity_14 = F.interpolate(M, H/4, W/4, 'nearest') │ + │ +COST VOLUME [fine-tune 0.1× LR] │ + feat_left[0] + feat_right[0] │ + ├── GWC volume │ + └── Concat volume │ + │ │ + corr_stem → corr_feature_att → hourglass → logits (B,D,H/4,W/4) │ + │ +STAGE 2 — GAUSSIAN LOGIT PRIOR [learnable σ, α — full LR] │ + d_idx = arange(D).view(1,D,1,1) │ + prior_bias = −½·((d_idx − disp_prior_14) / σ.abs().clamp(0.1))² │ + prior_bias = prior_bias · validity_14 │ + logits' = logits + α · prior_bias │ + → prob = softmax(logits') │ + → stereo_init = disparity_regression(prob, max_disp//4) (B,1,H/4,W/4) │ + │ +STAGE 3a — GRU INIT BLEND [DepthInitBlend ~5K — full LR] │ + x = cat(stereo_init, disp_prior_14, validity_14) (B,3,H/4,W/4) │ + blend_w = sigmoid(3-layer CNN(x)) · validity_14 [bias=-5 at init → 0] │ + init_disp = blend_w·disp_prior_14 + (1−blend_w)·stereo_init │ + │ +CONTEXT NETWORK + GRU ITERATIONS [fine-tune 0.1× LR] │ + cnet(feat_left) → net_list, inp_list, att │ + for itr in 0..iters-1: │ + geo_feat = Geo_Encoding_Volume(disp, coords) │ + Δdisp = update_block(net_list, inp_list, geo_feat, disp, att) │ + disp += Δdisp │ + disp_up = upsample_disp(disp, mask_feat_4, stem_2x) (B,1,H,W) │ + │ +STAGE 3b — OUTPUT BLEND [DepthOutputBlend ~8K — full LR] ← NEW ←┘ + for each disp_up in disp_preds: + x = cat(disp_up, disp_prior_full, validity) (B,3,H,W) + blend_w = sigmoid(3-layer CNN(x)) · validity [bias=-5 at init → 0] + disp_final = disp_up + blend_w·(disp_prior_full − disp_up) + +OUTPUT: disp_final (B,1,H,W) +``` + +--- + +## 3. New Modules + +### DepthInitBlend (Stage 3a, ~5 K params) +``` +Input: cat(stereo_init, disp_prior_14, validity_14) (B,3,H/4,W/4) +Layers: BasicConv(3→16, k=3, BN+ReLU) + BasicConv(16→16, k=3, BN+ReLU) + Conv2d(16→1, k=1) ← weight=0, bias=−5 at init +Output: blend_w = sigmoid(net(x)) · validity_14 +Return: blend_w·disp_prior_14 + (1−blend_w)·stereo_init +``` + +### DepthOutputBlend (Stage 3b, ~8 K params) — NEW +``` +Input: cat(disp_stereo, disp_prior_full, validity) (B,3,H,W) +Layers: BasicConv(3→32, k=3, BN+ReLU) + BasicConv(32→32, k=3, BN+ReLU) + Conv2d(32→1, k=1) ← weight=0, bias=−5 at init +Output: blend_w = sigmoid(net(x)) · validity +Return: disp_stereo + blend_w·(disp_prior_full − disp_stereo) [residual] +``` + +Both modules start with near-zero blend weight (sigmoid(−5) ≈ 0.007), so the model is numerically identical to the wrapped pretrained stereo model at epoch 0. Depth fusion is learned gradually. + +Applied in **training mode** to every GRU iteration prediction (8 predictions per step → 8 sequence-loss terms, all passing through Stage 3b). Applied in **test mode** only to the final upsampled disparity. + +--- + +## 4. Depth Preprocessing + +All depth-prior computations use a consistent unit system: + +```python +depth_rs_m = depth_rs_mm / 1000.0 · validity # metres, invalid → 0 +disp_prior = where(validity, focal_px·baseline_m / depth_rs_m.clamp(1e-3), 0) + # focal(px) · baseline(m) / depth(m) = pixels +``` + +No separate normalization step or `DEPTH_NORM_M` constant needed (no DepthEncoder). + +--- + +## 5. Parameter Groups + +| Group | Modules | LR | +|---|---|---| +| Frozen (no grad) | `stereo.feature` (all submodules), `stereo.stem_2` | 0 | +| New — full LR | `depth_init_blend`, `depth_output_blend`, `depth_sigma`, `depth_prior_scale` | 2e-5 | +| Fine-tune — 0.1× LR | All remaining trainable stereo params | 2e-6 | + +New learnable params: `depth_sigma` (σ, 4.0), `depth_prior_scale` (α, 0.1), plus ~13 K CNN params. +Fully frozen: entire `Feature` module (backbone + FPN decoder) + `stem_2`. + +--- + +## 6. Zero-Init Guarantee + +At initialization, with all new module weights zero and biases −5: + +- **Stage 2**: `α = 0.1` → small Gaussian prior added to logits. The prior is correctly centered at the depth-derived disparity and has radius σ=4 (in 1/4-scale pixels ≈ 16 full-scale pixels), giving a gentle push toward the depth estimate without overriding the cost volume. +- **Stage 3a**: `blend_w ≈ 0.007` → `init_disp ≈ stereo_init`. GRU warm start is unchanged. +- **Stage 3b**: `blend_w ≈ 0.007` → `disp_final ≈ disp_up`. Output is unchanged. + +Epoch 1 training loss should be < 1.0 (same order as the fine-tuned stereo baseline). + +--- + +## 7. Key Differences from v1 + +| | v1 | v2 | +|---|---|---| +| DepthEncoder | ✓ (4-scale CNN) | ✗ removed | +| DepthFusionModule | ✓ (residual proj) | ✗ removed | +| Feature extraction frozen | stem+stages only | entire Feature + stem_2 | +| Stage 2 logit prior | ✓ | ✓ | +| Stage 3a GRU init blend | ✓ | ✓ | +| Stage 3b output blend | ✗ | ✓ NEW | +| New params | ~288 K | ~13 K | +| Fine-tune params | ~12.4 M | ~11.4 M | diff --git a/model_architecture_report.html b/model_architecture_report.html new file mode 100644 index 0000000..3b1aa11 --- /dev/null +++ b/model_architecture_report.html @@ -0,0 +1,917 @@ + + + + + + FastFoundationStereo — Architecture Report + + + + +
+

FastFoundationStereo

+
Neural Network Architecture & Processing Cost Report
+
+ Stereo Depth Estimation + EdgeNeXt Backbone + GRU Iterative Refinement + 3D Cost Aggregation + Transformer Attention +
+
+ +
+ + +
+
📊
+ Model Overview +
+ +
+
+
~23M
+
Total Parameters
+
+
+
480×640
+
Typical Input Resolution
+
+
+
192
+
Max Disparity (pixels)
+
+
+
8–12
+
GRU Refinement Iterations
+
+
+
1/4
+
Internal Resolution Scale
+
+
+
fp16
+
AMP Inference Dtype
+
+
+ +
+
🔁 Pipeline Summary
+ FastFoundationStereo follows a classic stereo matching flow: feature extraction → cost volume → cost aggregation → iterative GRU refinement → upsampling. + The key design choices are a 3D hourglass with per-scale image feature guidance for cost aggregation, and a Selective ConvGRU that blends small and large receptive fields based on a spatial attention map. +
+ + +
+
+ Block-by-Block Pipeline +
+ +
+ + +
+
+
+
+
+
+
+
+
Block 1
+
Feature Extraction
+
+ core/extractor.py · Feature +
+
+ 2 × (B, 3, H, W) + + (B, 128, H/4, W/4) + (B, 192, H/8, W/8) + (B, 320, H/16, W/16) + (B, 304, H/32, W/32) +
+
+
+ 🧠 + EdgeNeXt-small backbone (pretrained) — 4-stage hierarchical encoder producing multi-scale features at strides 4, 8, 16, 32. Channel widths: 48 → 96 → 160 → 304. +
+
+ 🔺 + FPN-style deconvolution — deconv32→16, deconv16→8, deconv8→4 with lateral skip connections. Final Conv1×1 projects to 128ch at stride-4 (includes ViT feature dim offset). +
+
+ + Efficiency trick — left and right images concatenated into one batch (2B, 3, H, W) for a single forward pass through the shared backbone, then split. +
+
+
+
Estimated compute share
+
+
~10–15%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 2
+
Cost Volume Construction
+
+ core/submodule.py · build_gwc / build_concat +
+
+ (B, 128, H/4, W/4) × 2 + + (B, 32, D/4, H/4, W/4) +
+
+
+ 📐 + GWC Volume (Group-wise Correlation) — features split into 8 groups, dot-product correlation for every disparity shift d=0..D/4-1. Shape: (B, 8, D/4, H/4, W/4). Optimised via PyTorch1 or Triton kernel. +
+
+ 🧩 + Concat Volumeproj_cmb (Conv1×1: 128→12ch) projects each side, then left/right feature pairs concatenated for each shift. Shape: (B, 24, D/4, H/4, W/4). +
+
+ 🔗 + Combined — channel-wise cat → (B, 32, 48, H/4, W/4). At 480×640 this tensor holds ~88M values — the single largest allocation. +
+
+
+
Estimated compute share
+
+
~10–15%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 3
+
Cost Volume Stem
+
+ core/foundation_stereo.py · corr_stem + corr_feature_att +
+
+ (B, 32, 48, H/4, W/4) + + (B, 28, 48, H/4, W/4) +
+
+
+ 🔧 + corr_stem — Conv3d 1×1: 32→28ch, then BasicConv 3D 3×3×3, then 2 × ResnetBasicBlock3D (28ch). Projects and refines the raw cost volume. +
+
+ 👁 + FeatureAtt — squeeze-excite style: global-avg-pool the 2D image features (128ch), FC → scale factors applied to cost volume channels. Modulates cost volume by image content at stride-4. +
+
+
+
Estimated compute share
+
+
~3%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 4 — ⚠ Bottleneck
+
Hourglass Cost Aggregation
+
+ core/foundation_stereo.py · hourglass +
+
+ (B, 28, 48, H/4, W/4) + + (B, 28, 48, H/4, W/4) +
+
+ Input (28ch, D/4, H/4, W/4) + │ + ├─[Encoder]────────────────────────────────────────────── + │ conv1: stride-2 3D + Conv3dNormActReduced → 56ch D/8 H/8 W/8 + │ └─ FeatureAtt_8 (guided by x8 img feats, 192ch) + │ conv2: stride-2 3D + Conv3dNormActReduced → 112ch D/16 H/16 W/16 + │ └─ FeatureAtt_16 (guided by x16 img feats, 320ch) + │ conv3: stride-2 3D + Conv3dNormActReduced → 168ch D/32 H/32 W/32 + │ └─ FeatureAtt_32 (guided by x32 img feats, 304ch) + │ + ├─[Bottleneck Attention]───────────────────────────────── + │ conv_patch: 4×4×4 strided → (28ch, D/64, H/64, W/64) + │ CostVolumeDisparityAttention: + │ 4-head Transformer × 4 layers (in disparity dimension) + │ + sinusoidal positional embeddings + │ interpolate ×4 → (28ch, D/4, H/4, W/4) [residual add] + │ + ├─[Decoder]────────────────────────────────────────────── + │ conv3_up: deconv → 112ch + │ cat with enc skip → 224ch → agg_0 (2×Conv3dNormActReduced) → 112ch + │ └─ FeatureAtt_up_16 + │ conv2_up: deconv → 56ch + │ cat with enc skip → 112ch → agg_1 (2×Conv3dNormActReduced) → 56ch + │ └─ FeatureAtt_up_8 + │ conv1_up: deconv → 28ch + │ + bottleneck residual + │ conv_out: 2×Conv3dNormActReduced → 28ch + │ + Output (28ch, D/4, H/4, W/4)
+
+
Estimated compute share
+
+
~50–60% — dominant bottleneck due to 3D conv over 4D volume at multiple scales
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 5
+
Classifier + Initial Disparity
+
+ core/foundation_stereo.py · classifier +
+
+ (B, 28, 48, H/4, W/4) + + (B, 1, H/4, W/4) +
+
+
+ 🎯 + Classifier — BasicConv 3D: 28→14ch, ResnetBasicBlock3D, Conv3d 7×7×7: 14→1ch → logits (B, 48, H/4, W/4). +
+
+ 📉 + Soft-argmax — softmax over disparity dim, weighted sum Σ p(d) · d → smooth initial disparity estimate (used as seed for GRU iterations). +
+
+
+
Estimated compute share
+
+
~2%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 6
+
Context Network + Attention
+
+ core/extractor.py · ContextNetSharedBackbone +
+
+ (B, 128, H/4, W/4) + + net (128ch) + inp (128ch) +
+
+
+ 🧵 + ContextNet — two parallel Conv2d 3×3 projections on x4: one for hidden state net (tanh), one for context inp (ReLU). Uses the same left features already computed in Block 1. +
+
+ 🔍 + CAM (Channel Attention Enhancement) — squeeze-excite on inp. SAM (Spatial Attention Extractor) — produces spatial attention map used by SelectiveConvGRU. Both are very lightweight. +
+
+
+
Estimated compute share
+
+
~1%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 7
+
Geometry Encoding Volume
+
+ core/geometry.py · Combined_Geo_Encoding_Volume +
+
+ features_left/right[0] + cost vol + + 2-level pyramid (sampled per iter) +
+
+
+ 🗺 + Full correlation map — einsum dot-product between all left×right feature positions: (B, H/4, W/4, 1, W/4). Normalised L2. Built once. +
+
+ 🔭 + 2-level pyramid — cost volume and correlation map both avg-pooled in disparity dim × 2 for coarse/fine sampling. At each GRU iter, 1D bilinear sampling at disp ± corr_radius=4 → 9 samples × 2 levels × 2 sources = feature vector. +
+
+
+
Estimated compute share (init + 8 lookups)
+
+
~5–8%
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 8 — Iterative (×8 default)
+
Selective GRU Update Block
+
+ core/update.py · BasicSelectiveMultiUpdateBlock +
+
+ net (128ch) + inp (128ch) + geo_feat + + Δdisp (1ch) + mask (32ch) +
+
+
+ 🏃 + BasicMotionEncoder — corr branch: Conv2d 1×1 + 3×3 → 256ch. Disp branch: Conv2d 7×7 + 3×3 → 64ch. Fuse: Conv2d 1×1 → 127ch + concat raw disp = 128ch motion features. +
+
+ 🔀 + SelectiveConvGRU — blends two ConvGRU cells: small_gru (1×1 conv, narrow RF) and large_gru (3×3 conv, wider RF), mixed by spatial attention att. Adaptive receptive field per pixel. +
+
+ 📏 + DispHead — Conv2d + ReLU, then 2 × EdgeNextConvEncoder (DWConv 7×7, expansion×4), Conv2d → Δdisp (1ch). Predicts the additive disparity update. +
+
+ 🧮 + Mask head — 2 × Conv2d → 32ch. Produces per-pixel upsampling weights used in Block 9. +
+
+
+
Estimated compute share (×8 iters total)
+
+
~10–15% total (cheap per iter, accumulates over iterations)
+
+
+
+ + +
+
+
+
+
+
+
+
+
Block 9
+
Disparity Upsampling
+
+ core/foundation_stereo.py · upsample_disp +
+
+ (B, 1, H/4, W/4) + + (B, 1, H, W) +
+
+
+ 🔼 + stem_2x — a 2-layer Conv2d branch (stem_2) produces 32ch features at H/2 × W/2 directly from the input image. Gives high-resolution texture cues. +
+
+ + Conv2x deconv (spx_2_gru) — upsample mask_feat_4 (32ch, H/4) with stem_2x (32ch, H/2) → 64ch at H/2. Then ConvTranspose2d → 9ch at full resolution. +
+
+ 🎨 + context_upsample — the 9 channels are per-pixel soft weights for a 3×3 neighbourhood of the disparity map. Weighted sum produces sharp, edge-aware upsampled disparity at full H × W. +
+
+
+
Estimated compute share
+
+
~1%
+
+
+
+ +
+ + +
+
+ Relative Processing Cost (480×640, max_disp=192, iters=8) +
+ +
+
⚠ Note on measurements
+ The training job finetune_faro.py is currently occupying the GPU (~14.9 GiB / 16 GiB), preventing live profiling. + Estimates below are derived from code analysis (op counts, tensor sizes, kernel types) and the partial CUDA kernel profiler output obtained before OOM. +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
#BlockTypeCalls/forwardRelative costVisual
4Hourglass Cost Aggregation3D Conv enc-dec + Transformer50–60%
1Feature Extraction2D Conv (EdgeNeXt + FPN)1× (both images)10–15%
2Cost Volume ConstructionGWC + concat, O(D·H·W)10–15%
8GRU Update Block2D Conv + ConvGRU8× (one per iter)10–15%
7Geo Encoding Volume lookupBilinear sampling8× (one per iter)5–8%
3Cost Volume Stem3D Conv + ResNet~3%
5Classifier + Soft-argmax3D Conv (small)~2%
6Context Network + CAM/SAM2D Conv (tiny)~1%
9Disparity Upsampling2D ConvTranspose + softmax1× (test) / N× (train)~1%
+
+ + +
+
🚀
+ Optimisation Targets +
+ +
+
Highest-impact targets for speed improvement
+
    +
  • Block 4 — Hourglass (50–60%): Reduce number of hourglass levels, decrease volume_dim (default 28), reduce Transformer depth (num_transformer=4), use half-precision 3D convolutions, or replace 3D convs with 2D factorized convs.
  • +
  • Block 2 — Cost Volume (10–15%): Reduce max_disp or use the Triton-optimised kernel (optimize_build_volume='triton'). Lower cv_group (default 8) reduces GWC channels.
  • +
  • Block 8 — GRU Iterations (10–15%): Reduce valid_iters from 8 to 4–6. Quality vs. speed tradeoff. Hierarchical inference (run_hierachical) can help for high-res inputs.
  • +
  • Block 1 — Feature Extraction (10–15%): Switch to a smaller EdgeNeXt variant or prune the FPN deconv layers. Already shared between left/right via batch trick.
  • +
+
+ +
+ +
+ FastFoundationStereo Architecture Report — generated 2026-03-24  |  Fast-FoundationStereo project +
+ + + diff --git a/niv/Utils.py b/niv/Utils.py new file mode 100644 index 0000000..1f12387 --- /dev/null +++ b/niv/Utils.py @@ -0,0 +1,88 @@ +import os, sys, torch, imageio, logging, importlib, argparse +import cv2 +import numpy as np +import yaml +try: + import open3d as o3d +except: + o3d = None + +AMP_DTYPE = torch.float16 + +def set_logging_format(level=logging.INFO): + importlib.reload(logging) + FORMAT = '%(message)s' + logging.basicConfig(level=level, format=FORMAT, datefmt='%m-%d|%H:%M:%S') + + +def set_seed(random_seed): + import torch,random + np.random.seed(random_seed) + random.seed(random_seed) + torch.manual_seed(random_seed) + torch.cuda.manual_seed_all(random_seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def toOpen3dCloud(points,colors=None,normals=None): + cloud = o3d.geometry.PointCloud() + cloud.points = o3d.utility.Vector3dVector(points.astype(np.float64)) + if colors is not None: + if colors.max()>1: + colors = colors/255.0 + cloud.colors = o3d.utility.Vector3dVector(colors.astype(np.float64)) + if normals is not None: + cloud.normals = o3d.utility.Vector3dVector(normals.astype(np.float64)) + return cloud + + + +def depth2xyzmap(depth:np.ndarray, K, uvs:np.ndarray=None, zmin=0.1): + invalid_mask = (depth thres is invalid + """ + disp = disp.copy() + H,W = disp.shape[:2] + invalid_mask = disp>=invalid_thres + if (invalid_mask==0).sum()==0: + other_output['min_val'] = None + other_output['max_val'] = None + return np.zeros((H,W,3)) + if min_val is None: + min_val = disp[invalid_mask==0].min() + if max_val is None: + max_val = disp[invalid_mask==0].max() + other_output['min_val'] = min_val + other_output['max_val'] = max_val + vis = ((disp-min_val)/(max_val-min_val)).clip(0,1) * 255 + if cmap is None: + vis = cv2.applyColorMap(vis.clip(0, 255).astype(np.uint8), color_map)[...,::-1] + else: + vis = cmap(vis.astype(np.uint8))[...,:3]*255 + if invalid_mask.any(): + vis[invalid_mask] = 0 + return vis.astype(np.uint8) diff --git a/niv/depth_prior_explainer.py b/niv/depth_prior_explainer.py new file mode 100644 index 0000000..625bf11 --- /dev/null +++ b/niv/depth_prior_explainer.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +""" +D1-EdgeNeXt-FFS: RS depth prior injection points +================================================= + +Shows exactly where rs_disp and conf enter the network during inference. +Two injection points: + 1. Sensor-gated disparity initialisation (before first GRU iteration) + 2. GRU motion encoder — prior fed at every iteration + +Run: + python3 docs/depth_prior_explainer.py --ckpt weights/stage3_best.pt + +Bundled sample images are used automatically. Pass --output-dir to save +the predicted depth map. +""" + +import argparse +import os +import sys +import cv2 #matlib +#matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +import numpy as np +import torch +import torch.nn.functional as F + + +_HERE = os.path.dirname(os.path.abspath(__file__)) +_FFS_ROOT = os.path.abspath(os.path.join(_HERE, '..')) +if _FFS_ROOT not in sys.path: + sys.path.insert(0, _FFS_ROOT) + +from niv.smart_lite.model import build_gwc_volume +from niv.smart_lite.ablation.d1_models import build_d1_model + +device_type = "cuda" if torch.cuda.is_available() else "cpu" +device = torch.device(device_type) + +MODEL_H, MODEL_W = 384, 512 + + +# ============================================================================= +# Input helpers +# ============================================================================= + +def depth_mm_to_prior(depth_mm, focal_px, baseline_m): + """(H,W) uint16 mm → rs_disp (1,1,384,512) + conf (1,1,384,512) float32.""" + #src_h, src_w = depth_mm.shape + # y0 = (src_h - MODEL_H) // 2 + # x0 = (src_w - MODEL_W) // 2 + # crop = depth_mm[y0:y0+MODEL_H, x0:x0+MODEL_W].astype(np.float32) + z_m = depth_mm / 1000.0 + valid = z_m > 0.0 + rs_disp = np.where(valid, focal_px * baseline_m / np.maximum(z_m, 1e-6), 0.0).astype(np.float32) + conf = valid.astype(np.float32) + return rs_disp[np.newaxis, np.newaxis], conf[np.newaxis, np.newaxis] + + +def _load_image(path): + """Load image as (H,W,3) uint8 BGR. Accepts PNG/JPG or FARO .mat (key Il/Ir).""" + if path.endswith(".mat"): + import scipy.io as sio + mat = sio.loadmat(path) + for key in ("Il", "Ir", "img", "image"): + if key in mat: + mono = mat[key].astype(np.uint8) + return np.stack([mono, mono, mono], axis=-1) + raise SystemExit(f"Unknown keys in .mat: {list(mat.keys())}") + import cv2 + img = cv2.imread(path) + if img is None: + raise SystemExit(f"Cannot read image: {path}") + if img.ndim == 2: + import cv2 as _cv2 + img = _cv2.cvtColor(img, _cv2.COLOR_GRAY2BGR) + return img + + +def _load_depth_mm(path): + """Load depth as (H,W) uint16 mm. Accepts PNG uint16 or FARO .mat (key Z_im).""" + if path.endswith(".mat"): + import scipy.io as sio + mat = sio.loadmat(path) + for key in ("Z_im", "depth_mm", "depth"): + if key in mat: + return mat[key].astype(np.uint16) + raise SystemExit(f"Unknown keys in .mat: {list(mat.keys())}") + import cv2 + d = cv2.imread(path, cv2.IMREAD_ANYDEPTH) + if d is None: + raise SystemExit(f"Cannot read depth: {path}") + return d.astype(np.uint16) + + +def to_tensor(bgr): + #y0 = (stream_h - MODEL_H) // 2 + #x0 = (stream_w - MODEL_W) // 2 + crop = bgr #[y0:y0+MODEL_H, x0:x0+MODEL_W] + rgb = crop[..., ::-1].astype(np.float32) / 255.0 + return torch.from_numpy(np.ascontiguousarray(rgb.transpose(2, 0, 1))).unsqueeze(0).to(device) + +# ============================================================================= +# Annotated forward pass — two injection points +# ============================================================================= + +def annotated_forward(model, left_t, right_t, rs_disp_t, conf_t, num_iters=8): + """ + Runs inference and prints where rs_disp / conf affect computation. + + INJECTION POINT 1 — disparity initialisation + Before any GRU iteration, an initial disparity estimate is formed. + Where RS sensor has data (conf > 0), the initial disparity is taken + directly from the sensor. Where there are holes, it falls back to + a coarse cost-volume regression. + + has_sensor = conf_q4 > 0 + disp_init = has_sensor * rs_q4 + (1 - has_sensor) * coarse_disp + + INJECTION POINT 2 — GRU motion encoder (every iteration) + At each of the num_iters GRU steps, the prior is concatenated into + the motion encoder alongside the current disparity and correlation: + + motion = motion_encoder(current_disp, corr, prior) + + Inside MotionEncoder: + p = prior_net(prior) # 2-ch → prior_w-ch + fused = relu(fuse(cat([corr, disp, p]))) + motion = cat([fused, disp], dim=1) + + The prior is therefore not a one-time warm-start — it acts as a + persistent guidance channel at every refinement step. + """ + with torch.no_grad(): + B, _, H, W = left_t.shape + Hq, Wq = H // 4, W // 4 + + # Feature extraction and cost volume (no prior here) + feat_left, feat_right = model._extract(left_t, right_t) + + gwc_volume = build_gwc_volume(feat_left, feat_right, model.max_disp_q4, model.num_groups) + + # Downsample prior to GRU resolution (1/4), scale disparity accordingly + rs_q4 = F.interpolate(rs_disp_t, (Hq, Wq), mode='bilinear', align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf_t, (Hq, Wq), mode='bilinear', align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) # (B, 2, Hq, Wq) + + # ── INJECTION POINT 1: sensor-gated disparity initialisation ────────── + coarse_disp = model.coarse_head(gwc_volume) + has_sensor = (conf_q4 > 0).float() + disp_init = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + sensor_pct = has_sensor.mean().item() * 100 + print(f"\n [INJECTION 1] disparity initialisation") + print(f" RS-seeded pixels : {sensor_pct:.1f}% (conf > 0 → init from sensor)") + print(f" cost-vol pixels : {100-sensor_pct:.1f}% (holes → init from coarse regression)") + print(f" init disp range : [{disp_init.min():.2f}, {disp_init.max():.2f}] px (at 1/4 res)") + + # Context features (no prior) + net, inp, att = model.context_net(feat_left) + + # ── INJECTION POINT 2: prior injected into motion encoder every step ── + print(f"\n [INJECTION 2] GRU iterations — prior in motion encoder each step") + print(f" prior shape fed to motion_encoder: {tuple(prior.shape)} (rs_q4 + conf_q4)") + disp = disp_init + disp_preds = [] + for i in range(num_iters): + disp = disp.detach() + corr = model.corr_lookup(gwc_volume, disp) + motion = model.motion_encoder(disp, corr, prior) # ← prior here + net = model.gru(net, torch.cat([motion, inp * att], dim=1)) + delta = model.disp_head(net) + disp = disp + delta + pred = model.upsample(net, disp) + disp_preds.append(pred) + print(f" iter {i+1}/{num_iters}: delta_mean={delta.abs().mean():.4f} px") + + final = disp_preds[-1] + print(f"\n final disparity: {tuple(final.shape)} " + f"range [{final.min():.2f}, {final.max():.2f}] px") + return final + + +# ============================================================================= +# Output saving +# ============================================================================= + +def _save_output(out_dir, disp, focal_px, baseline_m): + + + os.makedirs(out_dir, exist_ok=True) + d = disp[0, 0].cpu().numpy() + valid = d > 0.5 + depth_m = np.zeros_like(d) + depth_m[valid] = focal_px * baseline_m / np.maximum(d[valid], 1e-4) + + depth_mm = (depth_m * 1000).clip(0, 65535).astype(np.uint16) + cv2.imwrite(os.path.join(out_dir, "depth.png"), depth_mm) + + fig, ax = plt.subplots(figsize=(8, 5)) + vmax = np.percentile(depth_m[valid], 95) if valid.any() else 5.0 + im = ax.imshow(depth_m, cmap="plasma", vmin=0, vmax=vmax) + ax.set_title("D1-EdgeNeXt-FFS depth (real RS prior)", fontsize=11) + ax.axis("off") + fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="depth (m)") + fig.tight_layout() + fig.savefig(os.path.join(out_dir, "depth_colorized.png"), dpi=120) + plt.close(fig) + + print(f"\n Saved → {out_dir}/depth.png (uint16 mm)") + print(f" → {out_dir}/depth_colorized.png") + + +# ============================================================================= +# Main +# ============================================================================= + +def main(): + p = argparse.ArgumentParser( + description="D1-EdgeNeXt-FFS: RS prior injection point explainer") + p.add_argument("--pt", default = f'{_FFS_ROOT}/weights/weights_niv/stage3_best.pt', help="Path to stage3_best.pt") + p.add_argument("--ffs-ckpt", default = f'{_FFS_ROOT}/weights/weights_niv/model_best_bp2_serialize.pth', help="FFS backbone weights (default: weights/model_best_bp2_serialize.pth)") + p.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + p.add_argument("--left", metavar="FILE", help="Left IR/RGB image (PNG or FARO .mat)") + p.add_argument("--right", metavar="FILE", help="Right IR/RGB image") + p.add_argument("--depth", metavar="FILE", help="Hardware depth: PNG uint16 mm or FARO .mat") + p.add_argument("--focal", type=float, default=420.0, metavar="PX") + p.add_argument("--baseline", type=float, default=0.05, metavar="M") + p.add_argument("--output-dir", default = f'{_FFS_ROOT}/demo_data_out') + args = p.parse_args() + + #device = torch.device(args.device) + model_path = args.ffs_ckpt #or os.path.join(_FFS_ROOT, "weights", "weights_niv","model_best_bp2_serialize.pth") + + # Load model + model = build_d1_model("edgenext", max_disp=192, num_iters=8, ffs_ckpt=model_path) + ck = torch.load(args.pt, map_location="cpu", weights_only=False) + model.load_state_dict(ck.get("model", ck), strict=False) + model.eval().to(device) + total = sum(p.numel() for p in model.parameters()) / 1e6 + frozen = sum(p.numel() for p in model.parameters() if not p.requires_grad) / 1e6 + print(f"Model: {total:.1f}M params ({frozen:.1f}M frozen backbone, {total-frozen:.1f}M trained)") + + # Input data + if args.left and args.right: + left_bgr = _load_image(args.left) + right_bgr = _load_image(args.right) + focal_px, baseline_m = args.focal, args.baseline + stream_h, stream_w = left_bgr.shape[:2] + depth_mm = _load_depth_mm(args.depth) if args.depth else np.zeros((stream_h, stream_w), dtype=np.uint16) + print(f"Input: {stream_w}×{stream_h} focal={focal_px:.1f}px baseline={baseline_m*1000:.1f}mm " + f"depth valid={100*(depth_mm>0).mean():.1f}%") + else: + _sample_dir = os.path.join(_FFS_ROOT, "demo_data") + _sample_l = os.path.join(_sample_dir, "imageL_d16_000.png") + _sample_r = os.path.join(_sample_dir, "imageR_d16_000.png") + _sample_d = os.path.join(_sample_dir, "imageD_d16_000.png") + _sample_cam = os.path.join(_sample_dir, "camera.txt") + if os.path.isfile(_sample_l) and os.path.isfile(_sample_r): + left_bgr = _load_image(_sample_l) + right_bgr = _load_image(_sample_r) + depth_mm = _load_depth_mm(_sample_d) if os.path.isfile(_sample_d) else np.zeros(left_bgr.shape[:2], dtype=np.uint16) + focal_px, baseline_m = 420.0, 0.05 + if os.path.isfile(_sample_cam): + for line in open(_sample_cam): + if line.startswith("focal_px="): focal_px = float(line.split("=")[1]) + elif line.startswith("baseline_m="): baseline_m = float(line.split("=")[1]) + stream_h, stream_w = left_bgr.shape[:2] + if args.output_dir is None: + args.output_dir = os.path.join(_FFS_ROOT, "demo_data_out") + print(f"Using bundled sample images ({stream_w}×{stream_h})") + else: + raise SystemExit("Provide --left / --right, or ensure sample/ directory is present.") + + # Depth mm → prior tensors + rs_disp_np, conf_np = depth_mm_to_prior(depth_mm, focal_px, baseline_m) + rs_disp_t = torch.from_numpy(rs_disp_np).to(device) + conf_t = torch.from_numpy(conf_np).to(device) + + left_t = to_tensor(left_bgr) + right_t = to_tensor(right_bgr) + + print("\n" + "=" * 55) + print("RS prior injection points") + #print("=" * 55) + final = annotated_forward(model, left_t, right_t, rs_disp_t, conf_t, num_iters=8) + + if args.output_dir: + _save_output(args.output_dir, final, focal_px, baseline_m) + + +if __name__ == "__main__": + #python3 docs/depth_prior_explainer.py --ckpt weights/stage3_best.pt + main() diff --git a/niv/image_client.py b/niv/image_client.py new file mode 100644 index 0000000..e0f26b5 --- /dev/null +++ b/niv/image_client.py @@ -0,0 +1,55 @@ +import socket +import sys +import cv2 +import numpy as np +from niv.image_server import send_msg, recv_msg + + +class ImageClient: + def __init__(self, host: str, port: int) -> None: + self._sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self._sock.connect((host, port)) + + def process(self, img: np.ndarray) -> np.ndarray: + _, jpeg = cv2.imencode('.jpg', img) + send_msg(self._sock, jpeg.tobytes()) + data = recv_msg(self._sock) + if data is None: + raise RuntimeError('Server closed the connection unexpectedly') + gray = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_GRAYSCALE) + if gray is None: + raise RuntimeError('Server response could not be decoded as an image') + return gray + + def close(self) -> None: + self._sock.close() + + def __enter__(self) -> 'ImageClient': + return self + + def __exit__(self, *_) -> None: + self.close() + + +def main() -> None: + cap = cv2.VideoCapture(0) + if not cap.isOpened(): + print('Cannot open webcam', file=sys.stderr) + sys.exit(1) + with ImageClient('127.0.0.1', 9999) as client: + while True: + ret, frame = cap.read() + if not ret: + print('Failed to capture frame', file=sys.stderr) + break + gray = client.process(frame) + cv2.imshow('Original', frame) + cv2.imshow('Grayscale (server)', gray) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + cap.release() + cv2.destroyAllWindows() + + +if __name__ == '__main__': + main() diff --git a/niv/image_server.py b/niv/image_server.py new file mode 100644 index 0000000..fccea2c --- /dev/null +++ b/niv/image_server.py @@ -0,0 +1,71 @@ +import socket +import struct +import logging +import socketserver +import cv2 +import numpy as np + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') +logger = logging.getLogger(__name__) + + +def send_msg(sock: socket.socket, data: bytes) -> None: + header = struct.pack('>I', len(data)) + sock.sendall(header + data) + + +def recv_msg(sock: socket.socket) -> bytes | None: + header = _recv_exact(sock, 4) + if header is None: + return None + length = struct.unpack('>I', header)[0] + return _recv_exact(sock, length) + + +def _recv_exact(sock: socket.socket, n: int) -> bytes | None: + buf = b'' + while len(buf) < n: + chunk = sock.recv(n - len(buf)) + if not chunk: + # Note: returning None here cannot distinguish a clean close from a + # truncated frame mid-read. Callers should treat None as + # "connection gone" and terminate the session. + return None + buf += chunk + return buf + + +class ImageHandler(socketserver.BaseRequestHandler): + def handle(self) -> None: + logger.info('Connection from %s', self.client_address) + try: + while True: + data = recv_msg(self.request) + if data is None: + break + img = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_COLOR) + if img is None: + logger.warning('Could not decode image, closing connection') + break + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + _, jpeg = cv2.imencode('.jpg', gray) + send_msg(self.request, jpeg.tobytes()) + except (ConnectionResetError, BrokenPipeError, OSError) as exc: + logger.warning('Connection error: %s', exc) + finally: + logger.info('Connection closed: %s', self.client_address) + + +def main() -> None: + host, port = '0.0.0.0', 9999 + socketserver.ThreadingTCPServer.allow_reuse_address = True + with socketserver.ThreadingTCPServer((host, port), ImageHandler) as server: + logger.info('Listening on %s:%d', host, port) + try: + server.serve_forever() + except KeyboardInterrupt: + logger.info('Shutting down') + + +if __name__ == '__main__': + main() diff --git a/niv/requirements.txt b/niv/requirements.txt new file mode 100644 index 0000000..9db4ce5 --- /dev/null +++ b/niv/requirements.txt @@ -0,0 +1,7 @@ +torch>=2.0 # install from https://pytorch.org with your CUDA version +numpy +timm +opencv-python +pyyaml +matplotlib +scipy # only needed if passing .mat files via --left/--right/--depth diff --git a/niv/run_fast_foundation_niv_with_rs.py b/niv/run_fast_foundation_niv_with_rs.py new file mode 100644 index 0000000..0161a4e --- /dev/null +++ b/niv/run_fast_foundation_niv_with_rs.py @@ -0,0 +1,875 @@ + +''' +python scripts/run_fast_foundation_with_rs.py - github + +Environment: + ffs - GPU laptop + +Installation: + python -m venv ./envs/ffs + source ./envs/ffs/bin/activate + pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124 + cd Fast-FoundationStereo + pip install -r requirements.txt + pip install pyrealsense2 + download weights : https://drive.google.com/drive/folders/1HuTt7UIp7gQsMiDvJwVuWmKpvFzIIMap + + python scripts/run_demo.py --model_dir weights/20-30-48/model_best_bp2_serialize.pth --left_file demo_data/left.png --right_file demo_data/right.png --intrinsic_file assets/K.txt --out_dir output/ --remove_invisible 0 --denoise_cloud 1 --scale 1 --get_pc 1 --valid_iters 8 --max_disp 192 --zfar 100 + + +''' + + + + +import numpy as np +from argparse import ArgumentParser + +import os,sys +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +from omegaconf import OmegaConf +from core.utils.utils import InputPadder +import argparse, torch, logging, yaml +import numpy as np +from Utils import ( + AMP_DTYPE, set_logging_format, set_seed, vis_disparity, + depth2xyzmap, toOpen3dCloud, o3d, +) +import cv2 +#from faro_data_manager import DataSource +from scripts.data_manager_faro_laptop import DataSource +from scripts.opencv_realsense_camera import RealSense + + +def process_arguments(): + parser = ArgumentParser() + code_dir = os.path.dirname(os.path.realpath(__file__)) + + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', default=f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth', type=str) + parser.add_argument('--left_file', default=f'{code_dir}/../demo_data/left.png', type=str) + parser.add_argument('--right_file', default=f'{code_dir}/../demo_data/right.png', type=str) + parser.add_argument('--intrinsic_file', default=f'{code_dir}/../demo_data/K.txt', type=str, help='camera intrinsic matrix and baseline file') + parser.add_argument('--out_dir', default='/home/bowen/debug/stereo_output', type=str) + parser.add_argument('--remove_invisible', default=1, type=int) + parser.add_argument('--denoise_cloud', default=0, type=int) + parser.add_argument('--denoise_nb_points', type=int, default=30, help='number of points to consider for radius outlier removal') + parser.add_argument('--denoise_radius', type=float, default=0.03, help='radius to use for outlier removal') + parser.add_argument('--scale', default=1, type=float) + parser.add_argument('--hiera', default=0, type=int) + parser.add_argument('--get_pc', type=int, default=1, help='save point cloud output') + parser.add_argument('--valid_iters', type=int, default=8, help='number of flow-field updates during forward pass') + parser.add_argument('--max_disp', type=int, default=192, help='maximum disparity') + parser.add_argument('--zfar', type=float, default=100, help="max depth to include in point cloud") + + return parser.parse_args() + +# Original ------------------------------------------- + +# Original Split ------------------------------------------- +#from opencv_realsense_camera import RealSense +import time + +def foundation_stereo_algo_init(args): + "initialize the algorithm" + + set_logging_format() + set_seed(0) + torch.autograd.set_grad_enabled(False) + + os.system(f'rm -rf {args.out_dir} && mkdir -p {args.out_dir}') + + with open(f'{os.path.dirname(args.model_dir)}/cfg.yaml', 'r') as ff: + cfg:dict = yaml.safe_load(ff) + for k in args.__dict__: + if args.__dict__[k] is not None: + cfg[k] = args.__dict__[k] + args = OmegaConf.create(cfg) + logging.info(f"args:\n{args}") + model = torch.load(args.model_dir, map_location='cpu', weights_only=False) + model.args.valid_iters = args.valid_iters + model.args.max_disp = args.max_disp + + model.cuda().eval() + return model + +def foundation_stereo_algo(args, model, np_left, np_right): + "stereo algo" + scale = args.scale + + img0 = np_left #imageio.imread(args.left_file) + img1 = np_right #imageio.imread(args.right_file) + if len(img0.shape)==2: + img0 = np.tile(img0[...,None], (1,1,3)) + img1 = np.tile(img1[...,None], (1,1,3)) + + img0 = img0[...,:3] + img1 = img1[...,:3] + Ho,Wo = img0.shape[:2] + + img0 = cv2.resize(img0, fx=scale, fy=scale, dsize=None) + img1 = cv2.resize(img1, dsize=(img0.shape[1], img0.shape[0])) + + H,W = img0.shape[:2] + #img0_ori = img0.copy() + #img1_ori = img1.copy() + #logging.info(f"img0: {img0.shape}") + #imageio.imwrite(f'{args.out_dir}/left.png', img0) + #imageio.imwrite(f'{args.out_dir}/right.png', img1) + + img0 = torch.as_tensor(img0).cuda().float()[None].permute(0,3,1,2) + img1 = torch.as_tensor(img1).cuda().float()[None].permute(0,3,1,2) + padder = InputPadder(img0.shape, divis_by=32, force_square=False) + img0, img1 = padder.pad(img0, img1) + + #logging.info(f"Start forward, 1st time run can be slow due to compilation") + with torch.amp.autocast('cuda', enabled=True, dtype=AMP_DTYPE): + if not args.hiera: + disp = model.forward(img0, img1, iters=args.valid_iters, test_mode=True, optimize_build_volume='pytorch1') + else: + disp = model.run_hierachical(img0, img1, iters=args.valid_iters, test_mode=True, small_ratio=0.5) + + #logging.info("forward done") + + scale_factor = 1/scale + disp = padder.unpad(disp.float()) + #disp = disp.data.cpu().numpy().reshape(H,W).clip(0, None) + disp = disp.data.cpu().numpy().reshape(H,W)*scale_factor + disp = disp.clip(0, None) + + # recover original size + #orig_shape = np_left.shape[::-1] + orig_shape = (np_left.shape[1],np_left.shape[0]) + #print(orig_shape, disp.shape) + #scale_factor = orig_shape[0]/process_shape[0] + disp_out = cv2.resize(disp, orig_shape, interpolation=cv2.INTER_NEAREST) # A: changed + + # cmap = None + # min_val = None + # max_val = None + # vis = vis_disparity(disp, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + # vis = np.concatenate([img0_ori, img1_ori, vis], axis=1) + # imageio.imwrite(f'{args.out_dir}/disp_vis.png', vis) + # s = 1280/vis.shape[1] + # resized_vis = cv2.resize(vis, (int(vis.shape[1]*s), int(vis.shape[0]*s))) + # cv2.imshow('disp', resized_vis[:,:,::-1]) + # cv2.waitKey(0) + + # if args.remove_invisible: + # yy,xx = np.meshgrid(np.arange(disp.shape[0]), np.arange(disp.shape[1]), indexing='ij') + # us_right = xx-disp + # invalid = us_right<0 + # disp[invalid] = np.inf + + # debug and show + #show_point_cloud(args, disp, img0_ori) + #disp = disp * 1000 # must be in mm + return disp_out + +def convert_disparity_to_depth(BF, disparity): + "from GIL" + + disparity = disparity.astype(np.float32) + depth = np.zeros_like(disparity) + disparity_valid = disparity > 0.1 + depth[disparity_valid] = BF / disparity[disparity_valid] + #depth[disparity_valid] += 0.5 # LUT in the simulator + return depth.astype(np.uint16) + +def depth_opencv_rs_merge(depth_rs, depth_cv): + "computing disparity by merging depth from real sense and opencv" + depth_merged = np.copy(depth_rs) + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = (depth_cv < 550) & (depth_cv > 80) + depth_merged[mask] = depth_cv[mask] + + # A: changed + # # if the real sense depth is invalid use opencv depth + # mask = (depth_rs < 1) & (depth_cv > 0) + # depth_merged[mask] = depth_cv[mask] + + return depth_merged + +def depth_fs_rs_error(depth_rs, depth_fs): + "computing depth error between real sense and fs" + depth_rs, depth_fs = depth_rs.astype(np.float32), depth_fs.astype(np.float32) + depth_error = np.abs(depth_rs - depth_fs) + + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = depth_rs < 2 + depth_error[mask] = 0 + + return depth_error + +def preprocess(frame, scale_factor = 0.5, crop_size=(512, 384)): + "convert and downscale, then center-crop to crop_size (W, H)" + frame = frame.astype(np.float32) + + # assign + imgL = frame[:,:,0] + imgR = frame[:,:,1] + imgD = frame[:,:,2] + + # if scale_factor < 0.9: + + # imgL = cv2.resize(imgL, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgR = cv2.resize(imgR, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgD = cv2.resize(imgD, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_NEAREST) + + # center crop + crop_w, crop_h = crop_size + H, W = imgL.shape[:2] + y0 = (H - crop_h) // 2 + x0 = (W - crop_w) // 2 + imgL = imgL[y0:y0+crop_h, x0:x0+crop_w] + imgR = imgR[y0:y0+crop_h, x0:x0+crop_w] + imgD = imgD[y0:y0+crop_h, x0:x0+crop_w] + + return imgL, imgR, imgD + +def show_images_depth(imgD = None, imgC = None, fig_num = 1, fig_name = 'Depth Image', vmax = 1500): + "draw results of depth estimation" + + + if (imgD is None) and (imgC is None): + print('No images found') + return False + + elif imgD is None: # no data acquired + img_show = imgC + + elif imgC is None: # no data is processed + img_show = imgD + #img_show = cv.applyColorMap(self.imgD, cv.COLORMAP_TURBO) + + elif np.all(imgD.shape == imgC.shape): + img_show = np.concatenate((imgD, imgC ), axis = 1) + + # deal with 16 uint + if img_show.dtype == 'uint16' or img_show.dtype == 'float32': + img_show = cv2.convertScaleAbs(img_show, alpha=0.1) + img_show = cv2.applyColorMap(img_show, cv2.COLORMAP_TURBO) # + pass + else: + #self.imgD = np.repeat(self.imgD[:,:,np.newaxis], 3, axis = 2) + #img_show = np.concatenate((self.imgD, self.imgC ), axis = 1) + #img_show = cv.applyColorMap(img_show.astype(np.uint8), cv.COLORMAP_TURBO) + #img_show = self.imgC #np.concatenate((self.imgD, self.imgC ), axis = 1) + pass + + # deal with black and white + if img_show.shape[1] > 2400: + img_show = cv2.pyrDown(img_show) + + cv2.imshow(f'{fig_name} (q-exit)', img_show) + ch = cv2.waitKey(5) + ret = ch == ord('q') + + # plt.figure() + # plt.imshow(imgD, vmin = 100, vmax=vmax) + # plt.title(fig_name) + # plt.show(block=False) + + return ret + +def show_point_cloud(args, disp, img0_ori): + "from fs" + if not args.get_pc: + return + + scale = args.scale + with open(args.intrinsic_file, 'r') as f: + lines = f.readlines() + K = np.array(list(map(float, lines[0].rstrip().split()))).astype(np.float32).reshape(3,3) + baseline = float(lines[1]) + K[:2] *= scale + depth = K[0,0]*baseline/disp + np.save(f'{args.out_dir}/depth_meter.npy', depth) + xyz_map = depth2xyzmap(depth, K) + pcd = toOpen3dCloud(xyz_map.reshape(-1,3), img0_ori.reshape(-1,3)) + keep_mask = (np.asarray(pcd.points)[:,2]>0) & (np.asarray(pcd.points)[:,2]<=args.zfar) + keep_ids = np.arange(len(np.asarray(pcd.points)))[keep_mask] + pcd = pcd.select_by_index(keep_ids) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud.ply', pcd) + #logging.info(f"PCL saved to {args.out_dir}") + + if args.denoise_cloud: + logging.info("[Optional step] denoise point cloud...") + cl, ind = pcd.remove_radius_outlier(nb_points=args.denoise_nb_points, radius=args.denoise_radius) + inlier_cloud = pcd.select_by_index(ind) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud_denoise.ply', inlier_cloud) + pcd = inlier_cloud + + logging.info("Visualizing point cloud. Press ESC to exit.") + vis = o3d.visualization.Visualizer() + vis.create_window() + vis.add_geometry(pcd) + vis.get_render_option().point_size = 1.0 + vis.get_render_option().background_color = np.array([0.5, 0.5, 0.5]) + ctr = vis.get_view_control() + ctr.set_front([0, 0, -1]) + id = np.asarray(pcd.points)[:,2].argmin() + ctr.set_lookat(np.asarray(pcd.points)[id]) + ctr.set_up([0, -1, 0]) + vis.run() + vis.destroy_window() + +def depth2xyz(depth:np.ndarray, K, us=None, vs=None, zmin=0.1): + #invalid_mask = (depth dt: + # Options (uncomment each to try them out): + # 1) extend with ndarrays. + pcd.points.extend(np.random.rand(n_new, 3)) + + # 2) extend with Vector3dVector instances. + # pcd.points.extend( + # o3d.utility.Vector3dVector(np.random.rand(n_new, 3))) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(pcd) + previous_t = time.time() + + keep_running = vis.poll_events() + vis.update_renderer() + + vis.destroy_window() + +def test_video_stream_rs_and_foundation(): + "streaming rs to foundation" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_merged = depth_opencv_rs_merge(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_merged, None, fig_name='Depth Merged') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + +def test_video_error_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + #print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + #print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + + # plt.show() + + d.release() + +def test_point_cloud_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + #d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + #camera_k = d.get_camera_intrinsics() + args = process_arguments() + K = np.array([[637,0,640],[0,637,360],[0,0,1]]) + + # init + model = foundation_stereo_algo_init(args) + + # create visualizer and window. + vis = o3d.visualization.Visualizer() + vis.create_window(height=720, width=1280) + + # initialize pointcloud instance. + pcd = o3d.geometry.PointCloud() + # *optionally* add initial points + points = np.random.rand(10, 3)*1000 + pcd.points = o3d.utility.Vector3dVector(points) + us, vs = None, None + + # include it in the visualizer before non-blocking visualization. + vis.add_geometry(pcd) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + #xyz_map, pts = depth2xyzmap(img_depth_fs, K) + pts, us, vs = depth2xyz(img_depth_fs, K, us, vs, zmin=100) + clr = np.zeros_like(pts) + clr[:,0] = clr[:,1] = clr[:,2] = img_left.flatten() + + # show point cloud + #pcd.points.extend(pts) + + #vis.remove_geometry(pcd) + #pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(pts) + #pcd.colors = o3d.utility.Vector3dVector(clr) + downpcd = pcd.voxel_down_sample(voxel_size=0.1) + #vis.add_geometry(pcd) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(downpcd) + + keep_running = vis.poll_events() + if not keep_running: break + vis.update_renderer() + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + vis.destroy_window() + +def test_video_stream_rs_versus_foundation_x2(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def merge_fs_rs(depth_rs, depth_fs): + "trying to merge information and deal with non valid regions" + nr, nc = depth_rs.shape + depth_rs_out = depth_rs.copy() + valid_rs = depth_rs > 1 + start_rs = np.zeros_like(valid_rs) + start_rs[:,:-1] = np.logical_and(valid_rs[:,:-1] , ~valid_rs[:,1:]) # if pixel k is valid and k + 1 is not + stop_rs = np.zeros_like(valid_rs) + stop_rs[:,1:] = np.logical_and(~valid_rs[:,:-1] , valid_rs[:,1:]) # if pixel k-1 is not valid and k is valid + + for r in range(nr): + start_ind = np.where(start_rs[r,:])[0] + stop_ind = np.where(stop_rs[r,:])[0] + if len(start_ind) < 1 or len(stop_ind) < 1: + continue + + for s in start_ind: + ii = np.where(s < stop_ind)[0] + if len(ii) < 1: continue + f = stop_ind[0] + if np.abs(depth_fs[r,s] - depth_fs[r,f]) < 0.1*depth_fs[r,s]: + depth_rs_out[r,s:f] = depth_fs[r,s:f] + print('.') + + return depth_rs_out + +def test_video_stream_rs_fs_merge(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_depth_merge = merge_fs_rs(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(img_depth_merge, None, fig_name='Depth Merge', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.save_image(img_depth_rs,fname='depth_rs') + d.save_image(img_depth_fs,fname='depth_fs') + d.save_image(img_left,fname='img_left') + d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def test_faro_rs_fs_error(): + "reading data files from FARO and comparing the results" + + #d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + d = DataSource() + img_num = d.init_directory() + camera_bf = d.get_bf() # for depth conversion (should be in meter) + + args = process_arguments() + args.scale = 0.5 + args.model_dir = '/home/administrato/dev/Fast-FoundationStereo/weights/20-30-48/model_finetuned_faro.pth' + args.model_dir = '/home/administrato/dev/Fast-FoundationStereo/weights/20-30-48/model_finetuned_faro.pth' + + # init + model = foundation_stereo_algo_init(args) + + + img_index = np.random.randint(0,img_num,8) + for k in img_index: + + # frame is I1,I2, D data + out_data = d.get_item(k, debug = True) + #out_data = d.load_specific_files(debug = True) + #print(out_data) + + # extract + img_left, img_right, img_depth_rs, img_depth_faro = out_data["img_left"], out_data["img_right"], out_data["img_depth_rs"], out_data["img_depth_faro"] + print(img_left.shape, img_right.shape) + + # process + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_error_fs = depth_fs_rs_error(img_depth_rs, img_depth_fs) + img_error_faro = depth_fs_rs_error(img_depth_faro, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS', vmax = 400) + ret3 = show_images_depth(img_depth_faro, None, fig_name='Depth Faro') + ret4 = show_images_depth(img_error_fs.astype(np.uint8), None, fig_name='Error RS-FS', vmax = 100) + ret5 = show_images_depth(img_error_faro.astype(np.uint8), None, fig_name='Error Faro-FS', vmax = 100) + ret6 = show_images_depth(img_left.astype(np.uint8), None, fig_name='Images L') + ret7 = show_images_depth(img_right.astype(np.uint8), None, fig_name='Images R') + ret = ret1 or ret2 or ret3 or ret4 or ret5 or ret6 or ret7 + if ret: break + + # d.save_image(img_depth_rs,fname='depth_rs') + #d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + #d.ckose() + + +# ─── D1-EdgeNeXt-FFS: fast foundation stereo with RS depth prior ───────────── +from niv.smart_lite.ablation.d1_models import build_d1_model +from niv.smart_lite.model import build_gwc_volume as _build_gwc_volume +import torch.nn.functional as _F +from niv.depth_prior_explainer import depth_mm_to_prior + + +def d1_model_init(ffs_ckpt=None, d1_ckpt=None, max_disp=192, num_iters=8): + "initialize D1-EdgeNeXt-FFS model with RS depth prior support" + if ffs_ckpt is None: + ffs_ckpt = f'{code_dir}/../weights/weights_niv/model_best_bp2_serialize.pth' + if d1_ckpt is None: + d1_ckpt = f'{code_dir}/../weights/weights_niv/stage3_best.pt' + model = build_d1_model("edgenext", max_disp=max_disp, num_iters=num_iters, ffs_ckpt=ffs_ckpt) + ck = torch.load(d1_ckpt, map_location="cpu", weights_only=False) + model.load_state_dict(ck.get("model", ck), strict=False) + return model.eval().cuda() + + +def d1_stereo_algo(model, left_bgr, right_bgr, depth_mm, focal_px, baseline_m, num_iters=8): + "run D1-EdgeNeXt-FFS forward pass — returns disparity (H,W) float32" + rs_disp_np, conf_np = depth_mm_to_prior(depth_mm, focal_px, baseline_m) + rs_disp_t = torch.from_numpy(rs_disp_np).cuda() + conf_t = torch.from_numpy(conf_np).cuda() + + def _to_tensor(img0): + if len(img0.shape)==2: + img0 = np.tile(img0[...,None], (1,1,3)) + rgb = img0[..., ::-1].astype(np.float32) / 255.0 + return torch.from_numpy(np.ascontiguousarray(rgb.transpose(2, 0, 1))).unsqueeze(0).cuda() + + left_t = _to_tensor(left_bgr) + right_t = _to_tensor(right_bgr) + + with torch.no_grad(): + _, _, H, W = left_t.shape + Hq, Wq = H // 4, W // 4 + + feat_left, feat_right = model._extract(left_t, right_t) + gwc_vol = _build_gwc_volume(feat_left, feat_right, model.max_disp_q4, model.num_groups) + + rs_q4 = _F.interpolate(rs_disp_t, (Hq, Wq), mode='bilinear', align_corners=False) * 0.25 + conf_q4 = _F.interpolate(conf_t, (Hq, Wq), mode='bilinear', align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) + + coarse_disp = model.coarse_head(gwc_vol) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = model.context_net(feat_left) + for _ in range(num_iters): + disp = disp.detach() + corr = model.corr_lookup(gwc_vol, disp) + motion = model.motion_encoder(disp, corr, prior) + net = model.gru(net, torch.cat([motion, inp * att], dim=1)) + disp = disp + model.disp_head(net) + final = model.upsample(net, disp) + + return final[0, 0].cpu().numpy() + + +def test_video_stream_rs_d1(): + "stream RealSense frames through D1-EdgeNeXt-FFS with RS depth prior" + d = RealSense(mode='d16', use_ir=True, frame_size=(1280, 720)) + #d.switch_projector(True) + camera_bf = d.get_bf() # focal_px * baseline_mm + K = np.array([[637, 0, 640], [0, 637, 360], [0, 0, 1]], dtype=np.float32) + focal_px = K[0, 0] + baseline_m = camera_bf / focal_px / 1000.0 + + model = d1_model_init() + + ret = False + while not ret: + retf, frame = d.read() + if not retf: + print('is your camera open/connected?') + break + + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = d1_stereo_algo( + model, + img_left.astype(np.uint8), + img_right.astype(np.uint8), + img_depth_rs, + focal_px, + baseline_m, + ) + img_depth_d1 = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Process time: {time.time() - t_start:.3f}s') + + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_d1, None, fig_name='Depth D1-FFS') + ret3 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 + + d.release() + + +if __name__ == '__main__': + #main() + #test_video_stream_rs_and_foundation() # ok + #test_video_error_rs_versus_foundation() # ok + # test_point_cloud_rt() # ok + #test_point_cloud_rs_versus_foundation() + #test_video_stream_rs_versus_foundation_x2() # ok + #test_video_stream_rs_fs_merge() + #test_faro_rs_fs_error() + test_video_stream_rs_d1() + + + + + + + + + \ No newline at end of file diff --git a/niv/smart_lite/__init__.py b/niv/smart_lite/__init__.py new file mode 100644 index 0000000..d4364e7 --- /dev/null +++ b/niv/smart_lite/__init__.py @@ -0,0 +1 @@ +from .model import SmartLiteStereo diff --git a/niv/smart_lite/ablation/__init__.py b/niv/smart_lite/ablation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/niv/smart_lite/ablation/d1_models.py b/niv/smart_lite/ablation/d1_models.py new file mode 100644 index 0000000..f7a39a5 --- /dev/null +++ b/niv/smart_lite/ablation/d1_models.py @@ -0,0 +1,495 @@ +""" +Direction-1 ablation models: SmartLite-L base, one FFS component swapped in. + +Each class replaces exactly one sub-system with its FFS counterpart so we +can measure the per-component accuracy delta. + +Variants +-------- +D1_NoRsPrior – eval only; zeros rs_disp / conf (baseline minus prior) +D1_SPX – replaces ConvexUpsample with FFS SPX upsample +D1_SelGRU – replaces DS-ConvGRU with FFS SelectiveConvGRU +D1_RichVolume – replaces tiny CoarseDisparityHead with combined GWC+concat + volume + 3-D stem + classifier (no hourglass, no multi-scale + features needed) +D1_EdgeNeXt – replaces DS-conv backbone with FFS EdgeNeXt Feature extractor + + 1×1 adapter conv + +Usage +----- + from smart_lite.ablation.d1_models import build_d1_model + model = build_d1_model('spx', max_disp=192) + model = build_d1_model('edgenext', max_disp=192, + ffs_ckpt='../../weights/model_best_bp2.pth') +""" + +import os +import sys +import torch +import torch.nn as nn +import torch.nn.functional as F + +_HERE = os.path.dirname(os.path.realpath(__file__)) +_FFS_ROOT = os.path.abspath(os.path.join(_HERE, '..', '..')) +if _FFS_ROOT not in sys.path: + sys.path.insert(0, _FFS_ROOT) + +from niv.smart_lite.model import ( + SmartLiteStereo, build_gwc_volume, MODEL_CONFIGS, + DepthwiseSeparableConv2d, DepthwiseSeparableConvGRU, + MotionEncoder, +) +from core.submodule import ( + Conv2x, + context_upsample, + build_gwc_volume_optimized_pytorch1, + build_concat_volume_optimized_pytorch1, + disparity_regression, + BasicConv_IN, +) +from core.update import SelectiveConvGRU + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _L_cfg(): + """SmartLite-L hyperparameters.""" + return MODEL_CONFIGS['L'] + + +def _load_ffs(ffs_ckpt: str, device='cpu'): + """Load FastFoundationStereo, return frozen on *device*. + + Supports two checkpoint formats: + - model_best_bp2_serialize.pth : serialized FastFoundationStereo object + (pure EdgeNeXt, no DINOv2 — preferred) + - model_best_bp2.pth : state-dict checkpoint (contains DINOv2 + EdgeNeXt; + shape-filters to extract only EdgeNeXt feature.* keys) + """ + obj = torch.load(ffs_ckpt, map_location='cpu', weights_only=False) + + if hasattr(obj, 'state_dict'): + # Serialized FastFoundationStereo — pure EdgeNeXt, no DINOv2 + ffs = obj + print(f'[_load_ffs] loaded serialized FastFoundationStereo from {ffs_ckpt}') + else: + # State-dict checkpoint: build model from cfg.yaml then load weights + from omegaconf import OmegaConf + cfg_path = os.path.join(os.path.dirname(ffs_ckpt), 'cfg.yaml') + cfg = OmegaConf.load(cfg_path) + state = obj.get('model', obj.get('state_dict', obj)) + from core.foundation_stereo import FastFoundationStereo + ffs = FastFoundationStereo(cfg) + model_state = ffs.state_dict() + compatible = {k: v for k, v in state.items() + if k in model_state and v.shape == model_state[k].shape} + skipped = len(state) - len(compatible) + if skipped: + print(f'[_load_ffs] skipped {skipped} shape-mismatched keys ' + f'(DINOv2/dim mismatches — harmless, only feature.* extracted)') + missing, unexpected = ffs.load_state_dict(compatible, strict=False) + feature_missing = [k for k in missing if k.startswith('feature.')] + if feature_missing: + print(f'[_load_ffs] WARNING: feature keys missing: {feature_missing[:5]}') + + ffs = ffs.to(device).eval() + for p in ffs.parameters(): + p.requires_grad = False + cfg = getattr(ffs, 'cfg', None) + return ffs, cfg + + +# --------------------------------------------------------------------------- +# D1_NoRsPrior — eval only, zeros the sensor prior +# --------------------------------------------------------------------------- + +class D1_NoRsPrior(SmartLiteStereo): + """SmartLite-L with RS prior disabled. No training changes needed.""" + + def forward(self, left_rgb, right_rgb, rs_disp, conf, + num_iters=None, test_mode=False): + return super().forward( + left_rgb, right_rgb, + torch.zeros_like(rs_disp), + torch.zeros_like(conf), + num_iters=num_iters, test_mode=test_mode, + ) + + +# --------------------------------------------------------------------------- +# D1_SPX — FFS SPX upsample instead of ConvexUpsample +# --------------------------------------------------------------------------- + +class D1_SPX(SmartLiteStereo): + """SmartLite-L with FFS SPX upsampling. + + New modules (need training): stem_2, spx_2_gru, spx_gru + Removed: upsample (ConvexUpsample) + + SPX uses the original-resolution image (1/2-res stem features) to sharpen + depth-discontinuity boundaries, which ConvexUpsample cannot. + """ + + def __init__(self, max_disp=192, **sl_kwargs): + super().__init__(max_disp=max_disp, **sl_kwargs) + hidden_dim = sl_kwargs.get('hidden_dim', _L_cfg()['hidden_dim']) + del self.upsample # replaced + + # SPX modules matching FFS architecture + self.stem_2 = nn.Sequential( + BasicConv_IN(3, 32, kernel_size=3, stride=2, padding=1), + nn.Conv2d(32, 32, 3, 1, 1, bias=False), + nn.InstanceNorm2d(32), nn.ReLU(), + ) + self.spx_2_gru = Conv2x(hidden_dim, 32, deconv=True, bn=False, concat=True) + self.spx_gru = nn.ConvTranspose2d(2 * 32, 9, kernel_size=4, + stride=2, padding=1) + + def _upsample(self, net, disp_low, stem_2x): + xspx = self.spx_2_gru(net, stem_2x) + spx_pred = F.softmax(self.spx_gru(xspx), dim=1) + return context_upsample(disp_low * 4., spx_pred).unsqueeze(1) + + def forward(self, left_rgb, right_rgb, rs_disp, conf, + num_iters=None, test_mode=False): + iters = num_iters if num_iters is not None else self.num_iters + B, _, H, W = left_rgb.shape + Hq, Wq = H // 4, W // 4 + + stem_2x = self.stem_2(left_rgb) + + feat_left, feat_right = self.backbone(left_rgb, right_rgb) + gwc_volume = build_gwc_volume(feat_left, feat_right, + self.max_disp_q4, self.num_groups) + + rs_q4 = F.interpolate(rs_disp, (Hq, Wq), mode='bilinear', + align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf, (Hq, Wq), mode='bilinear', + align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) + + coarse_disp = self.coarse_head(gwc_volume) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = self.context_net(feat_left) + + disp_preds = [] + for i in range(iters): + disp = disp.detach() + corr = self.corr_lookup(gwc_volume, disp) + motion = self.motion_encoder(disp, corr, prior) + net = self.gru(net, torch.cat([motion, inp * att], dim=1)) + disp = disp + self.disp_head(net) + if test_mode and i < iters - 1: + continue + disp_preds.append(self._upsample(net, disp, stem_2x)) + + return disp_preds[-1] if test_mode else (coarse_disp, disp_preds) + + +# --------------------------------------------------------------------------- +# D1_SelGRU — FFS SelectiveConvGRU cell, keep MotionEncoder + prior +# --------------------------------------------------------------------------- + +class _GRUAttProj(nn.Module): + """Predict per-pixel kernel-selection attention from GRU hidden state.""" + def __init__(self, hidden_dim): + super().__init__() + self.proj = nn.Sequential( + nn.Conv2d(hidden_dim, hidden_dim // 2, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim // 2, 1, 1), + nn.Sigmoid(), + ) + def forward(self, h): + return self.proj(h) + + +class D1_SelGRU(SmartLiteStereo): + """SmartLite-L with FFS SelectiveConvGRU replacing DS-ConvGRU. + + New modules (need training): sel_gru, gru_att_proj + Removed: gru (DepthwiseSeparableConvGRU) + + SelectiveConvGRU blends a 1×1 (fast, global) and 3×3 (spatial) GRU cell + based on per-pixel attention — more expressive than a fixed DS-GRU. + """ + + def __init__(self, max_disp=192, **sl_kwargs): + super().__init__(max_disp=max_disp, **sl_kwargs) + cfg = _L_cfg() + hidden_dim = sl_kwargs.get('hidden_dim', cfg['hidden_dim']) + context_dim = sl_kwargs.get('context_dim', cfg['context_dim']) + input_dim = hidden_dim + context_dim # motion + inp*att + del self.gru # replaced + + self.sel_gru = SelectiveConvGRU(hidden_dim=hidden_dim, + input_dim=input_dim) + self.gru_att_proj = _GRUAttProj(hidden_dim) + + def forward(self, left_rgb, right_rgb, rs_disp, conf, + num_iters=None, test_mode=False): + iters = num_iters if num_iters is not None else self.num_iters + B, _, H, W = left_rgb.shape + Hq, Wq = H // 4, W // 4 + + feat_left, feat_right = self.backbone(left_rgb, right_rgb) + gwc_volume = build_gwc_volume(feat_left, feat_right, + self.max_disp_q4, self.num_groups) + + rs_q4 = F.interpolate(rs_disp, (Hq, Wq), mode='bilinear', + align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf, (Hq, Wq), mode='bilinear', + align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) + + coarse_disp = self.coarse_head(gwc_volume) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = self.context_net(feat_left) + + disp_preds = [] + for i in range(iters): + disp = disp.detach() + corr = self.corr_lookup(gwc_volume, disp) + motion = self.motion_encoder(disp, corr, prior) + x_in = torch.cat([motion, inp * att], dim=1) + gru_att = self.gru_att_proj(net) # [B,1,Hq,Wq] + net = self.sel_gru(gru_att, net, x_in) + disp = disp + self.disp_head(net) + if test_mode and i < iters - 1: + continue + disp_preds.append(self.upsample(net, disp)) + + return disp_preds[-1] if test_mode else (coarse_disp, disp_preds) + + +# --------------------------------------------------------------------------- +# D1_RichVolume — combined GWC+concat volume + 3-D stem + classifier +# (FFS cost-agg init without the hourglass) +# --------------------------------------------------------------------------- + +class D1_RichVolume(SmartLiteStereo): + """SmartLite-L with a richer coarse init from GWC+concat+3D-stem. + + New modules (need training): proj_cmb, corr_stem, ffs_classifier + Removed: coarse_head + + Note: hourglass and FeatureAtt are skipped because they require multi-scale + features that SmartLite backbone doesn't produce. This still tests whether + a richer (combined) cost volume gives a better warm start over the tiny 2- + layer 3D-conv head. + """ + + _CONCAT_HALF = 12 # FFS concat_channel // 2 + + def __init__(self, max_disp=192, **sl_kwargs): + super().__init__(max_disp=max_disp, **sl_kwargs) + feature_ch = sl_kwargs.get('feature_ch', _L_cfg()['feature_ch']) + num_groups = sl_kwargs.get('num_groups', _L_cfg()['num_groups']) + del self.coarse_head # replaced + + volume_dim = 28 # FFS default + ch_half = self._CONCAT_HALF + + self.proj_cmb = nn.Conv2d(feature_ch, ch_half, 1, bias=False) + + from core.submodule import BasicConv, ResnetBasicBlock3D + self.corr_stem = nn.Sequential( + nn.Conv3d(ch_half * 2 + num_groups, volume_dim, kernel_size=1), + BasicConv(volume_dim, volume_dim, kernel_size=3, padding=1, is_3d=True), + ResnetBasicBlock3D(volume_dim, volume_dim, kernel_size=3, + stride=1, padding=1), + ResnetBasicBlock3D(volume_dim, volume_dim, kernel_size=3, + stride=1, padding=1), + ) + from core.submodule import ResnetBasicBlock3D as R3D + self.ffs_classifier = nn.Sequential( + BasicConv(volume_dim, volume_dim // 2, kernel_size=3, + padding=1, is_3d=True), + R3D(volume_dim // 2, volume_dim // 2, kernel_size=3, + stride=1, padding=1), + nn.Conv3d(volume_dim // 2, 1, kernel_size=7, padding=3), + ) + + def _coarse_init(self, feat_left, feat_right): + gwc = build_gwc_volume_optimized_pytorch1( + feat_left, feat_right, self.max_disp_q4, + self.num_groups, normalize=True) + lc = self.proj_cmb(feat_left) + rc = self.proj_cmb(feat_right) + cat_vol = build_concat_volume_optimized_pytorch1( + lc, rc, maxdisp=self.max_disp_q4) + comb = self.corr_stem(torch.cat([gwc, cat_vol], dim=1)) + logits = self.ffs_classifier(comb).squeeze(1) + return disparity_regression(F.softmax(logits, dim=1), self.max_disp_q4) + + def forward(self, left_rgb, right_rgb, rs_disp, conf, + num_iters=None, test_mode=False): + iters = num_iters if num_iters is not None else self.num_iters + B, _, H, W = left_rgb.shape + Hq, Wq = H // 4, W // 4 + + feat_left, feat_right = self.backbone(left_rgb, right_rgb) + gwc_volume = build_gwc_volume(feat_left, feat_right, + self.max_disp_q4, self.num_groups) + + rs_q4 = F.interpolate(rs_disp, (Hq, Wq), mode='bilinear', + align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf, (Hq, Wq), mode='bilinear', + align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) + + coarse_disp = self._coarse_init(feat_left, feat_right) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = self.context_net(feat_left) + + disp_preds = [] + for i in range(iters): + disp = disp.detach() + corr = self.corr_lookup(gwc_volume, disp) + motion = self.motion_encoder(disp, corr, prior) + net = self.gru(net, torch.cat([motion, inp * att], dim=1)) + disp = disp + self.disp_head(net) + if test_mode and i < iters - 1: + continue + disp_preds.append(self.upsample(net, disp)) + + return disp_preds[-1] if test_mode else (coarse_disp, disp_preds) + + +# --------------------------------------------------------------------------- +# D1_EdgeNeXt — FFS EdgeNeXt backbone + 1×1 adapter, keep everything else +# --------------------------------------------------------------------------- + +class D1_EdgeNeXt(SmartLiteStereo): + """SmartLite-L with FFS EdgeNeXt Feature extractor replacing DS backbone. + + New modules (need training): feat_adapter (1×1 conv) + Frozen (if ffs_ckpt provided): ffs_feature (pretrained ImageNet weights) + Removed: backbone + + The EdgeNeXt backbone is pretrained on ImageNet and outputs 128-160ch + features at 1/4 resolution. A 1×1 conv adapts this to SmartLite's + expected feature_ch=64, so the downstream GWC / context_net / GRU + continue to operate unchanged. + """ + + def __init__(self, ffs_ckpt: str = None, max_disp=192, **sl_kwargs): + super().__init__(max_disp=max_disp, **sl_kwargs) + feature_ch = sl_kwargs.get('feature_ch', _L_cfg()['feature_ch']) + del self.backbone + + from core.extractor import Feature as FFSFeature + if ffs_ckpt is not None: + ffs_model, ffs_cfg = _load_ffs(ffs_ckpt) + self.ffs_feature = ffs_model.feature + else: + from omegaconf import OmegaConf + # Minimal stub args if no checkpoint provided + ffs_cfg = OmegaConf.create({ + 'vit_size': 'vits', 'mixed_precision': False, + }) + self.ffs_feature = FFSFeature(ffs_cfg) + + ffs_x4_ch = self.ffs_feature.d_out[0] + self.feat_adapter = nn.Conv2d(ffs_x4_ch, feature_ch, 1, bias=False) + + def _extract(self, left_rgb, right_rgb): + imgs = torch.cat([left_rgb * 255.0, right_rgb * 255.0], dim=0) + out = self.ffs_feature(imgs) + B = left_rgb.shape[0] + fl = self.feat_adapter(out[0][:B]) + fr = self.feat_adapter(out[0][B:]) + return fl.contiguous(), fr.contiguous() + + def forward(self, left_rgb, right_rgb, rs_disp, conf, + num_iters=None, test_mode=False): + iters = num_iters if num_iters is not None else self.num_iters + B, _, H, W = left_rgb.shape + Hq, Wq = H // 4, W // 4 + + feat_left, feat_right = self._extract(left_rgb, right_rgb) + gwc_volume = build_gwc_volume(feat_left, feat_right, + self.max_disp_q4, self.num_groups) + + rs_q4 = F.interpolate(rs_disp, (Hq, Wq), mode='bilinear', + align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf, (Hq, Wq), mode='bilinear', + align_corners=False) + prior = torch.cat([rs_q4, conf_q4], dim=1) + + coarse_disp = self.coarse_head(gwc_volume) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = self.context_net(feat_left) + + disp_preds = [] + for i in range(iters): + disp = disp.detach() + corr = self.corr_lookup(gwc_volume, disp) + motion = self.motion_encoder(disp, corr, prior) + net = self.gru(net, torch.cat([motion, inp * att], dim=1)) + disp = disp + self.disp_head(net) + if test_mode and i < iters - 1: + continue + disp_preds.append(self.upsample(net, disp)) + + return disp_preds[-1] if test_mode else (coarse_disp, disp_preds) + + +# --------------------------------------------------------------------------- +# Factory +# --------------------------------------------------------------------------- + +_D1_CLASSES = { + 'no_prior': D1_NoRsPrior, + 'spx': D1_SPX, + 'selgru': D1_SelGRU, + 'rich_volume': D1_RichVolume, + 'edgenext': D1_EdgeNeXt, +} + + +def build_d1_model(variant: str, max_disp: int = 192, + ffs_ckpt: str = None, + num_iters: int = None, + feature_ch: int = None) -> SmartLiteStereo: + """Build a Direction-1 hybrid model with L config. + + Args: + variant: one of no_prior | spx | selgru | rich_volume | edgenext + max_disp: maximum disparity (default 192) + ffs_ckpt: path to FFS checkpoint (required for edgenext) + num_iters: GRU iterations override (default: L config = 8). + feature_ch: override feature width (default: L config = 64). + Use 32 for a lighter model. + """ + if variant not in _D1_CLASSES: + raise ValueError(f"Unknown D1 variant '{variant}'. " + f"Choose: {list(_D1_CLASSES)}") + cfg = _L_cfg() + kwargs = dict( + feature_ch=feature_ch if feature_ch is not None else cfg['feature_ch'], + backbone_mid1=cfg['backbone_mid1'], + backbone_mid2=cfg['backbone_mid2'], + backbone_extra_blocks=cfg['backbone_extra_blocks'], + hidden_dim=cfg['hidden_dim'], + context_dim=cfg['context_dim'], + context_mid=cfg['context_mid'], + motion_ch=cfg['motion_ch'], + num_groups=cfg['num_groups'], + corr_radius=cfg['corr_radius'], + num_iters=num_iters if num_iters is not None else cfg['num_iters'], + ) + cls = _D1_CLASSES[variant] + if variant == 'edgenext': + return cls(ffs_ckpt=ffs_ckpt, max_disp=max_disp, **kwargs) + return cls(max_disp=max_disp, **kwargs) diff --git a/niv/smart_lite/model.py b/niv/smart_lite/model.py new file mode 100644 index 0000000..266334b --- /dev/null +++ b/niv/smart_lite/model.py @@ -0,0 +1,531 @@ +""" +Smart Lite Stereo — lightweight stereo-matching with sensor-prior fusion. +Three sizes (S / M / L) share the same 4-stage architecture; only channel +widths and GRU iteration counts differ. + +Stages: + 1. Depthwise-separable Siamese feature backbone (→ 1/4 res) + 2. 8-group GWC cost volume + tiny 3-D head → coarse disparity (soft-argmax) + 3. DS-ConvGRU iterative refinement with 1-D bilinear correlation lookup + 4. Learned convex upsampling (3×3 mask head) + +Quick start: + from smart_lite.model import build_model + model = build_model("M") # or "S" / "L" + model = build_model("L", max_disp=192) +""" + +from __future__ import annotations + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# --------------------------------------------------------------------------- +# Size presets +# --------------------------------------------------------------------------- + +MODEL_CONFIGS = { + # ── SmartLite-S ───────────────────────────────────────────────────────── + # Baseline distillation target. Fastest inference; fits comfortably on + # Jetson Orin NX 8 GB. + "S": dict( + feature_ch=32, + backbone_mid1=16, backbone_mid2=24, backbone_extra_blocks=0, + hidden_dim=16, context_dim=24, context_mid=64, motion_ch=64, + num_groups=8, corr_radius=4, num_iters=4, + ), + # ── SmartLite-M ───────────────────────────────────────────────────────── + # Better accuracy, still real-time on a desktop GPU. + "M": dict( + feature_ch=48, + backbone_mid1=24, backbone_mid2=36, backbone_extra_blocks=0, + hidden_dim=32, context_dim=48, context_mid=96, motion_ch=96, + num_groups=8, corr_radius=4, num_iters=6, + ), + # ── SmartLite-L ───────────────────────────────────────────────────────── + # Highest accuracy among the three; one extra backbone refinement block. + # Still far lighter than FoundationStereo. + "L": dict( + feature_ch=64, + backbone_mid1=32, backbone_mid2=48, backbone_extra_blocks=1, + hidden_dim=64, context_dim=64, context_mid=128, motion_ch=128, + num_groups=8, corr_radius=4, num_iters=8, + ), +} + + +def build_model(size: str = "S", max_disp: int = 192) -> "SmartLiteStereo": + """Construct a SmartLiteStereo from a named size preset (S / M / L).""" + if size not in MODEL_CONFIGS: + raise ValueError(f"Unknown model size '{size}'. Choose from: {list(MODEL_CONFIGS)}") + cfg = MODEL_CONFIGS[size] + return SmartLiteStereo(max_disp=max_disp, **cfg) + + +# --------------------------------------------------------------------------- +# Building blocks +# --------------------------------------------------------------------------- + +class DepthwiseSeparableConv2d(nn.Module): + """Depthwise-separable convolution: depthwise 3×3 + pointwise 1×1.""" + + def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False): + super().__init__() + self.depthwise = nn.Conv2d(in_ch, in_ch, kernel_size, stride, padding, + groups=in_ch, bias=False) + self.pointwise = nn.Conv2d(in_ch, out_ch, 1, bias=bias) + + def forward(self, x): + return self.pointwise(self.depthwise(x)) + + +class DSConvBnRelu(nn.Module): + """Depthwise-separable conv + BatchNorm + ReLU.""" + + def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, padding=1): + super().__init__() + self.conv = DepthwiseSeparableConv2d(in_ch, out_ch, kernel_size, stride, padding) + self.bn = nn.BatchNorm2d(out_ch) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.relu(self.bn(self.conv(x))) + + +# --------------------------------------------------------------------------- +# Stage 1 — Feature Backbone (Siamese, depthwise-separable) +# --------------------------------------------------------------------------- + +class FeatureBackbone(nn.Module): + """ + Vertical-stack Siamese backbone. Accepts *stacked* left+right images + ``[B, 3, 2*H, W]`` and produces per-image features at 1/4 resolution. + + Architecture: + stem (3 → mid1_ch, stride 2) → 1/2 res + stage1 (mid1_ch → mid2_ch, stride 2) → 1/4 res + stage2 (mid2_ch → feature_ch, stride 1) × (2 + extra_blocks) → 1/4 res + """ + + def __init__( + self, + out_ch: int = 32, + mid1_ch: int = 16, + mid2_ch: int = 24, + extra_blocks: int = 0, + ): + super().__init__() + self.stem = DSConvBnRelu(3, mid1_ch, stride=2) + self.stage1 = DSConvBnRelu(mid1_ch, mid2_ch, stride=2) + blocks = [ + DSConvBnRelu(mid2_ch, out_ch, stride=1), + DSConvBnRelu(out_ch, out_ch, stride=1), + ] + for _ in range(extra_blocks): + blocks.append(DSConvBnRelu(out_ch, out_ch, stride=1)) + self.stage2 = nn.Sequential(*blocks) + + def forward(self, left_rgb, right_rgb): + stacked = torch.cat([left_rgb, right_rgb], dim=2) # [B, 3, 2H, W] + x = self.stage2(self.stage1(self.stem(stacked))) # [B, out_ch, H/2, W/4] + feat_left, feat_right = x.chunk(2, dim=2) + return feat_left.contiguous(), feat_right.contiguous() + + +# --------------------------------------------------------------------------- +# Stage 1b — Group-wise Correlation (GWC) Volume +# --------------------------------------------------------------------------- + +def build_gwc_volume(feat_left, feat_right, max_disp, num_groups=8): + """ + Build a group-wise correlation volume. + + Returns: + volume: [B, num_groups, max_disp, H, W] + """ + B, C, H, W = feat_left.shape + assert C % num_groups == 0 + cpg = C // num_groups + + left = F.normalize(feat_left, dim=1).view(B, num_groups, cpg, H, W) + right = F.normalize(feat_right, dim=1).view(B, num_groups, cpg, H, W) + + volume = feat_left.new_zeros(B, num_groups, max_disp, H, W) + for d in range(max_disp): + if d == 0: + volume[:, :, d] = (left * right).sum(dim=2) + else: + volume[:, :, d, :, d:] = (left[:, :, :, :, d:] * right[:, :, :, :, :-d]).sum(dim=2) + return volume + + +# --------------------------------------------------------------------------- +# Stage 2 — Coarse Disparity Regression ("Warm Start") +# --------------------------------------------------------------------------- + +class CoarseDisparityHead(nn.Module): + """ + Tiny 2-layer 3-D conv head → soft-argmax over disparity dimension. + + Input: GWC volume [B, G, D, H, W] + Output: coarse disparity [B, 1, H, W] + """ + + def __init__(self, in_ch=8, mid_ch=8, max_disp=48): + super().__init__() + self.max_disp = max_disp + self.conv1 = nn.Sequential( + nn.Conv3d(in_ch, mid_ch, kernel_size=3, padding=1), + nn.BatchNorm3d(mid_ch), + nn.ReLU(inplace=True), + ) + self.conv2 = nn.Conv3d(mid_ch, 1, kernel_size=3, padding=1) + + def forward(self, volume): + x = self.conv1(volume) + logits = self.conv2(x).squeeze(1) # [B, D, H, W] + prob = F.softmax(logits, dim=1) + disp_vals = torch.arange(0, self.max_disp, dtype=prob.dtype, + device=prob.device).view(1, -1, 1, 1) + return (prob * disp_vals).sum(dim=1, keepdim=True) + + +# --------------------------------------------------------------------------- +# Stage 2b — Context Network +# --------------------------------------------------------------------------- + +class ContextNet(nn.Module): + """ + Processes *left* features → GRU initial state + context + attention. + + Architecture: 2×Conv3×3 + CAM + SAM → proj + Outputs: net0 [B, hidden_dim, H, W], + inp0 [B, context_dim, H, W], + att0 [B, 1, H, W] + """ + + def __init__(self, in_ch=32, hidden_dim=16, context_dim=24, mid_ch=64): + super().__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(in_ch, mid_ch, 3, padding=1), nn.BatchNorm2d(mid_ch), nn.ReLU(True), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(mid_ch, mid_ch, 3, padding=1), nn.BatchNorm2d(mid_ch), nn.ReLU(True), + ) + self.proj = nn.Conv2d(mid_ch, hidden_dim + context_dim + 1, 1) + + self.cam = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(mid_ch, max(1, mid_ch // 4), 1), nn.ReLU(True), + nn.Conv2d(max(1, mid_ch // 4), mid_ch, 1), nn.Sigmoid(), + ) + self.sam = nn.Sequential( + nn.Conv2d(2, 1, 7, padding=3), nn.Sigmoid(), + ) + self.hidden_dim = hidden_dim + self.context_dim = context_dim + + def forward(self, feat_left): + x = self.conv1(feat_left) + x = x * self.cam(x) + x = self.conv2(x) + sa = self.sam(torch.cat([x.mean(1, keepdim=True), x.amax(1, keepdim=True)], dim=1)) + x = x * sa # (1,128,H/4,W/4) + out = self.proj(x) + net0 = torch.tanh(out[:, :self.hidden_dim]) + inp0 = torch.relu(out[:, self.hidden_dim:self.hidden_dim + self.context_dim]) + att0 = torch.sigmoid(out[:, -1:]) + return net0, inp0, att0 + + +# --------------------------------------------------------------------------- +# Stage 3 — 1-D Bilinear Correlation Lookup +# --------------------------------------------------------------------------- + +class CorrLookup1D(nn.Module): + """ + Sample 2*radius+1 correlation values from the GWC volume around + the current disparity estimate via bilinear interpolation. + + Returns: corr_features [B, G*(2*radius+1), H, W] + """ + + def __init__(self, radius=4, num_groups=8): + super().__init__() + self.radius = radius + self.num_groups = num_groups + offsets = torch.arange(-radius, radius + 1, dtype=torch.float32) + self.register_buffer("offsets", offsets) + + def forward(self, volume, disp): + B, G, D, H, W = volume.shape + r = self.radius + + sample_d = disp + self.offsets.view(1, -1, 1, 1) + sample_d_norm = 2.0 * sample_d / max(D - 1, 1) - 1.0 + + vol_flat = volume.view(B * G, 1, D, H * W) + S = H * W + n_samples = 2 * r + 1 + + spatial_norm = torch.linspace(-1, 1, S, device=volume.device) + spatial_norm = spatial_norm.view(1, 1, 1, S).expand(B, n_samples, 1, S) + d_norm = sample_d_norm.view(B, n_samples, H, W).reshape(B, n_samples, 1, S) + + grid = torch.stack( + [spatial_norm.expand(B, n_samples, 1, S).reshape(B, n_samples, S), + d_norm.reshape(B, n_samples, S)], + dim=-1, + ) + grid = grid.unsqueeze(1).expand(B, G, n_samples, S, 2).reshape(B * G, n_samples, S, 2) + + sampled = F.grid_sample(vol_flat, grid, mode='bilinear', + padding_mode='zeros', align_corners=True) + return sampled.view(B, G, n_samples, H, W).view(B, G * n_samples, H, W) + + +# --------------------------------------------------------------------------- +# Stage 3 — Motion Encoder + DS-ConvGRU +# --------------------------------------------------------------------------- + +class MotionEncoder(nn.Module): + """ + Encodes correlation features + current disparity + sensor prior + into a motion feature for the GRU. + + motion_ch controls the internal width of all three paths. + """ + + def __init__(self, corr_ch, prior_ch=2, hidden_dim=16, motion_ch=64): + super().__init__() + disp_ch = max(4, motion_ch // 4) + prior_w = max(4, motion_ch // 4) + fuse_in = motion_ch + disp_ch + prior_w + + self.corr_net = nn.Sequential( + nn.Conv2d(corr_ch, motion_ch, 1), + nn.ReLU(inplace=True), + nn.Conv2d(motion_ch, motion_ch, 3, padding=1), + nn.ReLU(inplace=True), + ) + self.disp_net = nn.Sequential( + nn.Conv2d(1, disp_ch, 7, padding=3), + nn.ReLU(inplace=True), + nn.Conv2d(disp_ch, disp_ch, 3, padding=1), + nn.ReLU(inplace=True), + ) + self.prior_net = nn.Sequential( + nn.Conv2d(prior_ch, prior_w, 3, padding=1), + nn.ReLU(inplace=True), + ) + self.fuse = nn.Conv2d(fuse_in, hidden_dim - 1, 1) + + def forward(self, disp, corr, prior): + c = self.corr_net(corr) + d = self.disp_net(disp) + p = self.prior_net(prior) + fused = F.relu(self.fuse(torch.cat([c, d, p], dim=1))) + return torch.cat([fused, disp], dim=1) + + +class DepthwiseSeparableConvGRU(nn.Module): + """ConvGRU cell using depthwise-separable convolutions.""" + + def __init__(self, hidden_dim, input_dim, kernel_size=3): + super().__init__() + total = hidden_dim + input_dim + pad = kernel_size // 2 + self.convz = DepthwiseSeparableConv2d(total, hidden_dim, kernel_size, padding=pad) + self.convr = DepthwiseSeparableConv2d(total, hidden_dim, kernel_size, padding=pad) + self.convq = DepthwiseSeparableConv2d(total, hidden_dim, kernel_size, padding=pad) + + def forward(self, h, x): + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz(hx)) + r = torch.sigmoid(self.convr(hx)) + q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1))) + return (1 - z) * h + z * q + + +class DispHead(nn.Module): + """Predicts a residual disparity delta from GRU hidden state.""" + + def __init__(self, hidden_dim): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim, 1, 3, padding=1), + ) + + def forward(self, h): + return self.conv(h) + + +# --------------------------------------------------------------------------- +# Stage 4 — Learned Convex Upsampling (3×3 mask head) +# --------------------------------------------------------------------------- + +class ConvexUpsample(nn.Module): + """ + Predict 9 softmax weights per coarse pixel for content-adaptive 4× upsampling. + """ + + def __init__(self, hidden_dim, upsample_factor=4): + super().__init__() + self.factor = upsample_factor + self.mask_head = nn.Conv2d(hidden_dim, upsample_factor ** 2 * 9, 3, padding=1) + + def forward(self, h, disp_low): + B, _, H, W = disp_low.shape + f = self.factor + + mask = self.mask_head(h).view(B, 1, 9, f, f, H, W) + mask = F.softmax(mask, dim=2) + + disp_unfold = F.unfold(disp_low * f, 3, padding=1).view(B, 1, 9, 1, 1, H, W) + up = (mask * disp_unfold).sum(dim=2) + return up.permute(0, 1, 4, 2, 5, 3).reshape(B, 1, H * f, W * f) + + +# --------------------------------------------------------------------------- +# Full Model +# --------------------------------------------------------------------------- + +class SmartLiteStereo(nn.Module): + """ + Smart Lite Stereo network. Use ``build_model(size)`` for the standard + S / M / L presets, or construct directly for custom configurations. + + Args: + max_disp: max disparity at full resolution (default 192) + feature_ch: backbone output channels + backbone_mid1: backbone stem output channels + backbone_mid2: backbone stage-1 output channels + backbone_extra_blocks: extra refinement blocks in backbone stage-2 + hidden_dim: GRU hidden-state channels + context_dim: context channels injected each GRU iteration + context_mid: internal width of ContextNet convolutions + motion_ch: internal width of MotionEncoder paths + num_groups: GWC correlation groups + corr_radius: 1-D lookup radius (2r+1 samples per pixel) + num_iters: default GRU iterations + """ + + def __init__( + self, + max_disp: int = 192, + feature_ch: int = 32, + backbone_mid1: int = 16, + backbone_mid2: int = 24, + backbone_extra_blocks: int = 0, + hidden_dim: int = 16, + context_dim: int = 24, + context_mid: int = 64, + motion_ch: int = 64, + num_groups: int = 8, + corr_radius: int = 4, + num_iters: int = 4, + ): + super().__init__() + self.max_disp = max_disp + self.max_disp_q4 = max_disp // 4 + self.hidden_dim = hidden_dim + self.context_dim = context_dim + self.num_groups = num_groups + self.num_iters = num_iters + + self.backbone = FeatureBackbone( + out_ch=feature_ch, + mid1_ch=backbone_mid1, + mid2_ch=backbone_mid2, + extra_blocks=backbone_extra_blocks, + ) + self.coarse_head = CoarseDisparityHead( + in_ch=num_groups, mid_ch=num_groups, max_disp=self.max_disp_q4, + ) + self.context_net = ContextNet( + in_ch=feature_ch, hidden_dim=hidden_dim, + context_dim=context_dim, mid_ch=context_mid, + ) + + corr_ch = num_groups * (2 * corr_radius + 1) + self.motion_encoder = MotionEncoder( + corr_ch=corr_ch, prior_ch=2, + hidden_dim=hidden_dim, motion_ch=motion_ch, + ) + self.gru = DepthwiseSeparableConvGRU( + hidden_dim=hidden_dim, input_dim=hidden_dim + context_dim, + ) + self.disp_head = DispHead(hidden_dim) + self.corr_lookup = CorrLookup1D(radius=corr_radius, num_groups=num_groups) + self.upsample = ConvexUpsample(hidden_dim=hidden_dim, upsample_factor=4) + + # ------------------------------------------------------------------ + + def param_count(self) -> int: + return sum(p.numel() for p in self.parameters()) + + # ------------------------------------------------------------------ + + def forward( + self, + left_rgb: torch.Tensor, + right_rgb: torch.Tensor, + rs_disp: torch.Tensor, + conf: torch.Tensor, + num_iters: int | None = None, + test_mode: bool = False, + ): + """ + Args: + left_rgb: [B, 3, H, W] + right_rgb: [B, 3, H, W] + rs_disp: [B, 1, H, W] RealSense raw disparity (0 = invalid) + conf: [B, 1, H, W] confidence (0 = invalid) + num_iters: override self.num_iters at inference + test_mode: True → return only final full-res disparity + Returns: + test_mode=True → disparity [B, 1, H, W] + test_mode=False → (coarse [B,1,H/4,W/4], list of full-res preds) + """ + iters = num_iters if num_iters is not None else self.num_iters + B, _, H, W = left_rgb.shape + Hq, Wq = H // 4, W // 4 + + feat_left, feat_right = self.backbone(left_rgb, right_rgb) + gwc_volume = build_gwc_volume(feat_left, feat_right, + self.max_disp_q4, self.num_groups) + + rs_disp_q4 = F.interpolate(rs_disp, (Hq, Wq), mode='bilinear', + align_corners=False) * 0.25 + conf_q4 = F.interpolate(conf, (Hq, Wq), mode='bilinear', + align_corners=False) + prior = torch.cat([rs_disp_q4, conf_q4], dim=1) + + coarse_disp = self.coarse_head(gwc_volume) + has_sensor = (conf_q4 > 0).float() + disp = has_sensor * rs_disp_q4 + (1.0 - has_sensor) * coarse_disp + + net, inp, att = self.context_net(feat_left) + + disp_preds = [] + for i in range(iters): + disp = disp.detach() + corr = self.corr_lookup(gwc_volume, disp) + motion = self.motion_encoder(disp, corr, prior) + net = self.gru(net, torch.cat([motion, inp * att], dim=1)) + disp = disp + self.disp_head(net) + + if test_mode and i < iters - 1: + continue + + disp_up = self.upsample(net, disp) + disp_preds.append(disp_up) + + if test_mode: + return disp_up + + return coarse_disp, disp_preds diff --git a/niv/test_server.py b/niv/test_server.py new file mode 100644 index 0000000..7f40fd8 --- /dev/null +++ b/niv/test_server.py @@ -0,0 +1,195 @@ +import os +import socket +import threading +import unittest +import socketserver +import cv2 +import numpy as np +from niv.image_server import send_msg, recv_msg, ImageHandler + +_TEST_DIR = os.path.dirname(os.path.abspath(__file__)) +_FIXTURE_LEFT = os.path.join(_TEST_DIR, 'sample', 'left.png') + + +class TestProtocolHelpers(unittest.TestCase): + def _pair(self): + return socket.socketpair() + + def test_roundtrip_small(self): + a, b = self._pair() + try: + payload = b'hello world' + send_msg(a, payload) + result = recv_msg(b) + self.assertEqual(result, payload) + finally: + a.close() + b.close() + + def test_roundtrip_binary(self): + a, b = self._pair() + try: + payload = bytes(range(256)) * 100 + send_msg(a, payload) + result = recv_msg(b) + self.assertEqual(result, payload) + finally: + a.close() + b.close() + + def test_recv_returns_none_on_closed_socket(self): + a, b = self._pair() + a.close() + result = recv_msg(b) + self.assertIsNone(result) + b.close() + + +def _make_server() -> socketserver.ThreadingTCPServer: + socketserver.ThreadingTCPServer.allow_reuse_address = True + server = socketserver.ThreadingTCPServer(('127.0.0.1', 0), ImageHandler) + return server + + +class TestImageServer(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.server = _make_server() + cls.port = cls.server.server_address[1] + cls.thread = threading.Thread(target=cls.server.serve_forever, daemon=True) + cls.thread.start() + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + cls.thread.join(timeout=2) + + def _connect(self) -> socket.socket: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect(('127.0.0.1', self.port)) + return sock + + def _send_image(self, sock: socket.socket, img: np.ndarray) -> None: + _, jpeg = cv2.imencode('.jpg', img) + send_msg(sock, jpeg.tobytes()) + + def _recv_gray(self, sock: socket.socket) -> np.ndarray: + data = recv_msg(sock) + self.assertIsNotNone(data, 'Server returned no data') + gray = cv2.imdecode(np.frombuffer(data, dtype=np.uint8), cv2.IMREAD_GRAYSCALE) + self.assertIsNotNone(gray, 'Response could not be decoded as image') + return gray + + def test_synthetic_image_becomes_grayscale(self): + img = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8) + with self._connect() as sock: + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (100, 100)) + + def test_multiple_images_same_connection(self): + with self._connect() as sock: + for _ in range(3): + img = np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8) + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (64, 64)) + + @unittest.skipUnless(os.path.exists(_FIXTURE_LEFT), 'sample/left.png fixture not present') + def test_real_image(self): + img = cv2.imread(_FIXTURE_LEFT) + self.assertIsNotNone(img, 'sample/left.png must be readable') + h, w = img.shape[:2] + with self._connect() as sock: + self._send_image(sock, img) + gray = self._recv_gray(sock) + self.assertEqual(gray.shape, (h, w)) + + +class TestImageClient(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.server = _make_server() + cls.port = cls.server.server_address[1] + cls.thread = threading.Thread(target=cls.server.serve_forever, daemon=True) + cls.thread.start() + + @classmethod + def tearDownClass(cls): + cls.server.shutdown() + cls.thread.join(timeout=2) + + def test_client_process_returns_grayscale(self): + from niv.image_client import ImageClient + img = np.random.randint(0, 256, (80, 80, 3), dtype=np.uint8) + with ImageClient('127.0.0.1', self.port) as client: + gray = client.process(img) + self.assertEqual(gray.shape, (80, 80)) + + def test_client_process_multiple_images(self): + from niv.image_client import ImageClient + with ImageClient('127.0.0.1', self.port) as client: + for size in [32, 64, 128]: + img = np.random.randint(0, 256, (size, size, 3), dtype=np.uint8) + gray = client.process(img) + self.assertEqual(gray.shape, (size, size)) + + +class TestImageClientWebcam(unittest.TestCase): + """Interactive webcam test. + + Disabled by default because it requires a physical camera and a display. + Enable by setting the environment variable NIV_WEBCAM_TEST=1. + Press 'q' in the display window to finish the test. + """ + + @classmethod + def setUpClass(cls): + #if os.environ.get('NIV_WEBCAM_TEST') != '1': + # raise unittest.SkipTest('Set NIV_WEBCAM_TEST=1 to run the webcam test') + cls.server = _make_server() + cls.port = cls.server.server_address[1] + cls.thread = threading.Thread(target=cls.server.serve_forever, daemon=True) + cls.thread.start() + + @classmethod + def tearDownClass(cls): + #if os.environ.get('NIV_WEBCAM_TEST') != '1': + # return + cls.server.shutdown() + cls.thread.join(timeout=2) + + def test_webcam_roundtrip_display(self): + from niv.image_client import ImageClient + cam_index = int(os.environ.get('NIV_WEBCAM_INDEX', '0')) + cap = cv2.VideoCapture(cam_index) + if not cap.isOpened(): + self.skipTest(f'Cannot open webcam at index {cam_index}') + + frames_processed = 0 + try: + with ImageClient('127.0.0.1', self.port) as client: + while True: + ret, frame = cap.read() + if not ret: + self.fail('Failed to capture frame from webcam') + gray = client.process(frame) + self.assertEqual(gray.shape, frame.shape[:2]) + cv2.imshow('Webcam (original)', frame) + cv2.imshow('Server response (grayscale)', gray) + frames_processed += 1 + if cv2.waitKey(1) & 0xFF == ord('q'): + break + finally: + cap.release() + cv2.destroyAllWindows() + + self.assertGreater(frames_processed, 0) + + +if __name__ == '__main__': + #unittest.main() + t = TestImageClientWebcam() + t.setUpClass() + t.test_webcam_roundtrip_display() + t.tearDownClass() diff --git a/reports/faro_benchmark/close_range_analysis.png b/reports/faro_benchmark/close_range_analysis.png new file mode 100644 index 0000000..547bead Binary files /dev/null and b/reports/faro_benchmark/close_range_analysis.png differ diff --git a/reports/faro_benchmark/coverage_heatmaps.png b/reports/faro_benchmark/coverage_heatmaps.png new file mode 100644 index 0000000..4e84b62 Binary files /dev/null and b/reports/faro_benchmark/coverage_heatmaps.png differ diff --git a/reports/faro_benchmark/depth_comparison.png b/reports/faro_benchmark/depth_comparison.png new file mode 100644 index 0000000..50eeb3d Binary files /dev/null and b/reports/faro_benchmark/depth_comparison.png differ diff --git a/reports/faro_benchmark/distance_error_curve.png b/reports/faro_benchmark/distance_error_curve.png new file mode 100644 index 0000000..a30bfe9 Binary files /dev/null and b/reports/faro_benchmark/distance_error_curve.png differ diff --git a/reports/faro_benchmark/error_histograms.png b/reports/faro_benchmark/error_histograms.png new file mode 100644 index 0000000..3992406 Binary files /dev/null and b/reports/faro_benchmark/error_histograms.png differ diff --git a/reports/faro_benchmark/error_maps.png b/reports/faro_benchmark/error_maps.png new file mode 100644 index 0000000..24647a2 Binary files /dev/null and b/reports/faro_benchmark/error_maps.png differ diff --git a/reports/faro_benchmark/index.html b/reports/faro_benchmark/index.html new file mode 100644 index 0000000..7633a64 --- /dev/null +++ b/reports/faro_benchmark/index.html @@ -0,0 +1,141 @@ + + + + + + Depth Quality Report — rs-enhanced-depth + + + +

Depth Quality Report

+

rs-enhanced-depth — multi-method depth quality analysis

+ +
SourceFARO dataset (/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE)
Frames702
Resolution1280×720
Ground truthFARO GT
Methods evaluatedFARO GT, Fine-tuned on FARO, Original model
Generated2026-03-29 15:13:49
+ +
+

Model Configuration

+ + + + +
Methodmax_dispvalid_itersEngine ResolutionEngine Dir
Original model
Fine-tuned on FARO
+
+ +
+

Depth Comparison

+
+ Depth Comparison +

Side-by-side depth maps from a single representative frame. Invalid pixels are black.

+
+
+
+

Error Maps

+
+ Error Maps +

Per-pixel absolute error |pred − GT| clipped at 0.5 m. Brighter = more error.

+
+
+
+

Coverage Heatmaps

+
+ Coverage Heatmaps +

Fraction of frames each pixel has valid depth, averaged over all benchmark frames.

+
+
+
+

Distance Error Curve

+
+ Distance Error Curve +

Mean Absolute Error (MAE) broken down by distance range.

+
+
+
+

Error Histograms

+
+ Error Histograms +

Distribution of per-pixel absolute errors from the stored visualisation frames.

+
+
+
+

Summary Table

+
+ Summary Table +

Aggregate quality metrics — see legend below the table for column explanations.

+
+
+
+

How to Read the Summary Table

+ + + + + + + + + + + + + + + +
MRE* (%%)Overall score (recommended). Mean Relative Error with hole penalty — pixels where the method has no depth but ground truth does count as 100%% error. This is the fairest single metric because it penalises both inaccuracy and missing coverage. Lower is better.
MRE (%%)Mean Relative Error over valid pixels only (holes ignored). 5%% means each measured pixel is ~5%% off on average. Lower is better.
MAE (m)Mean Absolute Error in meters, valid pixels only. Lower is better.
δ1 (%%)Percentage of valid pixels within 1.25× of ground truth depth. Higher is better. 100%% is perfect.
Coverage (%%)Percentage of pixels that produced valid depth. Higher is better. MinZ improves this at close range (<0.55m) by filling holes the hardware camera cannot see.
FPSProcessing speed (frames per second). Higher is faster.
GT?★ GT marks the ground truth method (NNDepth accurate). Its error columns show "—" because you don't compare ground truth to itself.
+
+
+

Close Range Analysis

+
+ Close Range Analysis +

Coverage and stability for objects closer than 0.55 m. Highlights MinZ benefit.

+
+
+
+

Timing Bars

+
+ Timing Bars +

Processing speed in FPS. Hardware baseline is fixed at ~30 FPS (camera frame rate).

+
+
+
Generated by tests/depth_report/ on 2026-03-29 15:13:49
+ + \ No newline at end of file diff --git a/reports/faro_benchmark/results.json b/reports/faro_benchmark/results.json new file mode 100644 index 0000000..68093c4 --- /dev/null +++ b/reports/faro_benchmark/results.json @@ -0,0 +1,64 @@ +{ + "source": "FARO dataset (/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE)", + "n_frames": 702, + "resolution": "1280x720", + "ground_truth": "faro_gt", + "distance_bins": [ + "0\u20131 m", + "1\u20132 m", + "2\u20133 m", + "3\u20135 m" + ], + "model_configs": { + "original": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_best_bp2_serialize.pth" + }, + "finetuned": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_finetuned_faro_kitchen.pth" + } + }, + "methods": { + "faro_gt": { + "label": "FARO GT", + "mae_mean": 0.0, + "mae_std": 0.0, + "rmse_mean": 0.0, + "mre_mean": 0.0, + "delta1_mean": 100.0, + "coverage_mean": 99.32362520773978, + "fps_mean": -1, + "time_ms_mean": 0.0, + "close_range_coverage": 25.356125356125357, + "gpu_load_mean": 0.0, + "is_ground_truth": true + }, + "finetuned": { + "label": "Fine-tuned on FARO", + "mae_mean": 0.009571012443838975, + "mae_std": 0.0034880630610683075, + "rmse_mean": 0.03290741254795778, + "mre_mean": 0.005161342896292904, + "delta1_mean": 99.92280284364074, + "coverage_mean": 99.32361763389324, + "fps_mean": 6.596843296084722, + "time_ms_mean": 151.58765414262726, + "close_range_coverage": 25.356125356125357, + "gpu_load_mean": 0.0, + "is_ground_truth": false + }, + "original": { + "label": "Original model", + "mae_mean": 0.02072368732167997, + "mae_std": 0.007662992898661906, + "rmse_mean": 0.06485350047400025, + "mre_mean": 0.011096837766537512, + "delta1_mean": 99.66651418755849, + "coverage_mean": 99.32362520773978, + "fps_mean": 6.876085113326082, + "time_ms_mean": 145.43159130796195, + "close_range_coverage": 25.356125356125357, + "gpu_load_mean": 0.0, + "is_ground_truth": false + } + } +} \ No newline at end of file diff --git a/reports/faro_benchmark/summary_table.png b/reports/faro_benchmark/summary_table.png new file mode 100644 index 0000000..f939d52 Binary files /dev/null and b/reports/faro_benchmark/summary_table.png differ diff --git a/reports/faro_benchmark/timing_bars.png b/reports/faro_benchmark/timing_bars.png new file mode 100644 index 0000000..cd04e94 Binary files /dev/null and b/reports/faro_benchmark/timing_bars.png differ diff --git a/reports/faro_benchmark_office/close_range_analysis.png b/reports/faro_benchmark_office/close_range_analysis.png new file mode 100644 index 0000000..2fd72b3 Binary files /dev/null and b/reports/faro_benchmark_office/close_range_analysis.png differ diff --git a/reports/faro_benchmark_office/coverage_heatmaps.png b/reports/faro_benchmark_office/coverage_heatmaps.png new file mode 100644 index 0000000..24fcb8b Binary files /dev/null and b/reports/faro_benchmark_office/coverage_heatmaps.png differ diff --git a/reports/faro_benchmark_office/depth_comparison.png b/reports/faro_benchmark_office/depth_comparison.png new file mode 100644 index 0000000..492edf7 Binary files /dev/null and b/reports/faro_benchmark_office/depth_comparison.png differ diff --git a/reports/faro_benchmark_office/distance_error_curve.png b/reports/faro_benchmark_office/distance_error_curve.png new file mode 100644 index 0000000..8df6473 Binary files /dev/null and b/reports/faro_benchmark_office/distance_error_curve.png differ diff --git a/reports/faro_benchmark_office/error_histograms.png b/reports/faro_benchmark_office/error_histograms.png new file mode 100644 index 0000000..bde7364 Binary files /dev/null and b/reports/faro_benchmark_office/error_histograms.png differ diff --git a/reports/faro_benchmark_office/error_maps.png b/reports/faro_benchmark_office/error_maps.png new file mode 100644 index 0000000..8735645 Binary files /dev/null and b/reports/faro_benchmark_office/error_maps.png differ diff --git a/reports/faro_benchmark_office/index.html b/reports/faro_benchmark_office/index.html new file mode 100644 index 0000000..6c14cb4 --- /dev/null +++ b/reports/faro_benchmark_office/index.html @@ -0,0 +1,141 @@ + + + + + + Depth Quality Report — rs-enhanced-depth + + + +

Depth Quality Report

+

rs-enhanced-depth — multi-method depth quality analysis

+ +
SourceFARO dataset (/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE)
Frames445
Resolution1280×720
Ground truthFARO GT
Methods evaluatedFARO GT, Fine-tuned on FARO, Original model
Generated2026-04-12 14:19:16
+ +
+

Model Configuration

+ + + + +
Methodmax_dispvalid_itersEngine ResolutionEngine Dir
Original model
Fine-tuned on FARO
+
+ +
+

Depth Comparison

+
+ Depth Comparison +

Side-by-side depth maps from a single representative frame. Invalid pixels are black.

+
+
+
+

Error Maps

+
+ Error Maps +

Per-pixel absolute error |pred − GT| clipped at 0.5 m. Brighter = more error.

+
+
+
+

Coverage Heatmaps

+
+ Coverage Heatmaps +

Fraction of frames each pixel has valid depth, averaged over all benchmark frames.

+
+
+
+

Distance Error Curve

+
+ Distance Error Curve +

Mean Absolute Error (MAE) broken down by distance range.

+
+
+
+

Error Histograms

+
+ Error Histograms +

Distribution of per-pixel absolute errors from the stored visualisation frames.

+
+
+
+

Summary Table

+
+ Summary Table +

Aggregate quality metrics — see legend below the table for column explanations.

+
+
+
+

How to Read the Summary Table

+ + + + + + + + + + + + + + + +
MRE* (%%)Overall score (recommended). Mean Relative Error with hole penalty — pixels where the method has no depth but ground truth does count as 100%% error. This is the fairest single metric because it penalises both inaccuracy and missing coverage. Lower is better.
MRE (%%)Mean Relative Error over valid pixels only (holes ignored). 5%% means each measured pixel is ~5%% off on average. Lower is better.
MAE (m)Mean Absolute Error in meters, valid pixels only. Lower is better.
δ1 (%%)Percentage of valid pixels within 1.25× of ground truth depth. Higher is better. 100%% is perfect.
Coverage (%%)Percentage of pixels that produced valid depth. Higher is better. MinZ improves this at close range (<0.55m) by filling holes the hardware camera cannot see.
FPSProcessing speed (frames per second). Higher is faster.
GT?★ GT marks the ground truth method (NNDepth accurate). Its error columns show "—" because you don't compare ground truth to itself.
+
+
+

Close Range Analysis

+
+ Close Range Analysis +

Coverage and stability for objects closer than 0.55 m. Highlights MinZ benefit.

+
+
+
+

Timing Bars

+
+ Timing Bars +

Processing speed in FPS. Hardware baseline is fixed at ~30 FPS (camera frame rate).

+
+
+
Generated by tests/depth_report/ on 2026-04-12 14:19:16
+ + \ No newline at end of file diff --git a/reports/faro_benchmark_office/results.json b/reports/faro_benchmark_office/results.json new file mode 100644 index 0000000..30ba0a2 --- /dev/null +++ b/reports/faro_benchmark_office/results.json @@ -0,0 +1,64 @@ +{ + "source": "FARO dataset (/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE)", + "n_frames": 445, + "resolution": "1280x720", + "ground_truth": "faro_gt", + "distance_bins": [ + "0\u20131 m", + "1\u20132 m", + "2\u20133 m", + "3\u20135 m" + ], + "model_configs": { + "original": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_best_bp2_serialize.pth" + }, + "finetuned": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_finetuned_faro_kitchen.pth" + } + }, + "methods": { + "faro_gt": { + "label": "FARO GT", + "mae_mean": 0.0, + "mae_std": 0.0, + "rmse_mean": 0.0, + "mre_mean": 0.0, + "delta1_mean": 100.0, + "coverage_mean": 99.13976791705682, + "fps_mean": -1, + "time_ms_mean": 0.0, + "close_range_coverage": 8.539325842696629, + "gpu_load_mean": 0.0, + "is_ground_truth": true + }, + "finetuned": { + "label": "Fine-tuned on FARO", + "mae_mean": 0.017334664322101, + "mae_std": 0.0018994669092165116, + "rmse_mean": 0.06851478966601779, + "mre_mean": 0.010713689330588565, + "delta1_mean": 99.3566269030714, + "coverage_mean": 99.13976084581773, + "fps_mean": 6.591002457733087, + "time_ms_mean": 151.72198863720962, + "close_range_coverage": 8.539325842696629, + "gpu_load_mean": 0.0, + "is_ground_truth": false + }, + "original": { + "label": "Original model", + "mae_mean": 0.019633635941348718, + "mae_std": 0.0025874206605145706, + "rmse_mean": 0.07497013829899637, + "mre_mean": 0.012076542543226414, + "delta1_mean": 99.20257452337185, + "coverage_mean": 99.13975767595193, + "fps_mean": 6.838218937662999, + "time_ms_mean": 146.2369089255507, + "close_range_coverage": 8.539325842696629, + "gpu_load_mean": 0.0, + "is_ground_truth": false + } + } +} \ No newline at end of file diff --git a/reports/faro_benchmark_office/summary_table.png b/reports/faro_benchmark_office/summary_table.png new file mode 100644 index 0000000..7a156d5 Binary files /dev/null and b/reports/faro_benchmark_office/summary_table.png differ diff --git a/reports/faro_benchmark_office/timing_bars.png b/reports/faro_benchmark_office/timing_bars.png new file mode 100644 index 0000000..b0be1d1 Binary files /dev/null and b/reports/faro_benchmark_office/timing_bars.png differ diff --git a/reports/inbolt_benchmark/close_range_analysis.png b/reports/inbolt_benchmark/close_range_analysis.png new file mode 100644 index 0000000..e543bb0 Binary files /dev/null and b/reports/inbolt_benchmark/close_range_analysis.png differ diff --git a/reports/inbolt_benchmark/coverage_heatmaps.png b/reports/inbolt_benchmark/coverage_heatmaps.png new file mode 100644 index 0000000..273f28c Binary files /dev/null and b/reports/inbolt_benchmark/coverage_heatmaps.png differ diff --git a/reports/inbolt_benchmark/depth_comparison.png b/reports/inbolt_benchmark/depth_comparison.png new file mode 100644 index 0000000..6bd5cd9 Binary files /dev/null and b/reports/inbolt_benchmark/depth_comparison.png differ diff --git a/reports/inbolt_benchmark/depth_scale_comparison.png b/reports/inbolt_benchmark/depth_scale_comparison.png new file mode 100644 index 0000000..27c0799 Binary files /dev/null and b/reports/inbolt_benchmark/depth_scale_comparison.png differ diff --git a/reports/inbolt_benchmark/depth_vs_distance.png b/reports/inbolt_benchmark/depth_vs_distance.png new file mode 100644 index 0000000..2f49bde Binary files /dev/null and b/reports/inbolt_benchmark/depth_vs_distance.png differ diff --git a/reports/inbolt_benchmark/distance_error_curve.png b/reports/inbolt_benchmark/distance_error_curve.png new file mode 100644 index 0000000..b800cb1 Binary files /dev/null and b/reports/inbolt_benchmark/distance_error_curve.png differ diff --git a/reports/inbolt_benchmark/error_histograms.png b/reports/inbolt_benchmark/error_histograms.png new file mode 100644 index 0000000..5c1b221 Binary files /dev/null and b/reports/inbolt_benchmark/error_histograms.png differ diff --git a/reports/inbolt_benchmark/error_maps.png b/reports/inbolt_benchmark/error_maps.png new file mode 100644 index 0000000..57dfcc9 Binary files /dev/null and b/reports/inbolt_benchmark/error_maps.png differ diff --git a/reports/inbolt_benchmark/index.html b/reports/inbolt_benchmark/index.html new file mode 100644 index 0000000..ac8de6d --- /dev/null +++ b/reports/inbolt_benchmark/index.html @@ -0,0 +1,141 @@ + + + + + + Depth Quality Report — rs-enhanced-depth + + + +

Depth Quality Report

+

rs-enhanced-depth — multi-method depth quality analysis

+ +
SourceINBOLT dataset (/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection)
Frames18
Resolution640×480
Ground truthINBOLT GT
Methods evaluatedINBOLT GT, Fine-tuned on INBOLT, Original model
Generated2026-03-25 12:01:18
+ +
+

Model Configuration

+ + + + +
Methodmax_dispvalid_itersEngine ResolutionEngine Dir
Original model
Fine-tuned on INBOLT
+
+ +
+

Depth Comparison

+
+ Depth Comparison +

Side-by-side depth maps from a single representative frame. Invalid pixels are black.

+
+
+
+

Error Maps

+
+ Error Maps +

Per-pixel absolute error |pred − GT| clipped at 0.5 m. Brighter = more error.

+
+
+
+

Coverage Heatmaps

+
+ Coverage Heatmaps +

Fraction of frames each pixel has valid depth, averaged over all benchmark frames.

+
+
+
+

Distance Error Curve

+
+ Distance Error Curve +

Mean Absolute Error (MAE) broken down by distance range.

+
+
+
+

Error Histograms

+
+ Error Histograms +

Distribution of per-pixel absolute errors from the stored visualisation frames.

+
+
+
+

Summary Table

+
+ Summary Table +

Aggregate quality metrics — see legend below the table for column explanations.

+
+
+
+

How to Read the Summary Table

+ + + + + + + + + + + + + + + +
MRE* (%%)Overall score (recommended). Mean Relative Error with hole penalty — pixels where the method has no depth but ground truth does count as 100%% error. This is the fairest single metric because it penalises both inaccuracy and missing coverage. Lower is better.
MRE (%%)Mean Relative Error over valid pixels only (holes ignored). 5%% means each measured pixel is ~5%% off on average. Lower is better.
MAE (m)Mean Absolute Error in meters, valid pixels only. Lower is better.
δ1 (%%)Percentage of valid pixels within 1.25× of ground truth depth. Higher is better. 100%% is perfect.
Coverage (%%)Percentage of pixels that produced valid depth. Higher is better. MinZ improves this at close range (<0.55m) by filling holes the hardware camera cannot see.
FPSProcessing speed (frames per second). Higher is faster.
GT?★ GT marks the ground truth method (NNDepth accurate). Its error columns show "—" because you don't compare ground truth to itself.
+
+
+

Close Range Analysis

+
+ Close Range Analysis +

Coverage and stability for objects closer than 0.55 m. Highlights MinZ benefit.

+
+
+
+

Timing Bars

+
+ Timing Bars +

Processing speed in FPS. Hardware baseline is fixed at ~30 FPS (camera frame rate).

+
+
+
Generated by tests/depth_report/ on 2026-03-25 12:01:18
+ + \ No newline at end of file diff --git a/reports/inbolt_benchmark/results.json b/reports/inbolt_benchmark/results.json new file mode 100644 index 0000000..33e451b --- /dev/null +++ b/reports/inbolt_benchmark/results.json @@ -0,0 +1,64 @@ +{ + "source": "INBOLT dataset (/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection)", + "n_frames": 18, + "resolution": "640x480", + "ground_truth": "inbolt_gt", + "distance_bins": [ + "0\u20131 m", + "1\u20132 m", + "2\u20133 m", + "3\u20135 m" + ], + "model_configs": { + "original": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_best_bp2_serialize.pth" + }, + "finetuned": { + "model_path": "/home/adiroha/repos/Fast-FoundationStereo/scripts/../weights/20-30-48/model_finetuned_inbolt.pth" + } + }, + "methods": { + "inbolt_gt": { + "label": "INBOLT GT", + "mae_mean": 0.0, + "mae_std": 0.0, + "rmse_mean": 0.0, + "mre_mean": 0.0, + "delta1_mean": 100.0, + "coverage_mean": 82.82678674768518, + "fps_mean": -1, + "time_ms_mean": 0.0, + "close_range_coverage": 11.11111111111111, + "gpu_load_mean": 0.0, + "is_ground_truth": true + }, + "finetuned": { + "label": "Fine-tuned on INBOLT", + "mae_mean": 1.6766890618536208, + "mae_std": 0.5363837577208318, + "rmse_mean": 1.706638468636407, + "mre_mean": 1.5533304479387071, + "delta1_mean": 0.0, + "coverage_mean": 82.82678674768518, + "fps_mean": 5.817052135797856, + "time_ms_mean": 171.9083784458538, + "close_range_coverage": 11.11111111111111, + "gpu_load_mean": 0.0, + "is_ground_truth": false + }, + "original": { + "label": "Original model", + "mae_mean": 2.0390112466282315, + "mae_std": 0.5198313899331768, + "rmse_mean": 2.0891922778553433, + "mre_mean": 1.9439863959948223, + "delta1_mean": 0.0, + "coverage_mean": 82.82678674768518, + "fps_mean": 21.39207336477468, + "time_ms_mean": 46.74628695162634, + "close_range_coverage": 11.11111111111111, + "gpu_load_mean": 0.0, + "is_ground_truth": false + } + } +} \ No newline at end of file diff --git a/reports/inbolt_benchmark/summary_table.png b/reports/inbolt_benchmark/summary_table.png new file mode 100644 index 0000000..0b6f4c4 Binary files /dev/null and b/reports/inbolt_benchmark/summary_table.png differ diff --git a/reports/inbolt_benchmark/timing_bars.png b/reports/inbolt_benchmark/timing_bars.png new file mode 100644 index 0000000..b7d5f9e Binary files /dev/null and b/reports/inbolt_benchmark/timing_bars.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_comparison_zivid_rs_ffs.png b/reports/inbolt_ffs_benchmark/depth_comparison_zivid_rs_ffs.png new file mode 100644 index 0000000..f749725 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_comparison_zivid_rs_ffs.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_noise_comparison_ffs.png b/reports/inbolt_ffs_benchmark/depth_noise_comparison_ffs.png new file mode 100644 index 0000000..7933c80 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_noise_comparison_ffs.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_scale_comparison.png b/reports/inbolt_ffs_benchmark/depth_scale_comparison.png new file mode 100644 index 0000000..8deb338 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_scale_comparison.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_scale_comparison_ffs.png b/reports/inbolt_ffs_benchmark/depth_scale_comparison_ffs.png new file mode 100644 index 0000000..8deb338 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_scale_comparison_ffs.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_scale_comparison_inbolt.png b/reports/inbolt_ffs_benchmark/depth_scale_comparison_inbolt.png new file mode 100644 index 0000000..3df6090 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_scale_comparison_inbolt.png differ diff --git a/reports/inbolt_ffs_benchmark/depth_scale_comparison_wrong_bf.png b/reports/inbolt_ffs_benchmark/depth_scale_comparison_wrong_bf.png new file mode 100644 index 0000000..0f1fbe6 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/depth_scale_comparison_wrong_bf.png differ diff --git a/reports/inbolt_ffs_benchmark/error_000_inputs.png b/reports/inbolt_ffs_benchmark/error_000_inputs.png new file mode 100644 index 0000000..8f964e9 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_000_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_001_inputs.png b/reports/inbolt_ffs_benchmark/error_001_inputs.png new file mode 100644 index 0000000..ddfb697 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_001_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_002_inputs.png b/reports/inbolt_ffs_benchmark/error_002_inputs.png new file mode 100644 index 0000000..23f4e41 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_002_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_003_inputs.png b/reports/inbolt_ffs_benchmark/error_003_inputs.png new file mode 100644 index 0000000..f205448 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_003_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_004_inputs.png b/reports/inbolt_ffs_benchmark/error_004_inputs.png new file mode 100644 index 0000000..c5d5278 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_004_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_005_inputs.png b/reports/inbolt_ffs_benchmark/error_005_inputs.png new file mode 100644 index 0000000..704e62a Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_005_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_006_inputs.png b/reports/inbolt_ffs_benchmark/error_006_inputs.png new file mode 100644 index 0000000..220bb27 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_006_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_007_inputs.png b/reports/inbolt_ffs_benchmark/error_007_inputs.png new file mode 100644 index 0000000..197fdd9 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_007_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/error_008_inputs.png b/reports/inbolt_ffs_benchmark/error_008_inputs.png new file mode 100644 index 0000000..7edac6e Binary files /dev/null and b/reports/inbolt_ffs_benchmark/error_008_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_000_inputs.png b/reports/inbolt_ffs_benchmark/sample_000_inputs.png new file mode 100644 index 0000000..4cd3ccd Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_000_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_001_inputs.png b/reports/inbolt_ffs_benchmark/sample_001_inputs.png new file mode 100644 index 0000000..5abc63d Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_001_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_002_inputs.png b/reports/inbolt_ffs_benchmark/sample_002_inputs.png new file mode 100644 index 0000000..4f4c761 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_002_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_003_inputs.png b/reports/inbolt_ffs_benchmark/sample_003_inputs.png new file mode 100644 index 0000000..e9f3ced Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_003_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_004_inputs.png b/reports/inbolt_ffs_benchmark/sample_004_inputs.png new file mode 100644 index 0000000..ecfaae1 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_004_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_005_inputs.png b/reports/inbolt_ffs_benchmark/sample_005_inputs.png new file mode 100644 index 0000000..a0d3082 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_005_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_006_inputs.png b/reports/inbolt_ffs_benchmark/sample_006_inputs.png new file mode 100644 index 0000000..b8f30c6 Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_006_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_007_inputs.png b/reports/inbolt_ffs_benchmark/sample_007_inputs.png new file mode 100644 index 0000000..3d567de Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_007_inputs.png differ diff --git a/reports/inbolt_ffs_benchmark/sample_008_inputs.png b/reports/inbolt_ffs_benchmark/sample_008_inputs.png new file mode 100644 index 0000000..5ce370f Binary files /dev/null and b/reports/inbolt_ffs_benchmark/sample_008_inputs.png differ diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..f775d3d --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +# just to create a module \ No newline at end of file diff --git a/scripts/benchmark_faro.py b/scripts/benchmark_faro.py new file mode 100644 index 0000000..bdb6a90 --- /dev/null +++ b/scripts/benchmark_faro.py @@ -0,0 +1,225 @@ +"""Benchmark original vs fine-tuned FastFoundationStereo on the FARO dataset. + +Loads both models, runs inference on all FARO samples, computes depth quality +metrics against FARO scanner ground truth, and produces an HTML report. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_faro.py [--out_dir reports/faro_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch + +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_faro import DataSource +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_metrics, + compute_bin_mae, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── constants ──────────────────────────────────────────────────────────────── + +FARO_DIR = r'/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE' +#FARO_DIR = r'data/faro' # local path to FARO dataset --- IGNORE --- +ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro_kitchen_epoch_006_epoch_013.pth' +DEFAULT_OUT = f'{code_dir}/../reports/benchmark_faro_office_013' + +BF = 49470.45 # focal_px * baseline_mm (calibrated from camera) +ITERS = 8 # GRU iterations +N_VIZ = 5 # number of frames saved for visual comparison in report + +METHODS = { + "original": {"label": "Original model", "color": "#2980b9"}, + "finetuned": {"label": "Fine-tuned on FARO", "color": "#e74c3c"}, + "faro_gt": {"label": "FARO GT", "color": "#27ae60"}, +} +GT_NAME = "faro_gt" + + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert uint16 IR images to CUDA float tensors (3-channel pseudo-RGB).""" + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H×W×3 + right = np.stack([right, right, right], axis=-1) + left_t = torch.as_tensor(left).float()[None].permute(0, 3, 1, 2).cuda() + right_t = torch.as_tensor(right).float()[None].permute(0, 3, 1, 2).cuda() + return left_t, right_t + + +@torch.no_grad() +def infer_depth_m(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 # disparity → mm → m + return depth_m + + +def load_model(path: str): + logging.info(f"Loading model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--faro_dir', default=FARO_DIR, help='Path to FARO dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load models ────────────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + active_methods = [GT_NAME] + list(models.keys()) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.faro_dir,test_keywords=['OFFICE'], split='train') + logging.info(f"Found {n} samples in {args.faro_dir}") + if n == 0: + logging.error("No samples found — check FARO_DIR path") + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} # will be init on first frame + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_faro'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 # mm → m + + # run inference for each model + frame_depths = {GT_NAME: gt_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # per-frame metrics + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics(GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + all_metrics.append(fm) + + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = float((pred[gt_close_mask] > 0).mean()) * 100.0 if n_close > 0 else 0.0 + close_range_valid[mname].append(close_cov) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f" {idx + 1}/{n} frames processed") + + # normalise coverage maps to [0, 1] + for m in active_methods: + valid_acc[m] /= max(n, 1) + + # ── aggregate timing ────────────────────────────────────────────────────── + mean_timing = {m: float(np.mean(ts)) if ts else 0.0 for m, ts in timing_ms_raw.items()} + mean_timing[GT_NAME] = 0.0 + + # ── build BenchmarkResults ──────────────────────────────────────────────── + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]["label"] for m in active_methods}, + method_colors={m: METHODS[m]["color"] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f"FARO dataset ({args.faro_dir})", + method_configs={ + "original": {"model_path": args.original}, + "finetuned": {"model_path": args.finetuned}, + }, + ) + + stats = aggregate(results, mean_timing) + + # ── generate report ─────────────────────────────────────────────────────── + reporter = ReportGenerator(results, stats, out_dir) + reporter.generate() + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_faro_rs.py b/scripts/benchmark_faro_rs.py new file mode 100644 index 0000000..64a8601 --- /dev/null +++ b/scripts/benchmark_faro_rs.py @@ -0,0 +1,585 @@ +"""Benchmark stereo models + RealSense hardware depth against FARO scanner ground truth. + +All depth values are in millimetres (mm) throughout this script. + +Metric descriptions +------------------- +MAE (mm) Mean Absolute Error — average |predicted − GT| in mm over valid pixels. + Lower is better. Sensitive to large outlier errors. + +MAE* / MRE* (pen.) Penalised variants of MAE / MRE. Pixels where the model produces + no depth (holes) are penalised: MAE* counts the full GT depth as the + error, MRE* counts a 100% relative error. Rewards methods with high + coverage — a sensor that refuses to predict on anything has 100% MRE*. + +RMSE (mm) Root Mean Square Error — sqrt(mean(|pred − GT|²)). + Penalises large individual errors more than MAE. Lower is better. + +MRE (%) Mean Relative Error — mean(|pred − GT| / GT) × 100. + Scale-independent; allows fair comparison across depth ranges. + Lower is better. + +δ1 (%) Inlier accuracy — % of valid pixels where + max(pred/GT, GT/pred) < 1.25. Higher is better. + A single threshold commonly used in stereo depth benchmarks. + +Coverage (%) % of image pixels where both prediction and GT are non-zero. + Higher is better. Sensors with many holes score lower. + +Close-range cov. Coverage restricted to pixels where GT < 550 mm. Important for + near-field robotics and manipulation tasks. Higher is better. + +Latency (ms) Wall-clock inference time per frame (model only; data loading excluded). + Hardware sensors (RealSense) are treated as fixed 30 FPS devices. + +FPS Frames per second = 1000 / latency_ms. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_faro_rs.py [--out_dir reports/benchmark_faro_rs] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path +from typing import Dict, List, Tuple + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +import torch + +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_faro import DataSource +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_metrics, + aggregate, +) +from report import ReportGenerator + + +# ── constants ───────────────────────────────────────────────────────────────── + +FARO_DIR = r'/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE' +ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro_kitchen_epoch_006_epoch_013.pth' +DEFAULT_OUT = f'{code_dir}/../reports/benchmark_faro_rs_bathroom' +DATASET_KEYWORDS = ['BATHROOM'] # ['OFFICE', 'KITCHEN'] # filter dataset paths containing any of these (case-insensitive) + +BF = 49470.45 # focal_px × baseline_mm (calibrated from RealSense stereo pair) +ITERS = 8 # GRU update iterations +N_VIZ = 5 # frames saved for visual comparison in the report + +# Depth threshold for the "close-range" coverage metric — in mm +CLOSE_RANGE_THRESHOLD_MM = 550.0 + +# Distance bins used for the per-bin MAE curve — all in mm +DIST_BINS_MM: List[Tuple[float, float]] = [ + (0.0, 500.0), + (500.0, 1000.0), + (1000.0, 2000.0), + (2000.0, 3000.0) +] +BIN_LABELS_MM = ["0–500 mm", "500–1000 mm", "1000–2000 mm", "2000–3000 mm"] +BIN_CENTERS_MM = [250.0, 750.0, 1500.0, 2500.0] + +METHODS: Dict[str, Dict[str, str]] = { + "original": {"label": "FFS Original", "color": "#2980b9"}, + "finetuned": {"label": "FFS Fine-tuned (FARO)", "color": "#e74c3c"}, + "depth_rs": {"label": "RealSense Hardware Depth", "color": "#f39c12"}, + "faro_gt": {"label": "FARO GT", "color": "#27ae60"}, +} +GT_NAME = "faro_gt" +RS_NAME = "depth_rs" # pre-recorded RealSense active-stereo depth from the dataset + + +# ── mm-based metric helpers ─────────────────────────────────────────────────── + +def compute_bin_mae_mm(pred_mm: np.ndarray, gt_mm: np.ndarray) -> List[float]: + """MAE (mm) per distance bin; returns NaN for bins with no valid GT pixels.""" + result = [] + for lo, hi in DIST_BINS_MM: + mask = (gt_mm >= lo) & (gt_mm < hi) & (gt_mm > 0) & (pred_mm > 0) + if mask.sum() == 0: + result.append(float("nan")) + else: + result.append(float(np.abs(pred_mm[mask] - gt_mm[mask]).mean())) + return result + + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert uint16 IR images to CUDA float tensors (3-channel pseudo-RGB).""" + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H×W×3 + right = np.stack([right, right, right], axis=-1) + left_t = torch.as_tensor(left).float()[None].permute(0, 3, 1, 2).cuda() + right_t = torch.as_tensor(right).float()[None].permute(0, 3, 1, 2).cuda() + return left_t, right_t + + +@torch.no_grad() +def infer_depth_mm(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair; return depth map in mm (H×W float32). + + BF = focal_px × baseline_mm, so depth_mm = BF / disparity_px. + No unit conversion needed — result is already in mm. + """ + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_mm = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_mm[valid] = BF / disp_np[valid] # disparity → mm (BF already in focal·mm) + return depth_mm + + +def load_model(path: str): + logging.info(f"Loading model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + + +# ── mm-aware report generator ───────────────────────────────────────────────── + +class ReportGeneratorMM(ReportGenerator): + """ReportGenerator subclass with all axis labels and colorbars in mm.""" + + # Bin constants injected by the benchmark script + _bin_labels = BIN_LABELS_MM + _bin_centers = BIN_CENTERS_MM + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices: List[int] = [] + + def _get_selected_viz_indices(self, n_pick: int = 4) -> List[int]: + """Return cached random frame indices used consistently across report sections.""" + if self._selected_viz_indices: + return self._selected_viz_indices + + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + + n = min(n_pick, n_total) + # fixed seed for reproducible reports + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("depth_comparison.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows = len(sel) + ncols = len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=1.0, vmax=5000.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="mm") + title = self._r.method_labels.get(name, name) + if c == 0: + title = f"Frame {frame_idx + 1} • {title}" + ax.set_title(title, fontsize=9, wrap=True) + ax.axis("off") + + fig.suptitle("Depth Map Comparison (4 random frames) — values in mm", + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("error_maps.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + + nrows = len(sel) + ncols = len(names) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis("off") + continue + + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=1.0, vmax=100.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (mm)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + if c == 0: + ax.set_title(f"Frame {frame_idx + 1} • {label}\nMAE={mean_err:.0f} mm", fontsize=9) + else: + ax.set_title(f"{label}\nMAE={mean_err:.0f} mm", fontsize=9) + ax.axis("off") + + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label} (4 random frames, mm)", fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + def _fig_distance_error_curve(self) -> str: + if not self._non_gt: + return self._empty_fig("distance_error_curve.png", "No comparison methods") + fig, ax = plt.subplots(figsize=(8, 5)) + for name in self._non_gt: + bin_data = self._r.dist_bin_mae.get(name, []) + if not bin_data: + continue + arr = np.array(bin_data) + mean_per_bin = np.array([ + np.nanmean(arr[:, i]) if np.any(~np.isnan(arr[:, i])) else 0.0 + for i in range(arr.shape[1]) + ]) + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax.plot(self._bin_centers, mean_per_bin, marker="o", color=color, + label=label, linewidth=2, markersize=7) + ax.set_xticks(self._bin_centers) + ax.set_xticklabels(self._bin_labels, fontsize=9) + ax.set_xlabel("Distance range", fontsize=10) + ax.set_ylabel("Mean Absolute Error (mm)", fontsize=10) + ax.set_title("Depth Error vs Distance", fontsize=12) + ax.legend(fontsize=9) + ax.grid(alpha=0.3) + fig.tight_layout() + return self._save(fig, "distance_error_curve.png") + + def _fig_error_histograms(self) -> str: + if not self._non_gt or not self._r.viz_frames: + return self._empty_fig("error_histograms.png", "No comparison data") + names = [n for n in self._non_gt + if any(n in vf and self._gt in vf for vf in self._r.viz_frames)] + if not names: + return self._empty_fig("error_histograms.png", "No viz data for comparison") + n = len(names) + nrows, ncols = self._grid_layout(n) + fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows)) + axes = np.asarray(axes).flatten().tolist() + for ax, name in zip(axes, names): + errors = [] + for vf in self._r.viz_frames: + if name not in vf or self._gt not in vf: + continue + pred, gt = vf[name], vf[self._gt] + valid = (gt > 0) & (pred > 0) + if valid.any(): + errors.extend(np.abs(pred[valid] - gt[valid]).tolist()) + if not errors: + ax.text(0.5, 0.5, "No data", transform=ax.transAxes, + ha="center", va="center", color="gray") + continue + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax.hist(errors, bins=50, range=(0.0, 500.0), color=color, + alpha=0.8, edgecolor="none") + mean_e = float(np.mean(errors)) + ax.axvline(mean_e, color="red", linestyle="--", linewidth=1.5, + label=f"mean={mean_e:.0f} mm") + ax.set_xlabel("Absolute error (mm)", fontsize=9) + ax.set_ylabel("Pixel count", fontsize=9) + ax.set_title(label, fontsize=9) + ax.legend(fontsize=8) + for ax in axes[n:]: + ax.axis("off") + fig.suptitle("Per-Pixel Error Distribution (vs GT, viz frames)", fontsize=11) + fig.tight_layout() + return self._save(fig, "error_histograms.png") + + def _fig_summary_table(self) -> str: + if not self._stats: + return self._empty_fig("summary_table.png", "No stats") + cols = ["Method", "MRE* (%)", "MRE (%)", "MAE (mm)", "δ1 (%)", + "Coverage (%)", "FPS", "GPU %", "GT?"] + gt_rows, other_rows = [], [] + for name, s in self._stats.items(): + is_gt = (name == self._gt) + row = [ + s.label, + "—" if is_gt else f"{s.mre_pen_mean * 100:.1f}", + "—" if is_gt else f"{s.mre_mean * 100:.1f}", + "—" if is_gt else f"{s.mae_mean:.1f}", + "—" if is_gt else f"{s.delta1_mean:.1f}", + f"{s.coverage_mean:.1f}", + f"{s.fps_mean:.1f}" if s.fps_mean < 999 else "≈30", + f"{s.gpu_load_mean:.0f}" if s.gpu_load_mean > 0 else "—", + "★ GT" if is_gt else "", + ] + (gt_rows if is_gt else other_rows).append((name, row)) + ordered = gt_rows + other_rows + cell_text = [r for _, r in ordered] + n = len(ordered) + fig, ax = plt.subplots(figsize=(14, 1.0 + 0.55 * n)) + ax.axis("off") + table = ax.table(cellText=cell_text, colLabels=cols, + cellLoc="center", loc="center") + table.auto_set_font_size(False) + table.set_fontsize(9) + table.scale(1, 1.6) + for (row, col), cell in table.get_celld().items(): + if row == 0: + cell.set_facecolor("#2c3e50") + cell.set_text_props(color="white", fontweight="bold") + elif row % 2 == 0: + cell.set_facecolor("#f7f7f7") + cell.set_edgecolor("#cccccc") + if row > 0 and cell_text[row - 1][-1] == "★ GT": + cell.set_facecolor("#d5f5d5") + ax.set_title("Depth Quality Summary (errors in mm)", fontsize=12, + pad=10, fontweight="bold") + fig.tight_layout() + return self._save(fig, "summary_table.png") + + def _fig_close_range_analysis(self) -> str: + names = list(self._r.method_names) + if not names: + return self._empty_fig("close_range_analysis.png", "No methods") + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + labels = [self._r.method_labels.get(n, n) for n in names] + coverages = [self._stats[n].close_range_coverage if n in self._stats else 0.0 + for n in names] + colors = [self._r.method_colors.get(n, "#888") for n in names] + bars = ax1.bar(labels, coverages, color=colors, alpha=0.85, edgecolor="white") + ax1.bar_label(bars, labels=[f"{v:.1f}%" for v in coverages], padding=3, fontsize=7) + ax1.set_ylabel(f"Coverage at < {CLOSE_RANGE_THRESHOLD_MM:.0f} mm (%)", fontsize=10) + ax1.set_title(f"Close-Range Coverage (< {CLOSE_RANGE_THRESHOLD_MM:.0f} mm)", fontsize=11) + ax1.tick_params(axis="x", rotation=45, labelsize=7) + ax1.set_xticklabels(labels, ha="right") + ax1.set_ylim(0, 115) + ax1.grid(axis="y", alpha=0.3) + for name in names: + vals = self._r.close_range_valid.get(name, []) + if not vals: + continue + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax2.plot(range(1, len(vals) + 1), vals, color=color, + label=label, alpha=0.8, linewidth=1.5) + ax2.set_xlabel("Frame", fontsize=10) + ax2.set_ylabel(f"Coverage at < {CLOSE_RANGE_THRESHOLD_MM:.0f} mm (%)", fontsize=10) + ax2.set_title("Close-Range Coverage per Frame", fontsize=11) + ax2.legend(fontsize=6, loc="best") + ax2.grid(alpha=0.3) + ax2.set_ylim(-5, 115) + fig.suptitle("Close-Range Depth Analysis", fontsize=13, fontweight="bold") + fig.tight_layout() + return self._save(fig, "close_range_analysis.png") + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--faro_dir', default=FARO_DIR, help='Path to FARO dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + parser.add_argument('--keywords', nargs='+', default=DATASET_KEYWORDS, help='Keywords to filter dataset paths (case-insensitive)') + parser.add_argument('--split', default='train', choices=['train', 'test', 'all'], + help='Dataset split to evaluate') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load stereo models ──────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + # active_methods includes GT, RS hardware, and all NN models + active_methods = [GT_NAME, RS_NAME] + list(models.keys()) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory( + input_rectified=args.faro_dir, + test_keywords=args.keywords, + split=args.split, + ) + logging.info(f"Found {n} samples in {args.faro_dir}") + if n == 0: + logging.error("No samples found — check FARO_DIR path") + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} # initialised on first frame + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} # only NN models have inference latency + H = W = None + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_faro'].astype(np.float32) # FARO GT — already in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense hardware — already in mm + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + # ── run NN inference (returns mm) ───────────────────────────────────── + frame_depths = {GT_NAME: gt_mm, RS_NAME: rs_mm} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_mm(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # ── per-frame metrics (all values in mm) ────────────────────────────── + gt_close_mask = (gt_mm > 0) & (gt_mm < CLOSE_RANGE_THRESHOLD_MM) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0, + ) + elif mname == RS_NAME: + # RealSense hardware depth — compare against FARO GT (both in mm) + fm = compute_metrics(pred, gt_mm, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_mm, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae_mm(pred, gt_mm)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f" {idx + 1}/{n} frames processed") + + # normalise coverage maps to [0, 1] + for m in active_methods: + valid_acc[m] /= max(n, 1) + + # ── aggregate timing ────────────────────────────────────────────────────── + mean_timing: Dict[str, float] = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + # FARO GT has no latency; RealSense is hardware 30 FPS + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / 30.0 # ~33 ms per frame at native 30 FPS + + # ── build BenchmarkResults ──────────────────────────────────────────────── + method_configs = { + "original": {"model_path": args.original}, + } + if "finetuned" in models: + method_configs["finetuned"] = {"model_path": args.finetuned} + method_configs[RS_NAME] = {"source": "RealSense D435 hardware depth (30 FPS)"} + method_configs[GT_NAME] = {"source": "FARO Focus 3D scanner — reference GT"} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]["label"] for m in active_methods}, + method_colors={m: METHODS[m]["color"] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f"FARO dataset • {args.faro_dir} • split={args.split} • filter={args.keywords}", + method_configs=method_configs, + ) + + # aggregate() computes per-method summaries; pass RS as hardware (30 fps) + stats = aggregate(results, mean_timing) + # Fix fps for hardware depth sensor (aggregate() uses 1000/t_ms) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = 30.0 + + # ── generate report ─────────────────────────────────────────────────────── + reporter = ReportGeneratorMM(results, stats, out_dir) + reporter.generate() + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt.py b/scripts/benchmark_inbolt.py new file mode 100644 index 0000000..aade923 --- /dev/null +++ b/scripts/benchmark_inbolt.py @@ -0,0 +1,690 @@ +"""Benchmark original vs fine-tuned FastFoundationStereo on the Inbolt dataset. + +Loads both models, runs inference on all Inbolt samples, computes depth quality +metrics against Zivid scanner ground truth, and produces an HTML report. + +Also generates depth accuracy and noise plots comparing RealSense stereo predictions +(model output) against Zivid ground-truth depth across distance bins. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt.py [--out_dir reports/inbolt_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +from core.utils.utils import InputPadder +import Utils as U + +from scripts.data_manager_inbolt import DataSource + +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_metrics, + compute_bin_mae, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── constants ──────────────────────────────────────────────────────────────── + +#DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' # local path to the dataset +MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415.pth' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_benchmark' +#FARO_DIR = r'data/faro' # local path to FARO dataset --- IGNORE --- +# ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro.pth' +# DEFAULT_OUT = f'{code_dir}/../reports/faro_benchmark' + +BF = 50.102706998586 * 385.509887695312 #49470.45 # focal_px * baseline_mm (calibrated from camera) +ITERS = 8 # GRU iterations +N_VIZ = 5 # number of frames saved for visual comparison in report + +DEPTH_BIN_SIZE_M = 0.1 # width of each distance bin for accuracy/noise plots +MAX_DEPTH_M = 2.0 # maximum depth considered in plots + +METHODS = { + "original": {"label": "Original model", "color": "#2980b9"}, + "finetuned": {"label": "Fine-tuned on INBOLT", "color": "#e74c3c"}, + "inbolt_gt": {"label": "INBOLT GT", "color": "#27ae60"}, +} +GT_NAME = "inbolt_gt" + + +# ── depth-vs-distance analysis ──────────────────────────────────────────────── + +class DepthBinAccumulator: + """Accumulates mean and std-dev of depth values per GT-distance bin. + + Bins are defined by Zivid GT depth, so *every* pixel whose GT depth falls in + [k * bin_size, (k+1) * bin_size) contributes to bin k. Works for any sensor + (model predictions OR Zivid GT values themselves — the latter gives an + estimate of within-bin spatial variation / measurement noise). + """ + + def __init__(self, bin_size_m: float = DEPTH_BIN_SIZE_M, max_depth_m: float = MAX_DEPTH_M): + self.bin_size = bin_size_m + self.n_bins = int(np.ceil(max_depth_m / bin_size_m)) + self.count = np.zeros(self.n_bins, dtype=np.float64) + self.sum_ = np.zeros(self.n_bins, dtype=np.float64) + self.sum_sq = np.zeros(self.n_bins, dtype=np.float64) + + def update(self, values_m: np.ndarray, gt_m: np.ndarray): + """Add one frame of data. + + Parameters + ---------- + values_m : (H, W) array of the depth values to accumulate (model or GT). + gt_m : (H, W) array of Zivid GT depths that define which bin each pixel falls in. + """ + valid = (gt_m > 0) & (values_m > 0) + if not valid.any(): + return + v_vals = values_m[valid].ravel().astype(np.float64) + v_gt = gt_m[valid].ravel().astype(np.float64) + + bins = np.floor(v_gt / self.bin_size).astype(np.int32) + bins = np.clip(bins, 0, self.n_bins - 1) + + np.add.at(self.count, bins, 1.0) + np.add.at(self.sum_, bins, v_vals) + np.add.at(self.sum_sq, bins, v_vals ** 2) + + @property + def bin_centers(self) -> np.ndarray: + return (np.arange(self.n_bins) + 0.5) * self.bin_size + + def mean(self) -> np.ndarray: + c = np.maximum(self.count, 1) + return np.where(self.count > 0, self.sum_ / c, np.nan) + + def std(self) -> np.ndarray: + c = np.maximum(self.count, 1) + m = np.where(self.count > 0, self.sum_ / c, np.nan) + v = np.where(self.count > 0, self.sum_sq / c - m ** 2, np.nan) + return np.sqrt(np.maximum(v, 0.0)) + +def plot_depth_vs_distance( + accumulators: dict, # {label: DepthBinAccumulator} + colors: dict, # {label: color_str} + out_path: Path, + min_count: int = 100, # bins with fewer samples are hidden +): + """Produce a two-panel figure: + left – actual depth (Zivid GT) vs measured/predicted depth + right – noise (std-dev of measured depth) vs actual depth + """ + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + for label, acc in accumulators.items(): + x = acc.bin_centers + mu = acc.mean() + sigma = acc.std() + color = colors.get(label, None) + + valid = acc.count >= min_count + xv, muv, sv = x[valid], mu[valid], sigma[valid] + + ax0 = axes[0] + ax0.plot(xv, muv, label=label, color=color, linewidth=1.5) + ax0.fill_between(xv, muv - sv, muv + sv, alpha=0.15, color=color) + + axes[1].plot(xv, sv * 1000, label=label, color=color, linewidth=1.5) + + # ideal line + lim = MAX_DEPTH_M + axes[0].plot([0, lim], [0, lim], 'k--', linewidth=1, label='ideal (y = x)') + axes[0].set_xlim(0, lim) + axes[0].set_ylim(0, lim) + axes[0].set_xlabel('Actual depth — Zivid GT (m)') + axes[0].set_ylabel('Measured depth (m)') + axes[0].set_title('Depth Accuracy: Actual vs Measured\n(shaded band = ±1 std dev)') + axes[0].legend(fontsize=9) + axes[0].grid(True, alpha=0.4) + + axes[1].set_xlim(0, lim) + axes[1].set_xlabel('Actual depth — Zivid GT (m)') + axes[1].set_ylabel('Noise / Std Dev (mm)') + axes[1].set_title('Depth Noise per Distance Bin') + axes[1].legend(fontsize=9) + axes[1].grid(True, alpha=0.4) + + plt.tight_layout() + plt.savefig(out_path, dpi=150, bbox_inches='tight') + plt.close(fig) + logging.info(f"Saved depth-vs-distance plot → {out_path}") + +def _to_1d_float_array(values, name: str) -> np.ndarray: + """Convert *values* to a finite 1D float array.""" + arr = np.asarray(values, dtype=np.float64).reshape(-1) + if arr.size == 0: + raise ValueError(f"{name} must contain at least one value") + if not np.all(np.isfinite(arr)): + raise ValueError(f"{name} contains NaN or infinite values") + return arr + +def fit_depth_scale_regression( + gt_delta_mm, + measured_delta_mm, + fit_intercept: bool = False, +) -> dict: + """Fit a linear depth-scale regression and compute residual statistics. + + Parameters + ---------- + gt_delta_mm : array-like + Ground-truth floor/depth deltas in millimetres. + measured_delta_mm : array-like + Measured deltas from one sensor/model in millimetres. + fit_intercept : bool, default=False + If False, uses a through-origin fit `y = slope * x`, which matches the + style of the attached plot. If True, fits `y = slope * x + intercept`. + + Returns + ------- + dict + Contains slope, intercept, fitted values, residuals, RMSE, and masks. + """ + x = _to_1d_float_array(gt_delta_mm, "gt_delta_mm") + y = _to_1d_float_array(measured_delta_mm, "measured_delta_mm") + + if x.shape != y.shape: + raise ValueError("gt_delta_mm and measured_delta_mm must have the same shape") + if x.size < 2: + raise ValueError("At least two samples are required for regression") + + valid = np.isfinite(x) & np.isfinite(y) + x = x[valid] + y = y[valid] + + if x.size < 2: + raise ValueError("Need at least two finite samples after filtering") + + if fit_intercept: + slope, intercept = np.polyfit(x, y, deg=1) + else: + denom = float(np.dot(x, x)) + if denom <= 0: + raise ValueError("Cannot fit a through-origin regression when gt deltas are all zero") + slope = float(np.dot(x, y) / denom) + intercept = 0.0 + + fitted = slope * x + intercept + residuals = y - fitted + rmse = float(np.sqrt(np.mean(residuals ** 2))) + + return { + "gt_delta_mm": x, + "measured_delta_mm": y, + "slope": float(slope), + "intercept": float(intercept), + "fitted_mm": fitted, + "residuals_mm": residuals, + "rmse_mm": rmse, + "fit_intercept": fit_intercept, + } + +def build_example_depth_scale_regression_series(gt_delta_mm, rs_delta_mm, zv_delta_mm) -> dict: + """Return example depth-delta series that reproduces the attached figure. + + The values approximate the plot shown in the screenshot: + - RealSense has a noticeable scale bias. + - Zivid stays close to the ideal slope of 1. + """ + gt_delta_mm = np.array([0, 100, 200, 300, 400, 500, 600, 700], dtype=np.float64) if gt_delta_mm is None else gt_delta_mm + rs_delta_mm = np.array([0.0, 104.0, 218.0, 323.0, 433.0, 542.0, 664.0, 754.0], dtype=np.float64) if rs_delta_mm is None else rs_delta_mm + zv_delta_mm = np.array([0.0, 101.0, 201.0, 301.0, 401.0, 502.0, 602.0, 707.0], dtype=np.float64) if zv_delta_mm is None else zv_delta_mm + + return { + "realsense": { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": rs_delta_mm, + "color": "#e74c3c", + "marker": "s", + "label": "realsense", + }, + "zivid": { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": zv_delta_mm, + "color": "#2980b9", + "marker": "o", + "label": "zivid", + }, + } + +def plot_depth_scale_regression( + series_map: dict, + out_path: Path, + title: str = "Depth Scale Regression — dataset_depth_bias", + fit_intercept: bool = False, + ideal_slope: float = 1.0, +): + """Create the two-panel regression + residuals figure from paired series. + + Parameters + ---------- + series_map : dict + Mapping of series name to configuration dict. Each entry should provide: + - gt_delta_mm + - measured_delta_mm + and may optionally include: + - label + - color + - marker + out_path : Path + Destination PNG path. + title : str + Figure title. + fit_intercept : bool + Whether to fit a free intercept. Defaults to a through-origin fit. + ideal_slope : float + Slope of the ideal reference line shown on the left panel. + """ + if not series_map: + raise ValueError("series_map must contain at least one series") + + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + fit_results = [] + max_x = 0.0 + max_y = 0.0 + + for default_name, cfg in series_map.items(): + result = fit_depth_scale_regression( + gt_delta_mm=cfg["gt_delta_mm"], + measured_delta_mm=cfg["measured_delta_mm"], + fit_intercept=fit_intercept, + ) + result["label"] = cfg.get("label", default_name) + result["color"] = cfg.get("color", None) + result["marker"] = cfg.get("marker", "o") + fit_results.append(result) + max_x = max(max_x, float(np.max(result["gt_delta_mm"]))) + max_y = max(max_y, float(np.max(result["measured_delta_mm"]))) + + lim = max(max_x, max_y) + fit_x = np.linspace(0.0, lim, 200) + + for result in fit_results: + label = result["label"] + color = result["color"] + marker = result["marker"] + x = result["gt_delta_mm"] + y = result["measured_delta_mm"] + slope = result["slope"] + intercept = result["intercept"] + rmse = result["rmse_mm"] + + axes[0].scatter(x, y, color=color, marker=marker, s=70, label=f"{label} (raw)", zorder=3) + axes[0].plot( + fit_x, + slope * fit_x + intercept, + color=color, + linewidth=2.0, + label=( + f"{label} fit: slope={slope:.3f}, intercept={intercept:.1f}mm, RMSE={rmse:.1f}mm" + if fit_intercept else + f"{label} fit: slope={slope:.3f}, RMSE={rmse:.1f}mm" + ), + ) + + axes[1].scatter( + x, + result["residuals_mm"], + color=color, + marker=marker, + s=70, + label=f"{label} (RMSE={rmse:.1f}mm)", + zorder=3, + ) + + axes[0].plot( + fit_x, + ideal_slope * fit_x, + linestyle="--", + color="gray", + linewidth=1.5, + label=f"ideal (slope={ideal_slope:.1f})", + ) + axes[0].set_xlabel("Ground Truth Delta (mm)") + axes[0].set_ylabel("Measured Depth Delta (mm)") + axes[0].set_title("Floor Depth Delta: Measured vs Ground Truth") + axes[0].grid(True, alpha=0.3) + axes[0].legend(fontsize=9, loc="upper left") + + axes[1].axhline(0.0, linestyle="--", color="gray", linewidth=1.2) + axes[1].set_xlabel("Ground Truth Delta (mm)") + axes[1].set_ylabel("Residual (mm)") + axes[1].set_title("Residuals (Measured − Fit)") + axes[1].grid(True, alpha=0.3) + axes[1].legend(fontsize=9, loc="upper left") + + axes[0].set_xlim(-0.05 * max(lim, 1.0), lim * 1.05) + axes[1].set_xlim(-0.05 * max(lim, 1.0), lim * 1.05) + + residual_values = np.concatenate([r["residuals_mm"] for r in fit_results]) + residual_abs_max = max(1.0, float(np.max(np.abs(residual_values)))) + axes[1].set_ylim(-residual_abs_max * 1.15, residual_abs_max * 1.15) + + fig.suptitle(title, fontsize=18, fontweight="bold") + plt.tight_layout(rect=[0, 0, 1, 0.96]) + plt.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + logging.info(f"Saved depth-scale regression plot → {out_path}") + + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert uint16 IR images to CUDA float tensors (3-channel pseudo-RGB).""" + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H×W×3 + right = np.stack([right, right, right], axis=-1) + left_t = torch.as_tensor(left).float()[None].permute(0, 3, 1, 2).cuda() + right_t = torch.as_tensor(right).float()[None].permute(0, 3, 1, 2).cuda() + return left_t, right_t + +@torch.no_grad() +def infer_depth_m(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 # disparity → mm → m + return depth_m + +def load_model(path: str): + logging.info(f"Loading model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + +@torch.no_grad() +def infer_depth_nobf_m(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair that returns depth; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + depth = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + depth = padder.unpad(depth.float()) + depth_np = depth.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(depth_np) + valid = depth_np > 0 + depth_m[valid] = depth_np[valid] / 1000.0 # mm → m + return depth_m + + +# ── inbolt graphs ───────────────────────────────────────────────────────────────────── + +def main_inbolt_graphs(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + gt_depth_diff = np.arange(n)*100 # mm + rs_depth_diff = np.arange(n)*0 # mm + zv_depth_diff = np.arange(n)*0 # zivid mm + rs_ref = None + zv_ref = None + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + zv_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + + # # Resize Zivid depth to match RealSense IR image resolution for pixel-level comparison + # rs_h, rs_w = left.shape[:2] + # if gt_mm.shape != (rs_h, rs_w): + # #gt_mm = _cv2.resize(gt_mm, (rs_w, rs_h), interpolation=_cv2.INTER_NEAREST) + # print(f"Shape mismatch: gt_mm {gt_mm.shape} vs rs {rs_h, rs_w}") + rs_valid = (rs_mm > rs_mm.max()*0.8) + zv_valid = (zv_mm > zv_mm.max()*0.8) + if idx == 0: + rs_ref = np.nanmean(rs_mm[rs_valid]) + zv_ref = np.nanmean(zv_mm[zv_valid]) + else: + rs_depth_diff[idx] = np.nanmean(rs_mm[rs_valid]) - rs_ref + zv_depth_diff[idx] = np.nanmean(zv_mm[zv_valid]) - zv_ref + + + sm = build_example_depth_scale_regression_series(gt_depth_diff, rs_depth_diff, zv_depth_diff) + plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_comparison.png", title="Depth Scale Comparison") + + logging.info(f"All outputs written to {out_dir}") + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load models ────────────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + active_methods = [GT_NAME] + list(models.keys()) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} # will be init on first frame + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + # depth-vs-distance accumulators: + # "zivid_gt" – Zivid depth values binned by Zivid GT (shows intra-bin spatial spread) + # one entry per stereo model – model predictions binned by Zivid GT + depth_acc_keys = ["zivid_gt"] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + + # Resize Zivid depth to match RealSense IR image resolution for pixel-level comparison + rs_h, rs_w = left.shape[:2] + if gt_mm.shape != (rs_h, rs_w): + #gt_mm = _cv2.resize(gt_mm, (rs_w, rs_h), interpolation=_cv2.INTER_NEAREST) + print(f"Shape mismatch: gt_mm {gt_mm.shape} vs rs {rs_h, rs_w}") + + if H is None: + H, W = rs_h, rs_w + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 # mm → m + rs_m = rs_mm / 1000.0 # mm → m + + # run inference for each model + frame_depths = {GT_NAME: gt_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # per-frame metrics + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics(GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = float((pred[gt_close_mask] > 0).mean()) * 100.0 if n_close > 0 else 0.0 + close_range_valid[mname].append(close_cov) + + # depth-vs-distance: accumulate per-bin stats + depth_accs["zivid_gt"].update(gt_m, gt_m) # GT vs itself → intra-bin spread + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f" {idx + 1}/{n} frames processed") + + # normalise coverage maps to [0, 1] + for m in active_methods: + valid_acc[m] /= max(n, 1) + + # ── aggregate timing ────────────────────────────────────────────────────── + mean_timing = {m: float(np.mean(ts)) if ts else 0.0 for m, ts in timing_ms_raw.items()} + mean_timing[GT_NAME] = 0.0 + + # ── build BenchmarkResults ──────────────────────────────────────────────── + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]["label"] for m in active_methods}, + method_colors={m: METHODS[m]["color"] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f"INBOLT dataset ({args.data_dir})", + method_configs={ + "original": {"model_path": args.original}, + "finetuned": {"model_path": args.finetuned}, + }, + ) + + stats = aggregate(results, mean_timing) + + # ── generate report ─────────────────────────────────────────────────────── + reporter = ReportGenerator(results, stats, out_dir) + reporter.generate() + + # ── depth-vs-distance comparison plot ──────────────────────────────────── + plot_colors = { + "zivid_gt": METHODS[GT_NAME]["color"], + **{m: METHODS[m]["color"] for m in models if m in METHODS}, + } + plot_labels = { + "zivid_gt": "Zivid GT (spatial spread)", + "original": METHODS["original"]["label"], + "finetuned": METHODS["finetuned"]["label"], + } + # rename keys to human-readable labels for the plot + labeled_accs = {plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0} + labeled_colors = {plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs if depth_accs[k].count.sum() > 0} + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / "depth_vs_distance.png", + ) + logging.info(f"All outputs written to {out_dir}") + + +if __name__ == '__main__': + # 1. works + #sm = build_example_depth_scale_regression_series() + #plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_regression_example.png", title="Example Depth Scale Regression") + + # 2. inbolt data + main_inbolt_graphs() + + # 3. full benchmark + report + #main() diff --git a/scripts/benchmark_inbolt_fast_foundation.py b/scripts/benchmark_inbolt_fast_foundation.py new file mode 100644 index 0000000..540425d --- /dev/null +++ b/scripts/benchmark_inbolt_fast_foundation.py @@ -0,0 +1,1213 @@ +"""Benchmark original vs fine-tuned FastFoundationStereo on the Inbolt dataset. + +Loads both models, runs inference on all Inbolt samples, computes depth quality +metrics against Zivid scanner ground truth, and produces an HTML report. + +Also generates depth accuracy and noise plots comparing RealSense stereo predictions +(model output) against Zivid ground-truth depth across distance bins. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt.py [--out_dir reports/inbolt_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path +import cv2 + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch +# import matplotlib +# matplotlib.use('Agg') +import matplotlib.pyplot as plt + +from core.utils.utils import InputPadder +import Utils as U + +from scripts.data_manager_inbolt import DataSource + +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_metrics, + compute_bin_mae, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── constants ──────────────────────────────────────────────────────────────── +#DATA_DIR = r'C:\Work\Data\Depth\Data Collection' # local path to the dataset +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' # local path to the dataset +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' # local path to the dataset + +MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415_epoch_030.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark' +#FARO_DIR = r'data/faro' # local path to FARO dataset --- IGNORE --- +# ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro.pth' +# DEFAULT_OUT = f'{code_dir}/../reports/faro_benchmark' + +BF = 49470.45 # focal_px * baseline_mm (calibrated from camera) +BF_RS = 49.8624*385.73 #- true # D435 - focal_px * baseline_mm (calibrated from camera) +ITERS = 8 # GRU iterations +N_VIZ = 5 # number of frames saved for visual comparison in report + +DEPTH_BIN_SIZE_M = 0.1 # width of each distance bin for accuracy/noise plots +MAX_DEPTH_M = 6.0 # maximum depth considered in plots + +METHODS = { + "original": {"label": "Original model", "color": "#2980b9"}, + "finetuned": {"label": "Fine-tuned on INBOLT", "color": "#e74c3c"}, + "inbolt_gt": {"label": "INBOLT GT", "color": "#27ae60"}, +} +GT_NAME = "inbolt_gt" + +CAMERA_MATRIX_RS = np.array([ + [385.5098876953125, 0, 328.31732177734375], + [0, 385.5098876953125, 235.6382141113281], + [0, 0, 1] +]) + +DIST_COEFFS_RS = np.array([ + 0.0, + -0.0, + -0.0, + 0.0, + -0.0 +]) + +CAMERA_MATRIX_ZIVID = np.array([ + [1241.8536376953125, 0, 609.9681922158361], + [0, 1241.6390380859375, 513.828698754003], + [0, 0, 1] +]) + +DIST_COEFFS_ZIVID = np.array([ + 0.045981280505657196, + -0.0316404290497303, + -0.00012756904470734298, + 0.0001183780113933608, + -0.17966397106647491 +]) + +# ── projection helpers ───────────────────────────────────────────────────────── + +def save_to_ply(points: np.ndarray, filename: str): + """Save a point cloud to a PLY file for visualization.""" + with open(filename, 'w') as f: + f.write('ply\n') + f.write('format ascii 1.0\n') + f.write(f'element vertex {len(points)}\n') + f.write('property float x\n') + f.write('property float y\n') + f.write('property float z\n') + f.write('end_header\n') + for x, y, z in points: + f.write(f'{x} {y} {z}\n') + +def project_camera_to_3d(depth_img_mm: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray) -> np.ndarray: + """Project 2D pixel coordinates with depth to 3D points in camera space.""" + h, w = depth_img_mm.shape + xs, ys = np.meshgrid(np.arange(w, dtype=np.float32), np.arange(h, dtype=np.float32), indexing='xy') + + # OpenCV expects Nx1x2 contiguous float32/float64 image points in (x, y) order. + distorted_points = np.stack([xs, ys], axis=-1).reshape(-1, 1, 2).astype(np.float32) + undistorted_points = cv2.undistortPoints(distorted_points, cam_matrix.astype(np.float32), dist_coeffs.astype(np.float32) ) + + uv = undistorted_points.reshape(-1, 2) + Z = depth_img_mm.reshape(-1).astype(np.float32) + valid = np.isfinite(Z) & (Z > 0) + if not np.any(valid): + return np.zeros((0, 3), dtype=np.float32) + + uv = uv[valid] + Z = Z[valid] + X = uv[:, 0] * Z + Y = uv[:, 1] * Z + + # save to ply point cloud for visualization + XYZ = np.stack([X, Y, Z], axis=1).astype(np.float32) + + return XYZ + +def project_3d_to_camera(points_3d: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray, frame_size = (480,640)) -> np.ndarray: + """Project 3D points in camera space back to 2D pixel coordinates.""" + if points_3d.shape[1] != 3: + raise ValueError("Input points_3d must have shape (N, 3)") + projected_pts, _ = cv2.projectPoints( + points_3d.reshape(-1, 1, 3), + np.zeros(3, dtype=np.float32), + np.zeros(3, dtype=np.float32), + cam_matrix.astype(np.float32), + dist_coeffs.astype(np.float32), + ) + + uv_rs = projected_pts.reshape(-1, 2) + u_idx = np.rint(uv_rs[:, 0]).astype(np.int32) + v_idx = np.rint(uv_rs[:, 1]).astype(np.int32) + + h_rs, w_rs = frame_size + in_bounds = (u_idx >= 0) & (u_idx < w_rs) & (v_idx >= 0) & (v_idx < h_rs) + if not np.any(in_bounds): + return np.zeros((h_rs, w_rs), dtype=np.float32) + + u_idx = u_idx[in_bounds] + v_idx = v_idx[in_bounds] + z_vals = points_3d[in_bounds, 2] # Z values of the valid points + + # Rasterize by nearest pixel; if multiple points hit a pixel, keep the closest depth. + lin = v_idx * w_rs + u_idx + depth_buffer = np.full(h_rs * w_rs, np.inf, dtype=np.float32) + np.minimum.at(depth_buffer, lin, z_vals) + depth_projected = depth_buffer.reshape(h_rs, w_rs) + depth_projected[~np.isfinite(depth_projected)] = 0.0 + return depth_projected + +# project from zivid depth patrix to point cloud and back to depth matrix with rs intrinsics and distortion to get "zivid GT as seen by RealSense" for pixel-level comparison +def project_depth_zivid_to_rs(depth_zivid_mm: np.ndarray, depth_rs_mm: np.ndarray, finx = 0) -> np.ndarray: + # create 3D point cloud from zivid depth + XYZ = project_camera_to_3d(depth_zivid_mm, CAMERA_MATRIX_ZIVID, DIST_COEFFS_ZIVID) # (N, 3) array of 3D points in Zivid camera space + # save to ply point cloud for visualization + #save_to_ply(XYZ/1000, f'zivid_original_points_{finx:03d}.ply') # save in meters for visualization + + # project back on imaage RS + depth_zivid_projected_mm = project_3d_to_camera(XYZ, CAMERA_MATRIX_RS, DIST_COEFFS_RS, frame_size = depth_rs_mm.shape) # (H, W) depth map of Zivid points projected into RealSense pixel space + + XYZ_RS = project_camera_to_3d(depth_zivid_projected_mm, CAMERA_MATRIX_RS, DIST_COEFFS_RS) + # save to ply point cloud for visualization + #save_to_ply(XYZ_RS/1000, f'zivid_projected_points_{finx:03d}.ply') # save in meters for visualization + + return depth_zivid_projected_mm + + +# ── depth-vs-distance analysis ──────────────────────────────────────────────── + +class DepthBinAccumulator: + """Accumulates mean and std-dev of depth values per GT-distance bin. + + Bins are defined by Zivid GT depth, so *every* pixel whose GT depth falls in + [k * bin_size, (k+1) * bin_size) contributes to bin k. Works for any sensor + (model predictions OR Zivid GT values themselves — the latter gives an + estimate of within-bin spatial variation / measurement noise). + """ + + def __init__(self, bin_size_m: float = DEPTH_BIN_SIZE_M, max_depth_m: float = MAX_DEPTH_M): + self.bin_size = bin_size_m + self.n_bins = int(np.ceil(max_depth_m / bin_size_m)) + self.count = np.zeros(self.n_bins, dtype=np.float64) + self.sum_ = np.zeros(self.n_bins, dtype=np.float64) + self.sum_sq = np.zeros(self.n_bins, dtype=np.float64) + + def update(self, values_m: np.ndarray, gt_m: np.ndarray): + """Add one frame of data. + + Parameters + ---------- + values_m : (H, W) array of the depth values to accumulate (model or GT). + gt_m : (H, W) array of Zivid GT depths that define which bin each pixel falls in. + """ + valid = (gt_m > 0) & (values_m > 0) + if not valid.any(): + return + v_vals = values_m[valid].ravel().astype(np.float64) + v_gt = gt_m[valid].ravel().astype(np.float64) + + bins = np.floor(v_gt / self.bin_size).astype(np.int32) + bins = np.clip(bins, 0, self.n_bins - 1) + + np.add.at(self.count, bins, 1.0) + np.add.at(self.sum_, bins, v_vals) + np.add.at(self.sum_sq, bins, v_vals ** 2) + + @property + def bin_centers(self) -> np.ndarray: + return (np.arange(self.n_bins) + 0.5) * self.bin_size + + def mean(self) -> np.ndarray: + c = np.maximum(self.count, 1) + return np.where(self.count > 0, self.sum_ / c, np.nan) + + def std(self) -> np.ndarray: + c = np.maximum(self.count, 1) + m = np.where(self.count > 0, self.sum_ / c, np.nan) + v = np.where(self.count > 0, self.sum_sq / c - m ** 2, np.nan) + return np.sqrt(np.maximum(v, 0.0)) + +def plot_depth_vs_distance( + accumulators: dict, # {label: DepthBinAccumulator} + colors: dict, # {label: color_str} + out_path: Path, + min_count: int = 100, # bins with fewer samples are hidden +): + """Produce a two-panel figure: + left – actual depth (Zivid GT) vs measured/predicted depth + right – noise (std-dev of measured depth) vs actual depth + """ + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + + for label, acc in accumulators.items(): + x = acc.bin_centers + mu = acc.mean() + sigma = acc.std() + color = colors.get(label, None) + + valid = acc.count >= min_count + xv, muv, sv = x[valid], mu[valid], sigma[valid] + + ax0 = axes[0] + ax0.plot(xv, muv, label=label, color=color, linewidth=1.5) + ax0.fill_between(xv, muv - sv, muv + sv, alpha=0.15, color=color) + + axes[1].plot(xv, sv * 1000, label=label, color=color, linewidth=1.5) + + # ideal line + lim = MAX_DEPTH_M + axes[0].plot([0, lim], [0, lim], 'k--', linewidth=1, label='ideal (y = x)') + axes[0].set_xlim(0, lim) + axes[0].set_ylim(0, lim) + axes[0].set_xlabel('Actual depth — Zivid GT (m)') + axes[0].set_ylabel('Measured depth (m)') + axes[0].set_title('Depth Accuracy: Actual vs Measured\n(shaded band = ±1 std dev)') + axes[0].legend(fontsize=9) + axes[0].grid(True, alpha=0.4) + + axes[1].set_xlim(0, lim) + axes[1].set_xlabel('Actual depth — Zivid GT (m)') + axes[1].set_ylabel('Noise / Std Dev (mm)') + axes[1].set_title('Depth Noise per Distance Bin') + axes[1].legend(fontsize=9) + axes[1].grid(True, alpha=0.4) + + plt.tight_layout() + plt.savefig(out_path, dpi=150, bbox_inches='tight') + plt.close(fig) + logging.info(f"Saved depth-vs-distance plot → {out_path}") + +def _to_1d_float_array(values, name: str) -> np.ndarray: + """Convert *values* to a finite 1D float array.""" + arr = np.asarray(values, dtype=np.float64).reshape(-1) + if arr.size == 0: + raise ValueError(f"{name} must contain at least one value") + if not np.all(np.isfinite(arr)): + raise ValueError(f"{name} contains NaN or infinite values") + return arr + +def fit_depth_scale_regression( + gt_delta_mm, + measured_delta_mm, + fit_intercept: bool = True, +) -> dict: + """Fit a linear depth-scale regression and compute residual statistics. + + Parameters + ---------- + gt_delta_mm : array-like + Ground-truth floor/depth deltas in millimetres. + measured_delta_mm : array-like + Measured deltas from one sensor/model in millimetres. + fit_intercept : bool, default=False + If False, uses a through-origin fit `y = slope * x`, which matches the + style of the attached plot. If True, fits `y = slope * x + intercept`. + + Returns + ------- + dict + Contains slope, intercept, fitted values, residuals, RMSE, and masks. + """ + x = _to_1d_float_array(gt_delta_mm, "gt_delta_mm") + y = _to_1d_float_array(measured_delta_mm, "measured_delta_mm") + + if x.shape != y.shape: + raise ValueError("gt_delta_mm and measured_delta_mm must have the same shape") + if x.size < 2: + raise ValueError("At least two samples are required for regression") + + valid = np.isfinite(x) & np.isfinite(y) + x = x[valid] + y = y[valid] + + if x.size < 2: + raise ValueError("Need at least two finite samples after filtering") + + if fit_intercept: + slope, intercept = np.polyfit(x, y, deg=1) + else: + denom = float(np.dot(x, x)) + if denom <= 0: + raise ValueError("Cannot fit a through-origin regression when gt deltas are all zero") + slope = float(np.dot(x, y) / denom) + intercept = 0.0 + + fitted = slope * x + intercept + residuals = y - fitted + rmse = float(np.sqrt(np.mean(residuals ** 2))) + + return { + "gt_delta_mm": x, + "measured_delta_mm": y, + "slope": float(slope), + "intercept": float(intercept), + "fitted_mm": fitted, + "residuals_mm": residuals, + "rmse_mm": rmse, + "fit_intercept": fit_intercept, + } + +def fit_plane_and_compute_error(depth_diff_mm: np.ndarray, valid_mask: np.ndarray) -> dict: + """Fit a plane z = a*x + b*y + c on valid pixels and compute residual errors. + + Parameters + ---------- + depth_diff_mm : np.ndarray + 2D depth-difference image in millimetres. + valid_mask : np.ndarray + 2D boolean mask of valid pixels used for fitting/evaluation. + + Returns + ------- + dict + Plane coefficients, residual statistics, and counts. + """ + if depth_diff_mm.ndim != 2: + raise ValueError("depth_diff_mm must be a 2D array") + if valid_mask.shape != depth_diff_mm.shape: + raise ValueError("valid_mask must have the same shape as depth_diff_mm") + + valid = valid_mask.astype(bool) & np.isfinite(depth_diff_mm) + n_valid = int(np.count_nonzero(valid)) + if n_valid < 3: + return { + "coeffs": np.array([np.nan, np.nan, np.nan], dtype=np.float64), + "rmse_mm": np.nan, + "mae_mm": np.nan, + "residuals_mm": np.array([], dtype=np.float64), + "n_valid": n_valid, + } + + ys, xs = np.nonzero(valid) + z = depth_diff_mm[valid].astype(np.float64) + + # Solve least-squares for z = a*x + b*y + c + A = np.stack([xs.astype(np.float64), ys.astype(np.float64), np.ones_like(z)], axis=1) + coeffs, _, _, _ = np.linalg.lstsq(A, z, rcond=None) + + fitted = A @ coeffs + residuals = z - fitted + rmse = float(np.sqrt(np.mean(residuals ** 2))) + mae = float(np.mean(np.abs(residuals))) + + return { + "coeffs": coeffs, + "rmse_mm": rmse, + "mae_mm": mae, + "residuals_mm": residuals, + "n_valid": n_valid, + } + +def build_example_depth_scale_regression_series( + gt_delta_mm, + rs_delta_mm, + zv_delta_mm, + fs_delta_mm=None, + ft_delta_mm=None, + rs_rsme_mm=None, + zv_rsme_mm=None, + fs_rsme_mm=None, + ft_rsme_mm=None +) -> dict: + """Return example depth-delta series that reproduces the attached figure. + + The values approximate the plot shown in the screenshot: + - RealSense has a noticeable scale bias. + - Zivid stays close to the ideal slope of 1. + """ + gt_delta_mm = np.array([0, 100, 200, 300, 400, 500, 600, 700], dtype=np.float64) if gt_delta_mm is None else gt_delta_mm + # rs_delta_mm = np.array([0.0, 104.0, 218.0, 323.0, 433.0, 542.0, 664.0, 754.0], dtype=np.float64) if rs_delta_mm is None else rs_delta_mm + # zv_delta_mm = np.array([0.0, 101.0, 201.0, 301.0, 401.0, 502.0, 602.0, 707.0], dtype=np.float64) if zv_delta_mm is None else zv_delta_mm + # fs_delta_mm = np.array([0.0, 102.0, 204.0, 306.0, 408.0, 510.0, 612.0, 714.0], dtype=np.float64) if fs_delta_mm is None else fs_delta_mm + # ftn_delta_mm = np.array([0.0, 103.0, 207.0, 311.0, 415.0, 519.0, 623.0, 727.0], dtype=np.float64) if ftn_delta_mm is None else ftn_delta_mm + series_map = { + "realsense": { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": rs_delta_mm, + "rmse_mm": rs_rsme_mm, + "color": "#e74c3c", + "marker": "s", + "label": "realsense", + }, + "zivid": { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": zv_delta_mm, + "rmse_mm": zv_rsme_mm, + "color": "#2980b9", + "marker": "o", + "label": "zivid", + }, + } + if fs_delta_mm is not None: + series_map["ffs"] = { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": fs_delta_mm, + "rmse_mm": fs_rsme_mm, + "color": "#27ae60", + "marker": "d", + "label": "ffs", + } + if ft_delta_mm is not None: + series_map["ftn"] = { + "gt_delta_mm": gt_delta_mm, + "measured_delta_mm": ft_delta_mm, + "rmse_mm": ft_rsme_mm, + "color": "#f39c12", + "marker": "^", + "label": "ftn", + } + return series_map + +def plot_depth_scale_regression( + series_map: dict, + out_path: Path, + title: str = "Depth Scale Regression — dataset_depth_bias", + fit_intercept: bool = True, + ideal_slope: float = 1.0, +): + """Create the two-panel regression + residuals figure from paired series. + + Parameters + ---------- + series_map : dict + Mapping of series name to configuration dict. Each entry should provide: + - gt_delta_mm + - measured_delta_mm + and may optionally include: + - label + - color + - marker + out_path : Path + Destination PNG path. + title : str + Figure title. + fit_intercept : bool + Whether to fit a free intercept. Defaults to a through-origin fit. + ideal_slope : float + Slope of the ideal reference line shown on the left panel. + """ + if not series_map: + raise ValueError("series_map must contain at least one series") + + fig, axes = plt.subplots(1, 2, figsize=(14, 6)) + fit_results = [] + max_x = 0.0 + max_y = 0.0 + + for default_name, cfg in series_map.items(): + result = fit_depth_scale_regression( + gt_delta_mm=cfg["gt_delta_mm"], + measured_delta_mm=cfg["measured_delta_mm"], + fit_intercept=fit_intercept, + ) + result["label"] = cfg.get("label", default_name) + result["color"] = cfg.get("color", None) + result["marker"] = cfg.get("marker", "o") + result["rmse_mm"] = result["rmse_mm"] if "rmse_mm" not in cfg else cfg["rmse_mm"] + fit_results.append(result) + max_x = max(max_x, float(np.max(result["gt_delta_mm"]))) + max_y = max(max_y, float(np.max(result["measured_delta_mm"]))) + + lim = max_x #max(max_x, max_y) + fit_x = np.linspace(0.0, lim, 200) + + for result in fit_results: + label = result["label"] + color = result["color"] + marker = result["marker"] + x = result["gt_delta_mm"] + y = result["measured_delta_mm"] + slope = result["slope"] + intercept = result["intercept"] + rmse = result["rmse_mm"] + print(rmse) + + axes[0].scatter(x, y-x, color=color, marker=marker, s=70, label=f"{label} (raw)", zorder=3) + # axes[0].plot( + # fit_x, + # slope * fit_x + intercept, + # color=color, + # linewidth=2.0, + # label=( + # f"{label} fit: slope={slope:.3f}, intercept={intercept:.1f}mm" + # if fit_intercept else + # f"{label} fit: slope={slope:.3f}" + # ), + # ) + + axes[1].scatter( + x, + rmse, #result["residuals_mm"], + color=color, + marker=marker, + s=70, + label=f"{label} RMSE mm", + zorder=3, + ) + + + # axes[0].plot( + # fit_x, + # ideal_slope * fit_x, + # linestyle="--", + # color="gray", + # linewidth=1.5, + # label=f"ideal (slope={ideal_slope:.1f})", + # ) + axes[0].axhline(0.0, linestyle="--", color="gray", linewidth=1.2) + axes[0].set_xlabel("Ground Truth Delta (mm)") + axes[0].set_ylabel("Measured Depth Delta (mm) - Ground Truth Delta (mm)") + axes[0].set_title("Depth Delta: Measured from Ground Truth") + axes[0].grid(True, alpha=0.3) + axes[0].legend(fontsize=9, loc="upper left") + + axes[1].axhline(0.0, linestyle="--", color="gray", linewidth=1.2) + axes[1].set_xlabel("Ground Truth Delta (mm)") + axes[1].set_ylabel("Residual RMSE (mm)") + axes[1].set_title("Residuals (Measured − Plane Fit)") + axes[1].grid(True, alpha=0.3) + axes[1].legend(fontsize=9, loc="upper left") + + axes[0].set_xlim(-0.05 * max(lim, 1.0), lim * 1.05) + axes[1].set_xlim(-0.05 * max(lim, 1.0), lim * 1.05) + + residual_values = np.concatenate([r["residuals_mm"] for r in fit_results]) + residual_abs_max = max(1.0, float(np.max(np.abs(residual_values)))) + residual_abs_max = 60 + axes[0].set_ylim(-20, 70) + axes[1].set_ylim(-20, 70) + + fig.suptitle(title, fontsize=18, fontweight="bold") + plt.tight_layout(rect=[0, 0, 1, 0.96]) + plt.savefig(out_path, dpi=150, bbox_inches="tight") + plt.close(fig) + logging.info(f"Saved depth-scale regression plot → {out_path}") + + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert uint16 IR images to CUDA float tensors (3-channel pseudo-RGB).""" + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H×W×3 + right = np.stack([right, right, right], axis=-1) + left_t = torch.as_tensor(left).float()[None].permute(0, 3, 1, 2).cuda() + right_t = torch.as_tensor(right).float()[None].permute(0, 3, 1, 2).cuda() + return left_t, right_t + +@torch.no_grad() +def infer_depth_m(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 # disparity → mm → m + return depth_m + +@torch.no_grad() +def infer_depth_rs_mm(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_mm = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_mm[valid] = (BF_RS / disp_np[valid]) # disparity → mm → m + return depth_mm + +def load_model(path: str): + logging.info(f"Loading model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + + +# ── inbolt graphs ───────────────────────────────────────────────────────────────────── + +def main_inbolt_graphs(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + gt_depth_diff = np.arange(n)*100 # mm + rs_depth_diff = np.arange(n)*0 # mm + zv_depth_diff = np.arange(n)*0 # zivid mm + rs_ref = None + zv_ref = None + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + zv_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + + + # # Resize Zivid depth to match RealSense IR image resolution for pixel-level comparison + # rs_h, rs_w = left.shape[:2] + # if gt_mm.shape != (rs_h, rs_w): + # #gt_mm = _cv2.resize(gt_mm, (rs_w, rs_h), interpolation=_cv2.INTER_NEAREST) + # print(f"Shape mismatch: gt_mm {gt_mm.shape} vs rs {rs_h, rs_w}") + rs_valid = (rs_mm > rs_mm.max()*0.8) + zv_valid = (zv_mm > zv_mm.max()*0.8) + if idx == 0: + rs_ref = np.nanmean(rs_mm[rs_valid]) + zv_ref = np.nanmean(zv_mm[zv_valid]) + else: + rs_depth_diff[idx] = np.nanmean(rs_mm[rs_valid]) - rs_ref + zv_depth_diff[idx] = np.nanmean(zv_mm[zv_valid]) - zv_ref + + + sm = build_example_depth_scale_regression_series(gt_depth_diff, rs_depth_diff, zv_depth_diff) + plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_comparison.png", title="Depth Scale Comparison") + + logging.info(f"All outputs written to {out_dir}") + +# ── inbolt graphs with Zivid projection ───────────────────────────────────────────────────────────────────── + +def main_inbolt_graphs_with_projection(): + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + gt_depth_diff = np.arange(n)*100 # mm + rs_depth_diff = np.arange(n)*0 # mm + zv_depth_diff = np.arange(n)*0 # zivid mm + rs_depth_rsme = np.arange(n)*0 # mm + zv_depth_rsme = np.arange(n)*0 # zivid mm + rs_ref = None + zv_ref = None + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + zv_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + + # project zivid on rs + zv_prj_mm = project_depth_zivid_to_rs(zv_mm, rs_mm, finx = idx) + + rs_valid = (10 < rs_mm) + rs_valid = rs_valid & (rs_mm < rs_mm[rs_valid].min()*1.1) + zv_valid = (10 < zv_prj_mm) + zv_valid = zv_valid & (zv_prj_mm < zv_prj_mm[zv_valid].min()*1.05) & rs_valid + if idx == 0: + rs_ref = rs_mm + zv_ref = zv_prj_mm + else: + + rs_diff_map = rs_mm - rs_ref + zv_diff_map = zv_prj_mm - zv_ref + + # debug visualization of difference maps and valid masks + # plt.figure(figsize=(12, 4)) + # plt.subplot(1, 3, 1),plt.imshow(rs_diff_map, vmin=-10, vmax=1000),plt.title(f"RealSense Depth Diff (mm)"),plt.colorbar() + # plt.subplot(1, 3, 2),plt.imshow(zv_diff_map, vmin=-10, vmax=1000),plt.title(f"Zivid Projected Depth Diff (mm)"),plt.colorbar() + # plt.subplot(1, 3, 3),plt.imshow(zv_valid, cmap='gray'),plt.title(f"Valid Mask (Zivid Projection)"),plt.colorbar() + # plt.suptitle(f"Sample {idx:03d} Depth Difference Maps and Valid Mask", fontsize=16) + # plt.tight_layout() + # plt.show() + + fig, axes = plt.subplots(1, 3, sharey=True, sharex=True, figsize=(8,4)) + axes[0].imshow(rs_diff_map, vmin=-10, vmax=1000),axes[0].set_title(f"RealSense Depth Diff (mm)"), + axes[1].imshow(zv_diff_map, vmin=-10, vmax=1000),axes[1].set_title(f"Zivid Projected Depth Diff (mm)"), + axes[2].imshow(zv_valid, cmap='gray'),axes[2].set_title(f"Valid Mask (Zivid Projection)"), + plt.suptitle(f"Sample {idx:03d} Depth Difference Maps and Valid Mask", fontsize=16) + plt.tight_layout() + plt.show() + + + + # Mean depth deltas over the common valid support. + rs_diff_valid = rs_diff_map[zv_valid] + zv_diff_valid = zv_diff_map[zv_valid] + rs_depth_diff[idx] = float(np.mean(rs_diff_valid)) + zv_depth_diff[idx] = float(np.mean(zv_diff_valid)) + + # Fit a plane to each difference map and use fit residual RMSE as error. + rs_plane_fit = fit_plane_and_compute_error(rs_diff_map, zv_valid) + zv_plane_fit = fit_plane_and_compute_error(zv_diff_map, zv_valid) + rs_depth_rsme[idx] = rs_plane_fit["rmse_mm"] + zv_depth_rsme[idx] = zv_plane_fit["rmse_mm"] + # old code + #rs_depth_rsme[idx] = np.sqrt(np.mean((rs_diff - rs_depth_diff[idx])**2)) + #zv_depth_rsme[idx] = np.sqrt(np.mean((zv_diff - zv_depth_diff[idx])**2)) + + sm = build_example_depth_scale_regression_series(gt_depth_diff, rs_depth_diff, zv_depth_diff, rs_rsme_mm=rs_depth_rsme, zv_rsme_mm=zv_depth_rsme) + plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_comparison_inbolt.png", title="Depth Scale Comparison") + + logging.info(f"All outputs written to {out_dir}") + +# ── inbolt and FFS graphs ───────────────────────────────────────────────────────────────────── + +def main_inbolt_ffs_graphs_with_projection(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + # ── load models ────────────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + gt_depth_diff = np.arange(n)*1 # when drive no meaning for gt depth diff, just want to see the scale of the error, so set to 1 mm + rs_depth_diff = np.arange(n)*0 # mm + zv_depth_diff = np.arange(n)*0 # zivid mm + ffs_depth_diff = np.arange(n)*0 # ffs mm + ftn_depth_diff = np.arange(n)*0 # ftn mm + rs_depth_rsme = np.arange(n)*0 # mm + zv_depth_rsme = np.arange(n)*0 # zivid mm + ffs_depth_rsme = np.arange(n)*0 # mm + ftn_depth_rsme = np.arange(n)*0 # zivid mm + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + zv_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + ffs_mm = infer_depth_rs_mm(models["original"], left, right) + ftn_mm = infer_depth_rs_mm(models["finetuned"], left, right) + + # project zivid on rs + zv_prj_mm = project_depth_zivid_to_rs(zv_mm, rs_mm, finx = idx) + + rs_valid = (10 < rs_mm) + zv_valid = (10 < zv_prj_mm) + ffs_valid = (10 < ffs_mm) + ftn_valid = (10 < ftn_mm) + + zv_zv_error = source.compute_depth_error(zv_prj_mm, zv_prj_mm, depth_mask=zv_valid & rs_valid) + rs_zv_error = source.compute_depth_error(rs_mm, zv_prj_mm, depth_mask=zv_valid & rs_valid) + ffs_zv_error = source.compute_depth_error(ffs_mm, zv_prj_mm, depth_mask=zv_valid & ffs_valid) + ftn_zv_error = source.compute_depth_error(ftn_mm, zv_prj_mm, depth_mask=zv_valid & ftn_valid) + + # debug + # img_list = [left, right, rs_mm, zv_prj_mm, ffs_mm, ftn_mm] + # ttl_list = ['left (RS)', 'right (RS)', 'depth RS (mm)', 'depth Zivid (mm)', 'depth FFS (mm)', 'depth FTN (mm)'] + # source.show_subset(img_list, ttl_list, save_path=DEFAULT_OUT , fig_name = f"sample_{idx:03d}_inputs.png") + img_list = [zv_zv_error, rs_zv_error, ffs_zv_error, ftn_zv_error] + ttl_list = ['Zivid Error', 'RS Error', 'FFS Error', 'FTN Error'] + source.show_subset(img_list, ttl_list, save_path=DEFAULT_OUT , fig_name = f"error_{idx:03d}_inputs.png") + + zv_depth_diff[idx] = np.mean(zv_zv_error) + rs_depth_diff[idx] = np.mean(rs_zv_error) + ffs_depth_diff[idx] = np.mean(ffs_zv_error) + ftn_depth_diff[idx] = np.mean(ftn_zv_error) + + zv_depth_rsme[idx] = np.sqrt(np.mean(zv_zv_error**2)) + rs_depth_rsme[idx] = np.sqrt(np.mean(rs_zv_error**2)) + ffs_depth_rsme[idx] = np.sqrt(np.mean(ffs_zv_error**2)) + ftn_depth_rsme[idx] = np.sqrt(np.mean(ftn_zv_error**2)) + + + sm = build_example_depth_scale_regression_series(gt_depth_diff, rs_depth_diff, rs_depth_diff, ffs_depth_diff, ftn_depth_diff, rs_rsme_mm=rs_depth_rsme, zv_rsme_mm=zv_depth_rsme, fs_rsme_mm=ffs_depth_rsme, ft_rsme_mm=ftn_depth_rsme) + plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_comparison_ffs.png", title="Depth Scale Comparison") + + logging.info(f"All outputs written to {out_dir}") + +def main_inbolt_ffs_graphs_with_projection_biased_dataset(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + # ── load models ────────────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + gt_depth_diff = np.arange(n)*100 # when drive no meaning for gt depth diff, just want to see the scale of the error, so set to 1 mm + rs_depth_diff = np.arange(n)*0 # mm + zv_depth_diff = np.arange(n)*0 # zivid mm + ffs_depth_diff = np.arange(n)*0 # ffs mm + ftn_depth_diff = np.arange(n)*0 # ftn mm + rs_depth_rsme = np.arange(n)*0 # mm + zv_depth_rsme = np.arange(n)*0 # zivid mm + ffs_depth_rsme = np.arange(n)*0 # mm + ftn_depth_rsme = np.arange(n)*0 # zivid mm + rs_ref = None + zv_ref = None + fs_ref = None + ft_ref = None + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + zv_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + ffs_mm = infer_depth_rs_mm(models["original"], left, right) + ftn_mm = infer_depth_rs_mm(models["finetuned"], left, right) + + # project zivid on rs + zv_prj_mm = project_depth_zivid_to_rs(zv_mm, rs_mm, finx = idx) + + if idx == 0: + zv_ref = zv_prj_mm + rs_ref = rs_mm + fs_ref = ffs_mm + ft_ref = ftn_mm + + # save ply of the point cloud for visualization + if idx == 1: + XYZ_FS = project_camera_to_3d(ffs_mm, CAMERA_MATRIX_RS, DIST_COEFFS_RS) + # save to ply point cloud for visualization + #save_to_ply(XYZ_FS/1000, f'ffs_projected_points_{idx:03d}.ply') # save in meters for visualization + + + zv_valid = (10 < zv_prj_mm) + zv_valid = (zv_prj_mm < (zv_prj_mm[zv_valid].min()*1.0+50)) & zv_valid + rs_valid = (10 < rs_mm) + rs_valid = (rs_mm < (rs_mm[rs_valid].min()*1.0+100)) & rs_valid & zv_valid + fs_valid = (10 < ffs_mm) + fs_valid = (ffs_mm < ffs_mm[fs_valid].min()*1.0+100) & fs_valid & zv_valid + ft_valid = (10 < ftn_mm) + ft_valid = (ftn_mm < ftn_mm[ft_valid].min()*1.0+100) & ft_valid & zv_valid + + zv_zv_error = source.compute_depth_error(zv_prj_mm, zv_ref, depth_mask= zv_valid) + rs_zv_error = source.compute_depth_error(rs_mm, rs_ref, depth_mask= rs_valid) + ffs_zv_error = source.compute_depth_error(ffs_mm, fs_ref, depth_mask= fs_valid) + ftn_zv_error = source.compute_depth_error(ftn_mm, ft_ref, depth_mask= ft_valid) + + # debug + img_list = [left, right, rs_mm, zv_prj_mm, ffs_mm, ftn_mm] + ttl_list = ['left (RS)', 'right (RS)', 'depth RS (mm)', 'depth Zivid (mm)', 'depth FFS (mm)', 'depth FTN (mm)'] + source.show_subset(img_list, ttl_list, save_path=DEFAULT_OUT , fig_name = f"sample_{idx:03d}_inputs") + img_list = [zv_zv_error, rs_zv_error, ffs_zv_error, rs_valid.astype(np.float32)*100] + ttl_list = ['Zivid Error', 'RS Error', 'FFS Error', 'RS Valid Mask'] + source.show_subset(img_list, ttl_list, vmin=0, vmax=100, save_path=DEFAULT_OUT , fig_name = f"error_{idx:03d}_inputs") + + zv_count = np.sum(zv_valid) + rs_count = np.sum(rs_valid) + fs_count = np.sum(fs_valid) + ft_count = np.sum(ft_valid) + + zv_depth_diff[idx] = np.sum(zv_zv_error) / zv_count + rs_depth_diff[idx] = np.sum(rs_zv_error) / rs_count + ffs_depth_diff[idx] = np.sum(ffs_zv_error) / fs_count + ftn_depth_diff[idx] = np.sum(ftn_zv_error) / ft_count + + # zv_depth_rsme[idx] = np.sqrt(np.sum(zv_zv_error**2) / zv_count) + # rs_depth_rsme[idx] = np.sqrt(np.sum(rs_zv_error**2) / rs_count) + # ffs_depth_rsme[idx] = np.sqrt(np.sum(ffs_zv_error**2) / fs_count) + # ftn_depth_rsme[idx] = np.sqrt(np.sum(ftn_zv_error**2) / ft_count) + + # Fit a plane to each difference map and use fit residual RMSE as error. + zv_plane_fit = fit_plane_and_compute_error(zv_zv_error, zv_valid) + rs_plane_fit = fit_plane_and_compute_error(rs_zv_error, rs_valid) + ffs_plane_fit = fit_plane_and_compute_error(ffs_zv_error, fs_valid) + ftn_plane_fit = fit_plane_and_compute_error(ftn_zv_error, ft_valid) + rs_depth_rsme[idx] = rs_plane_fit["rmse_mm"] + zv_depth_rsme[idx] = zv_plane_fit["rmse_mm"] + ffs_depth_rsme[idx] = ffs_plane_fit["rmse_mm"] + ftn_depth_rsme[idx] = ftn_plane_fit["rmse_mm"] + + + sm = build_example_depth_scale_regression_series(gt_depth_diff, rs_depth_diff, zv_depth_diff, ffs_depth_diff, ftn_depth_diff, rs_rsme_mm=rs_depth_rsme, zv_rsme_mm=zv_depth_rsme, fs_rsme_mm=ffs_depth_rsme, ft_rsme_mm=ftn_depth_rsme) + plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_comparison_zivid_rs_ffs.png", title="Depth Comparison") + + logging.info(f"All outputs written to {out_dir}") + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=MODEL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load models ────────────────────────────────────────────────────────── + models = {} + if Path(args.finetuned).exists(): + models["finetuned"] = load_model(args.finetuned) + else: + logging.warning(f"Fine-tuned model not found at {args.finetuned} — skipping") + + models["original"] = load_model(args.original) + + active_methods = [GT_NAME] + list(models.keys()) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f"Found {n} samples in {args.data_dir}") + if n == 0: + logging.error("No samples found — check DATA_DIR path") + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} # will be init on first frame + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + # depth-vs-distance accumulators: + # "zivid_gt" – Zivid depth values binned by Zivid GT (shows intra-bin spatial spread) + # one entry per stereo model – model predictions binned by Zivid GT + depth_acc_keys = ["zivid_gt"] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + #import cv2 as _cv2 # local import to avoid top-level dependency if already imported + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) # Zivid GT in mm + rs_mm = data['depth_rs'].astype(np.float32) # RealSense depth in mm + + # Resize Zivid depth to match RealSense IR image resolution for pixel-level comparison + rs_h, rs_w = left.shape[:2] + if gt_mm.shape != (rs_h, rs_w): + #gt_mm = _cv2.resize(gt_mm, (rs_w, rs_h), interpolation=_cv2.INTER_NEAREST) + print(f"Shape mismatch: gt_mm {gt_mm.shape} vs rs {rs_h, rs_w}") + + if H is None: + H, W = rs_h, rs_w + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 # mm → m + rs_m = rs_mm / 1000.0 # mm → m + + # run inference for each model + frame_depths = {GT_NAME: gt_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_rs_mm(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # per-frame metrics + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics(GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = float((pred[gt_close_mask] > 0).mean()) * 100.0 if n_close > 0 else 0.0 + close_range_valid[mname].append(close_cov) + + # depth-vs-distance: accumulate per-bin stats + depth_accs["zivid_gt"].update(gt_m, gt_m) # GT vs itself → intra-bin spread + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f" {idx + 1}/{n} frames processed") + + # normalise coverage maps to [0, 1] + for m in active_methods: + valid_acc[m] /= max(n, 1) + + # ── aggregate timing ────────────────────────────────────────────────────── + mean_timing = {m: float(np.mean(ts)) if ts else 0.0 for m, ts in timing_ms_raw.items()} + mean_timing[GT_NAME] = 0.0 + + # ── build BenchmarkResults ──────────────────────────────────────────────── + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]["label"] for m in active_methods}, + method_colors={m: METHODS[m]["color"] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f"INBOLT dataset ({args.data_dir})", + method_configs={ + "original": {"model_path": args.original}, + "finetuned": {"model_path": args.finetuned}, + }, + ) + + stats = aggregate(results, mean_timing) + + # ── generate report ─────────────────────────────────────────────────────── + reporter = ReportGenerator(results, stats, out_dir) + reporter.generate() + + # ── depth-vs-distance comparison plot ──────────────────────────────────── + plot_colors = { + "zivid_gt": METHODS[GT_NAME]["color"], + **{m: METHODS[m]["color"] for m in models if m in METHODS}, + } + plot_labels = { + "zivid_gt": "Zivid GT (spatial spread)", + "original": METHODS["original"]["label"], + "finetuned": METHODS["finetuned"]["label"], + } + # rename keys to human-readable labels for the plot + labeled_accs = {plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0} + labeled_colors = {plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs if depth_accs[k].count.sum() > 0} + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / "depth_vs_distance.png", + ) + logging.info(f"All outputs written to {out_dir}") + + +if __name__ == '__main__': + # 1. works + #sm = build_example_depth_scale_regression_series() + #plot_depth_scale_regression(sm, out_path=Path(DEFAULT_OUT) / "depth_scale_regression_example.png", title="Example Depth Scale Regression") + + # 2. inbolt data + #main_inbolt_graphs() + + # 3. full benchmark + report + #main() + + # 4. inbolt with ffs + #main_inbolt_ffs_graphs_with_projection() + + # 5. inbolt with zivid projection + #main_inbolt_graphs_with_projection() + + # 6. inbolt with zivid projection on biased dataset + main_inbolt_ffs_graphs_with_projection_biased_dataset() diff --git a/scripts/benchmark_inbolt_fs.py b/scripts/benchmark_inbolt_fs.py new file mode 100644 index 0000000..392de0a --- /dev/null +++ b/scripts/benchmark_inbolt_fs.py @@ -0,0 +1,419 @@ +"""Benchmark FastFoundationStereo models + RealSense hardware depth on the Inbolt dataset. + +This benchmark mirrors the structure of ``benchmark_faro_rs.py`` but uses the +Inbolt dataset and the meter-based reporting pipeline already used by +``benchmark_inbolt.py``. + +For fair pixel-wise comparison against the RealSense stereo pair and hardware +RealSense depth map, Zivid ground-truth depth is projected into RealSense image +space via ``DataSource.get_item_projected()``. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs.py [--out_dir reports/inbolt_ffs_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +import cv2 +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import Utils as U +from benchmark_inbolt import DepthBinAccumulator, infer_depth_m, load_model, plot_depth_vs_distance +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── custom report generator ────────────────────────────────────────────────── + +class ReportGeneratorInbolt(ReportGenerator): + """Custom report generator that shows 4 frames in depth comparison and error maps.""" + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices = [] + + def _get_selected_viz_indices(self, n_pick: int = 4): + """Return cached random frame indices used consistently across report sections.""" + if self._selected_viz_indices: + return self._selected_viz_indices + + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + + n = min(n_pick, n_total) + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("depth_comparison.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows = len(sel) + ncols = len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="m") + title = self._r.method_labels.get(name, name) + if c == 0: + title = f"Frame {frame_idx + 1} • {title}" + ax.set_title(title, fontsize=9, wrap=True) + ax.axis("off") + + fig.suptitle("Depth Map Comparison (4 random frames) — values in meters", + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("error_maps.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + + nrows = len(sel) + ncols = len(names) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis("off") + continue + + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=0.001, vmax=0.1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (m)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + if c == 0: + ax.set_title(f"Frame {frame_idx + 1} • {label}\nMAE={mean_err:.4f} m", fontsize=9) + else: + ax.set_title(f"{label}\nMAE={mean_err:.4f} m", fontsize=9) + ax.axis("off") + + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label} (4 random frames, m)", fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415_epoch_030.pth' +# MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +#FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +#DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark-model37-111-set-20260414_142239' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_planes_epoch_120.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark-planes_epoch_120' +N_VIZ = 5 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' +RS_FPS = 30.0 + + +def resolve_finetuned_model_path(preferred_path: str) -> Optional[str]: + """Return an existing fine-tuned Inbolt checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidate_names = [ + 'model_finetuned_inbolt.pth', + 'model_finetuned_inbolt-20260415_epoch_030.pth', + ] + + # 1) Try known candidate file names anywhere under weights/ + for name in candidate_names: + found = sorted(weights_dir.glob(f'**/{name}')) + if found: + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using fallback {found[0]}' + ) + return str(found[0]) + + # 2) Fallback to any Inbolt fine-tuned checkpoint, prefer lexicographically latest + generic = sorted(weights_dir.glob('**/model_finetuned_inbolt*.pth')) + if generic: + chosen = generic[-1] + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using discovered checkpoint {chosen}' + ) + return str(chosen) + + return None + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load stereo models ─────────────────────────────────────────────────── + models = {} + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + models['finetuned'] = load_model(finetuned_path) + else: + logging.warning( + f'Fine-tuned model not found (preferred: {args.finetuned}) and no fallback checkpoint found — skipping' + ) + + models['original'] = load_model(args.original) + + active_methods = [GT_NAME, RS_NAME] + list(models.keys()) + + # ── dataset ────────────────────────────────────────────────────────────── + source = DataSource(train_mode = False) + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ───────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + depth_acc_keys = ['zivid_gt', RS_NAME] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + # save raw data to p.g images 16 bit PNGs for later analysis if needed + #cv2.imwrite(str(out_dir / f'{mname}_{idx:03d}.png'), (frame_depths[mname] * 1000.0).astype(np.uint16)) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + # # create point clouds for visualization + # if idx % 10 == 0: + # for mname in active_methods: + # pred = frame_depths[mname] + + # XYZ = source.project_camera_to_3d(pred, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in Zivid camera space + # mname_path = os.path.join(out_dir, f'{mname}_{idx:03d}.ply') + # source.save_to_ply(XYZ/1000, mname_path) # save in meters for visualization + + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, + 0.0, + 0.0, + 0.0, + 100.0, + float((pred > 0).mean()) * 100.0, + 0.0, + mae_pen=0.0, + mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs['zivid_gt'].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map used as Inbolt ground truth'}, + } + if 'finetuned' in models and finetuned_path is not None: + method_configs['finetuned'] = {'model_path': finetuned_path} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + plot_colors = { + 'zivid_gt': METHODS[GT_NAME]['color'], + RS_NAME: METHODS[RS_NAME]['color'], + **{m: METHODS[m]['color'] for m in models if m in METHODS}, + } + plot_labels = { + 'zivid_gt': 'Zivid GT (spatial spread)', + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt_fs_depthrs.py b/scripts/benchmark_inbolt_fs_depthrs.py new file mode 100644 index 0000000..df18044 --- /dev/null +++ b/scripts/benchmark_inbolt_fs_depthrs.py @@ -0,0 +1,386 @@ +"""Benchmark FastFoundationStereo + RealSense depth fusion vs baselines on the Inbolt dataset. + +Extends benchmark_inbolt_fs.py by adding depth-fusion models as additional methods. + +Methods compared +---------------- + original : FFS pretrained, no fine-tuning, stereo only + finetuned : FFS fine-tuned on Inbolt (stereo only) + depthrs_finetuned : FFS + RS Depth Fusion v1 (DepthEncoder + DepthFusionModule) + depthrs_v2 : FFS + RS Output Blend v2 (Init Blend + Output Blend, frozen features) + depth_rs : RealSense hardware depth (no stereo) + zivid_gt : Projected Zivid depth (ground truth) + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs_depthrs.py [--out_dir reports/inbolt_ffs_depthrs] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import Utils as U +from core.utils.utils import InputPadder + +# Must be imported before torch.load so the depthrs models can be unpickled +from scripts.finetune_inbolt_depthrs import ( # noqa: F401 + FastFoundationStereoDepthRS, + DepthEncoder, + DepthFusionModule, + DepthInitBlend, +) +from scripts.finetune_inbolt_depthrs_2 import ( # noqa: F401 + FastFoundationStereoDepthRS_v2, + DepthInitBlend as DepthInitBlend_v2, + DepthOutputBlend, +) + +from benchmark_inbolt import ( + DepthBinAccumulator, + _preprocess_ir, + infer_depth_m, + load_model, + plot_depth_vs_distance, + BF, + ITERS, +) +from benchmark_inbolt_fs import ReportGeneratorInbolt, resolve_finetuned_model_path +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +DEPTHRS_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_depthrs_epoch_001.pth' +#DEPTHRS_V2_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_depthrs_v2_epoch_014.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_depthrs_benchmark' +N_VIZ = 5 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (Inbolt)', 'color': '#e74c3c'}, + 'depthrs_finetuned': {'label': 'FFS + RS Depth Fusion v1 (Inbolt)', 'color': '#8e44ad'}, + #'depthrs_v2': {'label': 'FFS + RS Output Blend v2 (Inbolt)', 'color': '#1abc9c'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' +RS_FPS = 30.0 + + +# ── depth-fusion inference ──────────────────────────────────────────────────── + +def load_depthrs_model(path: str): + """Load a FastFoundationStereoDepthRS checkpoint.""" + logging.info(f"Loading depth-fusion model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + + +def resolve_depthrs_model_path(preferred_path: str) -> Optional[str]: + """Return an existing depthrs v1 checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidates = sorted(weights_dir.glob('**/model_finetuned_inbolt_depthrs_epoch_*.pth')) + if candidates: + chosen = candidates[-1] + logging.warning(f'Preferred depthrs model not found at {preferred}. Using {chosen}') + return str(chosen) + + return None + + +def resolve_depthrs_v2_model_path(preferred_path: str) -> Optional[str]: + """Return an existing depthrs v2 checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidates = sorted(weights_dir.glob('**/model_finetuned_inbolt_depthrs_v2_epoch_*.pth')) + if candidates: + chosen = candidates[-1] + logging.warning(f'Preferred depthrs_v2 model not found at {preferred}. Using {chosen}') + return str(chosen) + + return None + + +@torch.no_grad() +def infer_depth_m_depthrs( + model, + left: np.ndarray, + right: np.ndarray, + depth_rs_mm: np.ndarray, +) -> np.ndarray: + """ + Run depth-fusion inference; return depth map in metres (H×W float32). + + depth_rs_mm: (H, W) float32, RealSense depth in millimetres. + """ + left_t, right_t = _preprocess_ir(left, right) + # debug - make depth zero + depth_rs_mm = depth_rs_mm*0 + depth_rs_t = torch.as_tensor(depth_rs_mm.astype(np.float32))[None, None].cuda() # (1,1,H,W) + + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t, depth_rs_t = padder.pad(left_t, right_t, depth_rs_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward( + left_t, right_t, + depth_rs_mm=depth_rs_t, + iters=ITERS, + test_mode=True, + ) + + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 # disparity → mm → m + return depth_m + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory') + parser.add_argument('--data_dir', default=DATA_DIR, help='Dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Original FFS weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Stereo-only fine-tuned weights') + parser.add_argument('--depthrs', default=DEPTHRS_PATH, help='Depth-fusion v1 weights') + #parser.add_argument('--depthrs_v2', default=DEPTHRS_V2_PATH, help='Depth-fusion v2 weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load models ─────────────────────────────────────────────────────────── + models = {} # name → model (stereo-only interface) + depthrs_models = {} # name → model (depth-fusion interface) + + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + models['finetuned'] = load_model(finetuned_path) + else: + logging.warning(f'Stereo fine-tuned model not found at {args.finetuned} — skipping') + + models['original'] = load_model(args.original) + + depthrs_path = resolve_depthrs_model_path(args.depthrs) + if depthrs_path is not None: + depthrs_models['depthrs_finetuned'] = load_depthrs_model(depthrs_path) + else: + logging.warning(f'Depth-fusion v1 model not found at {args.depthrs} — skipping') + + # depthrs_v2_path = resolve_depthrs_v2_model_path(args.depthrs_v2) + # if depthrs_v2_path is not None: + # depthrs_models['depthrs_v2'] = load_depthrs_model(depthrs_v2_path) + # else: + # logging.warning(f'Depth-fusion v2 model not found at {args.depthrs_v2} — skipping') + + all_model_names = list(models.keys()) + list(depthrs_models.keys()) + active_methods = [GT_NAME, RS_NAME] + all_model_names + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in all_model_names} + H = W = None + + depth_acc_keys = [GT_NAME, RS_NAME] + all_model_names + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + h, w = gt_mm.shape[:2] + if rs_mm.shape != (h, w): + import cv2 + rs_mm = cv2.resize(rs_mm, (w, h), interpolation=cv2.INTER_NEAREST) + + if H is None: + H, W = h, w + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + + # stereo-only models + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # depth-fusion models + for mname, model in depthrs_models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m_depthrs(model, left, right, rs_mm) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs[GT_NAME].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in all_model_names: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 20 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map (ground truth)'}, + } + if 'finetuned' in models and finetuned_path: + method_configs['finetuned'] = {'model_path': finetuned_path} + if 'depthrs_finetuned' in depthrs_models and depthrs_path: + method_configs['depthrs_finetuned'] = {'model_path': depthrs_path} + # if 'depthrs_v2' in depthrs_models and depthrs_v2_path: + # method_configs['depthrs_v2'] = {'model_path': depthrs_v2_path} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + # depth-vs-distance accuracy plot + plot_labels = {k: METHODS[k]['label'] for k in METHODS} + plot_colors = {k: METHODS[k]['color'] for k in METHODS} + + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt_fs_plane_bf.py b/scripts/benchmark_inbolt_fs_plane_bf.py new file mode 100644 index 0000000..1b7e3d6 --- /dev/null +++ b/scripts/benchmark_inbolt_fs_plane_bf.py @@ -0,0 +1,457 @@ +"""Benchmark FastFoundationStereo models + RealSense hardware depth on the Inbolt dataset. + +This benchmark mirrors the structure of ``benchmark_faro_rs.py`` but uses the +Inbolt dataset and the meter-based reporting pipeline already used by +``benchmark_inbolt.py``. + +For fair pixel-wise comparison against the RealSense stereo pair and hardware +RealSense depth map, Zivid ground-truth depth is projected into RealSense image +space via ``DataSource.get_item_projected()``. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs.py [--out_dir reports/inbolt_ffs_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +import cv2 +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch +from core.utils.utils import InputPadder +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +from scripts.finetune_inbolt_planes_bf import FastFoundationStereoWithDepth, DepthHead +import Utils as U +from benchmark_inbolt import DepthBinAccumulator, infer_depth_m, load_model, plot_depth_vs_distance #, infer_depth_nobf_m +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator +from finetune_inbolt_planes import find_flat_regions + + +# ── custom report generator ────────────────────────────────────────────────── + +class ReportGeneratorInbolt(ReportGenerator): + """Custom report generator that shows 4 frames in depth comparison and error maps.""" + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices = [] + + def _get_selected_viz_indices(self, n_pick: int = 4): + """Return cached random frame indices used consistently across report sections.""" + if self._selected_viz_indices: + return self._selected_viz_indices + + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + + n = min(n_pick, n_total) + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("depth_comparison.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows = len(sel) + ncols = len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="m") + title = self._r.method_labels.get(name, name) + if c == 0: + title = f"Frame {frame_idx + 1} • {title}" + ax.set_title(title, fontsize=9, wrap=True) + ax.axis("off") + + fig.suptitle("Depth Map Comparison (4 random frames) — values in meters", + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("error_maps.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + + nrows = len(sel) + ncols = len(names) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis("off") + continue + + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=0.001, vmax=0.1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (m)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + if c == 0: + ax.set_title(f"Frame {frame_idx + 1} • {label}\nMAE={mean_err:.4f} m", fontsize=9) + else: + ax.set_title(f"{label}\nMAE={mean_err:.4f} m", fontsize=9) + ax.axis("off") + + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label} (4 random frames, m)", fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415_epoch_030.pth' +# MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +#FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +#DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark-model37-111-set-20260414_142239' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_planes_bf_epoch_093.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark_planes_bf' +N_VIZ = 5 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' +RS_FPS = 30.0 +ITERS = 8 + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert IR uint8 pair to float RGB tensors on CUDA.""" + def _to_t(img): + img = np.clip(img.astype(np.float32), 0, 255) + img = np.stack([img, img, img], axis=-1) + return torch.as_tensor(img).float()[None].permute(0, 3, 1, 2).cuda() + return _to_t(left), _to_t(right) + +def resolve_finetuned_model_path(preferred_path: str) -> Optional[str]: + """Return an existing fine-tuned Inbolt checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidate_names = [ + 'model_finetuned_inbolt.pth', + 'model_finetuned_inbolt-20260415_epoch_030.pth', + ] + + # 1) Try known candidate file names anywhere under weights/ + for name in candidate_names: + found = sorted(weights_dir.glob(f'**/{name}')) + if found: + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using fallback {found[0]}' + ) + return str(found[0]) + + # 2) Fallback to any Inbolt fine-tuned checkpoint, prefer lexicographically latest + generic = sorted(weights_dir.glob('**/model_finetuned_inbolt*.pth')) + if generic: + chosen = generic[-1] + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using discovered checkpoint {chosen}' + ) + return str(chosen) + + return None + +@torch.no_grad() +def infer_depth_nobf_m(model:FastFoundationStereoWithDepth, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Run stereo inference on an IR pair that returns depth; return depth map in metres (H×W float32).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + depth, disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + + depth = padder.unpad(depth.float()) + depth_np = depth.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + + depth_m = np.zeros_like(depth_np) + valid = depth_np > 0 + depth_m[valid] = depth_np[valid] / 1000.0 # mm → m + return depth_m + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load stereo models ─────────────────────────────────────────────────── + models = {} + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + models['finetuned'] = load_model(finetuned_path) + else: + logging.warning( + f'Fine-tuned model not found (preferred: {args.finetuned}) and no fallback checkpoint found — skipping' + ) + + models['original'] = load_model(args.original) + + active_methods = [GT_NAME, RS_NAME] + list(models.keys()) + + # ── dataset ────────────────────────────────────────────────────────────── + source = DataSource(train_mode = False) + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ───────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + depth_acc_keys = ['zivid_gt', RS_NAME] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + # valid only for flat regions + valid = (gt_m > 0) + valid = find_flat_regions(gt_mm, valid) + gt_m[valid == False] = 0.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + for mname, model in models.items(): + t0 = time.monotonic() + if mname == 'finetuned': + frame_depths[mname] = infer_depth_nobf_m(model, left, right) + else: + frame_depths[mname] = infer_depth_m(model, left, right) + # save raw data to p.g images 16 bit PNGs for later analysis if needed + #cv2.imwrite(str(out_dir / f'{mname}_{idx:03d}.png'), (frame_depths[mname] * 1000.0).astype(np.uint16)) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + # # create point clouds for visualization + # if idx % 10 == 0: + # for mname in active_methods: + # pred = frame_depths[mname] + + # XYZ = source.project_camera_to_3d(pred, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in Zivid camera space + # mname_path = os.path.join(out_dir, f'{mname}_{idx:03d}.ply') + # source.save_to_ply(XYZ/1000, mname_path) # save in meters for visualization + + + for mname in active_methods: + pred = frame_depths[mname] + + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, + 0.0, + 0.0, + 0.0, + 100.0, + float((pred > 0).mean()) * 100.0, + 0.0, + mae_pen=0.0, + mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs['zivid_gt'].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map used as Inbolt ground truth'}, + } + if 'finetuned' in models and finetuned_path is not None: + method_configs['finetuned'] = {'model_path': finetuned_path} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + plot_colors = { + 'zivid_gt': METHODS[GT_NAME]['color'], + RS_NAME: METHODS[RS_NAME]['color'], + **{m: METHODS[m]['color'] for m in models if m in METHODS}, + } + plot_labels = { + 'zivid_gt': 'Zivid GT (spatial spread)', + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt_fs_planes.py b/scripts/benchmark_inbolt_fs_planes.py new file mode 100644 index 0000000..06d5646 --- /dev/null +++ b/scripts/benchmark_inbolt_fs_planes.py @@ -0,0 +1,426 @@ +"""Benchmark FastFoundationStereo models + RealSense hardware depth on the Inbolt dataset. + +This benchmark mirrors the structure of ``benchmark_faro_rs.py`` but uses the +Inbolt dataset and the meter-based reporting pipeline already used by +``benchmark_inbolt.py``. + +For fair pixel-wise comparison against the RealSense stereo pair and hardware +RealSense depth map, Zivid ground-truth depth is projected into RealSense image +space via ``DataSource.get_item_projected()``. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs.py [--out_dir reports/inbolt_ffs_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +import cv2 +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import Utils as U +from benchmark_inbolt import DepthBinAccumulator, infer_depth_m, load_model, plot_depth_vs_distance +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator +from finetune_inbolt_planes import find_flat_regions + + +# ── custom report generator ────────────────────────────────────────────────── + +class ReportGeneratorInbolt(ReportGenerator): + """Custom report generator that shows 4 frames in depth comparison and error maps.""" + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices = [] + + def _get_selected_viz_indices(self, n_pick: int = 4): + """Return cached random frame indices used consistently across report sections.""" + if self._selected_viz_indices: + return self._selected_viz_indices + + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + + n = min(n_pick, n_total) + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("depth_comparison.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows = len(sel) + ncols = len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="m") + title = self._r.method_labels.get(name, name) + if c == 0: + title = f"Frame {frame_idx + 1} • {title}" + ax.set_title(title, fontsize=9, wrap=True) + ax.axis("off") + + fig.suptitle("Depth Map Comparison (4 random frames) — values in meters", + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("error_maps.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + + nrows = len(sel) + ncols = len(names) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis("off") + continue + + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=0.001, vmax=0.1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (m)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + if c == 0: + ax.set_title(f"Frame {frame_idx + 1} • {label}\nMAE={mean_err:.4f} m", fontsize=9) + else: + ax.set_title(f"{label}\nMAE={mean_err:.4f} m", fontsize=9) + ax.axis("off") + + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label} (4 random frames, m)", fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415_epoch_030.pth' +# MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +#FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +#DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark-model37-111-set-20260414_142239' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_planes_25_epoch_012.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_benchmark-planes_25' +N_VIZ = 5 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' +RS_FPS = 30.0 + + +def resolve_finetuned_model_path(preferred_path: str) -> Optional[str]: + """Return an existing fine-tuned Inbolt checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidate_names = [ + 'model_finetuned_inbolt.pth', + 'model_finetuned_inbolt-20260415_epoch_030.pth', + ] + + # 1) Try known candidate file names anywhere under weights/ + for name in candidate_names: + found = sorted(weights_dir.glob(f'**/{name}')) + if found: + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using fallback {found[0]}' + ) + return str(found[0]) + + # 2) Fallback to any Inbolt fine-tuned checkpoint, prefer lexicographically latest + generic = sorted(weights_dir.glob('**/model_finetuned_inbolt*.pth')) + if generic: + chosen = generic[-1] + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using discovered checkpoint {chosen}' + ) + return str(chosen) + + return None + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load stereo models ─────────────────────────────────────────────────── + models = {} + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + models['finetuned'] = load_model(finetuned_path) + else: + logging.warning( + f'Fine-tuned model not found (preferred: {args.finetuned}) and no fallback checkpoint found — skipping' + ) + + models['original'] = load_model(args.original) + + active_methods = [GT_NAME, RS_NAME] + list(models.keys()) + + # ── dataset ────────────────────────────────────────────────────────────── + source = DataSource(train_mode = False) + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ───────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + depth_acc_keys = ['zivid_gt', RS_NAME] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + # valid only for flat regions + valid = (gt_m > 0) + valid = find_flat_regions(gt_mm, valid) + gt_m[valid == False] = 0.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + # save raw data to p.g images 16 bit PNGs for later analysis if needed + #cv2.imwrite(str(out_dir / f'{mname}_{idx:03d}.png'), (frame_depths[mname] * 1000.0).astype(np.uint16)) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + # # create point clouds for visualization + # if idx % 10 == 0: + # for mname in active_methods: + # pred = frame_depths[mname] + + # XYZ = source.project_camera_to_3d(pred, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in Zivid camera space + # mname_path = os.path.join(out_dir, f'{mname}_{idx:03d}.ply') + # source.save_to_ply(XYZ/1000, mname_path) # save in meters for visualization + + + for mname in active_methods: + pred = frame_depths[mname] + + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, + 0.0, + 0.0, + 0.0, + 100.0, + float((pred > 0).mean()) * 100.0, + 0.0, + mae_pen=0.0, + mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs['zivid_gt'].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map used as Inbolt ground truth'}, + } + if 'finetuned' in models and finetuned_path is not None: + method_configs['finetuned'] = {'model_path': finetuned_path} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + plot_colors = { + 'zivid_gt': METHODS[GT_NAME]['color'], + RS_NAME: METHODS[RS_NAME]['color'], + **{m: METHODS[m]['color'] for m in models if m in METHODS}, + } + plot_labels = { + 'zivid_gt': 'Zivid GT (spatial spread)', + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt_fs_with_confidence.py b/scripts/benchmark_inbolt_fs_with_confidence.py new file mode 100644 index 0000000..eeae811 --- /dev/null +++ b/scripts/benchmark_inbolt_fs_with_confidence.py @@ -0,0 +1,477 @@ +"""Benchmark FastFoundationStereo models on the Inbolt dataset, including the +confidence-head variant produced by finetune_inbolt_with_confidence.py. + +Methods compared: + original – pretrained FFS (no fine-tuning) + finetuned – FFS fine-tuned on INBOLT (standard loss) + confidence – FFS fine-tuned on INBOLT with confidence head + depth_rs – RealSense hardware depth (baseline) + zivid_gt – Zivid projected ground truth + +Confidence maps (one per frame) are stored alongside depth maps and rendered in +an extra report figure. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs_with_confidence.py [--out_dir reports/...] +""" + +import argparse +import logging +import os +import sys +import time +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import torch + +import Utils as U +from core.utils.utils import InputPadder + +# Import confidence model classes into __main__ scope so torch.load can +# find them when unpickling a model saved during training (where they were +# also in __main__). +from scripts.finetune_inbolt_with_confidence import FastFoundationStereoWithConfidence, ConfidenceHead # noqa: F401 + +from benchmark_inbolt import DepthBinAccumulator, plot_depth_vs_distance +from benchmark_inbolt_fs import ReportGeneratorInbolt, resolve_finetuned_model_path +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +CONFIDENCE_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_with_confidence-20260507_epoch_026.pth' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_ffs_confidence_benchmark' +N_VIZ = 5 +ITERS = 8 +BF = 50.102706998586 * 385.509887695312 # focal_px * baseline_mm +RS_FPS = 30.0 + +CONF_VIZ_KEY = '_conf_map' # suffix appended to 'confidence' in viz_frames + +METHODS: Dict[str, Dict] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'confidence': {'label': 'FFS + Confidence Head', 'color': '#8e44ad'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' + + +# ── inference helpers ───────────────────────────────────────────────────────── + +def _preprocess_ir(left: np.ndarray, right: np.ndarray): + """Convert IR uint8 pair to float RGB tensors on CUDA.""" + def _to_t(img): + img = np.clip(img.astype(np.float32), 0, 255) + img = np.stack([img, img, img], axis=-1) + return torch.as_tensor(img).float()[None].permute(0, 3, 1, 2).cuda() + return _to_t(left), _to_t(right) + + +@torch.no_grad() +def infer_depth_m(model, left: np.ndarray, right: np.ndarray) -> np.ndarray: + """Standard inference for models whose forward returns a disparity tensor.""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + disp = padder.unpad(disp.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 + return depth_m + + +@torch.no_grad() +def infer_depth_and_conf_m( + model: FastFoundationStereoWithConfidence, + left: np.ndarray, + right: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: + """Inference for the confidence model; returns (depth_m, conf [0–1]).""" + left_t, right_t = _preprocess_ir(left, right) + padder = InputPadder(left_t.shape, divis_by=32, force_square=False) + left_t, right_t = padder.pad(left_t, right_t) + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + disp, conf = model.forward(left_t, right_t, iters=ITERS, test_mode=True) + disp = padder.unpad(disp.float()) + conf = padder.unpad(conf.float()) + disp_np = disp.cpu().numpy().reshape(left.shape[:2]).clip(0, None) + conf_np = conf.cpu().numpy().reshape(left.shape[:2]).clip(0.0, 1.0) + depth_m = np.zeros_like(disp_np) + valid = disp_np > 0 + depth_m[valid] = (BF / disp_np[valid]) / 1000.0 + return depth_m, conf_np + + +# ── model loading ───────────────────────────────────────────────────────────── + +def load_model(path: str): + logging.info(f"Loading model from {path}") + model = torch.load(path, map_location='cpu', weights_only=False) + model.cuda().eval() + return model + + +def resolve_confidence_model_path(preferred: str) -> Optional[str]: + """Return an existing confidence-model checkpoint path, or None.""" + p = Path(preferred) + if p.exists(): + return str(p) + weights_dir = Path(code_dir) / '..' / 'weights' + candidates = sorted(weights_dir.glob('**/model_finetuned_inbolt_with_confidence*.pth')) + if candidates: + chosen = candidates[-1] + logging.warning(f"Preferred confidence model not found at {preferred}. Using {chosen}") + return str(chosen) + return None + + +# ── custom report generator ─────────────────────────────────────────────────── + +class ReportGeneratorWithConfidence(ReportGeneratorInbolt): + """Extends the INBOLT report with a confidence map visualisation panel.""" + + def generate(self) -> None: + fig_paths = [ + self._fig_depth_comparison(), + self._fig_error_maps(), + self._fig_confidence_maps(), + self._fig_coverage_heatmaps(), + self._fig_distance_error_curve(), + self._fig_error_histograms(), + self._fig_summary_table(), + self._fig_close_range_analysis(), + self._fig_timing_bars(), + ] + self._write_json() + self._write_html([p for p in fig_paths if p]) + print(f"\nReport written to: {self._out / 'index.html'}") + + def _fig_confidence_maps(self) -> str: + """Render per-frame confidence maps for the confidence model.""" + conf_key = f'confidence{CONF_VIZ_KEY}' + frames_with_conf = [vf for vf in self._r.viz_frames if conf_key in vf] + if not frames_with_conf: + return self._empty_fig("confidence_maps.png", "No confidence maps recorded") + + sel = self._get_selected_viz_indices(n_pick=4) + sel = [i for i in sel if conf_key in self._r.viz_frames[i]] + if not sel: + return self._empty_fig("confidence_maps.png", "No confidence maps in selected frames") + + ncols = 3 # left image (depth), confidence, zivid_gt (for reference) + nrows = len(sel) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + depth_cmap = self._depth_cmap() + conf_cmap = plt.get_cmap('RdYlGn') # red = low conf, green = high conf + + col_titles = [ + METHODS['confidence']['label'], + 'Confidence (0 = invalid, 1 = valid)', + METHODS[GT_NAME]['label'], + ] + + for row_idx, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + depth_conf = vf.get('confidence') + conf_map = vf.get(conf_key) + depth_gt = vf.get(GT_NAME) + + # Column 0: confidence model depth + ax = axes[row_idx, 0] + if depth_conf is not None: + im = ax.imshow(depth_conf, cmap=depth_cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label='m') + else: + ax.axis('off') + if row_idx == 0: + ax.set_title(col_titles[0], fontsize=8) + ax.set_ylabel(f'Frame {frame_idx + 1}', fontsize=8) + ax.set_xticks([]) + ax.set_yticks([]) + + # Column 1: confidence map + ax = axes[row_idx, 1] + if conf_map is not None: + im = ax.imshow(conf_map, cmap=conf_cmap, vmin=0.0, vmax=1.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04) + mean_conf = float(conf_map.mean()) + ax.set_title(f'{col_titles[1]}\nmean={mean_conf:.3f}', fontsize=8) + else: + ax.axis('off') + if row_idx == 0: + ax.set_title(col_titles[1], fontsize=8) + ax.set_xticks([]) + ax.set_yticks([]) + + # Column 2: Zivid GT + ax = axes[row_idx, 2] + if depth_gt is not None: + im = ax.imshow(depth_gt, cmap=depth_cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label='m') + else: + ax.axis('off') + if row_idx == 0: + ax.set_title(col_titles[2], fontsize=8) + ax.set_xticks([]) + ax.set_yticks([]) + + fig.suptitle( + 'Confidence Head Output — Depth, Confidence Map, and Zivid GT', + fontsize=11, y=1.01, + ) + fig.tight_layout() + return self._save(fig, 'confidence_maps.png') + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to standard fine-tuned model weights') + parser.add_argument('--confidence', default=CONFIDENCE_PATH, help='Path to confidence-head model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load models ─────────────────────────────────────────────────────────── + regular_models: Dict[str, object] = {} + conf_models: Dict[str, object] = {} + + regular_models['original'] = load_model(args.original) + + ft_path = resolve_finetuned_model_path(args.finetuned) + if ft_path: + regular_models['finetuned'] = load_model(ft_path) + else: + logging.warning(f'Standard fine-tuned model not found at {args.finetuned} — skipping') + ft_path = args.finetuned + + conf_path = resolve_confidence_model_path(args.confidence) + if conf_path: + conf_models['confidence'] = load_model(conf_path) + else: + logging.warning(f'Confidence model not found at {args.confidence} — skipping') + conf_path = args.confidence + + all_nn_models = {**regular_models, **conf_models} + active_methods = [GT_NAME, RS_NAME] + list(all_nn_models.keys()) + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics: list[FrameMetrics] = [] + viz_frames: list[dict] = [] + valid_acc: Dict[str, np.ndarray] = {} + dist_bin_mae: Dict[str, list] = {m: [] for m in active_methods} + close_range_valid: Dict[str, list] = {m: [] for m in active_methods} + timing_ms_raw: Dict[str, list] = {m: [] for m in all_nn_models} + H = W = None + + depth_acc_keys = [GT_NAME, RS_NAME] + list(all_nn_models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + frame_depths: Dict[str, np.ndarray] = {GT_NAME: gt_m, RS_NAME: rs_m} + frame_confs: Dict[str, np.ndarray] = {} + + for mname, model in regular_models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + for mname, model in conf_models.items(): + t0 = time.monotonic() + depth_m, conf_np = infer_depth_and_conf_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + frame_depths[mname] = depth_m + frame_confs[mname] = conf_np + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs[GT_NAME].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in all_nn_models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + vf = {k: v.copy() for k, v in frame_depths.items()} + # Store confidence maps under a separate key so they don't pollute depth metrics + for mname, conf_np in frame_confs.items(): + vf[f'{mname}{CONF_VIZ_KEY}'] = conf_np.copy() + viz_frames.append(vf) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = {m: float(np.mean(ts)) if ts else 0.0 for m, ts in timing_ms_raw.items()} + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map used as INBOLT ground truth'}, + } + if 'finetuned' in regular_models and ft_path: + method_configs['finetuned'] = {'model_path': ft_path} + if 'confidence' in conf_models and conf_path: + method_configs['confidence'] = {'model_path': conf_path} + + results = BenchmarkResults( + method_names = active_methods, + method_labels = {m: METHODS[m]['label'] for m in active_methods}, + method_colors = {m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name = GT_NAME, + n_frames = n, + width = W, + height = H, + all_metrics = all_metrics, + viz_frames = viz_frames, + coverage_maps = valid_acc, + dist_bin_mae = dist_bin_mae, + close_range_valid = close_range_valid, + source = f'INBOLT dataset ({args.data_dir})', + method_configs = method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorWithConfidence(results, stats, out_dir) + reporter.generate() + + # ── depth vs distance plot ───────────────────────────────────────────────── + plot_colors = { + GT_NAME: METHODS[GT_NAME]['color'], + RS_NAME: METHODS[RS_NAME]['color'], + **{m: METHODS[m]['color'] for m in all_nn_models if m in METHODS}, + } + plot_labels = { + GT_NAME: 'Zivid GT (spatial spread)', + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + 'confidence': METHODS['confidence']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + + # ── console summary ─────────────────────────────────────────────────────── + logging.info('\n── Depth summary (test set) ─────────────────────────────────────────') + for mname in active_methods: + if mname == GT_NAME: + continue + s = stats.get(mname) + if s is None: + continue + logging.info( + f" {s.label:<35} MAE={s.mae_mean*1000:.1f} mm " + f"MRE*={s.mre_pen_mean*100:.2f}% coverage={s.coverage_mean:.1f}% " + f"FPS={s.fps_mean:.1f}" + ) + + logging.info(f'\nAll outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_inbolt_trt.py b/scripts/benchmark_inbolt_trt.py new file mode 100644 index 0000000..f30f6ab --- /dev/null +++ b/scripts/benchmark_inbolt_trt.py @@ -0,0 +1,553 @@ +"""Benchmark original + fine-tuned FFS (PyTorch) and fine-tuned FFS-TRT (TensorRT FP16) on Inbolt. + +Extends ``benchmark_inbolt_fs.py`` by adding a TensorRT FP16 model alongside the +PyTorch models and RealSense hardware depth, so you can compare accuracy and speed +between the full-precision PyTorch path and the compiled TRT engine. + +Pass --rebuild_trt to export ONNX and compile TRT engines from the fine-tuned model +at the start of the run (requires ~10 min on first compile). The engines are written +to --trt_dir and reused on subsequent runs without --rebuild_trt. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + # First run: build engines then benchmark + python scripts/benchmark_inbolt_trt.py --rebuild_trt [--trt_dir output/onnx_trt_ft] + # Subsequent runs: reuse existing engines + python scripts/benchmark_inbolt_trt.py [--trt_dir output/onnx_trt_ft] [--out_dir reports/inbolt_trt_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +import warnings +import cv2 +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np +import torch +import yaml +from omegaconf import OmegaConf + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import Utils as U +from benchmark_inbolt import DepthBinAccumulator, plot_depth_vs_distance, BF, ITERS +from benchmark_inbolt import infer_depth_m, load_model +from scripts.data_manager_inbolt import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator +from core.foundation_stereo import TrtRunner, TrtFeatureRunner, TrtPostRunner, build_gwc_volume_triton +from make_trt_engine import build_engine + + +# ── constants ──────────────────────────────────────────────────────────────── + +DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +DEFAULT_TRT_DIR = f'{code_dir}/../output/onnx_trt' +DEFAULT_OUT = f'{code_dir}/../reports/inbolt_trt_benchmark' +N_VIZ = 5 + +RS_FPS = 30.0 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'finetuned_trt': {'label': 'FFS Fine-tuned TRT (FP16)', 'color': '#8e44ad'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, + 'zivid_gt': {'label': 'Zivid GT (projected to RS)', 'color': '#27ae60'}, +} +GT_NAME = 'zivid_gt' +RS_NAME = 'depth_rs' + + +# ── TRT build ──────────────────────────────────────────────────────────────── + +def rebuild_trt_engines(model_path: str, trt_dir: str, height: int, width: int) -> None: + """Export the model at *model_path* to ONNX, then compile both TRT engines. + + Writes to *trt_dir*: + feature_runner.onnx / feature_runner.engine + post_runner.onnx / post_runner.engine + onnx.yaml + """ + assert height % 32 == 0 and width % 32 == 0, \ + f"height and width must be divisible by 32, got {height}x{width}" + + os.makedirs(trt_dir, exist_ok=True) + logging.info(f'[rebuild] Loading model from {model_path}') + model = torch.load(model_path, map_location='cpu', weights_only=False) + model.cuda().eval() + + feature_runner = TrtFeatureRunner(model).cuda().eval() + post_runner = TrtPostRunner(model).cuda().eval() + + dummy_left = torch.randn(1, 3, height, width, device='cuda').float() * 255 + dummy_right = torch.randn(1, 3, height, width, device='cuda').float() * 255 + + # ── feature runner → ONNX ──────────────────────────────────────────────── + feature_onnx = os.path.join(trt_dir, 'feature_runner.onnx') + logging.info(f'[rebuild] Exporting feature_runner → {feature_onnx}') + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + torch.onnx.export( + feature_runner, + (dummy_left, dummy_right), + feature_onnx, + opset_version=17, + input_names=['left', 'right'], + output_names=['features_left_04', 'features_left_08', 'features_left_16', + 'features_left_32', 'features_right_04', 'stem_2x'], + do_constant_folding=True, + dynamo=True, + ) + + # ── post runner → ONNX ─────────────────────────────────────────────────── + with torch.no_grad(): + feats = feature_runner(dummy_left, dummy_right) + f04, f08, f16, f32, fr04, stem_2x = feats + cv_group = getattr(model, 'cv_group', 8) + gwc_volume = build_gwc_volume_triton( + f04.half(), fr04.half(), model.args.max_disp // 4, cv_group + ) + + post_onnx = os.path.join(trt_dir, 'post_runner.onnx') + logging.info(f'[rebuild] Exporting post_runner → {post_onnx}') + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + torch.onnx.export( + post_runner, + (f04.float(), f08.float(), f16.float(), f32.float(), + fr04.float(), stem_2x.float(), gwc_volume.float()), + post_onnx, + opset_version=17, + input_names=['features_left_04', 'features_left_08', 'features_left_16', + 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'], + output_names=['disp'], + do_constant_folding=True, + dynamo=True, + ) + + # ── save model config ───────────────────────────────────────────────────── + yaml_path = os.path.join(trt_dir, 'onnx.yaml') + with open(yaml_path, 'w') as f: + yaml.safe_dump(OmegaConf.to_container(model.args), f) + logging.info(f'[rebuild] Saved onnx.yaml → {yaml_path}') + + # ── compile TRT engines ─────────────────────────────────────────────────── + del model, feature_runner, post_runner # free GPU memory before TRT build + torch.cuda.empty_cache() + + for name in ('feature_runner', 'post_runner'): + onnx_path = os.path.join(trt_dir, f'{name}.onnx') + engine_path = os.path.join(trt_dir, f'{name}.engine') + logging.info(f'[rebuild] Compiling {name}.engine (this may take several minutes) …') + build_engine(onnx_path, engine_path, fp16=True, workspace_gb=4) + + logging.info(f'[rebuild] TRT engines ready in {trt_dir}') + + +# ── TRT helpers ─────────────────────────────────────────────────────────────── + +def load_trt_model(trt_dir: str) -> Optional[TrtRunner]: + """Load TRT engines from *trt_dir*; return None if engines are missing.""" + feature_engine = os.path.join(trt_dir, 'feature_runner.engine') + post_engine = os.path.join(trt_dir, 'post_runner.engine') + yaml_path = os.path.join(trt_dir, 'onnx.yaml') + + for p in (feature_engine, post_engine, yaml_path): + if not os.path.exists(p): + logging.warning(f'TRT file not found: {p} — skipping TRT model') + return None + + with open(yaml_path) as f: + cfg = yaml.safe_load(f) + args = OmegaConf.create(cfg) + + logging.info(f'Loading TRT engines from {trt_dir}') + return TrtRunner(args, feature_engine, post_engine) + + +@torch.no_grad() +def infer_depth_m_trt( + trt_model: TrtRunner, + trt_h: int, + trt_w: int, + left: np.ndarray, + right: np.ndarray, +) -> np.ndarray: + """Run TRT stereo inference; return depth map in metres (H×W float32). + + Images are resized to the fixed TRT engine resolution, disparity is scaled + back to original pixel units before the BF depth conversion. + """ + orig_h, orig_w = left.shape[:2] + fx = trt_w / orig_w + + left_r = cv2.resize(left.astype(np.float32), (trt_w, trt_h)) + right_r = cv2.resize(right.astype(np.float32), (trt_w, trt_h)) + + # pseudo-RGB (same as _preprocess_ir in benchmark_inbolt.py) + left_r = np.stack([left_r, left_r, left_r], axis=-1) + right_r = np.stack([right_r, right_r, right_r], axis=-1) + + left_t = torch.as_tensor(left_r).float()[None].permute(0, 3, 1, 2).cuda() + right_t = torch.as_tensor(right_r).float()[None].permute(0, 3, 1, 2).cuda() + + disp = trt_model.forward(left_t, right_t) + disp_np = disp.cpu().numpy().reshape(trt_h, trt_w).clip(0, None) + + # resize back to original resolution; divide by fx to restore pixel-unit disparity + disp_orig = cv2.resize(disp_np, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR) / fx + + depth_m = np.zeros_like(disp_orig) + valid = disp_orig > 0 + depth_m[valid] = (BF / disp_orig[valid]) / 1000.0 + return depth_m + + +# ── report generator ────────────────────────────────────────────────────────── + +class ReportGeneratorInbolt(ReportGenerator): + """4-frame depth comparison and error maps.""" + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices = [] + + def _get_selected_viz_indices(self, n_pick: int = 4): + if self._selected_viz_indices: + return self._selected_viz_indices + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + n = min(n_pick, n_total) + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig('depth_comparison.png', 'No viz frames') + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig('depth_comparison.png', 'No viz frames') + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows, ncols = len(sel), len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis('off') + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label='m') + title = self._r.method_labels.get(name, name) + if c == 0: + title = f'Frame {frame_idx + 1} • {title}' + ax.set_title(title, fontsize=9, wrap=True) + ax.axis('off') + fig.suptitle('Depth Map Comparison (4 random frames) — values in metres', + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, 'depth_comparison.png') + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig('error_maps.png', 'No comparison methods') + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig('error_maps.png', 'No viz frames') + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig('error_maps.png', 'Ground truth not in viz frame') + nrows, ncols = len(sel), len(names) + cmap = plt.get_cmap('hot').copy() + cmap.set_under('#222222') + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis('off') + continue + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis('off') + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=0.001, vmax=0.1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label='|error| (m)') + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + title = f'Frame {frame_idx + 1} • {label}\nMAE={mean_err:.4f} m' if c == 0 \ + else f'{label}\nMAE={mean_err:.4f} m' + ax.set_title(title, fontsize=9) + ax.axis('off') + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f'Absolute Error vs {gt_label} (4 random frames, m)', fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, 'error_maps.png') + + +# ── misc helpers ────────────────────────────────────────────────────────────── + +def resolve_finetuned_model_path(preferred_path: str) -> Optional[str]: + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + weights_dir = Path(code_dir) / '..' / 'weights' + for name in ('model_finetuned_inbolt.pth', 'model_finetuned_inbolt-20260415_epoch_030.pth'): + found = sorted(weights_dir.glob(f'**/{name}')) + if found: + logging.warning(f'Preferred fine-tuned model not found. Using fallback {found[0]}') + return str(found[0]) + generic = sorted(weights_dir.glob('**/model_finetuned_inbolt*.pth')) + if generic: + chosen = generic[-1] + logging.warning(f'Preferred fine-tuned model not found. Using discovered checkpoint {chosen}') + return str(chosen) + return None + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Fine-tuned model weights') + parser.add_argument('--trt_dir', default=DEFAULT_TRT_DIR, + help='Directory with feature_runner.engine, post_runner.engine, onnx.yaml') + parser.add_argument('--trt_height', type=int, default=448, help='TRT engine input height') + parser.add_argument('--trt_width', type=int, default=640, help='TRT engine input width') + parser.add_argument('--rebuild_trt', action='store_true', + help='Re-export ONNX and recompile TRT engines from --finetuned before benchmarking') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── optionally rebuild TRT engines ──────────────────────────────────────── + if args.rebuild_trt: + build_src = resolve_finetuned_model_path(args.finetuned) or args.original + logging.info(f'[rebuild_trt] Building TRT engines from {build_src} → {args.trt_dir}') + rebuild_trt_engines(build_src, args.trt_dir, args.trt_height, args.trt_width) + + # ── load PyTorch models ─────────────────────────────────────────────────── + pt_models = {} + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + pt_models['finetuned'] = load_model(finetuned_path) + else: + logging.warning(f'Fine-tuned model not found (preferred: {args.finetuned}) — skipping') + pt_models['original'] = load_model(args.original) + + # ── load TRT model ──────────────────────────────────────────────────────── + trt_model = load_trt_model(args.trt_dir) + trt_h, trt_w = args.trt_height, args.trt_width + + # ── assemble active methods ─────────────────────────────────────────────── + active_methods = [GT_NAME, RS_NAME] + list(pt_models.keys()) + if trt_model is not None: + active_methods.append('finetuned_trt') + + logging.info(f'Active methods: {active_methods}') + + # ── dataset ─────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ────────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in active_methods if m not in (GT_NAME, RS_NAME)} + H = W = None + + depth_acc_keys = [GT_NAME, RS_NAME] + [m for m in active_methods if m not in (GT_NAME, RS_NAME)] + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item_projected(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_zivid'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + + # PyTorch models + for mname, model in pt_models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + cv2.imwrite(str(out_dir / f'{mname}_{idx:03d}.png'), + (frame_depths[mname] * 1000.0).astype(np.uint16)) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + # TRT model + if trt_model is not None: + t0 = time.monotonic() + frame_depths['finetuned_trt'] = infer_depth_m_trt(trt_model, trt_h, trt_w, left, right) + cv2.imwrite(str(out_dir / f'finetuned_trt_{idx:03d}.png'), + (frame_depths['finetuned_trt'] * 1000.0).astype(np.uint16)) + timing_ms_raw['finetuned_trt'].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics(GT_NAME, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + close_cov = (float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0) + close_range_valid[mname].append(close_cov) + + depth_accs[GT_NAME].update(gt_m, gt_m) + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in active_methods: + if mname not in (GT_NAME, RS_NAME): + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = {m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items()} + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + GT_NAME: {'source': 'Projected Zivid depth map used as Inbolt ground truth'}, + } + if 'finetuned' in pt_models and finetuned_path is not None: + method_configs['finetuned'] = {'model_path': finetuned_path} + if trt_model is not None: + method_configs['finetuned_trt'] = { + 'engine_dir': args.trt_dir, + 'input_size': f'{trt_h}x{trt_w}', + 'precision': 'FP16', + } + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + # ── depth-vs-distance plot ──────────────────────────────────────────────── + plot_colors = {m: METHODS[m]['color'] for m in active_methods if m in METHODS} + plot_labels = { + GT_NAME: 'Zivid GT (spatial spread)', + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + 'finetuned_trt': METHODS['finetuned_trt']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_office_fs.py b/scripts/benchmark_office_fs.py new file mode 100644 index 0000000..8c1fee0 --- /dev/null +++ b/scripts/benchmark_office_fs.py @@ -0,0 +1,409 @@ +"""Benchmark FastFoundationStereo models + RealSense hardware depth on the Office dataset. + +This benchmark mirrors the structure of ``benchmark_faro_rs.py`` but uses the +Office dataset and the meter-based reporting pipeline already used by +``benchmark_inbolt.py``. + +Depth predictions from stereo and RealSense hardware depth are compared using +RealSense depth as the baseline reference. + +Usage: + cd /home/adiroha/repos/Fast-FoundationStereo + python scripts/benchmark_inbolt_fs.py [--out_dir reports/inbolt_ffs_benchmark] +""" + +import argparse +import logging +import os +import sys +import time +import cv2 +from pathlib import Path +from typing import Dict, Optional + +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import numpy as np + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import Utils as U +from benchmark_inbolt import DepthBinAccumulator, infer_depth_m, load_model, plot_depth_vs_distance +from data_manager_office import DataSource, CAMERA_MATRIX_RS, DIST_COEFFS_RS +from metrics import ( + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, + aggregate, + CLOSE_RANGE_THRESHOLD_M, +) +from report import ReportGenerator + + +# ── custom report generator ────────────────────────────────────────────────── + +class ReportGeneratorInbolt(ReportGenerator): + """Custom report generator that shows 4 frames in depth comparison and error maps.""" + + def __init__(self, results, stats, output_dir) -> None: + super().__init__(results, stats, output_dir) + self._selected_viz_indices = [] + + def _get_selected_viz_indices(self, n_pick: int = 4): + """Return cached random frame indices used consistently across report sections.""" + if self._selected_viz_indices: + return self._selected_viz_indices + + n_total = len(self._r.viz_frames) + if n_total == 0: + self._selected_viz_indices = [] + return self._selected_viz_indices + + n = min(n_pick, n_total) + rng = np.random.default_rng(42) + self._selected_viz_indices = sorted(rng.choice(n_total, size=n, replace=False).tolist()) + return self._selected_viz_indices + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("depth_comparison.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + method_names = [n for n in self._r.method_names if n in vf0] + nrows = len(sel) + ncols = len(method_names) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + cmap = self._depth_cmap() + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + for c, name in enumerate(method_names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + im = ax.imshow(vf[name], cmap=cmap, vmin=0.1, vmax=2.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="m") + title = self._r.method_labels.get(name, name) + if c == 0: + title = f"Frame {frame_idx + 1} • {title}" + ax.set_title(title, fontsize=9, wrap=True) + ax.axis("off") + + fig.suptitle("Depth Map Comparison (4 random frames) — values in meters", + fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + + sel = self._get_selected_viz_indices(n_pick=4) + if not sel: + return self._empty_fig("error_maps.png", "No viz frames") + + vf0 = self._r.viz_frames[sel[0]] + names = ([self._gt] if self._gt in vf0 else []) + [n for n in self._non_gt if n in vf0] + if not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + + nrows = len(sel) + ncols = len(names) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3.8 * nrows)) + axes = np.atleast_2d(axes) + + for r, frame_idx in enumerate(sel): + vf = self._r.viz_frames[frame_idx] + gt = vf.get(self._gt) + if gt is None: + for c in range(ncols): + axes[r, c].axis("off") + continue + + for c, name in enumerate(names): + ax = axes[r, c] + if name not in vf: + ax.axis("off") + continue + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=0.001, vmax=0.1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (m)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + if c == 0: + ax.set_title(f"Frame {frame_idx + 1} • {label}\nMAE={mean_err:.4f} m", fontsize=9) + else: + ax.set_title(f"{label}\nMAE={mean_err:.4f} m", fontsize=9) + ax.axis("off") + + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label} (4 random frames, m)", fontsize=11, y=1.01) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + +# ── constants ──────────────────────────────────────────────────────────────── + +#DATA_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +DATA_DIR = f'{code_dir}/../data/pattern_cube' +#ORIGINAL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# FINETUNED_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415_epoch_030.pth' +ORIGINAL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +FINETUNED_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_030.pth' +DEFAULT_OUT = f'{code_dir}/../reports/office_ffs_benchmark-model37' +N_VIZ = 5 + +METHODS: Dict[str, Dict[str, str]] = { + 'original': {'label': 'FFS Original', 'color': '#2980b9'}, + 'finetuned': {'label': 'FFS Fine-tuned (INBOLT)', 'color': '#e74c3c'}, + 'depth_rs': {'label': 'RealSense Hardware Depth', 'color': '#f39c12'}, +} +GT_NAME = 'depth_rs' +RS_NAME = 'depth_rs' +RS_FPS = 30.0 + + +def resolve_finetuned_model_path(preferred_path: str) -> Optional[str]: + """Return an existing fine-tuned Inbolt checkpoint path, or None if not found.""" + preferred = Path(preferred_path) + if preferred.exists(): + return str(preferred) + + weights_dir = Path(code_dir) / '..' / 'weights' + candidate_names = [ + 'model_finetuned_inbolt.pth', + 'model_finetuned_inbolt-20260415_epoch_030.pth', + ] + + # 1) Try known candidate file names anywhere under weights/ + for name in candidate_names: + found = sorted(weights_dir.glob(f'**/{name}')) + if found: + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using fallback {found[0]}' + ) + return str(found[0]) + + # 2) Fallback to any Inbolt fine-tuned checkpoint, prefer lexicographically latest + generic = sorted(weights_dir.glob('**/model_finetuned_inbolt*.pth')) + if generic: + chosen = generic[-1] + logging.warning( + f'Preferred fine-tuned model not found at {preferred}. Using discovered checkpoint {chosen}' + ) + return str(chosen) + + return None + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument('--out_dir', default=DEFAULT_OUT, help='Output directory for the report') + parser.add_argument('--data_dir', default=DATA_DIR, help='Path to dataset root') + parser.add_argument('--original', default=ORIGINAL_PATH, help='Path to original model weights') + parser.add_argument('--finetuned', default=FINETUNED_PATH, help='Path to fine-tuned model weights') + parser.add_argument('--n_viz', type=int, default=N_VIZ, help='Frames saved for visual comparison') + args = parser.parse_args() + + U.set_logging_format() + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + # ── load stereo models ─────────────────────────────────────────────────── + models = {} + finetuned_path = resolve_finetuned_model_path(args.finetuned) + if finetuned_path is not None: + models['finetuned'] = load_model(finetuned_path) + else: + logging.warning( + f'Fine-tuned model not found (preferred: {args.finetuned}) and no fallback checkpoint found — skipping' + ) + + models['original'] = load_model(args.original) + + active_methods = [GT_NAME, RS_NAME] + list(models.keys()) + + # ── dataset ────────────────────────────────────────────────────────────── + source = DataSource() + n = source.init_directory(input_rectified=args.data_dir) + logging.info(f'Found {n} samples in {args.data_dir}') + if n == 0: + logging.error('No samples found — check DATA_DIR path') + return + + # ── accumulators ───────────────────────────────────────────────────────── + all_metrics = [] + viz_frames = [] + valid_acc = {} + dist_bin_mae = {m: [] for m in active_methods} + close_range_valid = {m: [] for m in active_methods} + timing_ms_raw = {m: [] for m in models} + H = W = None + + depth_acc_keys = [RS_NAME] + list(models.keys()) + depth_accs = {k: DepthBinAccumulator() for k in depth_acc_keys} + + for idx in range(n): + data = source.get_item(idx) + left = data['left'] + right = data['right'] + gt_mm = data['depth_rs'].astype(np.float32) + rs_mm = data['depth_rs'].astype(np.float32) + + if H is None: + H, W = gt_mm.shape[:2] + for m in active_methods: + valid_acc[m] = np.zeros((H, W), np.float32) + + gt_m = gt_mm / 1000.0 + rs_m = rs_mm / 1000.0 + + frame_depths = {GT_NAME: gt_m, RS_NAME: rs_m} + for mname, model in models.items(): + t0 = time.monotonic() + frame_depths[mname] = infer_depth_m(model, left, right) + timing_ms_raw[mname].append((time.monotonic() - t0) * 1000.0) + + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + # create point clouds for visualization + for mname in active_methods: + pred = frame_depths[mname] + + XYZ = source.project_camera_to_3d(pred, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in RealSense camera space + mname_path = os.path.join(out_dir, f'{mname}_{idx}.ply') + source.save_to_ply(XYZ/1000, mname_path) # save in meters for visualization + + + for mname in active_methods: + pred = frame_depths[mname] + valid_acc[mname] += (pred > 0).astype(np.float32) + + if mname == GT_NAME: + fm = FrameMetrics( + GT_NAME, + 0.0, + 0.0, + 0.0, + 100.0, + float((pred > 0).mean()) * 100.0, + 0.0, + mae_pen=0.0, + mre_pen=0.0, + ) + elif mname == RS_NAME: + fm = compute_metrics(pred, gt_m, elapsed_ms=0.0, method_name=RS_NAME) + else: + fm = compute_metrics(pred, gt_m, timing_ms_raw[mname][-1], mname) + + all_metrics.append(fm) + dist_bin_mae[mname].append(compute_bin_mae(pred, gt_m)) + + close_cov = ( + float((pred[gt_close_mask] > 0).mean()) * 100.0 + if n_close > 0 else 0.0 + ) + close_range_valid[mname].append(close_cov) + + depth_accs[RS_NAME].update(rs_m, gt_m) + for mname in models: + depth_accs[mname].update(frame_depths[mname], gt_m) + + if idx < args.n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + + if (idx + 1) % 200 == 0 or (idx + 1) == n: + logging.info(f' {idx + 1}/{n} frames processed') + + for m in active_methods: + valid_acc[m] /= max(n, 1) + + mean_timing = { + m: float(np.mean(ts)) if ts else 0.0 + for m, ts in timing_ms_raw.items() + } + mean_timing[GT_NAME] = 0.0 + mean_timing[RS_NAME] = 1000.0 / RS_FPS + + method_configs = { + 'original': {'model_path': args.original}, + RS_NAME: {'source': f'RealSense hardware depth (~{RS_FPS:.0f} FPS)'}, + } + if 'finetuned' in models and finetuned_path is not None: + method_configs['finetuned'] = {'model_path': finetuned_path} + + results = BenchmarkResults( + method_names=active_methods, + method_labels={m: METHODS[m]['label'] for m in active_methods}, + method_colors={m: METHODS[m]['color'] for m in active_methods}, + ground_truth_name=GT_NAME, + n_frames=n, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=f'INBOLT dataset ({args.data_dir})', + method_configs=method_configs, + ) + + stats = aggregate(results, mean_timing) + if RS_NAME in stats: + stats[RS_NAME].fps_mean = RS_FPS + + reporter = ReportGeneratorInbolt(results, stats, out_dir) + reporter.generate() + + plot_colors = { + RS_NAME: METHODS[RS_NAME]['color'], + **{m: METHODS[m]['color'] for m in models if m in METHODS}, + } + plot_labels = { + RS_NAME: METHODS[RS_NAME]['label'], + 'original': METHODS['original']['label'], + 'finetuned': METHODS['finetuned']['label'], + } + labeled_accs = { + plot_labels.get(k, k): v + for k, v in depth_accs.items() + if depth_accs[k].count.sum() > 0 + } + labeled_colors = { + plot_labels.get(k, k): plot_colors.get(k) + for k in depth_accs + if depth_accs[k].count.sum() > 0 + } + + plot_depth_vs_distance( + accumulators=labeled_accs, + colors=labeled_colors, + out_path=out_dir / 'depth_vs_distance.png', + ) + logging.info(f'All outputs written to {out_dir}') + + +if __name__ == '__main__': + main() diff --git a/scripts/benchmark_runner.py b/scripts/benchmark_runner.py new file mode 100644 index 0000000..270a31d --- /dev/null +++ b/scripts/benchmark_runner.py @@ -0,0 +1,280 @@ +"""BenchmarkRunner — orchestrates warm-up, frame collection and metric accumulation.""" + +from __future__ import annotations + +import time +from typing import Dict, List, Optional + +import numpy as np + +from .metrics import ( + CLOSE_RANGE_THRESHOLD_M, + DIST_BINS, + BenchmarkResults, + FrameMetrics, + compute_bin_mae, + compute_metrics, +) +from .methods import DepthMethod + + +_GPU_LOAD_PATH = None + +def _read_gpu_load() -> float: + """Read GPU utilisation from sysfs (Jetson Orin). Returns 0-100 or -1 on failure.""" + global _GPU_LOAD_PATH + if _GPU_LOAD_PATH is None: + import glob + candidates = glob.glob("/sys/devices/platform/bus@0/*/load") + \ + glob.glob("/sys/devices/platform/gpu*/load") + for c in candidates: + if "gpu" in c.lower() or "17000000" in c: + _GPU_LOAD_PATH = c + break + if _GPU_LOAD_PATH is None: + _GPU_LOAD_PATH = "" + if not _GPU_LOAD_PATH: + return -1.0 + try: + with open(_GPU_LOAD_PATH) as f: + return int(f.read().strip()) / 10.0 + except Exception: + return -1.0 + + +class BenchmarkRunner: + """Runs the benchmark loop and collects all data needed by ReportGenerator. + + Usage:: + + runner = BenchmarkRunner(capture, methods, ground_truth_name="nn_accurate") + results = runner.run(n_frames=30, warmup=5, n_viz=3) + timing = runner.measure_timing(n_frames=5) + """ + + def __init__( + self, + capture, + methods: List[DepthMethod], + ground_truth_name: str, + ) -> None: + self._capture = capture + self._methods = methods + self._available = [m for m in methods if m.available] + self._gt_name = ground_truth_name + # Mark ground-truth flag on each method + for m in self._available: + m.is_ground_truth = m.name == ground_truth_name + + # ------------------------------------------------------------------ public + + def run( + self, + n_frames: int = 30, + warmup: int = 5, + n_viz: int = 3, + ) -> BenchmarkResults: + W = getattr(self._capture, "_width", None) or getattr(self._capture, "_w", 640) + H = getattr(self._capture, "_height", None) or getattr(self._capture, "_h", 480) + n_viz = min(n_viz, n_frames) + + self._print_plan() + self._warmup(warmup) + + all_metrics: List[FrameMetrics] = [] + viz_frames: List[Dict] = [] + valid_acc = {m.name: np.zeros((H, W), np.float32) for m in self._available} + dist_bin_mae: Dict[str, List[List[float]]] = {m.name: [] for m in self._available} + close_range_valid: Dict[str, List[float]] = {m.name: [] for m in self._available} + + print(f"\nRunning benchmark ({n_frames} frames)...") + for frame_idx in range(n_frames): + frame_depths = self._process_frame() + + gt_m = frame_depths.get(self._gt_name, np.zeros((H, W), np.float32)) + self._accumulate( + frame_idx, frame_depths, gt_m, H, W, + all_metrics, valid_acc, dist_bin_mae, close_range_valid, + ) + if frame_idx < n_viz: + viz_frames.append({k: v.copy() for k, v in frame_depths.items()}) + if (frame_idx + 1) % 10 == 0 or (frame_idx + 1) == n_frames: + print(f" Frame {frame_idx + 1}/{n_frames}") + + # Normalise coverage maps to [0, 1] + for name in valid_acc: + valid_acc[name] /= max(n_frames, 1) + + return BenchmarkResults( + method_names=[m.name for m in self._available], + method_labels={m.name: m.label for m in self._methods}, + method_colors={m.name: m.color for m in self._methods}, + ground_truth_name=self._gt_name, + n_frames=n_frames, + width=W, + height=H, + all_metrics=all_metrics, + viz_frames=viz_frames, + coverage_maps=valid_acc, + dist_bin_mae=dist_bin_mae, + close_range_valid=close_range_valid, + source=self._source_label(), + method_configs={m.name: m.config for m in self._available if m.config}, + ) + + def measure_timing(self, n_frames: int = 5) -> Dict[str, float]: + """Return mean processing time (ms) per available method.""" + times: Dict[str, List[float]] = {m.name: [] for m in self._available} + for _ in range(n_frames): + fs = self._capture.get_frames() + for m in self._available: + t0 = time.monotonic() + try: + m.process_fn(fs) + except Exception: + pass + times[m.name].append((time.monotonic() - t0) * 1000.0) + return {name: float(np.mean(ts)) if ts else 0.0 + for name, ts in times.items()} + + def measure_gpu_load( + self, duration_sec: float = 60.0, cooldown_sec: float = 60.0, + ) -> Dict[str, float]: + """Measure GPU load per method in isolation (one method at a time). + + For each method, idles for *cooldown_sec* to let the GPU settle, then + runs the method for *duration_sec* while a background thread samples + GPU utilisation every 50 ms. This gives a realistic sustained GPU + load, not a peak snapshot. + """ + import threading + + gpu_loads: Dict[str, float] = {} + + for i, m in enumerate(self._available): + # Cooldown — let GPU idle between methods + if cooldown_sec > 0 and i > 0: + print(f" Cooldown {cooldown_sec:.0f}s...") + time.sleep(cooldown_sec) + + # Baseline reading before inference starts + baseline = _read_gpu_load() + + samples: List[float] = [] + stop_event = threading.Event() + + def _sampler(): + while not stop_event.is_set(): + val = _read_gpu_load() + if val >= 0: + samples.append(val) + stop_event.wait(0.05) + + # Start sampling + t = threading.Thread(target=_sampler, daemon=True) + t.start() + + # Run method in isolation for duration_sec + t_end = time.monotonic() + duration_sec + n = 0 + while time.monotonic() < t_end: + fs = self._capture.get_frames() + try: + m.process_fn(fs) + except Exception: + pass + n += 1 + + # Stop sampling + stop_event.set() + t.join(timeout=1.0) + + gpu_loads[m.name] = float(np.mean(samples)) if samples else 0.0 + print(f" {m.label}: GPU {gpu_loads[m.name]:.0f}% " + f"(baseline {baseline:.0f}%, {n} frames in {duration_sec:.0f}s)") + + return gpu_loads + + # ------------------------------------------------------------------ private + + def _print_plan(self) -> None: + print(f"Ground truth: {self._gt_name}") + for m in self._methods: + if m.available: + tag = " [GT]" if m.is_ground_truth else "" + print(f" [OK] {m.label}{tag}") + else: + print(f" [--] {m.label} ({m.skip_reason})") + + def _warmup(self, n: int) -> None: + if n <= 0: + return + print(f"\nWarming up ({n} frames)...") + for _ in range(n): + fs = self._capture.get_frames() + for m in self._available: + try: + m.process_fn(fs) + except Exception: + pass + + def _process_frame(self) -> Dict[str, np.ndarray]: + import cv2 + fs = self._capture.get_frames() + W = getattr(self._capture, "_width", None) or getattr(self._capture, "_w", 640) + H = getattr(self._capture, "_height", None) or getattr(self._capture, "_h", 480) + result: Dict[str, np.ndarray] = {} + for m in self._available: + try: + pred = m.process_fn(fs) + except Exception: + pred = np.zeros((H, W), dtype=np.float32) + pred = pred.astype(np.float32) if pred.dtype != np.float32 else pred + # Resize to camera resolution if method returns different size + if pred.shape[:2] != (H, W): + pred = cv2.resize(pred, (W, H)) + result[m.name] = pred + return result + + def _accumulate( + self, + frame_idx: int, + frame_depths: Dict[str, np.ndarray], + gt_m: np.ndarray, + H: int, W: int, + all_metrics: List[FrameMetrics], + valid_acc: Dict, + dist_bin_mae: Dict, + close_range_valid: Dict, + ) -> None: + gt_close_mask = (gt_m > 0) & (gt_m < CLOSE_RANGE_THRESHOLD_M) + n_close = int(gt_close_mask.sum()) + + for m in self._available: + pred = frame_depths[m.name] + valid_acc[m.name] += (pred > 0).astype(np.float32) + + if m.is_ground_truth: + fm = FrameMetrics( + m.name, 0.0, 0.0, 0.0, 100.0, + float((pred > 0).mean()) * 100.0, 0.0, + mae_pen=0.0, mre_pen=0.0, + ) + else: + fm = compute_metrics(pred, gt_m, 0.0, m.name) + all_metrics.append(fm) + + dist_bin_mae[m.name].append(compute_bin_mae(pred, gt_m)) + + if n_close > 0: + close_cov = float((pred[gt_close_mask] > 0).mean()) * 100.0 + else: + close_cov = 0.0 + close_range_valid[m.name].append(close_cov) + + def _source_label(self) -> str: + from .capture import SyntheticCapture + if isinstance(self._capture, SyntheticCapture): + return "synthetic" + bag = getattr(self._capture, "_bag", None) + return f"bag:{bag}" if bag else "live" \ No newline at end of file diff --git a/scripts/data_manager_faro.py b/scripts/data_manager_faro.py new file mode 100644 index 0000000..9304e76 --- /dev/null +++ b/scripts/data_manager_faro.py @@ -0,0 +1,289 @@ +''' + +Dataset management from different image source. +Can read ros bag files, bin files, mp4 files and even image stream from the camera + +Output : + Depth, Left, Right or orther image types + +Usage: + + Usage examples: + # Training set — excludes KITCHEN paths + ds = DataSource() + ds.init_directory(root, test_keywords=['KITCHEN'], split='train') + + # Test set — only KITCHEN paths + ds = DataSource() + ds.init_directory(root, test_keywords=['KITCHEN'], split='test') + + # Access counts directly + ds.train_imgs # list of train paths + ds.test_imgs # list of test paths + +Environment : + C:\\Users\\udubin\\Documents\\Envs\\barcode + +Install : + See README.md + + +''' + +from copyreg import pickle +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import os +import glob +import re +import scipy.io as sio +import unittest + + # importing common Use modules +# import sys +# sys.path.append(r'..\Utils\src') +#from logger import log +import logging as log + +# -------------------------------- +#%% Data source +class DataSource: + + def __init__(self): + + # params + self.gray_scale_input = False + self.imgs = [] + + + log.info('Source is defined') + + def init_directory(self, input_rectified = '', gray_scale_input = False, sub_indexes = None, + test_keywords = None, split = 'all'): + """Load directory and optionally split into train/test by path keywords. + + Args: + test_keywords: list of strings (case-insensitive); paths containing any + of these keywords are assigned to the test split. + E.g. ['KITCHEN', 'BATHROOM'] + split: 'all' – return all images (default, backward-compatible) + 'train' – return only images whose path matches no keyword + 'test' – return only images whose path matches at least one keyword + """ + if len(input_rectified) < 3: + input_rectified = r'/mnt/algonas/Local' + + all_imgs = glob.glob(os.path.join(input_rectified, "**/L_images/L_Img_**.mat"), recursive=True) + self.gray_scale_input = gray_scale_input + + if test_keywords: + keywords_upper = [kw.upper() for kw in test_keywords] + def is_test(path): + p = path.upper() + return any(kw in p for kw in keywords_upper) + + self.train_imgs = [p for p in all_imgs if is_test(p)] + self.test_imgs = [p for p in all_imgs if not is_test(p)] + else: + self.train_imgs = all_imgs + self.test_imgs = [] + + if split == 'train': + self.imgs = self.train_imgs + elif split == 'test': + self.imgs = self.test_imgs + else: # 'all' + self.imgs = all_imgs + + if sub_indexes is not None: + self.imgs = [self.imgs[idx] for idx in sub_indexes] + + log.info(f"Split='{split}': {len(self.imgs)} images " + f"(train={len(self.train_imgs)}, test={len(self.test_imgs)})") + return len(self.imgs) + + + def get_item(self, index: int, debug: bool = False): + "get one item from the dataset" + output_str = {"left": [], "right": [], "depth_faro": [], "depth_rs": [], "rgb": [] } + + # find path + left_path = self.imgs[index] + right_path = left_path.replace("L_images", "R_images").replace("L_Img", "R_Img") + rgb_path = left_path.replace("L_images", "RGB_images").replace("L_Img", "RGB_Img") + depth_faro_path = left_path.replace("L_images", "Depth").replace("L_Img", "Depth_Img") + depth_rs_path = left_path.replace("L_images", "Z_Intel_Depth").replace("L_Img", "Z_Intel_Img") + + base_folder = left_path[:left_path.rfind("L_images") - 1] + gt_test_folder = os.path.join(base_folder, "Disparity") + + # right image + filename_r = os.path.basename(right_path) + filename_r_wo_ext = os.path.splitext(filename_r)[0] + m = re.search('R_Img_(\d+)', filename_r_wo_ext) + file_idx = int(m.group(1)) + + # read img + left_img = sio.loadmat(left_path)['Il'] + right_img = sio.loadmat(right_path)['Ir'] + rgb_img = sio.loadmat(rgb_path)['I_RGB'] + depth_faro_img = sio.loadmat(depth_faro_path)['depth'] + depth_rs_img = sio.loadmat(depth_rs_path)['Z_im'] + #left_img = cv2.cvtColor(left_img, cv2.COLOR_GRAY2BGR) + #right_img = cv2.cvtColor(right_img, cv2.COLOR_GRAY2BGR) + + if left_img is None or right_img is None or rgb_img is None: + return output_str + + # if self.gray_scale_input: + # left_img = cv2.cvtColor(left_img.astype("uint8"), cv2.COLOR_BGR2GRAY)[None, :, :] + # right_img = cv2.cvtColor(right_img.astype("uint8"), cv2.COLOR_BGR2GRAY)[None, :, :] + + # test_name = f"{base_folder}" + # prefix = f"{test_name}/{os.path.basename(left_path)}" + # file_sources = { + # "left_path": left_path, + # "prefix": os.path.basename(prefix), + # "right_path": right_path, + # "left_disp_path": left_disp_filename + # } + + + left_img, right_img, rgb_img, depth_rs_img, depth_faro_img = left_img, right_img, rgb_img, depth_rs_img.astype(np.float32), depth_faro_img.astype(np.float32) + + output_str["left"] = left_img + output_str["right"] = right_img + output_str["depth_faro"] = depth_faro_img + output_str["depth_rs"] = depth_rs_img + output_str["rgb"] = rgb_img + + if debug: + depth_error = self.compute_depth_error(depth_rs_img, depth_faro_img) + img_list = [left_img, right_img, rgb_img, depth_rs_img, depth_faro_img, depth_error] + ttl_list = ['left','right','rgb','depth rs','depth faro','depth error'] + self.show_subset(img_list, ttl_list) + + return output_str + + def compute_depth_error(self, depth_rs_img, depth_faro_img, depth_mask = None) : + "compute depth error" + depth_rs_img, depth_faro_img = depth_rs_img.astype(np.float32), depth_faro_img.astype(np.float32) + depth_error = np.zeros_like(depth_rs_img) + depth_mask = np.ones_like(depth_rs_img,dtype=bool) if depth_mask is None else depth_mask + + #depth_valid = depth_faro_img > 0 if depth_mask is None else depth_mask # depth_rs_img > 0 + depth_valid = np.logical_and(depth_faro_img > 0, depth_mask) + depth_valid = np.logical_and(depth_rs_img > 0, depth_valid) + depth_error[depth_valid] = np.abs(depth_rs_img[depth_valid] - depth_faro_img[depth_valid]) + return depth_error + + def show_subset(self, img_list, ttl_list, vmin=None, vmax=None, save_path='', fig_name=''): + "show some images" + img_num = len(img_list) + row_num = int(img_num/4) +1 + col_num = int(img_num/row_num) + fig, axes = plt.subplots(row_num, col_num, sharey=True, sharex=True) + axes = axes.reshape((row_num,col_num)) + do_save = os.path.exists(save_path) + for k in range(img_num): + ri, ci = int(k / col_num), k % col_num + pcm = axes[ri, ci].imshow(img_list[k], vmin=vmin, vmax=vmax) + axes[ri, ci].set_title(ttl_list[k]) + #fig.colorbar(pcm, ax=axes[ri, ci]) + + if do_save: + fig.savefig(os.path.join(save_path, fig_name + ".png")) + + plt.show(block=False) + + def save_data_to_folder(self, output_str, output_directory): + "save data dict to disk " + + # 3. Ensure the output directory exists + # exist_ok=True prevents an error if the directory already exists + os.makedirs(output_directory, exist_ok=True) + + img_left = output_str["left"].astype(np.uint16) + img_right = output_str["right"].astype(np.uint16) + depth_faro_img = output_str["depth_faro"].astype(np.uint16) + depth_rs_img = output_str["depth_rs"].astype(np.uint16) + rgb_img = output_str["rgb"].astype(np.uint16) + + # 4. Create the full output path + output_path = os.path.join(output_directory, "img_left.png") + success = cv2.imwrite(output_path, img_left, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + output_path = os.path.join(output_directory, "img_right.png") + success = cv2.imwrite(output_path, img_right, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + output_path = os.path.join(output_directory, "img_depth_faro.png") + success = cv2.imwrite(output_path, depth_faro_img, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + output_path = os.path.join(output_directory, "img_depth_rs.png") + success = cv2.imwrite(output_path, depth_rs_img, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + output_path = os.path.join(output_directory, "img_rgb.png") + success = cv2.imwrite(output_path, rgb_img, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + + return success + + + + +# -------------------------------- +#%% Tests +class TestDataSource(unittest.TestCase): + + def test_init_directory(self): + "check image are in data source" + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + + def test_get_item(self): + "check image from data source" + p = DataSource() + img_num = p.init_directory() + out_data = p.get_item(7, debug = True) + self.assertTrue(len(out_data["left"]) > 0) + + def test_show_images(self): + "show image from video file" + p = DataSource() + img_num = p.init_directory() + if img_num == 0: + log.warning("No images found in the directory.") + return + img_index = np.random.randint(0, high=img_num, size=8) + for k in img_index: + out_data = p.get_item(k, debug = True) + self.assertTrue(len(out_data["left"]) > 0) + plt.show() + + def test_show_images_and_save(self): + "show image from data files and ssaves them to disk" + p = DataSource() + img_num = p.init_directory() + if img_num == 0: + log.warning("No images found in the directory.") + return + img_index = np.random.randint(0, high=img_num, size=16) + for k in img_index: + out_data = p.get_item(k, debug = True) + out_folder = f"C:\\Work\\Projects\\Deploy\\data\\fs\\index_{k:04d}" + ret = p.save_data_to_folder(out_data, output_directory = out_folder) + self.assertTrue(ret) + plt.show() + +# -------------------------------- +#%% Run Test +def RunTest(): + + tst = TestDataSource() + #tst.test_init_directory() + tst.test_get_item() + #tst.test_show_images() + #tst.test_show_images_and_save() + + +#%% +if __name__ == '__main__': + #print (__doc__) + RunTest() \ No newline at end of file diff --git a/scripts/data_manager_faro_laptop.py b/scripts/data_manager_faro_laptop.py new file mode 100644 index 0000000..10fa007 --- /dev/null +++ b/scripts/data_manager_faro_laptop.py @@ -0,0 +1,278 @@ +''' + +Dataset management from different image source. +Can read png files created from FARO dataset + +Output : + Depth, Left, Right or orther image types + +Usage: + +Environment : + docker fs + +Install : + See README.md + + +''' + +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import os +import unittest +#from torch.utils.data import DataLoader + + # importing common Use modules + + + +# -------------------------------- +#%% Data source +class DataSource: + + def __init__(self): + + # params + self.input_dir = '' + self.gray_scale_input = False + self.dirs = [] + self.count = 0 + + print('Source is defined') + + def get_bf(self): + "image baseline for faro" + return 49470.45 + + def init_directory(self, gray_scale_input = False, sub_indexes = None): + "load entire directory" + + input_rectified = r'./data/faro' + dir_list = os.listdir(input_rectified) + self.dirs = [file for file in dir_list if file.startswith('index')] + + #self.imgs = glob.glob(os.path.join(input_rectified, f"/**/*.png"), recursive=True) + self.gray_scale_input = gray_scale_input + if sub_indexes is not None: + self.dirs = [self.dirs[idx] for idx in sub_indexes] + + self.input_dir = input_rectified + print(f'Total directories {len(self.dirs)}') + return len(self.dirs) + + def get_image_from_directory(self, dir_path): + "get an d,l,r image from a training directory" + + # check if initialized + fpath = dir_path + + #print(f'Reading files from {fpath}.....') + files = os.listdir(fpath) + file_extensions = ['.png'] #['.png','.jpg','.bmp','z.v_0.png'] + file_name_part = self.file_names[0] # depth name + filtered_files = [file for file in files if file.endswith(tuple(file_extensions))] + file_list = filtered_files + frame_count = 0 + print('Found %d.' %len(filtered_files)) + + file_num = len(file_list) + if file_num < 1 or self.frame_count >= file_num: + print('No image files are found') + return False, None + + # Iterate over files and process them + file_name = file_list[self.frame_count] + file_path = os.path.join(fpath, file_name) + img_array_d = cv2.imread(file_path, cv2.IMREAD_UNCHANGED) + + # depth with left + file_path = os.path.join(fpath, file_name.replace(self.file_names[0],self.file_names[1])) + img_array_l = cv2.imread(file_path, cv2.IMREAD_UNCHANGED).astype(img_array_d.dtype) + + # depth with right + file_path = os.path.join(fpath, file_name.replace(self.file_names[0],self.file_names[2])) + img_array_r = cv2.imread(file_path, cv2.IMREAD_UNCHANGED).astype(img_array_d.dtype) + + # assign data + self.frame_gray = img_array_d + self.frame_left = img_array_l + self.frame_right = img_array_r + img_array = np.stack((img_array_l,img_array_r,img_array_d),2) + + + # check the number of files : self.direct_count can be 0,+1,-1 + self.frame_count = (self.frame_count + self.direct_count) #% file_num + self.frame_name = file_name + + return True, img_array + + + def get_item(self, index: int, debug: bool = False): + "get one item from the dataset" + output_str = {"img_left": [], "img_right": [], "img_depth_faro": [], "img_depth_rs": [], "img_rgb": [] } + + if index > len(self.dirs): + print(f'bad directory {base_folder}') + return output_str + + # find path + base_folder = os.path.join(self.input_dir ,self.dirs[index]) + #print(f'Reading data from {base_folder}') + img_path = os.path.join(base_folder, "img_left.png") + left_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "img_right.png") + right_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "img_depth_faro.png") + depth_faro_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "img_depth_rs.png") + depth_rs_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "img_rgb.png") + rgb_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + + #left_img = cv2.cvtColor(left_img, cv2.COLOR_GRAY2BGR) + #right_img = cv2.cvtColor(right_img, cv2.COLOR_GRAY2BGR) + + if left_img is None or right_img is None or rgb_img is None: + print(f'bad directory {base_folder}') + return output_str + if len(left_img)<1 or len(right_img) < 1: + print(f'bad directory {base_folder}') + return output_str + + #print(right_img) + + #left_img, right_img, rgb_img, depth_rs_img, depth_faro_img = left_img, right_img, rgb_img, depth_rs_img.astype(np.float32), depth_faro_img.astype(np.float32) + + output_str["img_left"] = left_img + output_str["img_right"] = right_img + output_str["img_depth_faro"] = depth_faro_img + output_str["img_depth_rs"] = depth_rs_img + output_str["img_rgb"] = rgb_img + + if debug: + depth_error = self.compute_depth_error(depth_rs_img, depth_faro_img) + img_list = [left_img, right_img, rgb_img, depth_rs_img, depth_faro_img, depth_error] + ttl_list = ['left','right','rgb','depth rs','depth faro','depth error'] + self.show_subset(img_list, ttl_list) + + return output_str + + def load_specific_files(self, debug = True): + "specific files to load" + + # find path + base_folder = r'./data/roi' + #print(f'Reading data from {base_folder}') + img_path = os.path.join(base_folder, "2_Infrared.png") + left_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "1_Color.png") + right_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "1_Color.png") + depth_faro_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "2_Infrared.png") + depth_rs_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + img_path = os.path.join(base_folder, "1_Color.png") + rgb_img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED) + + output_str = {"img_left": [], "img_right": [], "img_depth_faro": [], "img_depth_rs": [], "img_rgb": [] } + output_str["img_left"] = left_img + output_str["img_right"] = right_img + output_str["img_depth_faro"] = depth_faro_img[:,:,0] + output_str["img_depth_rs"] = depth_rs_img[:,:,0] + output_str["img_rgb"] = rgb_img + + if debug: + depth_error = self.compute_depth_error(depth_rs_img, depth_faro_img) + img_list = [left_img, right_img, rgb_img, depth_rs_img, depth_faro_img, depth_error] + ttl_list = ['left','right','rgb','depth rs','depth faro','depth error'] + self.show_subset(img_list, ttl_list) + + return output_str + + def compute_depth_error(self, depth_rs_img, depth_faro_img, depth_mask = None) : + "compute depth error" + depth_rs_img, depth_faro_img = depth_rs_img.astype(np.float32), depth_faro_img.astype(np.float32) + depth_error = np.zeros_like(depth_rs_img) + depth_mask = np.ones_like(depth_rs_img,dtype=bool) if depth_mask is None else depth_mask + + #depth_valid = depth_faro_img > 0 if depth_mask is None else depth_mask # depth_rs_img > 0 + depth_valid = np.logical_and(depth_faro_img > 0, depth_mask) + depth_valid = np.logical_and(depth_rs_img > 0, depth_valid) + depth_error[depth_valid] = np.abs(depth_rs_img[depth_valid] - depth_faro_img[depth_valid]) + return depth_error + + def show_subset(self, img_list, ttl_list, vmin=None, vmax=None, save_path='', fig_name=''): + "show some images" + img_num = len(img_list) + row_num = int(img_num/4) +1 + col_num = int(img_num/row_num) + fig, axes = plt.subplots(row_num, col_num, sharey=True, sharex=True) + axes = axes.reshape((row_num,col_num)) + do_save = os.path.exists(save_path) + for k in range(img_num): + ri, ci = int(k / col_num), k % col_num + pcm = axes[ri, ci].imshow(img_list[k], vmin=vmin, vmax=vmax) + axes[ri, ci].set_title(ttl_list[k]) + #fig.colorbar(pcm, ax=axes[ri, ci]) + + if do_save: + fig.savefig(os.path.join(save_path, fig_name + ".png")) + + #plt.show(block=False) + plt.show() + + def save_image(self, frame, fname = ''): + fn = './image_%03d_%s.png' % (self.count, fname) + frame = frame.astype(np.uint16) #cv.cvtColor(frame, cv.CV_16U) + cv2.imwrite(fn, frame, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + print(fn, 'saved') + self.count += 1 + + + + +# -------------------------------- +#%% Tests +class TestDataSource(unittest.TestCase): + + def test_init_directory(self): + "check image are in data source" + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + + def test_get_item(self): + "check image from data source" + p = DataSource() + img_num = p.init_directory() + out_data = p.get_item(7, debug = True) + self.assertTrue(len(out_data["img_left"]) > 0) + + def test_show_images(self): + "show image from video file" + p = DataSource() + img_num = p.init_directory() + img_index = np.random.randint(0,img_num,8) + for k in img_index: + out_data = p.get_item(k, debug = True) + self.assertTrue(len(out_data["img_left"]) > 0) + plt.show() + + + +# -------------------------------- +#%% Run Test +def RunTest(): + + tst = TestDataSource() + #tst.test_init_directory() + #tst.test_get_item() + tst.test_show_images() + + +#%% +if __name__ == '__main__': + #print (__doc__) + RunTest() \ No newline at end of file diff --git a/scripts/data_manager_inbolt.py b/scripts/data_manager_inbolt.py new file mode 100644 index 0000000..f4ef3ce --- /dev/null +++ b/scripts/data_manager_inbolt.py @@ -0,0 +1,485 @@ +''' + +Dataset management for Inbolt stereo dataset. + +Loads RealSense IR stereo pairs (mono0/mono1) as left/right images +and Zivid depthmap as ground-truth depth. + +Expected directory layout (one or more session folders under root): + / + / + realsense/ + / + mono0.png # left IR image (uint8 or uint16) + mono1.png # right IR image (uint8 or uint16) + zivid/ + / + depthmap_mm.png # GT depth in mm (uint16) + color.png # optional RGB + +Only samples that have BOTH a realsense pair AND a matching zivid depth are +included. Sessions that lack a zivid subfolder (e.g. freedrive-only captures) +are silently skipped. + +Output dict keys (same as faro_data_manager for compatibility): + left : numpy array (H, W) uint8/uint16 IR + right : numpy array (H, W) uint8/uint16 IR + depth_faro : numpy array (H, W) float32, mm ← Zivid GT + depth_rs : numpy array (H, W) float32, mm (empty / zeros if absent) + rgb : numpy array (H, W, 3) uint8 (Zivid color, or empty) + +''' + +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import os +import glob +import unittest +import logging as log +import yaml + +# format logger +log.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=log.INFO) + +# -------------------------------- + +CAMERA_MATRIX_RS = np.array([ + [385.5098876953125, 0, 328.31732177734375], + [0, 385.5098876953125, 235.6382141113281], + [0, 0, 1] +]) + +DIST_COEFFS_RS = np.array([ + 0.0, + -0.0, + -0.0, + 0.0, + -0.0 +]) + +CAMERA_MATRIX_ZIVID = np.array([ + [1241.853637, 0, 609.9444419], + [0, 1241.853637, 513.6974808515621], + [0, 0, 1] +]) +DIST_COEFFS_ZIVID = np.array([ + - 0.04514386132359505, + - -0.03609563037753105, + - -6.156915333122015e-05, + - 0.00015102965699043125, + - -0.17297066748142242 +]) + + + + +# -------------------------------- +#%% Data source +class DataSource: + + def __init__(self, train_mode = True): + self.gray_scale_input = False + self.imgs = [] # list of dicts: {left, right, depth_zivid, rgb} + self.train_mode = train_mode + log.info('Source is defined') + + def init_directory(self, input_rectified='', gray_scale_input=False, sub_indexes=None): + """Scan root for (realsense, zivid) sample pairs and populate self.imgs.""" + if len(input_rectified) < 3: + input_rectified = ( + r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/' + r'Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' + ) + + self.gray_scale_input = gray_scale_input + self.imgs = [] + + #IGNORED_SESSIONS = {'dataset_y16_freedrive', 'dataset_y8_freedrive'} + #IGNORED_SESSIONS = {'dataset_y16_freedrive','dataset_depth_bias'} + IGNORED_SESSIONS = {'20260414_142239'} # include all sessions by default; manually exclude any bad ones here + + # Each immediate sub-directory is a session + try: + if self.train_mode: + sessions = sorted([ + os.path.join(input_rectified, d) + for d in os.listdir(input_rectified) + if os.path.isdir(os.path.join(input_rectified, d)) + and d not in IGNORED_SESSIONS + ]) + else: + sessions = sorted([ + os.path.join(input_rectified, d) + for d in os.listdir(input_rectified) + if os.path.isdir(os.path.join(input_rectified, d)) + and d in IGNORED_SESSIONS + ]) + + except FileNotFoundError: + log.error(f"Directory not found: {input_rectified}") + return 0 + + for session in sessions: + rs_root = os.path.join(session, 'realsense') + zivid_root = os.path.join(session, 'zivid') + + if not os.path.isdir(rs_root) or not os.path.isdir(zivid_root): + continue # session has no stereo+GT pair + + # Find all left images; match by index folder name + left_paths = sorted(glob.glob(os.path.join(rs_root, '*', 'mono0.png'))) + for left_path in left_paths: + idx = os.path.basename(os.path.dirname(left_path)) + right_path = os.path.join(rs_root, idx, 'mono1.png') + depth_rs_path = os.path.join(rs_root, idx, 'depthmap_mm.png') + depth_zivid_path = os.path.join(zivid_root, idx, 'depthmap_mm.png') + rgb_path = os.path.join(zivid_root, idx, 'color.png') + + if not os.path.isfile(depth_rs_path) or not os.path.isfile(depth_zivid_path): + continue # skip incomplete samples + + rs_metadata_path = os.path.join(rs_root, idx, 'metadata.yaml') + zv_metadata_path = os.path.join(zivid_root, idx, 'metadata.yaml') + + self.imgs.append({ + 'left': left_path, + 'right': right_path, + 'depth_rs': depth_rs_path, + 'depth_zivid': depth_zivid_path, + 'rgb': rgb_path if os.path.isfile(rgb_path) else None, + 'metadata_rs': rs_metadata_path if os.path.isfile(rs_metadata_path) else None, + 'metadata_zv': zv_metadata_path if os.path.isfile(zv_metadata_path) else None, + }) + + if sub_indexes is not None: + self.imgs = [self.imgs[i] for i in sub_indexes] + + log.info(f"DataSource: found {len(self.imgs)} samples in {input_rectified}") + return len(self.imgs) + + def get_item(self, index: int, debug: bool = False): + """Return one sample as a dict with left, right, depth_faro, depth_rs, rgb.""" + output_str = {"left": [], "right": [], "depth_zivid": [], "depth_rs": [], "rgb": [], "metadata_rs": None, "metadata_zv": None} + + entry = self.imgs[index] + + left_img = cv2.imread(entry['left'], cv2.IMREAD_UNCHANGED) + right_img = cv2.imread(entry['right'], cv2.IMREAD_UNCHANGED) + depth_rs_img = cv2.imread(entry['depth_rs'], cv2.IMREAD_UNCHANGED) + depth_zivid_img = cv2.imread(entry['depth_zivid'], cv2.IMREAD_UNCHANGED) + + if left_img is None or right_img is None or depth_rs_img is None or depth_zivid_img is None: + log.warning(f"Failed to load sample {index}: {entry}") + return output_str + + rgb_img = np.array([], dtype=np.uint8) + if entry['rgb'] is not None: + rgb_img = cv2.imread(entry['rgb'], cv2.IMREAD_COLOR) + if rgb_img is None: + rgb_img = np.array([], dtype=np.uint8) + + depth_rs = depth_rs_img.astype(np.float32) + depth_zivid = depth_zivid_img.astype(np.float32) # uint16 mm → float32 mm + + metadata_rs = None + if entry.get('metadata_rs') is not None: + with open(entry['metadata_rs'], 'r') as f: + metadata_rs = yaml.safe_load(f) + + metadata_zv = None + if entry.get('metadata_zv') is not None: + with open(entry['metadata_zv'], 'r') as f: + metadata_zv = yaml.safe_load(f) + + output_str["left"] = left_img + output_str["right"] = right_img + output_str["depth_zivid"] = depth_zivid # Zivid GT + output_str["depth_rs"] = depth_rs + output_str["rgb"] = rgb_img + output_str["metadata_rs"] = metadata_rs + output_str["metadata_zv"] = metadata_zv + + if debug: + img_list = [left_img, right_img, depth_rs, depth_zivid] + ttl_list = ['left (RS)', 'right (RS)', 'depth RS (mm)', 'depth Zivid (mm)'] + if rgb_img.size > 0: + img_list.append(rgb_img) + ttl_list.append('rgb (Zivid)') + self.show_subset(img_list, ttl_list) + + return output_str + + def get_item_projected(self, index: int, debug: bool = False): + """Return one sample as a dict with left, right, depth_faro, depth_rs, rgb.""" + output_str = {"left": [], "right": [], "depth_zivid": [], "depth_rs": [], "rgb": [], "metadata_rs": None, "metadata_zv": None} + + entry = self.imgs[index] + + left_img = cv2.imread(entry['left'], cv2.IMREAD_UNCHANGED) + right_img = cv2.imread(entry['right'], cv2.IMREAD_UNCHANGED) + depth_rs_img = cv2.imread(entry['depth_rs'], cv2.IMREAD_UNCHANGED) + depth_zivid_img = cv2.imread(entry['depth_zivid'], cv2.IMREAD_UNCHANGED) + + if left_img is None or right_img is None or depth_rs_img is None or depth_zivid_img is None: + log.warning(f"Failed to load sample {index}: {entry}") + return output_str + + rgb_img = np.array([], dtype=np.uint8) + if entry['rgb'] is not None: + rgb_img = cv2.imread(entry['rgb'], cv2.IMREAD_COLOR) + if rgb_img is None: + rgb_img = np.array([], dtype=np.uint8) + + depth_rs = depth_rs_img.astype(np.float32) + depth_zivid = depth_zivid_img.astype(np.float32) # uint16 mm → float32 mm + + zivid_projected_path = entry['depth_zivid'].replace('.png', '_projected.png') # for debug visualization of projected depth maps + if os.path.exists(zivid_projected_path): + depth_zivid_projected = cv2.imread(zivid_projected_path, cv2.IMREAD_UNCHANGED).astype(np.float32) + else: + depth_zivid_projected = self.project_depth_zivid_to_rs(depth_zivid, depth_rs, finx = index) + cv2.imwrite(zivid_projected_path, depth_zivid_projected.astype(np.uint16), [cv2.IMWRITE_PNG_COMPRESSION, 0]) # save projected depth for visualization + + metadata_rs = None + if entry.get('metadata_rs') is not None: + with open(entry['metadata_rs'], 'r') as f: + metadata_rs = yaml.safe_load(f) + + metadata_zv = None + if entry.get('metadata_zv') is not None: + with open(entry['metadata_zv'], 'r') as f: + metadata_zv = yaml.safe_load(f) + + output_str["left"] = left_img + output_str["right"] = right_img + output_str["depth_zivid"] = depth_zivid_projected # Zivid GT + output_str["depth_rs"] = depth_rs + output_str["rgb"] = rgb_img + output_str["metadata_rs"] = metadata_rs + output_str["metadata_zv"] = metadata_zv + + if debug: + img_list = [left_img, right_img, depth_rs, depth_zivid_projected] + ttl_list = ['left (RS)', 'right (RS)', 'depth RS (mm)', 'depth Zivid (mm)'] + # if rgb_img.size > 0: + # img_list.append(rgb_img) + # ttl_list.append('rgb (Zivid)') + self.show_subset(img_list, ttl_list) + + # create point cloud & save to ply point cloud for visualization + #XYZ = self.project_camera_to_3d(depth_zivid_projected, CAMERA_MATRIX_ZIVID, DIST_COEFFS_ZIVID) + XYZ = self.project_camera_to_3d(depth_zivid_projected, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in Zivid camera space + zivid_path = entry['depth_zivid'].replace('.png', f'.ply') + #self.save_to_ply(XYZ/1000, zivid_path) # save in meters for visualization + + XYZ = self.project_camera_to_3d(depth_rs, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in RS camera space + rs_path = entry['depth_rs'].replace('.png', f'.ply') + #self.save_to_ply(XYZ/1000, rs_path) + + return output_str + + def compute_depth_error(self, depth_pred, depth_gt, depth_mask=None): + """Compute absolute depth error between prediction and GT.""" + depth_pred = depth_pred.astype(np.float32) + depth_gt = depth_gt.astype(np.float32) + depth_error = np.zeros_like(depth_pred) + mask = np.ones_like(depth_pred, dtype=bool) if depth_mask is None else depth_mask + valid = np.logical_and(depth_gt > 0, mask) + valid = np.logical_and(depth_pred > 0, valid) + depth_error[valid] = (depth_pred[valid] - depth_gt[valid]) + return depth_error + + def show_subset(self, img_list, ttl_list, vmin=None, vmax=None, save_path='', fig_name=''): + """Display a list of images in a grid.""" + img_num = len(img_list) + col_num = min(img_num, 3) + row_num = (img_num + col_num - 1) // col_num + fig, axes = plt.subplots(row_num, col_num, sharey=True, sharex=True) + axes = np.array(axes).reshape(row_num, col_num) + for k in range(img_num): + ri, ci = k // col_num, k % col_num + axes[ri, ci].imshow(img_list[k], vmin=vmin, vmax=vmax) + axes[ri, ci].set_title(ttl_list[k]) + for k in range(img_num, row_num * col_num): + axes[k // col_num, k % col_num].axis('off') + if save_path and os.path.exists(save_path): + fig.savefig(os.path.join(save_path, fig_name + ".png")) + plt.show(block=False) + + def save_data_to_folder(self, output_str, output_directory): + """Save sample dict to PNG files on disk.""" + os.makedirs(output_directory, exist_ok=True) + + paths = { + "img_left.png": output_str["left"], + "img_right.png": output_str["right"], + "img_depth_zivid.png": output_str["depth_zivid"].astype(np.uint16), + "img_depth_rs.png": output_str["depth_rs"].astype(np.uint16), + } + success = True + for fname, img in paths.items(): + out = cv2.imwrite(os.path.join(output_directory, fname), img, + [cv2.IMWRITE_PNG_COMPRESSION, 0]) + success = success and out + + if output_str["rgb"] is not None and np.asarray(output_str["rgb"]).size > 0: + cv2.imwrite(os.path.join(output_directory, "img_rgb.png"), + output_str["rgb"], [cv2.IMWRITE_PNG_COMPRESSION, 0]) + + return success + + + def save_to_ply(self, points: np.ndarray, filename: str): + """Save a point cloud to a PLY file for visualization.""" + with open(filename, 'w') as f: + f.write('ply\n') + f.write('format ascii 1.0\n') + f.write(f'element vertex {len(points)}\n') + f.write('property float x\n') + f.write('property float y\n') + f.write('property float z\n') + f.write('end_header\n') + for x, y, z in points: + f.write(f'{x} {y} {z}\n') + log.info(f"Saved point cloud to {filename}") + + def project_camera_to_3d(self, depth_img_mm: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray) -> np.ndarray: + """Project 2D pixel coordinates with depth to 3D points in camera space.""" + h, w = depth_img_mm.shape + xs, ys = np.meshgrid(np.arange(w, dtype=np.float32), np.arange(h, dtype=np.float32), indexing='xy') + + # OpenCV expects Nx1x2 contiguous float32/float64 image points in (x, y) order. + distorted_points = np.stack([xs, ys], axis=-1).reshape(-1, 1, 2).astype(np.float32) + undistorted_points = cv2.undistortPoints(distorted_points, cam_matrix.astype(np.float32), dist_coeffs.astype(np.float32) ) + + uv = undistorted_points.reshape(-1, 2) + Z = depth_img_mm.reshape(-1).astype(np.float32) + valid = np.isfinite(Z) & (Z > 0) + if not np.any(valid): + return np.zeros((0, 3), dtype=np.float32) + + uv = uv[valid] + Z = Z[valid] + X = uv[:, 0] * Z + Y = uv[:, 1] * Z + + # save to ply point cloud for visualization + XYZ = np.stack([X, Y, Z], axis=1).astype(np.float32) + + return XYZ + + def project_3d_to_camera(self, points_3d: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray, frame_size = (480,640)) -> np.ndarray: + """Project 3D points in camera space back to 2D pixel coordinates.""" + if points_3d.shape[1] != 3: + raise ValueError("Input points_3d must have shape (N, 3)") + projected_pts, _ = cv2.projectPoints( + points_3d.reshape(-1, 1, 3), + np.zeros(3, dtype=np.float32), + np.zeros(3, dtype=np.float32), + cam_matrix.astype(np.float32), + dist_coeffs.astype(np.float32), + ) + + uv_rs = projected_pts.reshape(-1, 2) + u_idx = np.rint(uv_rs[:, 0]).astype(np.int32) + v_idx = np.rint(uv_rs[:, 1]).astype(np.int32) + + h_rs, w_rs = frame_size + in_bounds = (u_idx >= 0) & (u_idx < w_rs) & (v_idx >= 0) & (v_idx < h_rs) + if not np.any(in_bounds): + return np.zeros((h_rs, w_rs), dtype=np.float32) + + u_idx = u_idx[in_bounds] + v_idx = v_idx[in_bounds] + z_vals = points_3d[in_bounds, 2] # Z values of the valid points + + # Rasterize by nearest pixel; if multiple points hit a pixel, keep the closest depth. + lin = v_idx * w_rs + u_idx + depth_buffer = np.full(h_rs * w_rs, np.inf, dtype=np.float32) + np.minimum.at(depth_buffer, lin, z_vals) + depth_projected = depth_buffer.reshape(h_rs, w_rs) + depth_projected[~np.isfinite(depth_projected)] = 0.0 + return depth_projected + + # project from zivid depth patrix to point cloud and back to depth matrix with rs intrinsics and distortion to get "zivid GT as seen by RealSense" for pixel-level comparison + def project_depth_zivid_to_rs(self,depth_zivid_mm: np.ndarray, depth_rs_mm: np.ndarray, finx = 0) -> np.ndarray: + # create 3D point cloud from zivid depth + XYZ = self.project_camera_to_3d(depth_zivid_mm, CAMERA_MATRIX_ZIVID, DIST_COEFFS_ZIVID) # (N, 3) array of 3D points in Zivid camera space + # save to ply point cloud for visualization + #self.save_to_ply(XYZ/1000, f'zivid_original_points_{finx:03d}.ply') # save in meters for visualization + + # project back on imaage RS + depth_zivid_projected_mm = self.project_3d_to_camera(XYZ, CAMERA_MATRIX_RS, DIST_COEFFS_RS, frame_size = depth_rs_mm.shape) # (H, W) depth map of Zivid points projected into RealSense pixel space + + XYZ_RS = self.project_camera_to_3d(depth_zivid_projected_mm, CAMERA_MATRIX_RS, DIST_COEFFS_RS) + # save to ply point cloud for visualization + #self.save_to_ply(XYZ_RS/1000, f'zivid_projected_points_{finx:03d}.ply') # save in meters for visualization + + return depth_zivid_projected_mm + + def show_projection(self, rs_map, zv_map, zv_valid, idx): + fig, axes = plt.subplots(1, 3, sharey=True, sharex=True, figsize=(8,4)) + axes[0].imshow(rs_map, vmin=-10, vmax=1000),axes[0].set_title(f"RealSense Depth Diff (mm)"), + axes[1].imshow(zv_map, vmin=-10, vmax=1000),axes[1].set_title(f"Zivid Projected Depth Diff (mm)"), + axes[2].imshow(zv_valid, cmap='gray'),axes[2].set_title(f"Valid Mask (Zivid Projection)"), + plt.suptitle(f"Sample {idx:03d} Depth Difference Maps and Valid Mask", fontsize=16) + plt.tight_layout() + plt.show() + + +# -------------------------------- +#%% Tests +class TestDataSource(unittest.TestCase): + + def test_init_directory(self): + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + + def test_get_item(self): + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + out = p.get_item(0, debug=True) + self.assertTrue(len(out["left"]) > 0) + + def test_show_images(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\Depth\Data Collection-02') + if img_num == 0: + log.warning("No images found.") + return + for k in np.random.randint(0, img_num, size=min(8, img_num)): + out = p.get_item(int(k), debug=True) + self.assertTrue(len(out["left"]) > 0) + p.show_subset([out["left"], out["right"], out["depth_zivid"], out["depth_rs"], out["rgb"]], + ['left (RS)', 'right (RS)', 'depth Zivid (mm)', 'depth RS (mm)', 'rgb (Zivid)']) + + plt.show() + + def test_get_item_projected(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\Depth\Data Collection-02') + self.assertTrue(img_num > 0) + #for k in np.random.randint(0, img_num, size=min(12, img_num)): + for k in range(0, img_num): + out = p.get_item_projected(int(k), debug=False) + err = p.compute_depth_error(out["depth_rs"], out["depth_zivid"]) + self.assertTrue(len(out["left"]) > 0) + p.show_subset([out["left"], out["right"], out["depth_zivid"], out["depth_rs"], err], + ['left (RS)', 'right (RS)', 'depth Zivid (mm)', 'depth RS (mm)', 'error (mm)']) + plt.show() + + +# -------------------------------- +#%% Run Test +def RunTest(): + tst = TestDataSource() + #tst.test_get_item() + #tst.test_show_images() + tst.test_get_item_projected() + + +if __name__ == '__main__': + RunTest() diff --git a/scripts/data_manager_office.py b/scripts/data_manager_office.py new file mode 100644 index 0000000..3b80123 --- /dev/null +++ b/scripts/data_manager_office.py @@ -0,0 +1,274 @@ +''' +Dataset management for packed Office stereo frames. + +Reads `image_d16_*.png` files where channels are packed as: + channel 0 -> left IR + channel 1 -> right IR + channel 2 -> depth (mm) + +Expected directory layout: + / + image_d16_000.png + image_d16_001.png + ... + +Default root: + C:\Work\Data\DepthRS\data\pattern_cube + +Output dict keys (kept compatible with existing benchmark code): + left : numpy array (H, W) uint16 + right : numpy array (H, W) uint16 + depth_rs : numpy array (H, W) float32, mm + depth_zivid : numpy array (H, W) float32, mm (mirrors depth_rs for this dataset) + rgb : empty array (no RGB in packed d16 files) +''' + +import glob +import logging as log +import os +import unittest + +import cv2 +import matplotlib.pyplot as plt +import numpy as np + + +log.basicConfig(format='[%(asctime)s] %(levelname)s: %(message)s', level=log.INFO) + +# --------------------------------- +# D405 +CAMERA_MATRIX_RS = np.array([ + [638.77, 0, 644.23], + [0, 638.77, 358.049], + [0, 0, 1] +]) + +DIST_COEFFS_RS = np.array([ + 0.0, + -0.0, + -0.0, + 0.0, + -0.0 +]) + +class DataSource: + def __init__(self): + self.gray_scale_input = False + self.imgs = [] # list of packed d16 PNG paths + log.info('Source is defined') + + def init_directory(self, input_rectified='', gray_scale_input=False, sub_indexes=None): + """Scan root for packed d16 files and populate self.imgs.""" + if len(input_rectified) < 3: + input_rectified = r'C:\Work\Data\DepthRS\data\pattern_cube' + + self.gray_scale_input = gray_scale_input + + if not os.path.isdir(input_rectified): + log.error(f"Directory not found: {input_rectified}") + self.imgs = [] + return 0 + + # Support flat and nested layouts. + self.imgs = sorted(glob.glob(os.path.join(input_rectified, '**', 'image_d16_*.png'), recursive=True)) + + if sub_indexes is not None: + self.imgs = [self.imgs[i] for i in sub_indexes] + + log.info(f"DataSource: found {len(self.imgs)} samples in {input_rectified}") + return len(self.imgs) + + + def save_to_ply(self, points: np.ndarray, filename: str): + """Save a point cloud to a PLY file for visualization.""" + with open(filename, 'w') as f: + f.write('ply\n') + f.write('format ascii 1.0\n') + f.write(f'element vertex {len(points)}\n') + f.write('property float x\n') + f.write('property float y\n') + f.write('property float z\n') + f.write('end_header\n') + for x, y, z in points: + f.write(f'{x} {y} {z}\n') + log.info(f"Saved point cloud to {filename}") + + def project_camera_to_3d(self, depth_img_mm: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray) -> np.ndarray: + """Project 2D pixel coordinates with depth to 3D points in camera space.""" + h, w = depth_img_mm.shape + xs, ys = np.meshgrid(np.arange(w, dtype=np.float32), np.arange(h, dtype=np.float32), indexing='xy') + + # OpenCV expects Nx1x2 contiguous float32/float64 image points in (x, y) order. + distorted_points = np.stack([xs, ys], axis=-1).reshape(-1, 1, 2).astype(np.float32) + undistorted_points = cv2.undistortPoints(distorted_points, cam_matrix.astype(np.float32), dist_coeffs.astype(np.float32) ) + + uv = undistorted_points.reshape(-1, 2) + Z = depth_img_mm.reshape(-1).astype(np.float32) + valid = np.isfinite(Z) & (Z > 0) + if not np.any(valid): + return np.zeros((0, 3), dtype=np.float32) + + uv = uv[valid] + Z = Z[valid] + X = uv[:, 0] * Z + Y = uv[:, 1] * Z + + # save to ply point cloud for visualization + XYZ = np.stack([X, Y, Z], axis=1).astype(np.float32) + + return XYZ + + def get_item(self, index: int, debug: bool = False): + """Return one sample from packed d16 file as left/right/depth maps.""" + output_str = { + "left": [], + "right": [], + "depth_rs": [], + } + + packed_path = self.imgs[index] + packed_img = cv2.imread(packed_path, cv2.IMREAD_UNCHANGED) + + if packed_img is None: + log.warning(f"Failed to load sample {index}: {packed_path}") + return output_str + + if packed_img.ndim != 3 or packed_img.shape[2] < 3: + log.warning(f"Expected 3-channel packed image, got shape={packed_img.shape} at: {packed_path}") + return output_str + + left_img = packed_img[:, :, 0] + right_img = packed_img[:, :, 1] + depth_img = packed_img[:, :, 2].astype(np.float32) + + output_str["left"] = left_img + output_str["right"] = right_img + output_str["depth_rs"] = depth_img + + + if debug: + self.show_subset( + [output_str["left"], output_str["right"], output_str["depth_rs"] ], + ['left (packed ch0)', 'right (packed ch1)', 'depth RS (packed ch2, mm)'] + ) + + return output_str + + def get_item_projected(self, index: int, debug: bool = False): + """Compatibility wrapper for datasets without Zivid. + + For packed d16 files, depth_zivid is mirrored from depth_rs, so projection is not required. + """ + return self.get_item(index=index, debug=debug) + + def compute_depth_error(self, depth_pred, depth_gt, depth_mask=None): + """Compute signed depth error: pred - gt (mm) on valid pixels only.""" + depth_pred = depth_pred.astype(np.float32) + depth_gt = depth_gt.astype(np.float32) + depth_error = np.zeros_like(depth_pred) + mask = np.ones_like(depth_pred, dtype=bool) if depth_mask is None else depth_mask + valid = np.logical_and(depth_gt > 0, mask) + valid = np.logical_and(depth_pred > 0, valid) + depth_error[valid] = depth_pred[valid] - depth_gt[valid] + return depth_error + + def show_subset(self, img_list, ttl_list, vmin=None, vmax=None, save_path='', fig_name=''): + """Display a list of images in a compact grid.""" + img_num = len(img_list) + col_num = min(img_num, 3) + row_num = (img_num + col_num - 1) // col_num + fig, axes = plt.subplots(row_num, col_num, sharey=True, sharex=True) + axes = np.array(axes).reshape(row_num, col_num) + + for k in range(img_num): + ri, ci = k // col_num, k % col_num + axes[ri, ci].imshow(img_list[k], vmin=vmin, vmax=vmax) + axes[ri, ci].set_title(ttl_list[k]) + + for k in range(img_num, row_num * col_num): + axes[k // col_num, k % col_num].axis('off') + + if save_path and os.path.exists(save_path): + fig.savefig(os.path.join(save_path, fig_name + '.png')) + + plt.show(block=False) + + def save_data_to_folder(self, output_str, output_directory): + """Save sample dict to PNG files on disk.""" + os.makedirs(output_directory, exist_ok=True) + + paths = { + 'img_left.png': output_str['left'], + 'img_right.png': output_str['right'], + 'img_depth_rs.png': output_str['depth_rs'].astype(np.uint16), + } + + success = True + for fname, img in paths.items(): + out = cv2.imwrite(os.path.join(output_directory, fname), img, [cv2.IMWRITE_PNG_COMPRESSION, 0]) + success = success and out + + if output_str['rgb'] is not None and np.asarray(output_str['rgb']).size > 0: + cv2.imwrite( + os.path.join(output_directory, 'img_rgb.png'), + output_str['rgb'], + [cv2.IMWRITE_PNG_COMPRESSION, 0], + ) + + return success + + +class TestDataSource(unittest.TestCase): + def test_init_directory(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\DepthRS\data\pattern_cube') + self.assertTrue(img_num > 0) + + def test_get_item(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\DepthRS\data\pattern_cube') + self.assertTrue(img_num > 0) + out = p.get_item(0, debug=True) + self.assertTrue(len(out['left']) > 0) + + def test_show_images(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\DepthRS\data\pattern_cube') + if img_num == 0: + log.warning('No images found.') + return + + for k in np.random.randint(0, img_num, size=min(8, img_num)): + out = p.get_item(int(k), debug=True) + self.assertTrue(len(out['left']) > 0) + p.show_subset( + [out['left'], out['right'], out['depth_rs']], + ['left', 'right', 'depth_rs(mm)'], + ) + + plt.show() + + def test_get_item_projected(self): + p = DataSource() + img_num = p.init_directory(r'C:\Work\Data\DepthRS\data\pattern_cube') + self.assertTrue(img_num > 0) + for k in np.random.randint(0, img_num, size=min(6, img_num)): + out = p.get_item_projected(int(k), debug=True) + err = p.compute_depth_error(out['depth_rs'], out['depth_zivid']) + self.assertTrue(len(out['left']) > 0) + p.show_subset( + [out['left'], out['right'], out['depth_zivid'], out['depth_rs'], err], + ['left', 'right', 'depth_zivid(mm)', 'depth_rs(mm)', 'error(mm)'], + ) + plt.show() + + +def RunTest(): + tst = TestDataSource() + # tst.test_get_item() + tst.test_show_images() + #tst.test_get_item_projected() + + +if __name__ == '__main__': + RunTest() diff --git a/scripts/finetune_faro.py b/scripts/finetune_faro.py new file mode 100644 index 0000000..495dc17 --- /dev/null +++ b/scripts/finetune_faro.py @@ -0,0 +1,177 @@ +""" +Fine-tune FastFoundationStereo on the FARO dataset. + +The FARO dataset provides: + - img_left.png / img_right.png : uint16 IR stereo images + - img_depth_faro.png : ground-truth depth in mm (FARO scanner) + +Strategy: + - Freeze the ViT-L backbone (model.feature) to prevent overfitting on 24 samples. + - Train everything else with RAFT-style sequence loss over GRU iterations. + - IR uint16 images are clipped to [0,255] and replicated to 3 channels. + - Depth is converted to disparity: disp = BF / depth_mm (BF = 49470.45). + +Usage: + cd /home/administrato/dev/Fast-FoundationStereo + python scripts/finetune_faro.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_faro import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +FARO_DIR = r'/mnt/algonas/Local/Data/Stereo/Faro/FARO_DATA_BASE' # local path to the dataset +MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +OUT_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro_office.pth' +#MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_faro_kitchen.pth' + +BF = 49470.45 # focal_px * baseline_mm (calibrated from camera) +EPOCHS = 15 +LR = 2e-5 +ITERS = 8 # GRU iterations (same as inference) +GAMMA = 0.9 # sequence loss weight decay + + +# ── dataset ────────────────────────────────────────────────────────────────── + +class FaroDataset(Dataset): + def __init__(self, root): + self.source = DataSource() + n = self.source.init_directory(input_rectified=root, test_keywords=['OFFICE'], split='train') + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item(idx) + left = data['left'] + right = data['right'] + depth = data['depth_faro'] # float32, mm + + # uint16 IR → float [0, 255], replicate to 3-channel pseudo-RGB + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H x W x 3 + right = np.stack([right, right, right], axis=-1) + + # depth (mm) → disparity (pixels): disp = focal * baseline / depth + disp = np.zeros_like(depth, dtype=np.float32) + valid = depth > 0 + disp[valid] = BF / depth[valid] + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() # (3, H, W) + right_t = torch.from_numpy(right).permute(2, 0, 1).float() # (3, H, W) + disp_t = torch.from_numpy(disp).unsqueeze(0).float() # (1, H, W) + valid_t = torch.from_numpy(valid).unsqueeze(0) # (1, H, W) bool + + return left_t, right_t, disp_t, valid_t, idx + + +# ── loss ───────────────────────────────────────────────────────────────────── + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted sum of smooth-L1 losses over GRU iterations.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + # load full model object (weights + architecture) + logging.info(f"Loading model from {MODEL_PATH}") + model = torch.load(MODEL_PATH, map_location='cpu', weights_only=False) + + # freeze the ViT-L backbone — with only 24 samples it would overfit + for param in model.feature.parameters(): + param.requires_grad = False + logging.info("ViT backbone frozen.") + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + logging.info(f"Trainable: {trainable:,} / {total:,} parameters") + + model = torch.nn.DataParallel(model, device_ids=[0]) + model.cuda().train() + logging.info("Using DataParallel on GPUs 0 and 1.") + + optimizer = torch.optim.AdamW( + [p for p in model.module.parameters() if p.requires_grad], lr=LR, weight_decay=1e-4 + ) + scaler = torch.amp.GradScaler('cuda') + + dataset = FaroDataset(FARO_DIR) + dataloader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=4) + + best_loss = float('inf') + + for epoch in range(0,EPOCHS): + epoch_loss = 0.0 + + for left, right, disp_gt, valid, idx in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + # pad so H and W are divisible by 32 + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + optimizer.zero_grad() + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.module.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + if idx[0] % 10 == 0: + logging.info(f"Epoch {epoch+1:3d}/{EPOCHS} sample {idx[0]:3d} loss={loss.item():.4f}") + + avg = epoch_loss / len(dataloader) + logging.info(f"Epoch {epoch+1:3d}/{EPOCHS} loss={avg:.4f}") + + if avg < best_loss: + best_loss = avg + torch.save(model.module, OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth')) + logging.info(f" → saved best model (loss={best_loss:.4f})") + + logging.info(f"Training complete. Best loss: {best_loss:.4f}") + logging.info(f"Model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt.py b/scripts/finetune_inbolt.py new file mode 100644 index 0000000..9187190 --- /dev/null +++ b/scripts/finetune_inbolt.py @@ -0,0 +1,244 @@ +""" +Fine-tune FastFoundationStereo on the Inbolt dataset. + +The Inbolt dataset provides: + - realsense/{idx}/mono0.png : left IR image (uint8, 480x640) + - realsense/{idx}/mono1.png : right IR image (uint8, 480x640) + - zivid/{idx}/depthmap_mm.png: ground-truth depth in mm (Zivid scanner, 1024x1224) + +Strategy: + - Freeze the ViT-L backbone (model.feature) to prevent overfitting on small datasets. + - Train everything else with RAFT-style sequence loss over GRU iterations. + - IR uint8 images are replicated to 3 channels. + - Zivid depth is resized to RealSense image resolution before disparity conversion. + - Depth is converted to disparity: disp = BF / depth_mm. + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +#INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' # local path to the dataset +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +# MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# OUT_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415.pth' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415.pth' + + +# BF = 49.8624*385.73 # D435 - focal_px * baseline_mm (calibrated from camera) # D435 - focal_px * baseline_mm (calibrated from camera) +BF = 50.102706998586 * 385.509887695312 # new data +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 # GRU iterations (same as inference) +GAMMA = 0.9 # sequence loss weight decay +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + + +# ── dataset ────────────────────────────────────────────────────────────────── + +class InboltDataset(Dataset): + def __init__(self, root): + self.source = DataSource() + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth = data['depth_zivid'] # float32, mm (Zivid resolution) + + # Resize Zivid depth to match RealSense stereo image resolution + h, w = left.shape[:2] + if depth.shape != (h, w): + depth = cv2.resize(depth, (w, h), interpolation=cv2.INTER_NEAREST) + + # IR uint8 → float [0, 255], replicate to 3-channel pseudo-RGB + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H x W x 3 + right = np.stack([right, right, right], axis=-1) + + # depth (mm) → disparity (pixels): disp = focal * baseline / depth + disp = np.zeros_like(depth, dtype=np.float32) + valid = depth > 0 + disp[valid] = BF / depth[valid] + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() # (3, H, W) + right_t = torch.from_numpy(right).permute(2, 0, 1).float() # (3, H, W) + disp_t = torch.from_numpy(disp).unsqueeze(0).float() # (1, H, W) + valid_t = torch.from_numpy(valid).unsqueeze(0) # (1, H, W) bool + + return left_t, right_t, disp_t, valid_t + + +# ── loss ───────────────────────────────────────────────────────────────────── + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted sum of smooth-L1 losses over GRU iterations.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def evaluate_split_loss(model, dataloader): + """Evaluate average sequence loss over a dataloader (no gradient updates).""" + if len(dataloader) == 0: + return float('nan') + + model.eval() + total_loss = 0.0 + + with torch.no_grad(): + for left, right, disp_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + total_loss += loss.item() + + model.train() + return total_loss / len(dataloader) + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + # load full model object (weights + architecture) + logging.info(f"Loading model from {MODEL_PATH}") + model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + + # freeze the ViT-L backbone — with only 24 samples it would overfit + for param in model.feature.parameters(): + param.requires_grad = False + logging.info("ViT backbone frozen.") + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + logging.info(f"Trainable: {trainable:,} / {total:,} parameters") + + model = model.cuda().train() + logging.info("Model on single GPU.") + + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], lr=LR, weight_decay=1e-4 + ) + scaler = torch.amp.GradScaler('cuda') + + dataset = InboltDataset(INBOLT_DIR) + n_total = len(dataset) + + if n_total < 2: + raise RuntimeError(f"Need at least 2 samples for a 75/25 train/test split, got {n_total}.") + + n_train = int(round(TRAIN_RATIO * n_total)) + n_train = min(max(1, n_train), n_total - 1) + n_test = n_total - n_train + + split_generator = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_generator) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Random split with seed={SPLIT_SEED}: total={n_total}, train={len(train_set)} ({100.0*len(train_set)/n_total:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total:.1f}%)" + ) + + best_loss = float('inf') + + for epoch in range(EPOCHS): + epoch_loss = 0.0 + + for left, right, disp_gt, valid in train_loader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + # pad so H and W are divisible by 32 + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + + train_loss = epoch_loss / len(train_loader) + train_eval_error = evaluate_split_loss(model, train_loader) + test_eval_error = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} train_loss={train_loss:.4f} " + f"train_eval_error={train_eval_error:.4f} test_eval_error={test_eval_error:.4f}" + ) + + if test_eval_error < best_loss: + best_loss = test_eval_error + torch.save(model, OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth')) + logging.info(f" → saved best model (test_eval_error={best_loss:.4f})") + + final_train_error = evaluate_split_loss(model, train_loader) + final_test_error = evaluate_split_loss(model, test_loader) + logging.info(f"Final train error: {final_train_error:.4f}") + logging.info(f"Final test error: {final_test_error:.4f}") + logging.info(f"Training complete. Best test error: {best_loss:.4f}") + logging.info(f"Model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt_depthrs.py b/scripts/finetune_inbolt_depthrs.py new file mode 100644 index 0000000..22f06d0 --- /dev/null +++ b/scripts/finetune_inbolt_depthrs.py @@ -0,0 +1,553 @@ +""" +Fine-tune FastFoundationStereo + RealSense depth fusion on the Inbolt dataset. + +Extends finetune_inbolt.py with three-stage depth fusion as specified in +docs/superpowers/specs/2026-04-30-depth-fusion-design.md: + + Stage 1 - DepthEncoder features fused into left image features (early) + Stage 2 - Disparity prior Gaussian bias on cost-volume logits (mid) + Stage 3 - Learned blend of stereo soft-argmax and depth prior for GRU init (late) + +The Inbolt dataset provides: + - realsense/{idx}/mono0.png : left IR image (uint8, 480x640) + - realsense/{idx}/mono1.png : right IR image (uint8, 480x640) + - realsense/{idx}/depthmap_mm.png : RealSense depth in mm ← fusion input + - zivid/{idx}/depthmap_mm.png : GT depth in mm (Zivid scanner) + +Freezing strategy: + Frozen : EdgeNeXt backbone (stereo.feature.stem, stereo.feature.stages) + Full LR : DepthEncoder, DepthFusionModule, DepthInitBlend, depth_sigma, depth_prior_scale + 0.1x LR : all other trainable stereo parameters + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt_depthrs.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +from core.submodule import ( + BasicConv, + disparity_regression, + build_gwc_volume_optimized_pytorch1, + build_concat_volume_optimized_pytorch1, +) +from core.geometry import Combined_Geo_Encoding_Volume +from core.foundation_stereo import normalize_image +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_depthrs.pth' + +BF = 50.102706998586 * 385.509887695312 # focal_px * baseline_mm +FOCAL_PX = 385.509887695312 +BASELINE_MM = 50.102706998586 + +DEPTH_NORM_M = 5.0 # RealSense depth clipped and normalised to [0, 1] over 0–5 m + +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 +GAMMA = 0.9 +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + + +# ── depth fusion modules ────────────────────────────────────────────────────── + +class DepthEncoder(nn.Module): + """4-scale CNN encoder for a (normalised depth + validity) 2-channel input.""" + depth_chans = [32, 64, 96, 128] + + def __init__(self): + super().__init__() + self.stem = BasicConv(2, 32, kernel_size=3, stride=2, padding=1, bn=True, relu=True) + self.stage1 = BasicConv(32, 32, kernel_size=3, stride=2, padding=1, bn=True, relu=True) + self.stage2 = BasicConv(32, 64, kernel_size=3, stride=2, padding=1, bn=True, relu=True) + self.stage3 = BasicConv(64, 96, kernel_size=3, stride=2, padding=1, bn=True, relu=True) + self.stage4 = BasicConv(96, 128, kernel_size=3, stride=2, padding=1, bn=True, relu=True) + + def forward(self, x): + # x: (B, 2, H, W) — ch0: normalised depth, ch1: validity mask + validity = x[:, 1:2] # (B, 1, H, W) + + # build per-scale validity masks via 2x2 max-pool chaining + v2 = F.max_pool2d(validity, kernel_size=2, stride=2) # H/2 + v4 = F.max_pool2d(v2, kernel_size=2, stride=2) # H/4 + v8 = F.max_pool2d(v4, kernel_size=2, stride=2) # H/8 + v16 = F.max_pool2d(v8, kernel_size=2, stride=2) # H/16 + v32 = F.max_pool2d(v16, kernel_size=2, stride=2) # H/32 + + x2 = self.stem(x) # (B, 32, H/2, W/2) + d4 = self.stage1(x2) * v4 # (B, 32, H/4, W/4) + d8 = self.stage2(d4) * v8 # (B, 64, H/8, W/8) + d16 = self.stage3(d8) * v16 # (B, 96, H/16, W/16) + d32 = self.stage4(d16) * v32 # (B, 128, H/32, W/32) + + return [d4, d8, d16, d32] + + +class DepthFusionModule(nn.Module): + """Per-scale zero-init residual projection that adds depth features into left RGB features.""" + + def __init__(self, feat_dims): + super().__init__() + dc = DepthEncoder.depth_chans # [32, 64, 96, 128] + self.proj_4 = nn.Conv2d(dc[0], feat_dims[0], 1) + self.proj_8 = nn.Conv2d(dc[1], feat_dims[1], 1) + self.proj_16 = nn.Conv2d(dc[2], feat_dims[2], 1) + self.proj_32 = nn.Conv2d(dc[3], feat_dims[3], 1) + for proj in [self.proj_4, self.proj_8, self.proj_16, self.proj_32]: + nn.init.zeros_(proj.weight) + nn.init.zeros_(proj.bias) + + def forward(self, left_feats, depth_feats): + projs = [self.proj_4, self.proj_8, self.proj_16, self.proj_32] + return [left_feats[i] + projs[i](depth_feats[i]) for i in range(4)] + + +class DepthInitBlend(nn.Module): + """3-layer CNN that blends stereo soft-argmax and depth prior for GRU init.""" + + def __init__(self): + super().__init__() + self.net = nn.Sequential( + BasicConv(3, 16, kernel_size=3, padding=1, bn=True, relu=True), + BasicConv(16, 16, kernel_size=3, padding=1, bn=True, relu=True), + nn.Conv2d(16, 1, kernel_size=1), + ) + nn.init.zeros_(self.net[-1].weight) + nn.init.constant_(self.net[-1].bias, -5.0) + + def forward(self, stereo_init, disp_prior, validity): + # all inputs: (B, 1, H/4, W/4) + x = torch.cat([stereo_init, disp_prior, validity], dim=1) # (B, 3, H/4, W/4) + blend_w = torch.sigmoid(self.net(x)) + blend_w = blend_w * validity # 0 where depth invalid + return blend_w * disp_prior + (1.0 - blend_w) * stereo_init + + +# ── model wrapper ───────────────────────────────────────────────────────────── + +class FastFoundationStereoDepthRS(nn.Module): + """ + Wraps a pretrained FastFoundationStereo and adds RealSense depth fusion + at three stages of the pipeline. + + The wrapped stereo model is stored as self.stereo; all new modules live + directly on this wrapper so parameter groups are easy to separate. + """ + + def __init__(self, stereo_model): + super().__init__() + self.stereo = stereo_model + self.args = stereo_model.args + + feat_dims = stereo_model.feature.d_out # [224, 192, 320, 304] for vitl + self.depth_encoder = DepthEncoder() + self.depth_fusion = DepthFusionModule(feat_dims) + self.depth_init_blend = DepthInitBlend() + + # Stage-2 learned scalars: prior sharpness (σ) and weight (α) + self.depth_sigma = nn.Parameter(torch.tensor(4.0)) # disparity units at 1/4 scale + self.depth_prior_scale = nn.Parameter(torch.tensor(0.1)) # starts small → near-stereo behaviour + + # ------------------------------------------------------------------ + def _preprocess_depth(self, depth_rs_mm): + """ + depth_rs_mm: (B, 1, H, W) float32, RealSense depth in millimetres. + Returns: + depth_rs_m (B, 1, H, W) — depth in metres (invalid pixels = 0) + validity (B, 1, H, W) — 1 where depth is valid, else 0 + enc_input (B, 2, H, W) — [normalised_depth, validity] for DepthEncoder + """ + validity = ((depth_rs_mm > 0) & torch.isfinite(depth_rs_mm)).float() + depth_rs_m = (depth_rs_mm / 1000.0) * validity + d_norm = depth_rs_m.clamp(0, DEPTH_NORM_M) / DEPTH_NORM_M + enc_input = torch.cat([d_norm, validity], dim=1) + return depth_rs_m, validity, enc_input + + # ------------------------------------------------------------------ + def upsample_disp(self, disp, mask_feat_4, stem_2x): + return self.stereo.upsample_disp(disp, mask_feat_4, stem_2x) + + # ------------------------------------------------------------------ + def forward( + self, + image1, + image2, + depth_rs_mm=None, + focal=FOCAL_PX, + baseline_mm=BASELINE_MM, + iters=12, + test_mode=False, + low_memory=False, + optimize_build_volume='pytorch1', + ): + s = self.stereo # shorthand + B, C, H, W = image1.shape + low_memory = low_memory or self.args.get('low_memory', False) + + image1 = normalize_image(image1) + image2 = normalize_image(image2) + + with torch.amp.autocast('cuda', enabled=self.args.mixed_precision, dtype=U.AMP_DTYPE): + + # ── feature extraction ──────────────────────────────────── + out = s.feature(torch.cat([image1, image2], dim=0)) + features_left = [o[:B] for o in out] + features_right = [o[B:] for o in out] + stem_2x = s.stem_2(image1) + + # ── Stage 1: depth feature fusion ───────────────────────── + disp_prior_14 = None + validity_14 = None + + if depth_rs_mm is not None: + depth_rs_m, validity, enc_input = self._preprocess_depth(depth_rs_mm) + depth_feats = self.depth_encoder(enc_input) + features_left = self.depth_fusion(features_left, depth_feats) + + # ── cost volume (unchanged) ─────────────────────────────── + gwc_volume = build_gwc_volume_optimized_pytorch1( + features_left[0], features_right[0], + self.args.max_disp // 4, s.cv_group, + normalize=self.args.normalize, + ) + left_tmp = s.proj_cmb(features_left[0]) + right_tmp = s.proj_cmb(features_right[0]) + concat_volume = build_concat_volume_optimized_pytorch1(left_tmp, right_tmp, maxdisp=self.args.max_disp // 4) + del left_tmp, right_tmp + + comb_volume = torch.cat([gwc_volume, concat_volume], dim=1) + del concat_volume, gwc_volume + + comb_volume = s.corr_stem(comb_volume) + comb_volume = s.corr_feature_att(comb_volume, features_left[0]) + comb_volume = s.cost_agg(comb_volume, features_left) + + # ── Stage 2: disparity prior bias on logits ─────────────── + logits = s.classifier(comb_volume).squeeze(1) # (B, D, H/4, W/4) + + if depth_rs_mm is not None: + D = logits.shape[1] + # Use torch.where so invalid pixels get disp_prior=0. + # depth_rs_m is in metres; baseline_mm is in mm → divide by 1000 + # for consistent units: focal(px) * baseline_m(m) / depth_m(m) = px. + disp_prior = torch.where( + validity > 0.5, + (focal * baseline_mm / 1000.0) / depth_rs_m.clamp(min=1e-3), + torch.zeros_like(depth_rs_m), + ) + disp_prior_14 = F.interpolate(disp_prior / 4.0, size=(H // 4, W // 4), mode='nearest') + validity_14 = F.interpolate(validity, size=(H // 4, W // 4), mode='nearest') + + d_idx = torch.arange(D, device=logits.device, dtype=logits.dtype).view(1, D, 1, 1) + sigma = self.depth_sigma.abs().clamp(min=0.1) + prior_bias = -0.5 * ((d_idx - disp_prior_14) / sigma) ** 2 + prior_bias = prior_bias * validity_14 # flat where depth invalid + + logits = logits + self.depth_prior_scale * prior_bias + + prob = F.softmax(logits, dim=1) + stereo_init = disparity_regression(prob, self.args.max_disp // 4) # (B,1,H/4,W/4) + + # ── Stage 3: GRU init blend ─────────────────────────────── + if depth_rs_mm is not None: + init_disp = self.depth_init_blend(stereo_init, disp_prior_14, validity_14) + else: + init_disp = stereo_init + + # ── context network (unchanged) ─────────────────────────── + cnet_list = s.cnet(features_left[0], features_left[1], features_left[2]) + cnet_list = list(cnet_list) + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + inp_list = [s.cam(x) * x for x in inp_list] + att = [s.sam(x) for x in inp_list] + + # ── geometry encoding volume ────────────────────────────────── + geo_fn = Combined_Geo_Encoding_Volume( + features_left[0].to(s.dtype), + features_right[0].to(s.dtype), + comb_volume.to(s.dtype), + num_levels=self.args.corr_levels, + ) + b, c, h, w = features_left[0].shape + coords = torch.arange(w, dtype=torch.float, device=init_disp.device).reshape(1, 1, w, 1).repeat(b, h, 1, 1) + disp = init_disp.to(s.dtype) + disp_preds = [] + + del comb_volume, features_left, features_right, cnet_list + + # ── GRU iterations (unchanged) ──────────────────────────────── + for itr in range(iters): + disp = disp.detach() + geo_feat = geo_fn(disp, coords, dx=s.dx, low_memory=low_memory) + + with torch.amp.autocast('cuda', enabled=self.args.mixed_precision, dtype=U.AMP_DTYPE): + net_list, mask_feat_4, delta_disp = s.update_block( + net_list, inp_list, geo_feat.to(s.dtype), disp, att + ) + + disp = disp + delta_disp.to(s.dtype) + if test_mode and itr < iters - 1: + continue + + disp_up = self.upsample_disp(disp.to(s.dtype), mask_feat_4.to(s.dtype), stem_2x.to(s.dtype)) + disp_preds.append(disp_up) + + if test_mode: + return disp_up + + return init_disp, disp_preds + + +# ── dataset ─────────────────────────────────────────────────────────────────── + +class InboltDepthDataset(Dataset): + """ + Like InboltDataset but also returns the RealSense depth map (mm) as a + fourth tensor — used as the depth fusion input during training. + """ + + def __init__(self, root): + self.source = DataSource() + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth_zivid = data['depth_zivid'] # GT float32 mm (Zivid resolution) + depth_rs = data['depth_rs'] # RealSense float32 mm (model input) + + h, w = left.shape[:2] + + if depth_zivid.shape != (h, w): + depth_zivid = cv2.resize(depth_zivid, (w, h), interpolation=cv2.INTER_NEAREST) + if depth_rs.shape != (h, w): + depth_rs = cv2.resize(depth_rs, (w, h), interpolation=cv2.INTER_NEAREST) + + # IR uint8 → float [0, 255], replicated to 3-channel pseudo-RGB + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) + right = np.stack([right, right, right], axis=-1) + + # Zivid GT depth (mm) → disparity (pixels) + disp = np.zeros_like(depth_zivid, dtype=np.float32) + valid = depth_zivid > 0 + disp[valid] = BF / depth_zivid[valid] + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() # (3, H, W) + right_t = torch.from_numpy(right).permute(2, 0, 1).float() # (3, H, W) + disp_t = torch.from_numpy(disp).unsqueeze(0).float() # (1, H, W) + valid_t = torch.from_numpy(valid).unsqueeze(0) # (1, H, W) bool + depth_rs_t = torch.from_numpy(depth_rs).unsqueeze(0).float() # (1, H, W) mm + + return left_t, right_t, disp_t, valid_t, depth_rs_t + + +# ── loss ────────────────────────────────────────────────────────────────────── + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted sum of smooth-L1 losses over GRU iterations.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + if not v.any(): + continue + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def evaluate_split_loss(model, dataloader): + """Evaluate average sequence loss over a dataloader (no gradient updates).""" + if len(dataloader) == 0: + return float('nan') + + model.eval() + total_loss = 0.0 + + with torch.no_grad(): + for left, right, disp_gt, valid, depth_rs in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + depth_rs = depth_rs.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p, depth_rs_p = padder.pad(left, right, depth_rs) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, depth_rs_mm=depth_rs_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + total_loss += loss.item() + + model.train() + return total_loss / len(dataloader) + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + # ── load and wrap pretrained stereo model ───────────────────────── + logging.info(f"Loading base stereo model from {MODEL_PATH}") + stereo_model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + model = FastFoundationStereoDepthRS(stereo_model).cuda() + logging.info("Model wrapped with depth fusion modules.") + + # ── freeze EdgeNeXt backbone ────────────────────────────────────── + for param in model.stereo.feature.stem.parameters(): + param.requires_grad = False + for param in model.stereo.feature.stages.parameters(): + param.requires_grad = False + logging.info("EdgeNeXt backbone (stem + stages) frozen.") + + # ── two-group optimizer: new modules at full LR, rest at 0.1× LR ─ + new_params = ( + list(model.depth_encoder.parameters()) + + list(model.depth_fusion.parameters()) + + list(model.depth_init_blend.parameters()) + + [model.depth_sigma, model.depth_prior_scale] + ) + new_param_ids = {id(p) for p in new_params} + finetune_params = [ + p for p in model.parameters() + if p.requires_grad and id(p) not in new_param_ids + ] + + trainable_new = sum(p.numel() for p in new_params) + trainable_finetune = sum(p.numel() for p in finetune_params) + total = sum(p.numel() for p in model.parameters()) + logging.info( + f"Parameters — new (full LR): {trainable_new:,} " + f"fine-tune (0.1× LR): {trainable_finetune:,} " + f"frozen: {total - trainable_new - trainable_finetune:,} " + f"total: {total:,}" + ) + + optimizer = torch.optim.AdamW( + [ + {'params': new_params, 'lr': LR}, + {'params': finetune_params, 'lr': LR * 0.1}, + ], + weight_decay=1e-4, + ) + scaler = torch.amp.GradScaler('cuda') + + # ── dataset and splits ──────────────────────────────────────────── + dataset = InboltDepthDataset(INBOLT_DIR) + n_total = len(dataset) + if n_total < 2: + raise RuntimeError(f"Need at least 2 samples for train/test split, got {n_total}.") + + n_train = min(max(1, int(round(TRAIN_RATIO * n_total))), n_total - 1) + n_test = n_total - n_train + + split_gen = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_gen) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Split (seed={SPLIT_SEED}): total={n_total}, " + f"train={len(train_set)} ({100.0*len(train_set)/n_total:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total:.1f}%)" + ) + + model.train() + best_loss = float('inf') + + # ── training loop ───────────────────────────────────────────────── + for epoch in range(EPOCHS): + epoch_loss = 0.0 + + for left, right, disp_gt, valid, depth_rs in train_loader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + depth_rs = depth_rs.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p, depth_rs_p = padder.pad(left, right, depth_rs) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, depth_rs_mm=depth_rs_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + + train_loss = epoch_loss / len(train_loader) + train_eval_error = evaluate_split_loss(model, train_loader) + test_eval_error = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} " + f"train_loss={train_loss:.4f} " + f"train_eval={train_eval_error:.4f} " + f"test_eval={test_eval_error:.4f} " + f"depth_sigma={model.depth_sigma.item():.3f} " + f"depth_prior_scale={model.depth_prior_scale.item():.4f}" + ) + + if test_eval_error < best_loss: + best_loss = test_eval_error + save_path = OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth') + torch.save(model, save_path) + logging.info(f" → saved best model (test_eval={best_loss:.4f}) to {save_path}") + + final_train = evaluate_split_loss(model, train_loader) + final_test = evaluate_split_loss(model, test_loader) + logging.info(f"Final train error: {final_train:.4f}") + logging.info(f"Final test error: {final_test:.4f}") + logging.info(f"Best test error: {best_loss:.4f}") + torch.save(model, OUT_PATH) + logging.info(f"Final model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt_depthrs_2.py b/scripts/finetune_inbolt_depthrs_2.py new file mode 100644 index 0000000..bd0f9eb --- /dev/null +++ b/scripts/finetune_inbolt_depthrs_2.py @@ -0,0 +1,530 @@ +""" +Fine-tune FastFoundationStereo + RealSense depth fusion v2 on the Inbolt dataset. + +Architecture (no DepthEncoder / no DepthFusionModule — feature extraction fully frozen): + + INPUTS + left_IR (B,3,H,W) + right_IR (B,3,H,W) + depth_rs_mm (B,1,H,W) + + FEATURE EXTRACTION [FROZEN — stereo.feature + stereo.stem_2] + EdgeNeXt backbone + FPN decoder + → feat_left[x4,x8,x16,x32], feat_right[x4,x8,x16,x32], stem_2x + + DEPTH PREPROCESSING [no learned params] + validity M = (depth_rs_mm > 0) & isfinite + depth_rs_m = depth_rs_mm / 1000.0 · M + disp_prior = focal·baseline_m / depth_rs_m (full-res pixels) + disp_prior_14 = disp_prior / 4 (1/4-scale pixels) + validity_14 = F.interpolate(M, H/4, W/4) + + COST VOLUME [fine-tune 0.1× LR] + feat_left[0]+feat_right[0] → GWC+Concat → hourglass → logits (B,D,H/4,W/4) + + STAGE 2 — GAUSSIAN LOGIT PRIOR [learnable σ, α — full LR] + prior_bias = −½·((d_idx − disp_prior_14) / σ)² · validity_14 + logits' = logits + α·prior_bias + → stereo_init = softmax → disparity_regression (B,1,H/4,W/4) + + STAGE 3a — GRU INIT BLEND [DepthInitBlend ~5K — full LR] + concat(stereo_init, disp_prior_14, validity_14) + → sigmoid (3-layer CNN) → blend_w · validity_14 + → init_disp = blend_w·disp_prior_14 + (1−blend_w)·stereo_init + + CONTEXT NETWORK + GRU ITERATIONS [fine-tune 0.1× LR] + init_disp → 8× GRU update → disp_up (B,1,H,W) + + STAGE 3b — OUTPUT BLEND [DepthOutputBlend ~8K — full LR] ← NEW + concat(disp_up, disp_prior_full, validity) + → sigmoid (3-layer CNN) → blend_w_out · validity + → disp_final = disp_up + blend_w_out·(disp_prior_full − disp_up) + + OUTPUT: disp_final (B,1,H,W) + +Freezing strategy: + Frozen : stereo.feature (all submodules), stereo.stem_2 + Full LR : DepthInitBlend, DepthOutputBlend, depth_sigma, depth_prior_scale + 0.1× LR : all other trainable stereo params + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt_depthrs_2.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +from core.submodule import ( + BasicConv, + disparity_regression, + build_gwc_volume_optimized_pytorch1, + build_concat_volume_optimized_pytorch1, +) +from core.geometry import Combined_Geo_Encoding_Volume +from core.foundation_stereo import normalize_image +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_depthrs_v2.pth' + +BF = 50.102706998586 * 385.509887695312 # focal_px * baseline_mm +FOCAL_PX = 385.509887695312 +BASELINE_MM = 50.102706998586 + +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 +GAMMA = 0.9 +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + + +# ── depth fusion modules ────────────────────────────────────────────────────── + +class DepthInitBlend(nn.Module): + """Blend stereo soft-argmax init with depth prior at 1/4-scale for GRU warm start.""" + + def __init__(self): + super().__init__() + self.net = nn.Sequential( + BasicConv(3, 16, kernel_size=3, padding=1, bn=True, relu=True), + BasicConv(16, 16, kernel_size=3, padding=1, bn=True, relu=True), + nn.Conv2d(16, 1, kernel_size=1), + ) + nn.init.zeros_(self.net[-1].weight) + nn.init.constant_(self.net[-1].bias, -5.0) # sigmoid(-5) ≈ 0 → no blend at init + + def forward(self, stereo_init, disp_prior, validity): + # all inputs: (B, 1, H/4, W/4) + x = torch.cat([stereo_init, disp_prior, validity], dim=1) + blend_w = torch.sigmoid(self.net(x)) * validity + return blend_w * disp_prior + (1.0 - blend_w) * stereo_init + + +class DepthOutputBlend(nn.Module): + """Refine full-resolution GRU output with depth prior (residual, zero-init).""" + + def __init__(self): + super().__init__() + self.net = nn.Sequential( + BasicConv(3, 32, kernel_size=3, padding=1, bn=True, relu=True), + BasicConv(32, 32, kernel_size=3, padding=1, bn=True, relu=True), + nn.Conv2d(32, 1, kernel_size=1), + ) + nn.init.zeros_(self.net[-1].weight) + nn.init.constant_(self.net[-1].bias, -5.0) # no correction at init + + def forward(self, disp_stereo, disp_prior_full, validity): + # all inputs: (B, 1, H, W) + x = torch.cat([disp_stereo, disp_prior_full, validity], dim=1) + blend_w = torch.sigmoid(self.net(x)) * validity + return disp_stereo + blend_w * (disp_prior_full - disp_stereo) + + +# ── model wrapper ───────────────────────────────────────────────────────────── + +class FastFoundationStereoDepthRS_v2(nn.Module): + """ + Wraps a pretrained FastFoundationStereo and adds RealSense depth fusion + at three pipeline stages. Feature extraction (stereo.feature + stereo.stem_2) + is expected to be frozen before training. + + New modules (no DepthEncoder / no DepthFusionModule): + depth_sigma, depth_prior_scale — Stage 2 Gaussian prior on logits + DepthInitBlend — Stage 3a GRU warm-start blend + DepthOutputBlend — Stage 3b full-resolution output correction + """ + + def __init__(self, stereo_model): + super().__init__() + self.stereo = stereo_model + self.args = stereo_model.args + + self.depth_init_blend = DepthInitBlend() + self.depth_output_blend = DepthOutputBlend() + + self.depth_sigma = nn.Parameter(torch.tensor(4.0)) + self.depth_prior_scale = nn.Parameter(torch.tensor(0.1)) + + # ------------------------------------------------------------------ + def _preprocess_depth(self, depth_rs_mm): + """ + Returns: + depth_rs_m (B,1,H,W) metres, zeros where invalid + validity (B,1,H,W) float 0/1 + disp_prior (B,1,H,W) full-res disparity in pixels, zeros where invalid + """ + validity = ((depth_rs_mm > 0) & torch.isfinite(depth_rs_mm)).float() + depth_rs_m = (depth_rs_mm / 1000.0) * validity + disp_prior = torch.where( + validity > 0.5, + (FOCAL_PX * BASELINE_MM / 1000.0) / depth_rs_m.clamp(min=1e-3), + torch.zeros_like(depth_rs_m), + ) + return depth_rs_m, validity, disp_prior + + # ------------------------------------------------------------------ + def upsample_disp(self, disp, mask_feat_4, stem_2x): + return self.stereo.upsample_disp(disp, mask_feat_4, stem_2x) + + # ------------------------------------------------------------------ + def forward( + self, + image1, + image2, + depth_rs_mm=None, + iters=12, + test_mode=False, + low_memory=False, + ): + s = self.stereo + B, C, H, W = image1.shape + low_memory = low_memory or self.args.get('low_memory', False) + + image1 = normalize_image(image1) + image2 = normalize_image(image2) + + with torch.amp.autocast('cuda', enabled=self.args.mixed_precision, dtype=U.AMP_DTYPE): + + # ── feature extraction (frozen) ─────────────────────────── + out = s.feature(torch.cat([image1, image2], dim=0)) + features_left = [o[:B] for o in out] + features_right = [o[B:] for o in out] + stem_2x = s.stem_2(image1) + + # ── depth preprocessing ─────────────────────────────────── + disp_prior_14 = None + validity_14 = None + disp_prior_full = None + validity_full = None + + if depth_rs_mm is not None: + depth_rs_m, validity_full, disp_prior_full = self._preprocess_depth(depth_rs_mm) + disp_prior_14 = F.interpolate(disp_prior_full / 4.0, size=(H // 4, W // 4), mode='nearest') + validity_14 = F.interpolate(validity_full, size=(H // 4, W // 4), mode='nearest') + + # ── cost volume ─────────────────────────────────────────── + gwc_volume = build_gwc_volume_optimized_pytorch1( + features_left[0], features_right[0], + self.args.max_disp // 4, s.cv_group, + normalize=self.args.normalize, + ) + left_tmp = s.proj_cmb(features_left[0]) + right_tmp = s.proj_cmb(features_right[0]) + concat_volume = build_concat_volume_optimized_pytorch1( + left_tmp, right_tmp, maxdisp=self.args.max_disp // 4 + ) + del left_tmp, right_tmp + + comb_volume = torch.cat([gwc_volume, concat_volume], dim=1) + del concat_volume, gwc_volume + + comb_volume = s.corr_stem(comb_volume) + comb_volume = s.corr_feature_att(comb_volume, features_left[0]) + comb_volume = s.cost_agg(comb_volume, features_left) + + # ── Stage 2: Gaussian prior on logits ───────────────────── + logits = s.classifier(comb_volume).squeeze(1) # (B, D, H/4, W/4) + + if depth_rs_mm is not None: + D = logits.shape[1] + d_idx = torch.arange(D, device=logits.device, dtype=logits.dtype).view(1, D, 1, 1) + sigma = self.depth_sigma.abs().clamp(min=0.1) + prior_bias = -0.5 * ((d_idx - disp_prior_14) / sigma) ** 2 + prior_bias = prior_bias * validity_14 + logits = logits + self.depth_prior_scale * prior_bias + + prob = F.softmax(logits, dim=1) + stereo_init = disparity_regression(prob, self.args.max_disp // 4) # (B,1,H/4,W/4) + + # ── Stage 3a: GRU init blend ────────────────────────────── + if depth_rs_mm is not None: + init_disp = self.depth_init_blend(stereo_init, disp_prior_14, validity_14) + else: + init_disp = stereo_init + + # ── context network ─────────────────────────────────────── + cnet_list = s.cnet(features_left[0], features_left[1], features_left[2]) + cnet_list = list(cnet_list) + net_list = [torch.tanh(x[0]) for x in cnet_list] + inp_list = [torch.relu(x[1]) for x in cnet_list] + inp_list = [s.cam(x) * x for x in inp_list] + att = [s.sam(x) for x in inp_list] + + # ── geometry encoding volume ────────────────────────────────── + geo_fn = Combined_Geo_Encoding_Volume( + features_left[0].to(s.dtype), + features_right[0].to(s.dtype), + comb_volume.to(s.dtype), + num_levels=self.args.corr_levels, + ) + b, c, h, w = features_left[0].shape + coords = torch.arange(w, dtype=torch.float, device=init_disp.device).reshape(1, 1, w, 1).repeat(b, h, 1, 1) + disp = init_disp.to(s.dtype) + disp_preds = [] + + del comb_volume, features_left, features_right, cnet_list + + # ── GRU iterations ──────────────────────────────────────────── + for itr in range(iters): + disp = disp.detach() + geo_feat = geo_fn(disp, coords, dx=s.dx, low_memory=low_memory) + + with torch.amp.autocast('cuda', enabled=self.args.mixed_precision, dtype=U.AMP_DTYPE): + net_list, mask_feat_4, delta_disp = s.update_block( + net_list, inp_list, geo_feat.to(s.dtype), disp, att + ) + + disp = disp + delta_disp.to(s.dtype) + if test_mode and itr < iters - 1: + continue + + disp_up = self.upsample_disp(disp.to(s.dtype), mask_feat_4.to(s.dtype), stem_2x.to(s.dtype)) + disp_preds.append(disp_up) + + # ── Stage 3b: output blend (full resolution) ────────────────── + if depth_rs_mm is not None: + disp_preds = [ + self.depth_output_blend(p, disp_prior_full, validity_full) + for p in disp_preds + ] + + if test_mode: + return disp_preds[-1] + + return init_disp, disp_preds + + +# ── dataset ─────────────────────────────────────────────────────────────────── + +class InboltDepthDataset(Dataset): + """Returns (left, right, disp_gt, valid, depth_rs_mm) for each sample.""" + + def __init__(self, root): + self.source = DataSource() + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth_zivid = data['depth_zivid'] + depth_rs = data['depth_rs'] + + h, w = left.shape[:2] + if depth_zivid.shape != (h, w): + depth_zivid = cv2.resize(depth_zivid, (w, h), interpolation=cv2.INTER_NEAREST) + if depth_rs.shape != (h, w): + depth_rs = cv2.resize(depth_rs, (w, h), interpolation=cv2.INTER_NEAREST) + + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) + right = np.stack([right, right, right], axis=-1) + + disp = np.zeros_like(depth_zivid, dtype=np.float32) + valid = depth_zivid > 0 + disp[valid] = BF / depth_zivid[valid] + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() + right_t = torch.from_numpy(right).permute(2, 0, 1).float() + disp_t = torch.from_numpy(disp).unsqueeze(0).float() + valid_t = torch.from_numpy(valid).unsqueeze(0) + depth_rs_t = torch.from_numpy(depth_rs).unsqueeze(0).float() + + return left_t, right_t, disp_t, valid_t, depth_rs_t + + +# ── loss ────────────────────────────────────────────────────────────────────── + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted smooth-L1 sum over GRU iterations.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + if not v.any(): + continue + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def evaluate_split_loss(model, dataloader): + if len(dataloader) == 0: + return float('nan') + model.eval() + total = 0.0 + with torch.no_grad(): + for left, right, disp_gt, valid, depth_rs in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + depth_rs = depth_rs.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p, dr_p = padder.pad(left, right, depth_rs) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init, preds = model.forward(left_p, right_p, depth_rs_mm=dr_p, + iters=ITERS, test_mode=False) + preds = [padder.unpad(p) for p in preds] + total += sequence_loss(preds, disp_gt, valid).item() + + model.train() + return total / len(dataloader) + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + logging.info(f"Loading base stereo model from {MODEL_PATH}") + stereo_model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + model = FastFoundationStereoDepthRS_v2(stereo_model).cuda() + + # ── freeze entire feature extractor (backbone + FPN + stem_2) ───── + for param in model.stereo.feature.parameters(): + param.requires_grad = False + for param in model.stereo.stem_2.parameters(): + param.requires_grad = False + logging.info("Feature extractor (stereo.feature + stereo.stem_2) fully frozen.") + + # ── parameter groups ────────────────────────────────────────────── + new_params = ( + list(model.depth_init_blend.parameters()) + + list(model.depth_output_blend.parameters()) + + [model.depth_sigma, model.depth_prior_scale] + ) + new_param_ids = {id(p) for p in new_params} + finetune_params = [ + p for p in model.parameters() + if p.requires_grad and id(p) not in new_param_ids + ] + + n_new = sum(p.numel() for p in new_params) + n_ft = sum(p.numel() for p in finetune_params) + n_total = sum(p.numel() for p in model.parameters()) + logging.info( + f"Parameters — new (full LR): {n_new:,} " + f"fine-tune (0.1× LR): {n_ft:,} " + f"frozen: {n_total - n_new - n_ft:,} " + f"total: {n_total:,}" + ) + + optimizer = torch.optim.AdamW( + [ + {'params': new_params, 'lr': LR}, + {'params': finetune_params, 'lr': LR * 0.1}, + ], + weight_decay=1e-4, + ) + scaler = torch.amp.GradScaler('cuda') + + # ── dataset ─────────────────────────────────────────────────────── + dataset = InboltDepthDataset(INBOLT_DIR) + n_total_data = len(dataset) + if n_total_data < 2: + raise RuntimeError(f"Need at least 2 samples, got {n_total_data}.") + + n_train = min(max(1, int(round(TRAIN_RATIO * n_total_data))), n_total_data - 1) + n_test = n_total_data - n_train + + split_gen = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_gen) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Split (seed={SPLIT_SEED}): total={n_total_data}, " + f"train={len(train_set)} ({100.0*len(train_set)/n_total_data:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total_data:.1f}%)" + ) + + model.train() + best_loss = float('inf') + + # ── training loop ───────────────────────────────────────────────── + for epoch in range(EPOCHS): + epoch_loss = 0.0 + + for left, right, disp_gt, valid, depth_rs in train_loader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + depth_rs = depth_rs.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p, dr_p = padder.pad(left, right, depth_rs) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init, preds = model.forward( + left_p, right_p, depth_rs_mm=dr_p, iters=ITERS, test_mode=False + ) + preds = [padder.unpad(p) for p in preds] + loss = sequence_loss(preds, disp_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + + train_loss = epoch_loss / len(train_loader) + train_eval = evaluate_split_loss(model, train_loader) + test_eval = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} " + f"train_loss={train_loss:.4f} " + f"train_eval={train_eval:.4f} " + f"test_eval={test_eval:.4f} " + f"depth_sigma={model.depth_sigma.item():.3f} " + f"depth_prior_scale={model.depth_prior_scale.item():.4f}" + ) + + if test_eval < best_loss: + best_loss = test_eval + save_path = OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth') + torch.save(model, save_path) + logging.info(f" → saved best model (test_eval={best_loss:.4f}) to {save_path}") + + final_train = evaluate_split_loss(model, train_loader) + final_test = evaluate_split_loss(model, test_loader) + logging.info(f"Final train error: {final_train:.4f}") + logging.info(f"Final test error: {final_test:.4f}") + logging.info(f"Best test error: {best_loss:.4f}") + torch.save(model, OUT_PATH) + logging.info(f"Final model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt_planes.py b/scripts/finetune_inbolt_planes.py new file mode 100644 index 0000000..968199d --- /dev/null +++ b/scripts/finetune_inbolt_planes.py @@ -0,0 +1,307 @@ +""" +Fine-tune FastFoundationStereo on the Inbolt dataset. + +The Inbolt dataset provides: + - realsense/{idx}/mono0.png : left IR image (uint8, 480x640) + - realsense/{idx}/mono1.png : right IR image (uint8, 480x640) + - zivid/{idx}/depthmap_mm.png: ground-truth depth in mm (Zivid scanner, 1024x1224) + +Strategy: + - Freeze the ViT-L backbone (model.feature) to prevent overfitting on small datasets. + - Train everything else with RAFT-style sequence loss over GRU iterations. + - IR uint8 images are replicated to 3 channels. + - Zivid depth is resized to RealSense image resolution before disparity conversion. + - Depth is converted to disparity: disp = BF / depth_mm. + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +#INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' # local path to the dataset +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +# MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# OUT_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415.pth' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_planes_25.pth' + + +# BF = 49.8624*385.73 # D435 - focal_px * baseline_mm (calibrated from camera) # D435 - focal_px * baseline_mm (calibrated from camera) +BF = 50.102706998586 * 385.509887695312 # new data +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 # GRU iterations (same as inference) +GAMMA = 0.9 # sequence loss weight decay +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + +# -- Helpers ------------------------------- + +def measure_variability(img, levele_num = 2): + "estimate min and max values / std using 7x7 image kernel" + + """ + Finds the minimum and maximum values within the specified kernel size for each pixel in the image. + + Args: + image: The input image as a NumPy array. + kernel_size: The size of the square kernel (e.g., 7 for a 7x7 kernel). + + Returns: + A tuple containing: + - min_values: A NumPy array of the minimum values within each kernel. + - max_values: A NumPy array of the maximum values within each kernel. + """ + img_size = img.shape + for k in range(levele_num): + img = cv2.pyrDown(img) + + img = np.uint8(img) + kernel_size = 7 + + # Create a kernel of ones for min/max filtering + kernel = np.ones((kernel_size, kernel_size), np.uint8) + + # Find minimum values within the kernel + min_values = cv2.erode(img, kernel) + + # Find maximum values within the kernel + max_values = cv2.dilate(img, kernel) + + # diference + max_diff = cv2.absdiff(max_values , min_values) + + # debug + # Display the results using Matplotlib + #self.show_image_plt(img, min_values, max_values, max_diff) + for k in range(levele_num): + max_diff = cv2.pyrUp(max_diff) + + max_diff = cv2.resize(max_diff, img_size[::-1]) + + return max_diff.astype(np.float32) + +def find_flat_regions(disp_gt, valid): + """Identify planar regions in the ground-truth disparity map using RANSAC.""" + # convert disp_gt to numpy for variability measurement + disp_gt_np = disp_gt # (H, W) + valid_variability = valid + + # Fit a plane to the valid disparities using RANSAC + disp_variability = measure_variability(disp_gt_np, levele_num=2) # (H, W) variability measure (e.g., std or max-min) + valid_variability = valid_variability & (disp_variability < 50.0) # only consider low-variability pixels + + return valid_variability + + +# ── dataset ────────────────────────────────────────────────────────────────── + +class InboltDataset(Dataset): + def __init__(self, root,train_mode=True): + self.source = DataSource(train_mode=train_mode) + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth = data['depth_zivid'] # float32, mm (Zivid resolution) + + # Resize Zivid depth to match RealSense stereo image resolution + h, w = left.shape[:2] + if depth.shape != (h, w): + depth = cv2.resize(depth, (w, h), interpolation=cv2.INTER_NEAREST) + + # IR uint8 → float [0, 255], replicate to 3-channel pseudo-RGB + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H x W x 3 + right = np.stack([right, right, right], axis=-1) + + # depth (mm) → disparity (pixels): disp = focal * baseline / depth + disp = np.zeros_like(depth, dtype=np.float32) + valid = depth > 0 + disp[valid] = BF / depth[valid] + + #valid = find_flat_regions(disp, valid) + valid = find_flat_regions(depth, valid) + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() # (3, H, W) + right_t = torch.from_numpy(right).permute(2, 0, 1).float() # (3, H, W) + disp_t = torch.from_numpy(disp).unsqueeze(0).float() # (1, H, W) + valid_t = torch.from_numpy(valid).unsqueeze(0) # (1, H, W) bool + + return left_t, right_t, disp_t, valid_t + + +# ── loss ───────────────────────────────────────────────────────────────────── + + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted sum of smooth-L1 losses over GRU iterations.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def evaluate_split_loss(model, dataloader): + """Evaluate average sequence loss over a dataloader (no gradient updates).""" + if len(dataloader) == 0: + return float('nan') + + model.eval() + total_loss = 0.0 + + with torch.no_grad(): + for left, right, disp_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + total_loss += loss.item() + + model.train() + return total_loss / len(dataloader) + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + # load full model object (weights + architecture) + logging.info(f"Loading model from {MODEL_PATH}") + model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + + # freeze the ViT-L backbone — with only 24 samples it would overfit + for param in model.feature.parameters(): + param.requires_grad = False + logging.info("ViT backbone frozen.") + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + logging.info(f"Trainable: {trainable:,} / {total:,} parameters") + + model = model.cuda().train() + logging.info("Model on single GPU.") + + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], lr=LR, weight_decay=1e-4 + ) + scaler = torch.amp.GradScaler('cuda') + + dataset = InboltDataset(INBOLT_DIR, train_mode=True) + n_total = len(dataset) + + if n_total < 2: + raise RuntimeError(f"Need at least 2 samples for a 75/25 train/test split, got {n_total}.") + + n_train = int(round(TRAIN_RATIO * n_total)) + n_train = min(max(1, n_train), n_total - 1) + n_test = n_total - n_train + + split_generator = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_generator) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Random split with seed={SPLIT_SEED}: total={n_total}, train={len(train_set)} ({100.0*len(train_set)/n_total:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total:.1f}%)" + ) + + best_loss = float('inf') + + for epoch in range(EPOCHS): + epoch_loss = 0.0 + + for left, right, disp_gt, valid in train_loader: + #valid = find_flat_regions(disp_gt, valid) + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + # pad so H and W are divisible by 32 + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss = sequence_loss(disp_preds, disp_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + + train_loss = epoch_loss / len(train_loader) + train_eval_error = evaluate_split_loss(model, train_loader) + test_eval_error = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} train_loss={train_loss:.4f} " + f"train_eval_error={train_eval_error:.4f} test_eval_error={test_eval_error:.4f}" + ) + + if test_eval_error < best_loss: + best_loss = test_eval_error + torch.save(model, OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth')) + logging.info(f" → saved best model (test_eval_error={best_loss:.4f})") + + final_train_error = evaluate_split_loss(model, train_loader) + final_test_error = evaluate_split_loss(model, test_loader) + logging.info(f"Final train error: {final_train_error:.4f}") + logging.info(f"Final test error: {final_test_error:.4f}") + logging.info(f"Training complete. Best test error: {best_loss:.4f}") + logging.info(f"Model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt_planes_bf.py b/scripts/finetune_inbolt_planes_bf.py new file mode 100644 index 0000000..926c10c --- /dev/null +++ b/scripts/finetune_inbolt_planes_bf.py @@ -0,0 +1,371 @@ +""" +Fine-tune FastFoundationStereo on the Inbolt dataset. +Make loss on depth and not disparity, since depth is more directly related to the Zivid ground-truth and less sensitive to focal length / baseline calibration errors. + +The Inbolt dataset provides: + - realsense/{idx}/mono0.png : left IR image (uint8, 480x640) + - realsense/{idx}/mono1.png : right IR image (uint8, 480x640) + - zivid/{idx}/depthmap_mm.png: ground-truth depth in mm (Zivid scanner, 1024x1224) + +Strategy: + - Freeze the ViT-L backbone (model.feature) to prevent overfitting on small datasets. + - Train everything else with RAFT-style sequence loss over GRU iterations. + - IR uint8 images are replicated to 3 channels. + - Zivid depth is resized to RealSense image resolution before disparity conversion. + - Depth is converted to disparity: disp = BF / depth_mm. + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +#INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260322T091926Z-1-001/Data Collection' # local path to the dataset +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +# MODEL_PATH = f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth' +# OUT_PATH = f'{code_dir}/../weights/20-30-48/model_finetuned_inbolt-20260415.pth' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_planes_bf.pth' + + +# BF = 49.8624*385.73 # D435 - focal_px * baseline_mm (calibrated from camera) # D435 - focal_px * baseline_mm (calibrated from camera) +BF = 50.102706998586 * 385.509887695312 # new data +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 # GRU iterations (same as inference) +GAMMA = 0.9 # sequence loss weight decay +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + +# -- Helpers ------------------------------- + +def measure_variability(img, levele_num = 2): + "estimate min and max values / std using 7x7 image kernel" + + """ + Finds the minimum and maximum values within the specified kernel size for each pixel in the image. + + Args: + image: The input image as a NumPy array. + kernel_size: The size of the square kernel (e.g., 7 for a 7x7 kernel). + + Returns: + A tuple containing: + - min_values: A NumPy array of the minimum values within each kernel. + - max_values: A NumPy array of the maximum values within each kernel. + """ + img_size = img.shape + for k in range(levele_num): + img = cv2.pyrDown(img) + + img = np.uint8(img) + kernel_size = 7 + + # Create a kernel of ones for min/max filtering + kernel = np.ones((kernel_size, kernel_size), np.uint8) + + # Find minimum values within the kernel + min_values = cv2.erode(img, kernel) + + # Find maximum values within the kernel + max_values = cv2.dilate(img, kernel) + + # diference + max_diff = cv2.absdiff(max_values , min_values) + + # debug + # Display the results using Matplotlib + #self.show_image_plt(img, min_values, max_values, max_diff) + for k in range(levele_num): + max_diff = cv2.pyrUp(max_diff) + + max_diff = cv2.resize(max_diff, img_size[::-1]) + + return max_diff.astype(np.float32) + +def find_flat_regions(disp_gt, valid): + """Identify planar regions in the ground-truth disparity map using RANSAC.""" + # convert disp_gt to numpy for variability measurement + disp_gt_np = disp_gt # (H, W) + valid_variability = valid + + # Fit a plane to the valid disparities using RANSAC + disp_variability = measure_variability(disp_gt_np, levele_num=2) # (H, W) variability measure (e.g., std or max-min) + valid_variability = valid_variability & (disp_variability < 50.0) # only consider low-variability pixels + + return valid_variability + + +# ── dataset ────────────────────────────────────────────────────────────────── + +class InboltDataset(Dataset): + def __init__(self, root,train_mode=True): + self.source = DataSource(train_mode=train_mode) + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth = data['depth_zivid'] # float32, mm (Zivid resolution) + + # Resize Zivid depth to match RealSense stereo image resolution + h, w = left.shape[:2] + if depth.shape != (h, w): + depth = cv2.resize(depth, (w, h), interpolation=cv2.INTER_NEAREST) + + # IR uint8 → float [0, 255], replicate to 3-channel pseudo-RGB + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) # H x W x 3 + right = np.stack([right, right, right], axis=-1) + + # depth (mm) → disparity (pixels): disp = focal * baseline / depth + #disp = np.zeros_like(depth, dtype=np.float32) + valid = depth > 0 + #disp[valid] = BF / depth[valid] + + #valid = find_flat_regions(disp, valid) + valid = find_flat_regions(depth, valid) + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() # (3, H, W) + right_t = torch.from_numpy(right).permute(2, 0, 1).float() # (3, H, W) + depth_t = torch.from_numpy(depth).unsqueeze(0).float() # (1, H, W) + valid_t = torch.from_numpy(valid).unsqueeze(0) # (1, H, W) bool + + return left_t, right_t, depth_t, valid_t + + +# ── loss ───────────────────────────────────────────────────────────────────── + + +def sequence_loss(depth_preds, depth_gt, valid, gamma=GAMMA): + """RAFT-style weighted sum of smooth-L1 losses over GRU iterations.""" + n = len(depth_preds) + loss = 0.0 + for i, pred in enumerate(depth_preds): + w = gamma ** (n - 1 - i) + gt = depth_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def evaluate_split_loss(model, dataloader): + """Evaluate average sequence loss over a dataloader (no gradient updates).""" + if len(dataloader) == 0: + return float('nan') + + model.eval() + total_loss = 0.0 + + with torch.no_grad(): + for left, right, depth_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + depth_gt, valid = depth_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds, depth_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + depth_preds = [padder.unpad(p) for p in depth_preds] + loss = sequence_loss(depth_preds, depth_gt, valid) + + total_loss += loss.item() + + model.train() + return total_loss / len(dataloader) + +# ── depth head ─────────────────────────────────────────────────────────────── + +class DepthHead(nn.Module): + """Convert predicted disparity to depth via depth = (BF + weight) / disparity. + + `weight` is a single learnable scalar (in the same units as BF, i.e. focal*baseline) + that compensates for small focal-length / baseline calibration errors. + """ + + def __init__(self, bf: float, eps: float = 1e-6): + super().__init__() + self.register_buffer('bf', torch.tensor(float(bf))) + self.weight = nn.Parameter(torch.zeros(1)) + self.eps = 1 + + def forward(self, disparity: torch.Tensor) -> torch.Tensor: + #return (self.bf + self.weight) / disparity.clamp(min=self.eps) + return (self.bf) / disparity.clamp(min=self.eps) + + +# ── model wrapper ───────────────────────────────────────────────────────────── + +class FastFoundationStereoWithDepth(nn.Module): + """ + Wraps a pretrained FastFoundationStereo and adds a DepthHead that inverts + the predicted disparity to depth using (BF + trainable_weight) / disparity. + + forward(..., test_mode=False) → (init_disp, disp_preds, depth_preds) + forward(..., test_mode=True) → (disp_up, depth) + """ + + def __init__(self, base_model: nn.Module, bf: float = BF): + super().__init__() + self.base = base_model + self.depth_head = DepthHead(bf=bf) + + # expose base.feature so the freeze loop in main() still works + @property + def feature(self): + return self.base.feature + + def forward( + self, + image1: torch.Tensor, + image2: torch.Tensor, + iters: int = 12, + test_mode: bool = False, + **kwargs, + ): + result = self.base.forward(image1, image2, iters=iters, test_mode=test_mode, **kwargs) + + if test_mode: + disp_up = result + depth = self.depth_head(disp_up.float()) + return depth, disp_up + else: + init_disp, disp_preds = result + depth_preds = [self.depth_head(p.float()) for p in disp_preds] + return init_disp, disp_preds, depth_preds + + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + # load full model object (weights + architecture) + logging.info(f"Loading model from {MODEL_PATH}") + base_model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + + model = FastFoundationStereoWithDepth(base_model) + + # freeze the ViT-L backbone — with only 24 samples it would overfit + for param in model.feature.parameters(): + param.requires_grad = False + logging.info("ViT backbone frozen.") + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + logging.info(f"Trainable: {trainable:,} / {total:,} parameters") + + model = model.cuda().train() + logging.info("Model on single GPU.") + + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], lr=LR, weight_decay=1e-4 + ) + scaler = torch.amp.GradScaler('cuda') + + dataset = InboltDataset(INBOLT_DIR, train_mode=True) + n_total = len(dataset) + + if n_total < 2: + raise RuntimeError(f"Need at least 2 samples for a 75/25 train/test split, got {n_total}.") + + n_train = int(round(TRAIN_RATIO * n_total)) + n_train = min(max(1, n_train), n_total - 1) + n_test = n_total - n_train + + split_generator = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_generator) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Random split with seed={SPLIT_SEED}: total={n_total}, train={len(train_set)} ({100.0*len(train_set)/n_total:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total:.1f}%)" + ) + + best_loss = float('inf') + + for epoch in range(EPOCHS): + epoch_loss = 0.0 + + for left, right, depth_gt, valid in train_loader: + left, right = left.cuda(), right.cuda() + depth_gt, valid = depth_gt.cuda(), valid.cuda() + + # pad so H and W are divisible by 32 + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds, depth_preds = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + depth_preds = [padder.unpad(p) for p in depth_preds] + loss = sequence_loss(depth_preds, depth_gt, valid) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + + train_loss = epoch_loss / len(train_loader) + train_eval_error = evaluate_split_loss(model, train_loader) + test_eval_error = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} train_loss={train_loss:.4f} " + f"train_eval_error={train_eval_error:.4f} test_eval_error={test_eval_error:.4f}" + ) + + if test_eval_error < best_loss: + best_loss = test_eval_error + torch.save(model, OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth')) + logging.info(f" → saved best model (test_eval_error={best_loss:.4f})") + + final_train_error = evaluate_split_loss(model, train_loader) + final_test_error = evaluate_split_loss(model, test_loader) + logging.info(f"Final train error: {final_train_error:.4f}") + logging.info(f"Final test error: {final_test_error:.4f}") + logging.info(f"Training complete. Best test error: {best_loss:.4f}") + logging.info(f"Model saved to {OUT_PATH}") + + +if __name__ == '__main__': + main() diff --git a/scripts/finetune_inbolt_with_confidence.py b/scripts/finetune_inbolt_with_confidence.py new file mode 100644 index 0000000..4b75be1 --- /dev/null +++ b/scripts/finetune_inbolt_with_confidence.py @@ -0,0 +1,448 @@ +""" +Fine-tune FastFoundationStereo on the Inbolt dataset with an additional confidence head. + +The confidence head predicts pixel validity from the stereo pair: + confidence = 1 → pixel has a valid Zivid depth measurement + confidence = 0 → pixel has no valid Zivid measurement (specular, transparent, OOB) + +Loss: + - RAFT-style smooth-L1 sequence loss on valid pixels (disparity) + - Binary cross-entropy on all pixels (confidence, target = valid_mask) + +At the end of training an evaluation section compares depth performance on the test split +between the original model and the newly trained confidence model. + +Usage: + cd /path/to/Fast-FoundationStereo + python scripts/finetune_inbolt_with_confidence.py +""" + +import os, sys, logging +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +sys.path.append(code_dir) + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +import cv2 +from torch.utils.data import Dataset, DataLoader, random_split +from core.utils.utils import InputPadder +from core.foundation_stereo import normalize_image +import Utils as U +from scripts.data_manager_inbolt import DataSource + + +# ── constants ──────────────────────────────────────────────────────────────── + +INBOLT_DIR = r'/mnt/algonas/Local/Data/new_depth_stereo_datasets/Inbolt_datasets/Data Collection-20260415T084601Z-3-001/Data Collection' +MODEL_PATH = f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth' +OUT_PATH = f'{code_dir}/../weights/23-36-37/model_finetuned_inbolt_with_confidence-20260507.pth' + +BF = 50.102706998586 * 385.509887695312 # focal_px * baseline_mm +EPOCHS = 120 +LR = 2e-5 +ITERS = 8 +GAMMA = 0.9 +CONF_WEIGHT = 1.0 # weight of confidence BCE loss relative to disparity loss +TRAIN_RATIO = 0.75 +SPLIT_SEED = 0 + + +# ── confidence head ─────────────────────────────────────────────────────────── + +class ConfidenceHead(nn.Module): + """Lightweight head: stem_2 features at H/2 → confidence map at H (sigmoid, 0–1).""" + + def __init__(self, in_channels: int = 16): + super().__init__() + self.net = nn.Sequential( + nn.Conv2d(in_channels, 32, 3, padding=1), + nn.BatchNorm2d(32), + nn.ReLU(inplace=True), + nn.ConvTranspose2d(32, 16, kernel_size=4, stride=2, padding=1), # ×2 upsample + nn.BatchNorm2d(16), + nn.ReLU(inplace=True), + nn.Conv2d(16, 1, 1), + nn.Sigmoid(), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) + + +# ── model wrapper ───────────────────────────────────────────────────────────── + +class FastFoundationStereoWithConfidence(nn.Module): + """ + Wraps a pretrained FastFoundationStereo and adds a ConfidenceHead. + + forward(..., test_mode=False) → (init_disp, disp_preds, conf) + forward(..., test_mode=True) → (disp_up, conf) + """ + + def __init__(self, base_model: nn.Module): + super().__init__() + self.base = base_model + # stem_2 outputs 16 channels at H/2 resolution — cheap to re-evaluate + self.conf_head = ConfidenceHead(in_channels=16) + + # expose base.feature so the freeze loop in main() still works + @property + def feature(self): + return self.base.feature + + def forward( + self, + image1: torch.Tensor, + image2: torch.Tensor, + iters: int = 12, + test_mode: bool = False, + **kwargs, + ): + result = self.base.forward(image1, image2, iters=iters, test_mode=test_mode, **kwargs) + + # Recompute stem_2x for the confidence head. + # normalize_image is pure arithmetic; stem_2 is a 3-conv lightweight block. + with torch.amp.autocast('cuda', enabled=self.base.args.mixed_precision, dtype=U.AMP_DTYPE): + stem_2x = self.base.stem_2(normalize_image(image1)) # (B, 32, H/2, W/2) + conf = self.conf_head(stem_2x.float()) # (B, 1, H, W) + + if test_mode: + return result, conf # (disp_up, conf) + else: + init_disp, disp_preds = result + return init_disp, disp_preds, conf + + +# ── dataset ─────────────────────────────────────────────────────────────────── + +class InboltDataset(Dataset): + def __init__(self, root: str): + self.source = DataSource() + n = self.source.init_directory(input_rectified=root) + logging.info(f"DataSource found {n} samples in {root}") + + def __len__(self): + return len(self.source.imgs) + + def __getitem__(self, idx): + data = self.source.get_item_projected(idx) + left = data['left'] + right = data['right'] + depth = data['depth_zivid'] # float32, mm (Zivid resolution) + + h, w = left.shape[:2] + if depth.shape != (h, w): + depth = cv2.resize(depth, (w, h), interpolation=cv2.INTER_NEAREST) + + left = np.clip(left.astype(np.float32), 0, 255) + right = np.clip(right.astype(np.float32), 0, 255) + left = np.stack([left, left, left], axis=-1) + right = np.stack([right, right, right], axis=-1) + + disp = np.zeros_like(depth, dtype=np.float32) + valid = depth > 0 + disp[valid] = BF / depth[valid] + + left_t = torch.from_numpy(left).permute(2, 0, 1).float() + right_t = torch.from_numpy(right).permute(2, 0, 1).float() + disp_t = torch.from_numpy(disp).unsqueeze(0).float() + valid_t = torch.from_numpy(valid).unsqueeze(0) + + return left_t, right_t, disp_t, valid_t + + +# ── loss ────────────────────────────────────────────────────────────────────── + +def sequence_loss(disp_preds, disp_gt, valid, gamma=GAMMA): + """RAFT-style weighted smooth-L1 loss on valid pixels.""" + n = len(disp_preds) + loss = 0.0 + for i, pred in enumerate(disp_preds): + w = gamma ** (n - 1 - i) + gt = disp_gt + v = valid + if pred.shape[-2:] != gt.shape[-2:]: + gt = F.interpolate(gt, size=pred.shape[-2:], mode='nearest') + v = F.interpolate(valid.float(), size=pred.shape[-2:], mode='nearest').bool() + loss = loss + w * F.smooth_l1_loss(pred[v], gt[v]) + return loss + + +def sequence_loss_with_confidence(disp_preds, conf, disp_gt, valid, gamma=GAMMA, conf_weight=CONF_WEIGHT): + """ + Combined loss: + - Smooth-L1 disparity sequence loss on valid pixels. + - BCE confidence loss on all pixels: target=1 where Zivid is valid, 0 otherwise. + """ + disp_loss = sequence_loss(disp_preds, disp_gt, valid, gamma) + + conf_target = valid.float() + if conf.shape[-2:] != conf_target.shape[-2:]: + conf_target = F.interpolate(conf_target, size=conf.shape[-2:], mode='nearest') + # BCE is blocked by PyTorch's autocast dispatcher regardless of dtype; disable it here. + with torch.amp.autocast('cuda', enabled=False): + conf_loss = F.binary_cross_entropy(conf.float(), conf_target.float()) + + return disp_loss + conf_weight * conf_loss, disp_loss, conf_loss + + +# ── evaluation helpers ──────────────────────────────────────────────────────── + +def evaluate_split_loss(model, dataloader): + """Average combined loss over a dataloader (no grad).""" + if len(dataloader) == 0: + return float('nan') + + model.eval() + total_loss = 0.0 + + with torch.no_grad(): + for left, right, disp_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds, conf = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss, _, _ = sequence_loss_with_confidence(disp_preds, conf, disp_gt, valid) + + total_loss += loss.item() + + model.train() + return total_loss / len(dataloader) + + +def _depth_mae_m(model_or_fn, dataloader, with_confidence: bool): + """ + Compute depth MAE (metres) on valid GT pixels for a model. + model_or_fn: either a FastFoundationStereoWithConfidence (with_confidence=True) + or the original FastFoundationStereo (with_confidence=False). + Returns (mae_mm, coverage_pct) + """ + total_abs_err = 0.0 + total_valid_gt = 0 + total_valid_pred = 0 + total_pixels = 0 + + with torch.no_grad(): + for left, right, disp_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + if with_confidence: + disp_up, _conf = model_or_fn.forward(left_p, right_p, iters=ITERS, test_mode=True) + else: + disp_up = model_or_fn.forward(left_p, right_p, iters=ITERS, test_mode=True) + + disp_up = padder.unpad(disp_up.float()) # (1, 1, H, W) + + # Convert disparity → depth (mm) where disp > 0 + pred_disp_np = disp_up.squeeze().cpu().numpy().clip(0, None) + gt_disp_np = disp_gt.squeeze().cpu().numpy() + valid_np = valid.squeeze().cpu().numpy().astype(bool) + + pred_depth = np.zeros_like(pred_disp_np) + ok = pred_disp_np > 0 + pred_depth[ok] = BF / pred_disp_np[ok] # mm + + gt_depth = np.zeros_like(gt_disp_np) + gt_ok = gt_disp_np > 0 + gt_depth[gt_ok] = BF / gt_disp_np[gt_ok] # mm + + # Only evaluate where GT is valid + mask = valid_np & ok + if mask.any(): + total_abs_err += float(np.abs(pred_depth[mask] - gt_depth[mask]).sum()) + total_valid_pred += int(mask.sum()) + total_valid_gt += int(valid_np.sum()) + total_pixels += valid_np.size + + mae_mm = total_abs_err / max(total_valid_pred, 1) + coverage = 100.0 * total_valid_pred / max(total_valid_gt, 1) + return mae_mm, coverage + + +def _confidence_metrics(model, dataloader): + """Compute confidence accuracy (treating conf>0.5 as valid prediction).""" + tp = tn = fp = fn = 0 + + with torch.no_grad(): + for left, right, _disp_gt, valid in dataloader: + left, right = left.cuda(), right.cuda() + valid = valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _disp_up, conf = model.forward(left_p, right_p, iters=ITERS, test_mode=True) + + conf = padder.unpad(conf) + pred_valid = (conf > 0.5).squeeze().cpu() + gt_valid = valid.squeeze().cpu().bool() + + tp += int((pred_valid & gt_valid).sum()) + tn += int((~pred_valid & ~gt_valid).sum()) + fp += int((pred_valid & ~gt_valid).sum()) + fn += int((~pred_valid & gt_valid).sum()) + + accuracy = 100.0 * (tp + tn) / max(tp + tn + fp + fn, 1) + precision = 100.0 * tp / max(tp + fp, 1) + recall = 100.0 * tp / max(tp + fn, 1) + return accuracy, precision, recall + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + U.set_logging_format() + U.set_seed(0) + + logging.info(f"Loading base model from {MODEL_PATH}") + base_model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + + model = FastFoundationStereoWithConfidence(base_model) + + # freeze the ViT-L backbone — conf_head and the rest of the model will train + for param in model.feature.parameters(): + param.requires_grad = False + logging.info("ViT backbone frozen.") + + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + logging.info(f"Trainable: {trainable:,} / {total:,} parameters") + + model = model.cuda().train() + + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], lr=LR, weight_decay=1e-4 + ) + scaler = torch.amp.GradScaler('cuda') + + dataset = InboltDataset(INBOLT_DIR) + n_total = len(dataset) + + if n_total < 2: + raise RuntimeError(f"Need at least 2 samples, got {n_total}.") + + n_train = min(max(1, int(round(TRAIN_RATIO * n_total))), n_total - 1) + n_test = n_total - n_train + + split_gen = torch.Generator().manual_seed(SPLIT_SEED) + train_set, test_set = random_split(dataset, [n_train, n_test], generator=split_gen) + + train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0) + test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0) + + logging.info( + f"Split seed={SPLIT_SEED}: total={n_total}, " + f"train={len(train_set)} ({100.0*len(train_set)/n_total:.1f}%), " + f"test={len(test_set)} ({100.0*len(test_set)/n_total:.1f}%)" + ) + + best_loss = float('inf') + best_ckpt = None + + # ── training loop ───────────────────────────────────────────────────────── + for epoch in range(EPOCHS): + epoch_loss = epoch_disp_loss = epoch_conf_loss = 0.0 + + for left, right, disp_gt, valid in train_loader: + left, right = left.cuda(), right.cuda() + disp_gt, valid = disp_gt.cuda(), valid.cuda() + + padder = InputPadder(left.shape, divis_by=32, force_square=False) + left_p, right_p = padder.pad(left, right) + + optimizer.zero_grad(set_to_none=True) + + with torch.amp.autocast('cuda', enabled=True, dtype=U.AMP_DTYPE): + _init_disp, disp_preds, conf = model.forward( + left_p, right_p, iters=ITERS, test_mode=False + ) + disp_preds = [padder.unpad(p) for p in disp_preds] + loss, d_loss, c_loss = sequence_loss_with_confidence( + disp_preds, conf, disp_gt, valid + ) + + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + scaler.step(optimizer) + scaler.update() + + epoch_loss += loss.item() + epoch_disp_loss += d_loss.item() + epoch_conf_loss += c_loss.item() + + n_batches = len(train_loader) + train_loss = epoch_loss / n_batches + train_eval = evaluate_split_loss(model, train_loader) + test_eval = evaluate_split_loss(model, test_loader) + + logging.info( + f"Epoch {epoch+1:3d}/{EPOCHS} " + f"train_loss={train_loss:.4f} " + f"(disp={epoch_disp_loss/n_batches:.4f} conf={epoch_conf_loss/n_batches:.4f}) " + f"train_eval={train_eval:.4f} test_eval={test_eval:.4f}" + ) + + if test_eval < best_loss: + best_loss = test_eval + best_ckpt = OUT_PATH.replace('.pth', f'_epoch_{epoch+1:03d}.pth') + torch.save(model, best_ckpt) + logging.info(f" → saved best model (test_eval={best_loss:.4f})") + + logging.info(f"Training complete. Best test_eval={best_loss:.4f}") + + # ── depth performance comparison ────────────────────────────────────────── + logging.info("\n── Depth performance comparison (test split) ──────────────────────────") + + model.eval() + + # confidence model (best checkpoint or final model) + eval_model = model + if best_ckpt is not None: + logging.info(f"Loading best checkpoint for evaluation: {best_ckpt}") + eval_model = torch.load(best_ckpt, map_location='cuda', weights_only=False) + eval_model.eval() + + conf_mae, conf_cov = _depth_mae_m(eval_model, test_loader, with_confidence=True) + conf_acc, conf_prec, conf_rec = _confidence_metrics(eval_model, test_loader) + logging.info( + f"[Confidence model] depth MAE={conf_mae:.2f} mm coverage={conf_cov:.1f}% " + f"conf_acc={conf_acc:.1f}% conf_prec={conf_prec:.1f}% conf_rec={conf_rec:.1f}%" + ) + + # original model for comparison + logging.info(f"Loading original model for comparison: {MODEL_PATH}") + orig_model = torch.load(MODEL_PATH, map_location='cuda', weights_only=False) + orig_model.eval() + orig_mae, orig_cov = _depth_mae_m(orig_model, test_loader, with_confidence=False) + logging.info( + f"[Original model] depth MAE={orig_mae:.2f} mm coverage={orig_cov:.1f}%" + ) + + mae_delta = conf_mae - orig_mae + sign = "+" if mae_delta >= 0 else "" + logging.info( + f"\nDepth MAE delta (confidence − original): {sign}{mae_delta:.2f} mm " + f"(negative = confidence model is better)" + ) + logging.info(f"Model saved to {OUT_PATH} (best: {best_ckpt})") + + +if __name__ == '__main__': + main() diff --git a/scripts/make_onnx.py b/scripts/make_onnx.py index aaa9915..3361024 100755 --- a/scripts/make_onnx.py +++ b/scripts/make_onnx.py @@ -63,7 +63,8 @@ def forward(self, left, right): opset_version=17, input_names = ['left', 'right'], output_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x'], - do_constant_folding=True + do_constant_folding=True, + dynamo=True, ) features_left_04, features_left_08, features_left_16, features_left_32, features_right_04, stem_2x = feature_runner(left_img, right_img) @@ -77,7 +78,8 @@ def forward(self, left, right): opset_version=17, input_names = ['features_left_04', 'features_left_08', 'features_left_16', 'features_left_32', 'features_right_04', 'stem_2x', 'gwc_volume'], output_names = ['disp'], - do_constant_folding=True + do_constant_folding=True, + dynamo=True, ) with open(f'{args.save_path}/onnx.yaml', 'w') as f: diff --git a/scripts/make_trt_engine.py b/scripts/make_trt_engine.py new file mode 100644 index 0000000..568d8d7 --- /dev/null +++ b/scripts/make_trt_engine.py @@ -0,0 +1,52 @@ +"""Convert ONNX models to TensorRT engines using the Python TRT API.""" +import argparse +import os +import tensorrt as trt + +TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) + + +def build_engine(onnx_path: str, engine_path: str, fp16: bool = True, workspace_gb: int = 4): + builder = trt.Builder(TRT_LOGGER) + network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) + config = builder.create_builder_config() + config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_gb << 30) + + parser = trt.OnnxParser(network, TRT_LOGGER) + if not parser.parse_from_file(onnx_path): + for i in range(parser.num_errors): + print(f"ONNX parse error {i}: {parser.get_error(i)}") + raise RuntimeError(f"Failed to parse ONNX: {onnx_path}") + + if fp16 and builder.platform_has_fast_fp16: + config.set_flag(trt.BuilderFlag.FP16) + print("FP16 enabled") + + print(f"Building TRT engine from {onnx_path} …") + serialized = builder.build_serialized_network(network, config) + if serialized is None: + raise RuntimeError("build_serialized_network returned None") + + with open(engine_path, "wb") as f: + f.write(serialized) + print(f"Engine saved -> {engine_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--onnx_dir", type=str, + default="output/onnx_trt", + help="Directory containing feature_runner.onnx and post_runner.onnx") + parser.add_argument("--engine_dir", type=str, default=None, + help="Output directory for .engine files (defaults to onnx_dir)") + parser.add_argument("--fp16", action="store_true", default=True) + parser.add_argument("--workspace_gb", type=int, default=4) + args = parser.parse_args() + + engine_dir = args.engine_dir or args.onnx_dir + os.makedirs(engine_dir, exist_ok=True) + + for name in ("feature_runner", "post_runner"): + onnx_path = os.path.join(args.onnx_dir, f"{name}.onnx") + engine_path = os.path.join(engine_dir, f"{name}.engine") + build_engine(onnx_path, engine_path, fp16=args.fp16, workspace_gb=args.workspace_gb) diff --git a/scripts/metrics.py b/scripts/metrics.py new file mode 100644 index 0000000..ac92fe6 --- /dev/null +++ b/scripts/metrics.py @@ -0,0 +1,193 @@ +"""Depth quality metric dataclasses and computation helpers.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +import numpy as np + + +# Distance bins used throughout the report +DIST_BINS: List[Tuple[float, float]] = [ + (0.05, 0.25), + (0.25, 0.5), + (0.5, 0.75), + (0.75, 1.0), + (1.0, 1.5), + (1.5, 2.0), +] +BIN_LABELS = ["0.05–0.25 m", "0.25–0.5 m", "0.5–0.75 m", "0.75–1 m", "1–1.5 m", "1.5–2 m"] +BIN_CENTERS = [0.125, 0.375, 0.625, 0.875, 1.25, 1.75] +CLOSE_RANGE_THRESHOLD_M = 0.05 + + +@dataclass +class FrameMetrics: + """Quality metrics for a single (method, frame) pair vs ground truth.""" + method_name: str + mae: float # mean absolute error (m) — valid-only + rmse: float # root mean squared error (m) — valid-only + mre: float # mean relative error (dimensionless) — valid-only + delta1: float # % pixels within 1.25× of GT — valid-only + coverage: float # % valid pixels (pred > 0 AND gt > 0) + time_ms: float # wall-clock processing time + # Penalised metrics: missing pixels (pred=0 where GT>0) count as 100% error + mae_pen: float = 0.0 + mre_pen: float = 0.0 + + +@dataclass +class AggregateStats: + """Per-method aggregate statistics over all benchmark frames.""" + method_name: str + label: str + color: str + mae_mean: float + mae_std: float + rmse_mean: float + mre_mean: float + delta1_mean: float + coverage_mean: float + fps_mean: float + time_ms_mean: float + time_ms_std: float + close_range_coverage: float # % valid where GT < CLOSE_RANGE_THRESHOLD_M + mae_pen_mean: float = 0.0 # penalised MAE (holes count as full error) + mre_pen_mean: float = 0.0 # penalised MRE (holes count as 100% error) + gpu_load_mean: float = 0.0 # GPU utilisation % during inference + + +@dataclass +class BenchmarkResults: + """All data collected during a benchmark run.""" + method_names: List[str] + method_labels: Dict[str, str] + method_colors: Dict[str, str] + ground_truth_name: str + n_frames: int + width: int + height: int + all_metrics: List[FrameMetrics] + viz_frames: List[Dict[str, np.ndarray]] # [{method_name: float32_m_array}, ...] + coverage_maps: Dict[str, np.ndarray] # method_name -> (H,W) float [0,1] + dist_bin_mae: Dict[str, List[List[float]]] # method_name -> [frame][bin] + close_range_valid: Dict[str, List[float]] # method_name -> per-frame % + source: str + method_configs: Dict[str, Dict[str, str]] = field(default_factory=dict) # method_name -> config dict + + +def compute_metrics( + pred_m: np.ndarray, + gt_m: np.ndarray, + elapsed_ms: float, + method_name: str, +) -> FrameMetrics: + """Compute per-frame quality metrics between prediction and ground truth. + + Two sets of error metrics are computed: + - **valid-only** (mae, mre): only pixels where both pred and GT > 0. + - **penalised** (mae_pen, mre_pen): over all GT > 0 pixels. Where + pred == 0 (hole), the error equals the GT depth itself (i.e. 100% + relative error). This makes methods with poor coverage pay a price + in the error scores, not just in coverage %. + """ + gt_mask = gt_m > 0.0 + valid = gt_mask & (pred_m > 0.0) + n_gt = int(gt_mask.sum()) + n_valid = int(valid.sum()) + n_total = gt_m.size + + if n_valid == 0: + return FrameMetrics(method_name, 0.0, 0.0, 0.0, 0.0, 0.0, elapsed_ms) + + # --- valid-only metrics (unchanged) --- + p, g = pred_m[valid], gt_m[valid] + diff = np.abs(p - g) + mae = float(diff.mean()) + rmse = float(np.sqrt((diff ** 2).mean())) + mre = float((diff / (g + 1e-6)).mean()) + ratio = np.maximum(p / (g + 1e-6), g / (p + 1e-6)) + delta1 = float((ratio < 1.25).mean()) * 100.0 + coverage = float(n_valid / n_total) * 100.0 + + # --- penalised metrics (missing pixels = full GT depth as error) --- + if n_gt > 0: + missing = gt_mask & (pred_m <= 0.0) + # For missing pixels: absolute error = gt depth, relative error = 1.0 + pen_abs = np.zeros_like(gt_m) + pen_abs[valid] = diff + pen_abs[missing] = gt_m[missing] + mae_pen = float(pen_abs[gt_mask].mean()) + + pen_rel = np.zeros_like(gt_m) + pen_rel[valid] = diff / (gt_m[valid] + 1e-6) + pen_rel[missing] = 1.0 # 100% relative error for holes + mre_pen = float(pen_rel[gt_mask].mean()) + else: + mae_pen = mae + mre_pen = mre + + return FrameMetrics(method_name, mae, rmse, mre, delta1, coverage, elapsed_ms, + mae_pen=mae_pen, mre_pen=mre_pen) + + +def compute_bin_mae( + pred_m: np.ndarray, + gt_m: np.ndarray, +) -> List[float]: + """MAE per distance bin; returns NaN for bins with no valid GT pixels.""" + result = [] + for lo, hi in DIST_BINS: + mask = (gt_m >= lo) & (gt_m < hi) & (gt_m > 0) & (pred_m > 0) + if mask.sum() == 0: + result.append(float("nan")) + else: + result.append(float(np.abs(pred_m[mask] - gt_m[mask]).mean())) + return result + + +def aggregate( + results: BenchmarkResults, + timing_ms: Dict[str, float], + gpu_load: Optional[Dict[str, float]] = None, +) -> Dict[str, AggregateStats]: + """Compute per-method aggregate statistics from raw benchmark results.""" + stats: Dict[str, AggregateStats] = {} + gt = results.ground_truth_name + + for name in results.method_names: + label = results.method_labels.get(name, name) + color = results.method_colors.get(name, "#888888") + fm_list = [m for m in results.all_metrics if m.method_name == name] + if not fm_list: + continue + + t_ms = timing_ms.get(name, 0.0) + if name == "hardware": + fps = 30.0 + t_ms = 1000.0 / 30.0 + else: + fps = 1000.0 / t_ms if t_ms > 0 else float("inf") + + cr_vals = results.close_range_valid.get(name, [0.0]) + + stats[name] = AggregateStats( + method_name=name, + label=label, + color=color, + mae_mean=float(np.mean([m.mae for m in fm_list])), + mae_std=float(np.std([m.mae for m in fm_list])), + rmse_mean=float(np.mean([m.rmse for m in fm_list])), + mre_mean=float(np.mean([m.mre for m in fm_list])), + delta1_mean=float(np.mean([m.delta1 for m in fm_list])), + coverage_mean=float(np.mean([m.coverage for m in fm_list])), + fps_mean=fps, + time_ms_mean=t_ms, + time_ms_std=0.0, + close_range_coverage=float(np.mean(cr_vals)) if cr_vals else 0.0, + mae_pen_mean=float(np.mean([m.mae_pen for m in fm_list])), + mre_pen_mean=float(np.mean([m.mre_pen for m in fm_list])), + gpu_load_mean=(gpu_load or {}).get(name, 0.0), + ) + return stats \ No newline at end of file diff --git a/scripts/object_chessboard.py b/scripts/object_chessboard.py new file mode 100644 index 0000000..3fac5bf --- /dev/null +++ b/scripts/object_chessboard.py @@ -0,0 +1,763 @@ +''' + +Chessboard Object Manager. +Supported detection of the chess board object in the image and video. It is used for testing and demonstration of the pose estimation.: + + + +Usage : + python object_chessboard.py + + +Environemt : + .\\envs\\pyqt5g + +Install : + + +''' + + +import numpy as np +import cv2 +import matplotlib.pyplot as plt + +import sys, os +current_dir = os.path.dirname(__file__) +parent_dir = os.path.abspath(os.path.join(current_dir, '..')) +sys.path.append(parent_dir) + + # importing common Use modules +#from src.logger import log +import logging as log + + +# -------------------------------- +#%% ObjectManager - manages the entire experiment +class ObjectChessboard: + + def __init__(self, config = None): + + # params + self.config = config + self.name = 'chessboard' + self.frame = [] + self.resolution = (1280,720) + self.square_size = 21.0 # size in mm of the pattern square + self.debug_on = True + self.pattern_size = (9,6) + self._rt_plot = None + + + self.init() + log.info("Chess Object is Created") + + + def init(self): + # can be also string from file + + # should not be done - connect to external + if self.config is None: + log.info("Configuration is not connected") + else: + self.pattern_size = self.config.get('chessboard_size', self.pattern_size) + + ret = True + return ret + + def set_square_size(self,sqSize = 21.8): + # set chessboard size + if sqSize < 0 or sqSize > 100: + log.info('Square size should be in range 0.1:100 mm') + return + + self.square_size = sqSize # 21mm, + + log.info('Square size is %4.2f mm' % self.square_size) + + def set_pattern_size(self, pattern_size = (9,6)): + # set chessboard pattern size + if pattern_size[0] < 2 or pattern_size[1] < 2: + log.info('Pattern size should be at least 2x2') + return False + + self.pattern_size = pattern_size # (9,6) for 9x6 chessboard + + log.info('Pattern size is %dx%d' % self.pattern_size) + return True + + def get_object_points(self): + # prepare object points, like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0) + a = self.pattern_size[0] + b = self.pattern_size[1] + s = self.square_size # 21.8 # 21mm, but i want the units to be in meters + + objCorners = np.zeros((b*a,3), np.float32) + objCorners[:,:2] = np.mgrid[0:a,0:b].T.reshape(-1,2)*s + return objCorners + + def get_grid_points(self, grid_size=0.1): + # prepare grid points with step size of 1 mm, + # like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0) + scale_factor = self.square_size / grid_size + a = int(self.pattern_size[0]*scale_factor) + b = int(self.pattern_size[1]*scale_factor) + s = grid_size # 21.8 # 21mm, but i want the units to be in meters + + grid_corners = np.zeros((b*a,3), np.float32) + grid_corners[:,:2] = np.mgrid[0:a,0:b].T.reshape(-1,2)*s + return grid_corners + + def get_image_points(self, img): + "detect corners in the image and return their coordinates" + + # reduce size + if len(img.shape) > 2: + gray = cv2.cvtColor(img.astype(np.uint8),cv2.COLOR_BGR2GRAY) + else: + gray = img.astype(np.uint8) + + # find the chess board (calibration pattern) corners + flags = cv2.CALIB_CB_NORMALIZE_IMAGE | cv2.CALIB_CB_EXHAUSTIVE | cv2.CALIB_CB_ACCURACY + ret, imgCorners = cv2.findChessboardCornersSB(gray, self.pattern_size, flags=flags) + if ret: + # Refine the corners of the detected corners + criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001) + imgCorners2 = cv2.cornerSubPix(gray,imgCorners,(11,11),(-1,-1),criteria) + else: + imgCorners2= [] + #log.info('Chessboard corners not found in the image') + return imgCorners2 + + def detect(self, img): + """Detect chessboard corners and return a unified detection dictionary. + + Args: + img: BGR image. + + Returns: + dict: Detection result with keys: + - success (bool) + - image_points (np.ndarray or list) + - object_points (np.ndarray) + - quality (float) + - reason (str, when failed) + """ + if img is None: + return { + 'success': False, + 'reason': 'input image is None', + 'image_points': [], + 'object_points': self.get_object_points(), + 'quality': 0.0, + } + + try: + img_points = self.get_image_points(img) + except Exception as e: + return { + 'success': False, + 'reason': f'chessboard detection error: {e}', + 'image_points': [], + 'object_points': self.get_object_points(), + 'quality': 0.0, + } + + has_points = len(img_points) > 0 + detected_count = int(len(img_points)) if has_points else 0 + expected_count = int(self.pattern_size[0] * self.pattern_size[1]) + quality = float(detected_count / max(expected_count, 1)) + + result = { + 'success': has_points, + 'image_points': img_points, + 'object_points': self.get_object_points(), + 'quality': quality, + } + if not has_points: + result['reason'] = 'chessboard corners not found' + + return result + + def estimate_camera_pose(self, img, camera_matrix, dist_coeffs=None): + """Detect chessboard and estimate camera pose with solvePnP. + + Args: + img: BGR image. + camera_matrix: Intrinsic matrix (3x3). + dist_coeffs: Distortion coefficients (optional). + + Returns: + dict: { + success (bool), reason (str), image_points, object_points, + rvec, tvec, rotation_matrix, camera_position + } + camera_position is in chessboard coordinates, same unit as square_size. + """ + detection = self.detect(img) + if not detection.get('success', False): + return { + 'success': False, + 'reason': detection.get('reason', 'chessboard detection failed'), + 'image_points': detection.get('image_points', []), + 'object_points': detection.get('object_points', self.get_object_points()), + } + + obj_points = detection['object_points'].astype(np.float32) + img_points = np.asarray(detection['image_points'], dtype=np.float32) + + if img_points.ndim == 3 and img_points.shape[1] == 1: + img_points = img_points.reshape(-1, 2) + + cam_mtx = np.asarray(camera_matrix, dtype=np.float32) + dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float32) + + ok, rvec, tvec = cv2.solvePnP(obj_points, img_points, cam_mtx, dist, flags=cv2.SOLVEPNP_ITERATIVE) + if not ok: + return { + 'success': False, + 'reason': 'solvePnP failed', + 'image_points': img_points, + 'object_points': obj_points, + } + + rot_mtx, _ = cv2.Rodrigues(rvec) + # Camera center in object/chessboard coordinates: C = -R^T * t + camera_position = -rot_mtx.T @ tvec + + return { + 'success': True, + 'image_points': img_points, + 'object_points': obj_points, + 'rvec': rvec, + 'tvec': tvec, + 'rotation_matrix': rot_mtx, + 'camera_position': camera_position.reshape(3), + } + + def estimate_board_pose_in_camera(self, img, camera_matrix, dist_coeffs=None): + """Detect chessboard and estimate the board pose in the camera coordinate system. + + solvePnP returns rvec/tvec that transform points from object (board) space into + camera space, so tvec is already the board origin expressed in the camera frame, + and rot_mtx columns are the board X/Y/Z axes expressed in the camera frame. + + Args: + img: grayscale or BGR image. + camera_matrix: Intrinsic matrix (3x3). + dist_coeffs: Distortion coefficients (optional). + + Returns: + dict on success: + success : bool + image_points : (N, 2) detected corners in the image + object_points : (N, 3) 3-D corners in board frame + rvec : (3, 1) rotation vector (board → camera) + tvec : (3, 1) translation vector (board → camera) + rotation_matrix : (3, 3) rotation matrix R (board → camera) + board_position : (3,) position of the board origin in camera frame (= tvec) + board_center : (3,) position of the board geometric centre in camera frame + board_x_axis : (3,) board X axis expressed in camera frame + board_y_axis : (3,) board Y axis expressed in camera frame + board_z_axis : (3,) board normal expressed in camera frame + dict with success=False and reason on failure. + """ + detection = self.detect(img) + if not detection.get('success', False): + return { + 'success': False, + 'reason': detection.get('reason', 'chessboard detection failed'), + 'image_points': detection.get('image_points', []), + 'object_points': detection.get('object_points', self.get_object_points()), + } + + obj_points = detection['object_points'].astype(np.float32) + img_points = np.asarray(detection['image_points'], dtype=np.float32) + if img_points.ndim == 3 and img_points.shape[1] == 1: + img_points = img_points.reshape(-1, 2) + + cam_mtx = np.asarray(camera_matrix, dtype=np.float32) + dist = None if dist_coeffs is None else np.asarray(dist_coeffs, dtype=np.float32) + + ok, rvec, tvec = cv2.solvePnP(obj_points, img_points, cam_mtx, dist, flags=cv2.SOLVEPNP_ITERATIVE) + if not ok: + return { + 'success': False, + 'reason': 'solvePnP failed', + 'image_points': img_points, + 'object_points': obj_points, + } + + rot_mtx, _ = cv2.Rodrigues(rvec) + + # Board origin and axes in camera frame. + board_position = tvec.reshape(3) # origin corner [0,0] in camera frame + board_x_axis = rot_mtx[:, 0] # board +X in camera frame + board_y_axis = rot_mtx[:, 1] # board +Y in camera frame + board_z_axis = rot_mtx[:, 2] # board normal in camera frame + + # Geometric centre: average of all object points transformed into camera frame. + obj_center_board = obj_points.mean(axis=0).reshape(3, 1).astype(np.float32) + board_center = (rot_mtx @ obj_center_board + tvec).reshape(3) + + return { + 'success': True, + 'image_points': img_points, + 'object_points': obj_points, + 'rvec': rvec, + 'tvec': tvec, + 'rotation_matrix': rot_mtx, + 'board_position': board_position, + 'board_center': board_center, + 'board_x_axis': board_x_axis, + 'board_y_axis': board_y_axis, + 'board_z_axis': board_z_axis, + } + + def get_grid_in_camera_coordinates(self, rvec, tvec, camera_matrix, dist_coeffs): + """Project 3D grid points onto the camera image using the estimated pose. + + Args: + grid_points: (N, 3) array of 3D points in board coordinates. + rvec: (3, 1) rotation vector from solvePnP (board → camera). + tvec: (3, 1) translation vector from solvePnP (board → camera). + camera_matrix: Intrinsic matrix (3x3). + dist_coeffs: Distortion coefficients (optional). + + Returns: + projected_points: (N, 2) array of 2D points in image coordinates. + """ + grid_points = self.get_grid_points() + cam_mtx = np.asarray(camera_matrix, dtype=np.float32) + dist = np.asarray(dist_coeffs, dtype=np.float32) + rot_mtx, _ = cv2.Rodrigues(rvec) + + # transform grid points from board frame to camera frame: P_cam = R * P_board + t + #grid_transformed = (rot_mtx @ grid_points + tvec).reshape(-1,3) + grid_transformed = (grid_points @ rot_mtx.T + tvec.T) + #Z = grid_transformed[:,2] # depth of each point in camera frame + + # points projected to the camera image plane (with distortion): p_img = project(P_cam) + projected_points, _ = cv2.projectPoints(grid_points.astype(np.float32), rvec, tvec, cam_mtx, dist) + return grid_transformed, projected_points.reshape(-1, 2) + + def render_board_and_camera(self, object_points, camera_position, rvec=None, axis_length=None, show=True): + """Render chessboard points and estimated camera position in 3D (matplotlib).""" + a, b = self.pattern_size + s = float(self.square_size) + if axis_length is None: + axis_length = max(2.0 * s, 1.0) + + pts = np.asarray(object_points, dtype=np.float32).reshape(-1, 3) + cam = np.asarray(camera_position, dtype=np.float32).reshape(3) + + fig = plt.figure(figsize=(9, 7)) + ax = fig.add_subplot(111, projection='3d') + + # Draw board grid as wireframe in object coordinates. + grid = pts.reshape(b, a, 3) + ax.plot_wireframe(grid[:, :, 0], grid[:, :, 1], grid[:, :, 2], color='tab:blue', linewidth=1.0) + ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], c='tab:cyan', s=15, label='Chessboard corners') + + # Camera center. + ax.scatter([cam[0]], [cam[1]], [cam[2]], c='tab:red', s=70, marker='^', label='Camera center') + ax.text(cam[0], cam[1], cam[2], ' camera', color='tab:red') + + # Optional camera orientation axes (in object frame). + if rvec is not None: + rot_mtx, _ = cv2.Rodrigues(np.asarray(rvec, dtype=np.float32)) + # camera frame unit axes expressed in object frame = columns of R^T + cam_axes = rot_mtx.T + colors = ['r', 'g', 'b'] + labels = ['Xc', 'Yc', 'Zc'] + for i in range(3): + end = cam + cam_axes[:, i] * axis_length + ax.plot([cam[0], end[0]], [cam[1], end[1]], [cam[2], end[2]], color=colors[i], linewidth=2) + ax.text(end[0], end[1], end[2], labels[i], color=colors[i]) + + # Keep axes visually balanced. + x_vals = np.concatenate([pts[:, 0], np.array([cam[0]], dtype=np.float32)]) + y_vals = np.concatenate([pts[:, 1], np.array([cam[1]], dtype=np.float32)]) + z_vals = np.concatenate([pts[:, 2], np.array([cam[2]], dtype=np.float32)]) + max_range = max(np.ptp(x_vals), np.ptp(y_vals), np.ptp(z_vals), axis_length) * 0.6 + center = np.array([np.mean(x_vals), np.mean(y_vals), np.mean(z_vals)], dtype=np.float32) + ax.set_xlim(center[0] - max_range, center[0] + max_range) + ax.set_ylim(center[1] - max_range, center[1] + max_range) + ax.set_zlim(center[2] - max_range, center[2] + max_range) + + ax.set_xlabel(f'X [{self.square_size} mm units]') + ax.set_ylabel(f'Y [{self.square_size} mm units]') + ax.set_zlabel(f'Z [{self.square_size} mm units]') + ax.set_title('Chessboard and estimated camera pose') + ax.legend(loc='best') + plt.tight_layout() + + if show: + plt.show(block=False) + + return fig, ax + + def render_board_and_camera_real_time(self, object_points, camera_position, rvec=None, + axis_length=None, show=True, pause_sec=0.001): + """Realtime version of render_board_and_camera. + + Reuses the same matplotlib figure/artists and updates their data in-place, + so repeated calls can refresh visualization without creating a new plot. + """ + a, b = self.pattern_size + s = float(self.square_size) + if axis_length is None: + axis_length = max(2.0 * s, 1.0) + + pts = np.asarray(object_points, dtype=np.float32).reshape(-1, 3) + cam = np.asarray(camera_position, dtype=np.float32).reshape(3) + grid = pts.reshape(b, a, 3) + + # Create plot once, then update artists only. + need_init = ( + self._rt_plot is None + or self._rt_plot.get('fig', None) is None + or not plt.fignum_exists(self._rt_plot['fig'].number) + ) + + if need_init: + plt.ion() + fig = plt.figure(figsize=(9, 7)) + ax = fig.add_subplot(111, projection='3d') + + # Board as persistent line artists (rows + cols), easier to update than wireframe collection. + grid_lines = [] + for ri in range(b): + line, = ax.plot(grid[ri, :, 0], grid[ri, :, 1], grid[ri, :, 2], color='tab:blue', linewidth=1.0) + grid_lines.append(line) + for ci in range(a): + line, = ax.plot(grid[:, ci, 0], grid[:, ci, 1], grid[:, ci, 2], color='tab:blue', linewidth=1.0) + grid_lines.append(line) + + board_scatter = ax.scatter(pts[:, 0], pts[:, 1], pts[:, 2], c='tab:cyan', s=15, label='Chessboard corners') + cam_scatter = ax.scatter([cam[0]], [cam[1]], [cam[2]], c='tab:red', s=70, marker='^', label='Camera center') + cam_text = ax.text(cam[0], cam[1], cam[2], ' camera', color='tab:red') + + cam_axes_lines = [] + colors = ['r', 'g', 'b'] + labels = ['Xc', 'Yc', 'Zc'] + for i in range(3): + line, = ax.plot([cam[0], cam[0]], [cam[1], cam[1]], [cam[2], cam[2]], color=colors[i], linewidth=2) + txt = ax.text(cam[0], cam[1], cam[2], labels[i], color=colors[i]) + cam_axes_lines.append((line, txt)) + + ax.set_xlabel(f'X [{self.square_size} mm units]') + ax.set_ylabel(f'Y [{self.square_size} mm units]') + ax.set_zlabel(f'Z [{self.square_size} mm units]') + ax.set_title('Chessboard and estimated camera pose (real-time)') + ax.legend(loc='best') + plt.tight_layout() + + self._rt_plot = { + 'fig': fig, + 'ax': ax, + 'grid_lines': grid_lines, + 'board_scatter': board_scatter, + 'cam_scatter': cam_scatter, + 'cam_text': cam_text, + 'cam_axes_lines': cam_axes_lines, + } + else: + fig = self._rt_plot['fig'] + ax = self._rt_plot['ax'] + + # --- Update board lines --- + line_idx = 0 + for ri in range(b): + line = self._rt_plot['grid_lines'][line_idx] + line.set_data_3d(grid[ri, :, 0], grid[ri, :, 1], grid[ri, :, 2]) + line_idx += 1 + for ci in range(a): + line = self._rt_plot['grid_lines'][line_idx] + line.set_data_3d(grid[:, ci, 0], grid[:, ci, 1], grid[:, ci, 2]) + line_idx += 1 + + # --- Update scatters --- + self._rt_plot['board_scatter']._offsets3d = (pts[:, 0], pts[:, 1], pts[:, 2]) + self._rt_plot['cam_scatter']._offsets3d = (np.array([cam[0]]), np.array([cam[1]]), np.array([cam[2]])) + + # --- Update camera label text --- + old_text = self._rt_plot.get('cam_text', None) + if old_text is not None: + old_text.remove() + self._rt_plot['cam_text'] = ax.text(cam[0], cam[1], cam[2], ' camera', color='tab:red') + + # --- Update camera orientation axes --- + if rvec is not None: + rot_mtx, _ = cv2.Rodrigues(np.asarray(rvec, dtype=np.float32)) + cam_axes = rot_mtx.T + labels = ['Xc', 'Yc', 'Zc'] + for i in range(3): + line, txt = self._rt_plot['cam_axes_lines'][i] + end = cam + cam_axes[:, i] * axis_length + line.set_data_3d([cam[0], end[0]], [cam[1], end[1]], [cam[2], end[2]]) + txt.remove() + color = ['r', 'g', 'b'][i] + self._rt_plot['cam_axes_lines'][i] = (line, ax.text(end[0], end[1], end[2], labels[i], color=color)) + + # Keep axes balanced around board + camera. + x_vals = np.concatenate([pts[:, 0], np.array([cam[0]], dtype=np.float32)]) + y_vals = np.concatenate([pts[:, 1], np.array([cam[1]], dtype=np.float32)]) + z_vals = np.concatenate([pts[:, 2], np.array([cam[2]], dtype=np.float32)]) + max_range = max(np.ptp(x_vals), np.ptp(y_vals), np.ptp(z_vals), axis_length) * 0.6 + center = np.array([np.mean(x_vals), np.mean(y_vals), np.mean(z_vals)], dtype=np.float32) + ax.set_xlim(center[0] - max_range, center[0] + max_range) + ax.set_ylim(center[1] - max_range, center[1] + max_range) + ax.set_zlim(center[2] - max_range, center[2] + max_range) + + if show: + fig.canvas.draw_idle() + fig.canvas.flush_events() + plt.pause(pause_sec) + + return fig, ax + + def detect_estimate_and_render(self, img, camera_matrix, dist_coeffs=None, axis_length=None, show=True): + """Detect chessboard, estimate camera 3D pose, and render board + camera. + + Returns: + dict pose result from estimate_camera_pose(), plus optional 'figure' and 'axes'. + """ + pose = self.estimate_camera_pose(img, camera_matrix, dist_coeffs) + if not pose.get('success', False): + log.info(f"Pose estimation failed: {pose.get('reason', 'unknown reason')}") + return pose + + fig, ax = self.render_board_and_camera( + object_points=pose['object_points'], + camera_position=pose['camera_position'], + rvec=pose['rvec'], + axis_length=axis_length, + show=show, + ) + pose['figure'] = fig + pose['axes'] = ax + return pose + + def draw_corners(self, img, corners): + # draw corners on the image + if len(corners) == 0: + return img + + img_drawn = cv2.drawChessboardCorners(img, self.pattern_size, corners, True) + return img_drawn + + def show_corners(self, img, corners): + # show results + if not self.debug_on: + return False + + img = self.draw_corners(img, corners) + + cv2.imshow('Image with Corners',img) + + #press q if you want to end the loop + ret = cv2.waitKey(0) & 0xFF == ord('q') + return ret + + +# ---------------------- +#%% Tests +class TestObjectChessboard(): + + def __init__(self): + "init test" + self.s = ObjectChessboard() + log.info('TestObjectChessboard tests started') + + def assertTrue(self, isOk = True): + "assert true" + if not isOk: + raise AssertionError("Test failed") + + def assertFalse(self, isOk = False): + "assert false" + if isOk: + raise AssertionError("Test failed") + + + def test_object_detect_single_image(self): + """ + Function that loads images, does measurement and shows final result + """ + file_path = r"data\calib_robot_0006.jpg" + img = cv2.imread(file_path) + self.assertTrue(img is not None) + + img_points = self.s.get_image_points(img) + self.assertTrue(len(img_points) > 0) + + isOk = self.s.show_corners(img, img_points) + self.assertTrue(isOk) + + def test_render_board_and_camera(self): + """Test chessboard pose estimation and 3D rendering on a calibration image.""" + file_path = r"C:\Work\Code\robot_vision\pose6d\data\camera_calibration\calib_robot_0001.jpg" + img = cv2.imread(file_path) + self.assertTrue(img is not None) + + h, w = img.shape[:2] + # Approximate intrinsics for test robustness; replace with calibrated values when available. + fx = 600.0 + fy = 600.0 + cx = w / 2.0 + cy = h / 2.0 + camera_matrix = np.array([ + [fx, 0.0, cx], + [0.0, fy, cy], + [0.0, 0.0, 1.0], + ], dtype=np.float32) + dist_coeffs = np.zeros((5, 1), dtype=np.float32) + + pose = self.s.estimate_camera_pose(img, camera_matrix, dist_coeffs) + self.assertTrue(pose.get('success', False)) + + fig, ax = self.s.render_board_and_camera( + object_points=pose['object_points'], + camera_position=pose['camera_position'], + rvec=pose.get('rvec', None), + show=True, + ) + + self.assertTrue(fig is not None) + self.assertTrue(ax is not None) + plt.close(fig) + + def test_object_detect_video(self): + """ + Function that does processing using video file + """ + object_path = r"D:\RobotAI\Customers\Plasel\Objects\plasel_gray-01" + file_path = r"D:\RobotAI\Customers\Plasel\Objects\plasel_gray-01\videos\object_0002.mp4" + + isOk = self.s.pose6d.ObjectSelectSingle(object_path) + self.assertTrue(isOk) + + isOk = self.s.pose6d.TestRunFile(file_path) + self.assertTrue(isOk) + + def test_rs_camera_connection(self): + """ + Function that connects to RS camera and shows live stream, + chessboard detection and real-time 3D pose rendering. + """ + import importlib.util + #from opencv_realsense_camera import RealSense + + cam_module_path = r"C:\Work\Code\Fast-FoundationStereo\scripts\opencv_realsense_camera.py" + self.assertTrue(os.path.isfile(cam_module_path)) + + spec = importlib.util.spec_from_file_location("opencv_realsense_camera", cam_module_path) + rs_mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(rs_mod) + RealSense = rs_mod.RealSense + + cap = RealSense(frame_size=(1280, 720), use_ir=False, mode = 'd16') + self.assertTrue(cap is not None) + + # intr = cap.config.resolve(rs_mod.rs.pipeline_wrapper(cap.pipeline)) \ + # .get_stream(rs_mod.rs.infrared, 1).as_video_stream_profile().get_intrinsics() + # intr = cap.intr #get_camera_intrinsics(1) + # camera_matrix = np.array([ + # [intr.fx, 0.0, intr.ppx], + # [0.0, intr.fy, intr.ppy], + # [0.0, 0.0, 1.0], + # ], dtype=np.float32) + + h, w = 720, 1280 + fx = 600.0 + fy = 600.0 + cx = w / 2.0 + cy = h / 2.0 + camera_matrix = np.array([ + [fx, 0.0, cx], + [0.0, fy, cy], + [0.0, 0.0, 1.0], + ], dtype=np.float32) + #dist_coeffs = np.array(intr.coeffs, dtype=np.float32).reshape(-1, 1) + dist_coeffs = np.zeros((5, 1), dtype=np.float32) + + # Use a finite loop for test-style behavior; press 'q' in OpenCV window to exit early. + try: + for _ in range(300): + ret, _ = cap.read() + self.assertTrue(ret) + + # Use left IR image for chessboard detection. + ir_left = cap.img_l + pose = self.s.estimate_camera_pose(ir_left, camera_matrix, dist_coeffs) + if pose.get('success', False): + self.s.render_board_and_camera_real_time( + object_points=pose['object_points'], + camera_position=pose['camera_position'], + rvec=pose.get('rvec', None), + show=True, + pause_sec=0.001, + ) + + # Keep OpenCV feed visible and allow keyboard control ('q' to break). + should_exit = cap.show_image(cap.img_l) + if should_exit: + break + finally: + cap.close() + cv2.destroyAllWindows() + + def test_get_grid_in_camera_coordinates(self): + """Test projecting a 3D grid onto the camera image using the estimated pose.""" + file_path = r"C:\Work\Code\robot_vision\pose6d\data\camera_calibration\calib_robot_0001.jpg" + img = cv2.imread(file_path) + self.assertTrue(img is not None) + + h, w = img.shape[:2] + fx = 600.0 + fy = 600.0 + cx = w / 2.0 + cy = h / 2.0 + camera_matrix = np.array([ + [fx, 0.0, cx], + [0.0, fy, cy], + [0.0, 0.0, 1.0], + ], dtype=np.float32) + dist_coeffs = np.zeros((5, 1), dtype=np.float32) + + pose = self.s.estimate_camera_pose(img, camera_matrix, dist_coeffs) + self.assertTrue(pose.get('success', False)) + + XYZ, projected_points = self.s.get_grid_in_camera_coordinates( + rvec=pose['rvec'], + tvec=pose['tvec'], + camera_matrix=camera_matrix, + dist_coeffs=dist_coeffs, + ) + + # Draw projected grid points on the image. + for pt in projected_points: + cv2.circle(img, (int(pt[0]), int(pt[1])), radius=1, color=(0, 255, 0), thickness=-1) + + cv2.imshow('Projected Grid', img) + cv2.waitKey(0) + cv2.destroyAllWindows() + +# ---------------------------------------------------- +#%% Run Test +def RunTest(): + "Run all tests in the MainApp class" + tst = TestObjectChessboard() + + # tst.test_object_detect_single_image() # interactive (waits for key press) + #tst.test_render_board_and_camera() + tst.test_rs_camera_connection() + #tst.test_get_grid_in_camera_coordinates() + + + +#%% Run ALL +if __name__ == '__main__': + #print(__doc__) + RunTest() diff --git a/scripts/opencv_realsense_camera.py b/scripts/opencv_realsense_camera.py new file mode 100644 index 0000000..e1e1e9d --- /dev/null +++ b/scripts/opencv_realsense_camera.py @@ -0,0 +1,1023 @@ + +''' +OpenCV like wrapper for Real Sense Camera + +================== + +Allows to read, display store video and images of RGB - Depth combinations in different formats. +Can extract left and right IR images. +Aligns RGB and Depth data. +Can save data as mp4 or single images. +Can control laser power, exposure and other parameters. +Mouse point clicks will show depth value at the point + +Usage: + python opencv_realsense_camera.py + will run the camera and open the image window with live stream. + Use keys outlines in test() function to switch different modes + + Press 'd' to show different display optins + Press 's' to save the current image + Press 't' to save the left and right images in separate files + Press 'r' to start recording and one more time 'r' to stop video recording + Press 'e' to change exposure (1-9) + Press 'g' to change gain (1-9) + Press 'q' to exit + + +Environment : + C:\\Users\\udubin\\Documents\\Envs\\barcode + +Install : + pip install pyrealsense2-2.56.0.7981-cp310-cp310-win_amd64.whl + +''' +import os +#import profile +#from flask import ctx +import pyrealsense2 as rs +import numpy as np +import cv2 as cv +import time + + +#%% Draw + +def draw_str(dst, target, s): + x, y = target + dst = cv.putText(dst, s, (x+1, y+1), cv.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 0), thickness = 2, lineType=cv.LINE_AA) + dst = cv.putText(dst, s, (x, y), cv.FONT_HERSHEY_PLAIN, 1.0, (255, 255, 255), lineType=cv.LINE_AA) + return dst + +#%% Helper +DS5_product_ids = ["0AD1", "0AD2", "0AD3", "0AD4", "0AD5", "0AF6", "0AFE", "0AFF", "0B00", "0B01", "0B03", "0B07", "0B3A", "0B5C", "0B5B"] + +def find_device_that_supports_advanced_mode() : + ctx = rs.context() + ds5_dev = rs.device() + devices = ctx.query_devices() + for dev in devices: + if dev.supports(rs.camera_info.product_id) and str(dev.get_info(rs.camera_info.product_id)) in DS5_product_ids: + if dev.supports(rs.camera_info.name): + print("Found device that supports advanced mode:", dev.get_info(rs.camera_info.name)) + return dev + raise Exception("No D400 product line device that supports advanced mode was found") + + +#%% Main +class RealSense(object): + def __init__(self, mode = 'rgb', frame_size = None, use_ir = None, use_advanced = None): + + self.frame_size = (1280, 720) if frame_size is None else frame_size #frame_size #(1280, 720)#(640,480) + self.display_mode = 'rgb' if mode is None else mode + self.use_ir = True if use_ir is None else use_ir + self.use_projector = False + self.use_advanced = False if use_advanced is None else use_advanced # advanced mode is enabled + self.control_mode = 'no controls' + self.DISPLAY_MODES = ['rgb','irl','irr','rgd','ddd','d16','gdd','scl','sc2','iid','ii2','iig','iir','gd','ggd','dep'] + + self.fig_title = 'RS Camera (d,e,g,f,p,o,g,m,s,t,r,k: q - to exit)' + self.mouse_x = 0 + self.mouse_y = 0 + self.mouse_value = -1 + + + # noise measurement + self.img_int_mean = None + self.img_int_std = None + self.use_measure = False + self.rect = None + self.depth_scale = 0.001 # scale depth to meter + + # Configure depth and color streams + self.pipeline = rs.pipeline() + self.config = rs.config() + + # Get device product line for setting a supporting resolution + device_name = self.get_device_name() + self.set_frame_size(device_name) + + # set visual preset + #self.set_visual_preset('RS2_RS400_VISUAL_PRESET_CUSTOM') + #self.set_visual_preset('RS2_RS400_VISUAL_PRESET_HIGH_ACCURACY') + + # scale factor + self.depth_scale = self.get_depth_scale_factor() + + # start streaming + self.set_start_streaming() + + # set advanced mode - disparity in pixels + self.set_advanced_mode() + self.advance_mode = None + + # # turn emitter on-off + self.has_projector = device_name.find('D455') > 0 or device_name.find('D555') > 0 or device_name.find('D435') > 0 or device_name.find('D585') > 0 + self.switch_projector() + + # if the safety camera - to modify exposure, first change safety mode to service + self.set_safety_mode(device_name) + # if device_name.find('D585') > 0: + # ctx = rs.context() + # dev = ctx.query_devices().front() + # safety_sensor = dev.first_safety_sensor() + # safety_sensor.set_option(rs.option.safety_mode, rs.safety_mode.service) + #self.intr = self.get_camera_intrinsics(3) # depth intrinsics + + + + # Depth controls to defaults + #self.set_exposure() + #self.set_gain() + #self.set_laser_power() + + # Create an align object + # rs.align allows us to perform alignment of depth frames to others frames + # The "align_to" is the stream type to which we plan to align depth frames. + # align_to = rs.stream.color + # self.align = rs.align(align_to) + align_to = rs.stream.depth + self.align = rs.align(align_to) + + # output support + self.output_range = [0,255] # extract range to map 16 bit to 8 + + # record video + self.vout = None + self.record_on = False # toggle recording + self.count = 0 + + # save images + self.img_d = self.img_c = self.img_l = self.img_r = [] + + def render(self, dst): + pass + + def get_device_name(self): + "find device name" + device_name = '' + pipeline_wrapper = rs.pipeline_wrapper(self.pipeline) + try: # 545 + pipeline_profile = self.config.resolve(pipeline_wrapper) + device = pipeline_profile.get_device() + device_product_line = str(device.get_info(rs.camera_info.product_line)) + device_name = device.get_info(rs.camera_info.name) + print('Device name : ', device_name) + print('Device product line : ', device_product_line) + except Exception as e: + print('Real Sense new version - possibly will require a new driver version') + print(e) + + return device_name + + def get_depth_scale_factor(self): + "get depth scale factor" + + pipeline_wrapper = rs.pipeline_wrapper(self.pipeline) + try: + pipeline_profile = self.config.resolve(pipeline_wrapper) + depth_sensor = pipeline_profile.get_device().first_depth_sensor() + depth_scale = depth_sensor.get_depth_scale() + + print(f'Device depth scale factor : {depth_scale}') + except Exception as e: + print('Real Sense new version - possibly will require a new driver version') + print(e) + depth_scale = 0.001 + + return depth_scale + + def set_safety_mode(self, device_name): + if device_name.find('D585') > 0: + ctx = rs.context() + dev = ctx.query_devices().front() + safety_sensor = dev.first_safety_sensor() + safety_sensor.set_option(rs.option.safety_mode, rs.safety_mode.service) + + def set_visual_preset(self, preset_name = 'RS2_RS400_VISUAL_PRESET_HIGH_ACCURACY'): + "set visual preset" + VISUAL_PRESET = {'RS2_RS400_VISUAL_PRESET_CUSTOM': 0, + 'RS2_RS400_VISUAL_PRESET_DEFAULT': 1, + 'RS2_RS400_VISUAL_PRESET_HAND': 2, + 'RS2_RS400_VISUAL_PRESET_HIGH_ACCURACY': 3, + 'RS2_RS400_VISUAL_PRESET_HIGH_DENSITY': 4, + 'RS2_RS400_VISUAL_PRESET_MEDIUM_DENSITY': 5, + 'RS2_RS400_VISUAL_PRESET_REMOVE_IR_PATTERN': 6, + 'RS2_RS400_VISUAL_PRESET_COUNT': 7 + } + ctx = rs.context() + dev = ctx.query_devices().front() + depth_sensor = dev.first_depth_sensor() + options = depth_sensor.get_supported_options() + #print(options) + vp = depth_sensor.get_option(rs.option.visual_preset) + #print(vp) + depth_sensor.set_option(rs.option.visual_preset, VISUAL_PRESET[preset_name]) + vp = depth_sensor.get_option(rs.option.visual_preset) + print(vp) + + def set_frame_size(self, device_name): + "device dependent data" + if device_name.find('D585') > 0 or device_name.find('D555') > 0: + print(f'Configured for {device_name}') + self.frame_size = (1280, 720) + + print(f'Frame size : {self.frame_size[0]} x {self.frame_size[1]}') + + def set_start_streaming(self): + "start stremaing" + self.config.enable_stream(rs.stream.depth, self.frame_size[0], self.frame_size[1], rs.format.z16, 30) + self.config.enable_stream(rs.stream.color, self.frame_size[0], self.frame_size[1], rs.format.bgr8, 30) + + if self.use_ir: + self.config.enable_stream(rs.stream.infrared, 1) + self.config.enable_stream(rs.stream.infrared, 2) + print('IR is enabled') + else: + print('IR is disabled') + + + # Start streaming + profile = self.pipeline.start(self.config) + + # Getting the depth sensor's depth scale (see rs-align example for explanation) + self.depth_sensor = profile.get_device().first_depth_sensor() + #depth_scale = self.depth_sensor.get_depth_scale() + #print("Depth Scale is: " , depth_scale) + # + def set_disparity_mode(self): + "enable disparity mode output" + + try: + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + # Loop until we successfully enable advanced mode + while not advnc_mode.is_enabled(): + print("Trying to enable advanced mode...") + advnc_mode.toggle_advanced_mode(True) + # At this point the device will disconnect and re-connect. + print("Sleeping for 5 seconds...") + time.sleep(5) + # The 'dev' object will become invalid and we need to initialize it again + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + # Get each control's current value + print("Depth Control: \n", advnc_mode.get_depth_control()) + print("RSM: \n", advnc_mode.get_rsm()) + print("RAU Support Vector Control: \n", advnc_mode.get_rau_support_vector_control()) + print("Color Control: \n", advnc_mode.get_color_control()) + print("RAU Thresholds Control: \n", advnc_mode.get_rau_thresholds_control()) + print("SLO Color Thresholds Control: \n", advnc_mode.get_slo_color_thresholds_control()) + print("SLO Penalty Control: \n", advnc_mode.get_slo_penalty_control()) + print("HDAD: \n", advnc_mode.get_hdad()) + print("Color Correction: \n", advnc_mode.get_color_correction()) + print("Depth Table: \n", advnc_mode.get_depth_table()) + print("Auto Exposure Control: \n", advnc_mode.get_ae_control()) + print("Census: \n", advnc_mode.get_census()) + + except Exception as e: + print(e) + return + + #UD - enable disparity mode output + depth_table = advnc_mode.get_depth_table() + depth_table.disparityMode = 1 # 0-depth,1-disparity + advnc_mode.set_depth_table(depth_table) + print("Depth Table: \n", advnc_mode.get_depth_table()) # confirm the settings + + def set_advanced_mode(self): + "enable camera advanced mode" + if not self.use_advanced: + return + + try: + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + # Loop until we successfully enable advanced mode + while not advnc_mode.is_enabled(): + print("Trying to enable advanced mode...") + advnc_mode.toggle_advanced_mode(True) + # At this point the device will disconnect and re-connect. + print("Sleeping for 5 seconds...") + time.sleep(5) + # The 'dev' object will become invalid and we need to initialize it again + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + # Get each control's current value + print("Depth Control: \n", advnc_mode.get_depth_control()) + print("RSM: \n", advnc_mode.get_rsm()) + print("RAU Support Vector Control: \n", advnc_mode.get_rau_support_vector_control()) + print("Color Control: \n", advnc_mode.get_color_control()) + print("RAU Thresholds Control: \n", advnc_mode.get_rau_thresholds_control()) + print("SLO Color Thresholds Control: \n", advnc_mode.get_slo_color_thresholds_control()) + print("SLO Penalty Control: \n", advnc_mode.get_slo_penalty_control()) + print("HDAD: \n", advnc_mode.get_hdad()) + print("Color Correction: \n", advnc_mode.get_color_correction()) + print("Depth Table: \n", advnc_mode.get_depth_table()) + print("Auto Exposure Control: \n", advnc_mode.get_ae_control()) + print("Census: \n", advnc_mode.get_census()) + + except Exception as e: + print(e) + return + + #UD - enable disparity mode output + depth_table = advnc_mode.get_depth_table() + depth_table.disparityMode = 1 # 0-depth,1-disparity + advnc_mode.set_depth_table(depth_table) + print("Depth Table: \n", advnc_mode.get_depth_table()) # confirm the settings + + + # #UD - Simulator settings + hdad = advnc_mode.get_hdad() + hdad.ignoreSAD = 1 + advnc_mode.set_hdad(hdad) + print("HDAD: \n", advnc_mode.get_hdad()) + + color_cntrl = advnc_mode.get_color_control() + color_cntrl.disableSADColor = 1 + color_cntrl.disableRAUColor = 1 + advnc_mode.set_color_control(color_cntrl) + print("Color Correction: \n", advnc_mode.get_color_control()) + + # no difference + # rau_cntrl = advnc_mode.get_rau_support_vector_control() + # rau_cntrl.minWEsum = 1 + # rau_cntrl.minNSsum = 1 + # advnc_mode.set_rau_support_vector_control(rau_cntrl) + # print("RAU Support Vector Control: \n", advnc_mode.get_color_control()) + + rsm = advnc_mode.get_rsm() + rsm.rsmBypass = 1 + advnc_mode.set_rsm(rsm) + print("RSM: \n", advnc_mode.get_rsm()) + + depth_cntrl = advnc_mode.get_depth_control() + depth_cntrl.scoreThreshA = 0 + depth_cntrl.deepSeaSecondPeakThreshold = 50 + advnc_mode.set_depth_control(depth_cntrl) + print("Depth Control: \n", advnc_mode.get_depth_control()) + + slo_cntrl = advnc_mode.get_slo_penalty_control() + slo_cntrl.sloK1Penalty = 400 + slo_cntrl.sloK2Penalty = 511 + advnc_mode.set_slo_penalty_control(slo_cntrl) + print("SLO Penalty Control: \n", advnc_mode.get_slo_penalty_control()) + + #self.depth_sensor = dev + #return dev + + def load_preset_from_file(self, filename = 'depth_preset.json'): + "load preset from a file " + #json_config_string = filename + if not os.path.isfile(filename): + print(f'File {filename} not found') + return + + with open(filename) as f: + json_config_string = f.read().strip() + + # Loading json file onto device + dev = find_device_that_supports_advanced_mode() + advanced_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advanced_mode.is_enabled() else "disabled") + try: + advanced_mode.load_json(json_config_string) + print('Preset loaded from file : ', filename) + except Exception as e: + print(e) + return + + + + + def set_exposure(self, exposure_value = None, exposure_percent = None): + "set exposure to the correct values. use exposure_percent - to set exposure as a percentage" + + if not self.depth_sensor.supports(rs.option.exposure): + print('Exposure has no support') + return + + range = self.depth_sensor.get_option_range(rs.option.exposure) + if exposure_percent is None: + # old way - set directly + exposure_value = exposure_value if exposure_value is not None else range.default + else: + value_range = 2**(np.linspace(np.log2(range.min), np.log2(range.max/10) , 10)) + exposure_index = int(exposure_percent) + exposure_value = value_range[exposure_index] + + exposure_value = exposure_value if exposure_value > range.min else range.min + exposure_value = exposure_value if exposure_value < range.max else range.max + + self.depth_sensor.set_option(rs.option.exposure, int(exposure_value)) + print(f'Exposure is : {exposure_value}') + + def set_gain(self,gain_value = None): + "set gain to the correct values" + if self.depth_sensor.supports(rs.option.gain): + range = self.depth_sensor.get_option_range(rs.option.gain) + gain_value = gain_value if gain_value is not None else range.default + gain_value = gain_value if gain_value > range.min else range.min + gain_value = gain_value if gain_value < range.max else range.max + + self.depth_sensor.set_option(rs.option.gain, int(gain_value)) + print(f'Gain is : {gain_value}') + else: + print('Gain has no support') + + def set_output_range(self, range_value = 0): + "maps 16 bit to 8" + range_value = range_value * 255 + self.output_range[0] = range_value + self.output_range[1] = range_value + 255 + print(f'Output range is set to min {self.output_range[0]} and max {self.output_range[1]}') + + def get_baseline(self): + "returns camera baseline" + B = self.depth_sensor.get_option(rs.option.stereo_baseline) + print(f'Baseline is : {B} mm') + return B + + def get_focal_length(self): + "intrinsic parameters and returns focal length" + pipeline_wrapper = rs.pipeline_wrapper(self.pipeline) + pipeline_profile = self.config.resolve(pipeline_wrapper) + intr = pipeline_profile.get_stream(rs.stream.depth).as_video_stream_profile().get_intrinsics() + print(f'Intrinsics Fx is : {intr.fx} ') + return intr.fx + + def get_camera_intrinsics(self, stream_type = 1): + "intrinsic parameters of the camera" + if stream_type == 1: + stream_type = rs.stream.depth + elif stream_type == 2: + stream_type = rs.stream.color + elif stream_type == 3: + stream_type = rs.stream.infrared + + pipeline_wrapper = rs.pipeline_wrapper(self.pipeline) + pipeline_profile = self.config.resolve(pipeline_wrapper) + intr = pipeline_profile.get_stream(stream_type).as_video_stream_profile().get_intrinsics() + #print(f'Intrinsics Fx is : {intr.fx} ') + print(intr) + return intr + + def get_bf(self): + "read baseline and focal length for inverse depth compute" + b = self.get_baseline() + f = self.get_focal_length() + print(f'Total BF is : {b*f} ') + return b*f + + def get_camera_params(self, value_in = 0): + "whoch camera params toi show" + if value_in == 0: + self.get_bf() + elif value_in == 1: + self.get_camera_intrinsics() + + def set_laser_power(self, laser_power_value = None): + "set laser power to the correct values" + + if self.depth_sensor.supports(rs.option.laser_power): + range = self.depth_sensor.get_option_range(rs.option.laser_power) + laser_power_value = laser_power_value if laser_power_value is not None else range.default + laser_power_value = laser_power_value if laser_power_value > range.min else range.min + laser_power_value = laser_power_value if laser_power_value < range.max else range.max + + self.depth_sensor.set_option(rs.option.laser_power, int(laser_power_value)) + print(f'Laser power is : {laser_power_value}') + else: + print('Laser power has no support') + + def switch_projector(self, projector_on = True): + "switch projector on-off" + if not self.has_projector: + print('Camera is without projector') + else: + #if self.use_projector is False: + #self.depth_sensor.set_option(rs.option.emitter_always_on, projector_on) + self.depth_sensor.set_option(rs.option.emitter_enabled, projector_on) + + time.sleep(0.1) # wait for camera on - off + print('Camera projector : %s' %str(self.use_projector)) + + def switch_disparity(self): + "switch disparity on" + + if self.advance_mode is None: + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + # Loop until we successfully enable advanced mode + while not advnc_mode.is_enabled(): + print("Trying to enable advanced mode...") + advnc_mode.toggle_advanced_mode(True) + # At this point the device will disconnect and re-connect. + print("Sleeping for 5 seconds...") + time.sleep(5) + # The 'dev' object will become invalid and we need to initialize it again + dev = find_device_that_supports_advanced_mode() + advnc_mode = rs.rs400_advanced_mode(dev) + print("Advanced mode is", "enabled" if advnc_mode.is_enabled() else "disabled") + + self.advance_mode = advnc_mode + + depth_table = self.advance_mode.get_depth_table() + depth_table.disparityMode = 1 - depth_table.disparityMode # 0-depth,1-disparity - switch + self.advance_mode.set_depth_table(depth_table) + print("Depth Table: \n", self.advance_mode.get_depth_table()) # confirm the settings + + def set_display_mode(self, mode = 'rgb'): + "changes display mode by umber or by string" + + if isinstance(mode,int): # integer + mode = mode % len(self.DISPLAY_MODES) + mode = self.DISPLAY_MODES[mode] + + if not(mode in self.DISPLAY_MODES): + print(f'Not supported mode = {mode}') + + self.display_mode = mode + print(f'Current mode {mode}') + + def set_controls(self, value_in = 0): + "implements differnt controls according to the selected control mode. Input is an integer from 0-9" + if self.control_mode == 'display': + self.set_display_mode(value_in) + + elif self.control_mode == 'exposure': + self.set_exposure( None, exposure_percent=value_in) + + elif self.control_mode == 'gain': + self.set_gain(value_in*10) + + elif self.control_mode == 'projector': + self.use_projector = value_in == 1 + self.switch_projector(value_in == 1) + + elif self.control_mode == 'disparity': + self.switch_disparity() + + elif self.control_mode == 'range': + self.set_output_range(value_in) + + elif self.control_mode == 'focal': + self.get_camera_params(value_in) + else: + pass + + def convert_depth_to_disparity(self, img_depth): + "from GIL" + focal_len = 175.910019 + baseline = 94.773 + #replacementDepth = focal_len * baseline / (RectScaledInfra1.x - (maxLoc.x + RectScaledInfra2.x)); + img_disparity = img_depth.copy() + valid = img_depth > 0 + img_disparity[valid]= focal_len*baseline/img_depth[valid]*32 + return img_disparity + + def measure_noise(self, img): + "makes integration over ROI" + x0, y0, x1, y1 = self.rect + if len(img.shape) < 3: + img_roi = img[y0:y1,x0:x1].astype(np.float32) + else: # protect from rgb display + img_roi = img[y0:y1,x0:x1,0].astype(np.float32) + + if self.img_int_mean is None: + self.img_int_mean = img_roi + self.img_int_std = np.zeros_like(img_roi) + elif self.img_int_mean.shape[1] != img_roi.shape[1]: # image display is changed + self.img_int_mean = None + return 0 + + valid_bool = img_roi > 0 + #valid_num = valid_bool.sum() + #nr,nc = img_roi.shape[:2] + + self.img_int_mean += 0.1*(img_roi - self.img_int_mean) + self.img_int_std += 0.1*(np.abs(img_roi - self.img_int_mean) - self.img_int_std) + + err_std_valid = self.img_int_std.copy() + #err_std_valid[~valid_bool] = 100 + err_std = err_std_valid[valid_bool].mean() + + return err_std + + # Callback function + def track_mouse(self, event, x, y, flags, param): + if event == cv.EVENT_MOUSEMOVE: + self.mouse_x, self.mouse_y = x, y + self.mouse_value = self.img_d[y, x] + + def create_output_image(self, depth_image, color_image, irl_image, irr_image): + "defines the output image" + + if self.display_mode == 'rgb': + image_out = color_image + elif self.display_mode == 'irl': + image_out = irl_image + elif self.display_mode == 'irr': + image_out = irr_image + elif self.display_mode == 'ddd': + # Apply colormap on depth image (image must be converted to 8-bit per pixel first) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + depth_colormap = cv.applyColorMap(depth_scaled, cv.COLORMAP_JET) + image_out = depth_scaled + elif self.display_mode == 'rgd': + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + image_out = np.concatenate((color_image[:,:,:2], depth_scaled[:,:,np.newaxis] ), axis = 2) + elif self.display_mode == 'gd': + gray_image = cv.cvtColor(color_image, cv.COLOR_RGB2GRAY) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + image_out = np.concatenate((gray_image, depth_scaled ), axis = 1) + elif self.display_mode == 'ggd': + gray_image = cv.cvtColor(color_image, cv.COLOR_RGB2GRAY) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + image_out = np.stack((gray_image, gray_image, depth_scaled ), axis = 2) + elif self.display_mode == 'gdd': + gray_image = cv.cvtColor(color_image, cv.COLOR_RGB2GRAY) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + image_out = np.stack((gray_image, depth_scaled, depth_scaled ), axis = 2) + elif self.display_mode == 'scl': + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.05) + image_out = cv.applyColorMap(depth_scaled, cv.COLORMAP_JET) + elif self.display_mode == 'sc2': + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.1) + image_out = cv.applyColorMap(depth_scaled, cv.COLORMAP_JET) + elif self.display_mode == 'ii2': + image_out = np.concatenate((irl_image, irr_image), axis = 1) + elif self.display_mode == 'iid': + #print(f'Depth {depth_image.min()} - {depth_image.max()}') + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.1) + image_out = np.stack((irl_image, irr_image, depth_scaled), axis = 2) + #image_out = np.concatenate((irl_image, depth_scaled), axis = 1) + elif self.display_mode == 'd16': + image_out = np.stack((irl_image.astype(np.uint16), irr_image.astype(np.uint16), depth_image), axis = 2) + elif self.display_mode == 'iig': + image_out = np.stack((irl_image, irr_image, color_image[:,:,1]), axis = 2) + elif self.display_mode == 'iir': + image_out = np.stack((irl_image, irr_image, color_image[:,:,0]), axis = 2) + #image_out = np.concatenate((irl_image, color_image[:,:,0]), axis = 1) + elif self.display_mode == 'dep': + image_out = depth_image.astype(np.float32) + image_out = image_out - self.output_range[0] + image_out[image_out < 0] = 0 + image_out[image_out > 255] = 255 + + + #image_out = depth_image / 32 * 4 # 10 for scaling + #image_out = self.convert_depth_to_disparity(depth_image) + return image_out + + def read_aligned(self, dst=None): + "with frame alignments and color space transformations" + #self.use_projector = not self.use_projector # testing + w, h = self.frame_size + + # Wait for a coherent pair of frames: depth and color + frames = self.pipeline.wait_for_frames() + # Align the depth frame to color frame + aligned_frames = self.align.process(frames) + + # Get aligned frames + depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image + color_frame = aligned_frames.get_color_frame() + if not depth_frame or not color_frame: + return False, None + + # Convert images to numpy arrays + depth_image = np.asanyarray(depth_frame.get_data()) + color_image = np.asanyarray(color_frame.get_data()) + #color_image = cv.cvtColor(depth_image, cv.COLOR_GRAY2RGB) + #depth_image = cv.cvtColor(color_image, cv.COLOR_RGB2GRAY) + + # Apply colormap on depth image (image must be converted to 8-bit per pixel first) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + depth_colormap = cv.applyColorMap(depth_scaled, cv.COLORMAP_JET) + + depth_colormap_dim = depth_colormap.shape + color_colormap_dim = color_image.shape + + #If depth and color resolutions are different, resize color image to match depth image for display + if depth_colormap_dim != color_colormap_dim: + raise ValueError('depth and image size missmatch') + #color_image = cv.resize(color_image, dsize=(depth_colormap_dim[1], depth_colormap_dim[0]), interpolation=cv.INTER_AREA) + #images = np.hstack((resized_color_image, depth_colormap)) + # else: + # images = np.hstack((color_image, depth_colormap)) + + if self.use_ir: + ir_left = aligned_frames.get_infrared_frame(1) + irl_image = np.asanyarray(ir_left.get_data()) + ir_right = aligned_frames.get_infrared_frame(2) + irr_image = np.asanyarray(ir_right.get_data()) + else: + #print('Enable IR use at the start. use_ir = True') + irl_image = color_image[:,:,0] + irr_image = color_image[:,:,1] + image_out = color_image + + self.img_d, self.img_c, self.img_l, self.img_r = depth_image, color_image, irl_image, irr_image + image_out = self.create_output_image(depth_image, color_image, irl_image, irr_image) + return True, image_out + + def read(self, dst=None): + "color and depth are not aligned" + w, h = self.frame_size + + # Wait for a coherent pair of frames: depth and color + frames = self.pipeline.wait_for_frames() + depth_frame = frames.get_depth_frame() + color_frame = frames.get_color_frame() + if not depth_frame or not color_frame: + return False, None + + # Convert images to numpy arrays + depth_image = np.asanyarray(depth_frame.get_data()) + color_image = np.asanyarray(color_frame.get_data()) + #color_image = cv.cvtColor(depth_image, cv.COLOR_GRAY2RGB) + #depth_image = cv.cvtColor(color_image, cv.COLOR_RGB2GRAY) + + # convert depth to mm + depth_image = (depth_image * self.depth_scale * 1000).astype(np.uint16) + + # Apply colormap on depth image (image must be converted to 8-bit per pixel first) + depth_scaled = cv.convertScaleAbs(depth_image, alpha=0.03) + depth_colormap = cv.applyColorMap(depth_scaled, cv.COLORMAP_JET) + + depth_colormap_dim = depth_colormap.shape + color_colormap_dim = color_image.shape + + if depth_colormap_dim != color_colormap_dim: + raise ValueError('depth and image size missmatch') + #color_image = cv.resize(color_image, dsize=(depth_colormap_dim[1], depth_colormap_dim[0]), interpolation=cv.INTER_AREA) + #images = np.hstack((resized_color_image, depth_colormap)) + # else: + # images = np.hstack((color_image, depth_colormap)) + + if self.use_ir: + ir_left = frames.get_infrared_frame(1) + irl_image = np.asanyarray(ir_left.get_data()) + ir_right = frames.get_infrared_frame(2) + irr_image = np.asanyarray(ir_right.get_data()) + else: + #print('Enable IR use at the start. use_ir = True') + irl_image = color_image[:,:,0] + irr_image = color_image[:,:,1] + image_out = color_image + + # save if needed + self.img_d, self.img_c, self.img_l, self.img_r = depth_image, color_image, irl_image, irr_image + image_out = self.create_output_image(depth_image, color_image, irl_image, irr_image) + return True, image_out + + + + def isOpened(self): + "OpenCV compatability" + return True + + def save_image(self, frame, fname = ''): + fn = '/home/administrato/videos/image_%s_%03d.png' % (fname, self.count) + frame = frame.astype(np.uint16) #cv.cvtColor(frame, cv.CV_16U) + cv.imwrite(fn, frame, [cv.IMWRITE_PNG_COMPRESSION, 0]) + print(fn, 'saved') + self.count += 1 + + def save_two_images(self, frame): + "saves two differnet files" + if len(frame.shape) < 3: + print('Image should have 3 chnnels. Try differnet display options') + return + + fl = '.\\imageL_%s_%03d.png' % (self.display_mode, self.count) + cv.imwrite(fl, frame[:,:,0]) + fr = '.\\imageR_%s_%03d.png' % (self.display_mode, self.count) + cv.imwrite(fr, frame[:,:,1]) + print('Saving %s and %s' %(fl,fr)) + self.count += 1 + + def record_video(self, frame): + # record video to a file is switched on + if (self.vout is None) and (self.record_on is True): + fourcc = cv.VideoWriter_fourcc(*'mp4v') + k = 0 + fname = '.\\video_%s_%03d.mp4' % (self.display_mode,k) + while os.path.exists(fname): + k +=1 + fname = '.\\video_%s_%03d.mp4' % (self.display_mode,k) + + self.vout = cv.VideoWriter(fname, fourcc, 20.0, self.frame_size) + print('Writing video to file %s' %fname) + self.count = 0 + + # write frame + if (self.vout is not None) and (self.record_on is True): + "" + if len(frame.shape) < 3: + frame = frame[:self.frame_size[1],:self.frame_size[0]] + frame = cv.cvtColor(frame, cv.COLOR_GRAY2RGB) + + self.vout.write(frame) + self.count += 1 + if self.count % 100 == 0: + print('Writing frame %s' %str(self.count)) + + # record on is switched off + if (self.vout is not None) and (self.record_on is False): + self.vout.release() + self.vout = None + print('Video file created') + + def record_release(self): + "finish record" + if self.vout is not None: + self.vout.release() + self.vout = None + print('Video file created') + + def show_mouse_info(self, frame): + # Prepare the display text + if self.mouse_value < 0: + return frame + coord_text = f"X: {self.mouse_x}, Y: {self.mouse_y}" + color_text = f"V: {self.mouse_value}" + # Draw a background rectangle for better text visibility + cv.rectangle(frame, (self.mouse_x + 10, self.mouse_y - 45), (self.mouse_x + 180, self.mouse_y + 5), (0, 0, 0), -1) + # Draw the text on the frame + cv.putText(frame, coord_text, (self.mouse_x + 15, self.mouse_y - 25), + cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + cv.putText(frame, color_text, (self.mouse_x + 15, self.mouse_y - 5), + cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + # Draw a circle that changes color based on what it's hovering over + #cv.circle(frame, (self.mouse_x, self.mouse_y), 6, (int(self.mouse_value), int(self.mouse_value), int(self.mouse_value)), -1) + # Add a white border to the circle so it's visible on dark colors + cv.circle(frame, (self.mouse_x, self.mouse_y), 6, (int(255-self.mouse_value), 255, int(255-self.mouse_value)), 1) + + return frame + + def show_controls(self, frame): + "show image on opencv window" + if self.control_mode == 'display': + frame = cv.putText(frame, 'Display (0-RGB, 1,2,3...9-I1+I2)', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,12), 2) + + elif self.control_mode == 'exposure': + frame = cv.putText(frame, 'Exposure (1,2,3...9) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + + elif self.control_mode == 'gain': + frame = cv.putText(frame, 'Gain (1,2,3...9) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + + elif self.control_mode == 'projector': + frame = cv.putText(frame, 'Projector (0,1) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + + elif self.control_mode == 'disparity': + frame = cv.putText(frame, 'Disparity Out (0,1) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + + elif self.control_mode == 'range': + frame = cv.putText(frame, 'Bit Range Out (0,1,2...9) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + + elif self.control_mode == 'focal': + frame = cv.putText(frame, 'Camera Params (0-BL+F,1-Cam Mtrx+Dist, 2..) ', (10, 30), cv.FONT_HERSHEY_SIMPLEX, 0.9, (200,200,200), 2) + else: + pass + + return frame + + def show_measurements(self,frame): + "show measurements of the noise" + if not self.use_measure: + self.img_int_mean = None # reset when enabled + return frame + + if self.rect is None: + h,w = frame.shape[0]>>1, frame.shape[1]>>1 + h2,w2 = h>>2,w>>2 + self.rect = [w-h2, h-h2, w+h2, h+h2] + + err_std = self.measure_noise(frame) + # show min and max + print(f'Frame min {frame.min()} and max {frame.max()}') + + x0, y0, x1, y1 = self.rect + clr = (0, 0, 0) if frame[y0:y1,x0:x1].mean() > 128 else (240,240,240) + frame = cv.rectangle(frame, (x0, y0), (x1, y1), clr, 2) + frame = draw_str(frame,(x0,y0-10),str(err_std)) + + return frame + + def show_image(self, frame): + "show image on opencv window" + do_exit = False + frame_show = np.uint8(frame.copy()) + + #frame_show = self.show_controls(frame_show) + + cv.imshow(self.fig_title, frame_show) + ch = cv.waitKeyEx(1) & 0xff + if ch == ord('q') or ch == 27: + do_exit = True + elif ch in np.arange(48,58) : # numbers only + self.set_controls(ch - 48) + elif ch in np.arange(65,75) : # 2 digit numbers using SHIFT key and keys a,b,c,d,e,f,g + self.set_controls(ch - 55) + elif ch == ord('d'): # depth image + self.control_mode = 'no controls' if self.control_mode == 'display' else 'display' + elif ch == ord('e'): # exposure control + self.control_mode = 'no controls' if self.control_mode == 'exposure' else 'exposure' + elif ch == ord('g'): # exposure control + self.control_mode = 'no controls' if self.control_mode == 'gain' else 'gain' + elif ch == ord('p'): + self.control_mode = 'no controls' if self.control_mode == 'projector' else 'projector' + elif ch == ord('o'): + self.control_mode = 'no controls' if self.control_mode == 'range' else 'range' + elif ch == ord('x'): + self.control_mode = 'no controls' if self.control_mode == 'disparity' else 'disparity' + elif ch == ord('f'): + self.control_mode = 'no controls' if self.control_mode == 'focal' else 'focal' + elif ch == ord('m'): + self.use_measure = not self.use_measure + print(f'Noise measurement is {self.use_measure}') + elif ch == ord('s'): + self.save_image(frame) + elif ch == ord('t'): + self.save_two_images(frame) + elif ch == 2490368: # Left: 2424832 Up: 2490368 Right: 2555904 Down: 2621440 + pass + elif ch == ord('a'): # enable advanced mode + self.use_advanced = not self.use_advanced + self.set_advanced_mode() + elif ch == ord('r'): + self.record_on = not self.record_on + print('Video record %s' %str(self.record_on)) + elif ch == ord('k'): + if self.mouse_value < 0: # enable mouse tracking + cv.setMouseCallback(self.fig_title, self.track_mouse) + else: + cv.setMouseCallback(self.fig_title, lambda *args : None) + self.mouse_value = -1 # disable mouse tracking + elif ch != 255: + print(f'Unrecognized key {ch} - check your language setttings on the keyboard, must be English.') + + return do_exit + + def close(self): + # stop record + self.record_release() + + # Stop streaming + self.pipeline.stop() + #self.depth_sensor.stop() + #self.depth_sensor.close() + print('closed') + + def release(self): + "opencv compatability" + self.close() + + def test_intrinsics(self): + "check if left right and depth intrinsics" + c1 = self.get_camera_intrinsics(1) + c2 = self.get_camera_intrinsics(2) + c3 = self.get_camera_intrinsics(3) + #get_video_stream_intrinsics + + def test(self): + while True: + ret, frame = self.read() + #ret, frame = self.read_aligned() + if ret is False: + break + + frame = self.show_mouse_info(frame) + frame = self.show_measurements(frame) + frame = self.show_controls(frame) + ret = self.show_image(frame) + if ret : + break + + # check if record is required + self.record_video(frame) + + if ret is False: + print('Failed to read image') + else: + self.close() + cv.destroyAllWindows() + +if __name__ == '__main__': + cap = RealSense(frame_size=(1280,720)) + #cap = RealSense(frame_size=(640,360)) + #cap.test() + cap.test_intrinsics() \ No newline at end of file diff --git a/scripts/report.py b/scripts/report.py new file mode 100644 index 0000000..0dfa621 --- /dev/null +++ b/scripts/report.py @@ -0,0 +1,522 @@ +"""ReportGenerator — produces all PNG figures, HTML index, and JSON results.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, List + +# import matplotlib +# matplotlib.use("Agg") # headless — must be before pyplot import +import matplotlib.pyplot as plt +import numpy as np + +try: + from .metrics import ( + CLOSE_RANGE_THRESHOLD_M, + BIN_CENTERS, + BIN_LABELS, + AggregateStats, + BenchmarkResults, + ) +except ImportError: + import sys as _sys + from pathlib import Path as _Path + _sys.path.insert(0, str(_Path(__file__).parent)) + from metrics import ( + CLOSE_RANGE_THRESHOLD_M, + BIN_CENTERS, + BIN_LABELS, + AggregateStats, + BenchmarkResults, + ) + + +class ReportGenerator: + """Generates the full depth quality report in *output_dir*. + + Output structure:: + + output_dir/ + index.html ← self-contained HTML (relative img srcs) + results.json ← aggregate stats + depth_comparison.png ← side-by-side depth maps + error_maps.png ← |pred − GT| per method + coverage_heatmaps.png ← valid-pixel fraction over all frames + distance_error_curve.png ← MAE vs distance bins + error_histograms.png ← per-pixel error distributions + summary_table.png ← statistics table + close_range_analysis.png ← < 0.55 m coverage bar + per-frame curve + timing_bars.png ← FPS per method + """ + + def __init__( + self, + results: BenchmarkResults, + stats: Dict[str, AggregateStats], + output_dir: Path, + ) -> None: + self._r = results + self._stats = stats + self._out = Path(output_dir) + self._out.mkdir(parents=True, exist_ok=True) + self._gt = results.ground_truth_name + self._non_gt = [n for n in results.method_names if n != self._gt] + + def generate(self) -> None: + """Generate all figures, JSON, and HTML index.""" + fig_paths = [ + self._fig_depth_comparison(), + self._fig_error_maps(), + self._fig_coverage_heatmaps(), + self._fig_distance_error_curve(), + self._fig_error_histograms(), + self._fig_summary_table(), + self._fig_close_range_analysis(), + self._fig_timing_bars(), + ] + self._write_json() + self._write_html([p for p in fig_paths if p]) + print(f"\nReport written to: {self._out / 'index.html'}") + + # ------------------------------------------------------------------ figures + + def _save(self, fig, name: str) -> str: + fig.savefig(self._out / name, dpi=150, bbox_inches="tight") + plt.close(fig) + return name + + def _empty_fig(self, filename: str, msg: str) -> str: + fig, ax = plt.subplots(figsize=(6, 3), dpi=100) + ax.axis("off") + ax.text(0.5, 0.5, msg, transform=ax.transAxes, + ha="center", va="center", fontsize=12, color="gray") + return self._save(fig, filename) + + def _depth_cmap(self): + cmap = plt.get_cmap("plasma").copy() + cmap.set_under("black") + return cmap + + def _grid_layout(self, n, max_cols=4): + """Return (nrows, ncols) for a grid that fits n items with at most max_cols columns.""" + ncols = min(n, max_cols) + nrows = (n + ncols - 1) // ncols + return nrows, ncols + + def _fig_depth_comparison(self) -> str: + if not self._r.viz_frames: + return self._empty_fig("depth_comparison.png", "No viz frames") + vf = self._r.viz_frames[0] + names = [n for n in self._r.method_names if n in vf] + n = len(names) + nrows, ncols = self._grid_layout(n) + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows)) + axes = np.asarray(axes).flatten().tolist() + cmap = self._depth_cmap() + for i, (ax, name) in enumerate(zip(axes[:n], names)): + im = ax.imshow(vf[name], cmap=cmap, vmin=1e-4, vmax=5.0) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="meters") + ax.set_title(self._r.method_labels.get(name, name), fontsize=9, wrap=True) + ax.axis("off") + for ax in axes[n:]: + ax.axis("off") + fig.suptitle("Depth Map Comparison (single frame)", fontsize=11, y=1.02) + fig.tight_layout() + return self._save(fig, "depth_comparison.png") + + def _fig_error_maps(self) -> str: + if not self._r.viz_frames or not self._non_gt: + return self._empty_fig("error_maps.png", "No comparison methods") + vf = self._r.viz_frames[0] + gt = vf.get(self._gt) + # Include GT itself first (MAE=0 sanity check), then all other methods + names = ([self._gt] if self._gt in vf else []) + [n for n in self._non_gt if n in vf] + if gt is None or not names: + return self._empty_fig("error_maps.png", "Ground truth not available in viz frame") + n = len(names) + nrows, ncols = self._grid_layout(n) + cmap = plt.get_cmap("hot").copy() + cmap.set_under("#222222") + fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 4 * nrows)) + axes = np.asarray(axes).flatten().tolist() + for ax, name in zip(axes[:n], names): + pred = vf[name] + valid = (gt > 0) & (pred > 0) + err = np.where(valid, np.abs(pred - gt), 0.0).astype(np.float32) + im = ax.imshow(err, cmap=cmap, vmin=1e-4, vmax=0.5) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="|error| (m)") + mean_err = float(np.abs(pred[valid] - gt[valid]).mean()) if valid.any() else 0.0 + label = self._r.method_labels.get(name, name) + ax.set_title(f"{label}\nMAE={mean_err:.3f}m", fontsize=9) + ax.axis("off") + for ax in axes[n:]: + ax.axis("off") + gt_label = self._r.method_labels.get(self._gt, self._gt) + fig.suptitle(f"Absolute Error vs {gt_label}", fontsize=11, y=1.02) + fig.tight_layout() + return self._save(fig, "error_maps.png") + + def _fig_coverage_heatmaps(self) -> str: + names = list(self._r.coverage_maps.keys()) + n = len(names) + if n == 0: + return self._empty_fig("coverage_heatmaps.png", "No coverage data") + nrows, ncols = self._grid_layout(n) + fig, axes = plt.subplots(nrows, ncols, figsize=(3.5 * ncols, 3.5 * nrows)) + axes = np.asarray(axes).flatten().tolist() + for ax, name in zip(axes[:n], names): + cov = self._r.coverage_maps[name] + im = ax.imshow(cov, cmap="viridis", vmin=0, vmax=1) + plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="Valid fraction") + mean_cov = float(cov.mean()) * 100.0 + label = self._r.method_labels.get(name, name) + ax.set_title(f"{label}\n{mean_cov:.1f}% mean", fontsize=9) + ax.axis("off") + for ax in axes[n:]: + ax.axis("off") + fig.suptitle("Valid Pixel Coverage Heatmap (all frames avg)", fontsize=11, y=1.02) + fig.tight_layout() + return self._save(fig, "coverage_heatmaps.png") + + def _fig_distance_error_curve(self) -> str: + if not self._non_gt: + return self._empty_fig("distance_error_curve.png", "No comparison methods") + fig, ax = plt.subplots(figsize=(8, 5)) + for name in self._non_gt: + bin_data = self._r.dist_bin_mae.get(name, []) + if not bin_data: + continue + arr = np.array(bin_data) + mean_per_bin = np.array([ + np.nanmean(arr[:, i]) if np.any(~np.isnan(arr[:, i])) else 0.0 + for i in range(arr.shape[1]) + ]) + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax.plot(BIN_CENTERS, mean_per_bin, marker="o", color=color, + label=label, linewidth=2, markersize=7) + ax.set_xticks(BIN_CENTERS) + ax.set_xticklabels(BIN_LABELS, fontsize=9) + ax.set_xlabel("Distance range", fontsize=10) + ax.set_ylabel("Mean Absolute Error (m)", fontsize=10) + ax.set_title("Depth Error vs Distance", fontsize=12) + ax.legend(fontsize=9) + ax.grid(alpha=0.3) + fig.tight_layout() + return self._save(fig, "distance_error_curve.png") + + def _fig_error_histograms(self) -> str: + if not self._non_gt or not self._r.viz_frames: + return self._empty_fig("error_histograms.png", "No comparison data") + names = [n for n in self._non_gt + if any(n in vf and self._gt in vf for vf in self._r.viz_frames)] + if not names: + return self._empty_fig("error_histograms.png", "No viz data for comparison") + n = len(names) + nrows, ncols = self._grid_layout(n) + fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows)) + axes = np.asarray(axes).flatten().tolist() + for ax, name in zip(axes, names): + errors = [] + for vf in self._r.viz_frames: + if name not in vf or self._gt not in vf: + continue + pred, gt = vf[name], vf[self._gt] + valid = (gt > 0) & (pred > 0) + if valid.any(): + errors.extend(np.abs(pred[valid] - gt[valid]).tolist()) + if not errors: + ax.text(0.5, 0.5, "No data", transform=ax.transAxes, + ha="center", va="center", color="gray") + continue + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax.hist(errors, bins=50, range=(0.0, 1.0), color=color, + alpha=0.8, edgecolor="none") + mean_e = float(np.mean(errors)) + ax.axvline(mean_e, color="red", linestyle="--", linewidth=1.5, + label=f"mean={mean_e:.3f}m") + ax.set_xlabel("Absolute error (m)", fontsize=9) + ax.set_ylabel("Pixel count", fontsize=9) + ax.set_title(label, fontsize=9) + ax.legend(fontsize=8) + for ax in axes[n:]: + ax.axis("off") + fig.suptitle("Per-Pixel Error Distribution (vs GT, viz frames)", fontsize=11) + fig.tight_layout() + return self._save(fig, "error_histograms.png") + + def _fig_summary_table(self) -> str: + if not self._stats: + return self._empty_fig("summary_table.png", "No stats") + cols = ["Method", "MRE* (%)", "MRE (%)", "MAE (m)", "δ1 (%)", "Coverage (%)", "FPS", "GPU %", "GT?"] + gt_rows, other_rows = [], [] + for name, s in self._stats.items(): + is_gt = (name == self._gt) + row = [ + s.label, + "—" if is_gt else f"{s.mre_pen_mean * 100:.1f}", + "—" if is_gt else f"{s.mre_mean * 100:.1f}", + "—" if is_gt else f"{s.mae_mean:.4f}", + "—" if is_gt else f"{s.delta1_mean:.1f}", + f"{s.coverage_mean:.1f}", + f"{s.fps_mean:.1f}" if s.fps_mean < 999 else "≈30", + f"{s.gpu_load_mean:.0f}" if s.gpu_load_mean > 0 else "—", + "★ GT" if is_gt else "", + ] + (gt_rows if is_gt else other_rows).append((name, row)) + ordered = gt_rows + other_rows + cell_text = [r for _, r in ordered] + n = len(ordered) + fig, ax = plt.subplots(figsize=(13, 1.0 + 0.55 * n)) + ax.axis("off") + table = ax.table(cellText=cell_text, colLabels=cols, + cellLoc="center", loc="center") + table.auto_set_font_size(False) + table.set_fontsize(9) + table.scale(1, 1.6) + for (row, col), cell in table.get_celld().items(): + if row == 0: + cell.set_facecolor("#2c3e50") + cell.set_text_props(color="white", fontweight="bold") + elif row % 2 == 0: + cell.set_facecolor("#f7f7f7") + cell.set_edgecolor("#cccccc") + if row > 0 and cell_text[row - 1][-1] == "★ GT": + cell.set_facecolor("#d5f5d5") + ax.set_title("Depth Quality Summary", fontsize=12, pad=10, fontweight="bold") + fig.tight_layout() + return self._save(fig, "summary_table.png") + + def _fig_close_range_analysis(self) -> str: + names = list(self._r.method_names) + if not names: + return self._empty_fig("close_range_analysis.png", "No methods") + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + + labels = [self._r.method_labels.get(n, n) for n in names] + coverages = [self._stats[n].close_range_coverage if n in self._stats else 0.0 + for n in names] + colors = [self._r.method_colors.get(n, "#888") for n in names] + bars = ax1.bar(labels, coverages, color=colors, alpha=0.85, edgecolor="white") + ax1.bar_label(bars, labels=[f"{v:.1f}%" for v in coverages], padding=3, fontsize=7) + ax1.set_ylabel(f"Coverage at < {CLOSE_RANGE_THRESHOLD_M}m (%)", fontsize=10) + ax1.set_title(f"Close-Range Coverage (< {CLOSE_RANGE_THRESHOLD_M} m)", fontsize=11) + ax1.tick_params(axis="x", rotation=45, labelsize=7) + ax1.set_xticklabels(labels, ha="right") + ax1.set_ylim(0, 115) + ax1.grid(axis="y", alpha=0.3) + + for name in names: + vals = self._r.close_range_valid.get(name, []) + if not vals: + continue + color = self._r.method_colors.get(name, "#888") + label = self._r.method_labels.get(name, name) + ax2.plot(range(1, len(vals) + 1), vals, color=color, + label=label, alpha=0.8, linewidth=1.5) + ax2.set_xlabel("Frame", fontsize=10) + ax2.set_ylabel(f"Coverage at < {CLOSE_RANGE_THRESHOLD_M}m (%)", fontsize=10) + ax2.set_title("Close-Range Coverage per Frame", fontsize=11) + ax2.legend(fontsize=6, loc="best") + ax2.grid(alpha=0.3) + ax2.set_ylim(-5, 115) + + fig.suptitle("Close-Range Depth Analysis", fontsize=13, fontweight="bold") + fig.tight_layout() + return self._save(fig, "close_range_analysis.png") + + def _fig_timing_bars(self) -> str: + if not self._stats: + return self._empty_fig("timing_bars.png", "No timing data") + names = list(self._stats.keys()) + labels = [self._stats[n].label for n in names] + fps_vals = [min(self._stats[n].fps_mean, 200.0) for n in names] + colors = [self._stats[n].color for n in names] + fig, ax = plt.subplots(figsize=(8, 1.0 + 0.6 * len(names))) + bars = ax.barh(labels, fps_vals, color=colors, alpha=0.85, edgecolor="white") + ax.bar_label(bars, labels=[f"{f:.1f}" for f in fps_vals], padding=4, fontsize=9) + ax.axvline(30, color="gray", linestyle="--", alpha=0.5, label="30 FPS target") + ax.set_xlabel("Throughput (FPS)", fontsize=10) + ax.set_title("Processing Speed by Method", fontsize=11) + ax.legend(fontsize=8) + ax.grid(axis="x", alpha=0.3) + fig.tight_layout() + return self._save(fig, "timing_bars.png") + + # ------------------------------------------------------------------ JSON / HTML + + def _write_json(self) -> None: + data = { + "source": self._r.source, + "n_frames": self._r.n_frames, + "resolution": f"{self._r.width}x{self._r.height}", + "ground_truth": self._r.ground_truth_name, + "distance_bins": BIN_LABELS, + "model_configs": self._r.method_configs, + "methods": { + name: { + "label": s.label, + "mae_mean": s.mae_mean, + "mae_std": s.mae_std, + "rmse_mean": s.rmse_mean, + "mre_mean": s.mre_mean, + "delta1_mean": s.delta1_mean, + "coverage_mean": s.coverage_mean, + "fps_mean": s.fps_mean if s.fps_mean < 1e6 else -1, + "time_ms_mean": s.time_ms_mean, + "close_range_coverage": s.close_range_coverage, + "gpu_load_mean": s.gpu_load_mean, + "is_ground_truth": (name == self._gt), + } + for name, s in self._stats.items() + }, + } + (self._out / "results.json").write_text(json.dumps(data, indent=2)) + + def _write_html(self, fig_paths: List[str]) -> None: + import datetime + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + method_list = ", ".join( + self._r.method_labels.get(n, n) for n in self._r.method_names + ) + meta_rows = ( + f"Source{self._r.source}" + f"Frames{self._r.n_frames}" + f"Resolution{self._r.width}×{self._r.height}" + f"Ground truth" + f"{self._r.method_labels.get(self._gt, self._gt)}" + f"Methods evaluated{method_list}" + f"Generated{ts}" + ) + # Model configuration details for NN methods + config_html = "" + if self._r.method_configs: + config_html = '\n
\n

Model Configuration

\n ' + config_html += '\n ' + for name, cfg in self._r.method_configs.items(): + label = self._r.method_labels.get(name, name) + md = cfg.get("max_disp", "—") + vi = cfg.get("valid_iters", "—") + res = cfg.get("engine_resolution", "—") + edir = cfg.get("engine_dir", "—") + config_html += f'\n ' + config_html += '\n
Methodmax_dispvalid_itersEngine ResolutionEngine Dir
{label}{md}{vi}{res}{edir}
\n
' + captions = { + "depth_comparison.png": + "Side-by-side depth maps from a single representative frame. Invalid pixels are black.", + "error_maps.png": + "Per-pixel absolute error |pred − GT| clipped at 0.1 m. Brighter = more error.", + "coverage_heatmaps.png": + "Fraction of frames each pixel has valid depth, averaged over all benchmark frames.", + "distance_error_curve.png": + "Mean Absolute Error (MAE) broken down by distance range.", + "error_histograms.png": + "Distribution of per-pixel absolute errors from the stored visualisation frames.", + "summary_table.png": + "Aggregate quality metrics — see legend below the table for column explanations.", + "close_range_analysis.png": + f"Coverage and stability for objects closer than {CLOSE_RANGE_THRESHOLD_M} m. Highlights MinZ benefit.", + "timing_bars.png": + "Processing speed in FPS. Hardware baseline is fixed at ~30 FPS (camera frame rate).", + } + metric_legend = ( + '\n
' + "\n

How to Read the Summary Table

" + "\n " + "\n " + "\n " + "\n " + "\n " + "\n " + "\n " + '\n ' + "\n " + "\n " + "\n " + "\n " + "\n " + '\n ' + '\n ' + "\n
MRE* (%%)Overall score (recommended). Mean Relative Error with hole penalty — " + "pixels where the method has no depth but ground truth does count as 100%% error. " + "This is the fairest single metric because it penalises both inaccuracy and missing coverage. Lower is better.
MRE (%%)Mean Relative Error over valid pixels only (holes ignored). " + "5%% means each measured pixel is ~5%% off on average. Lower is better.
MAE (m)Mean Absolute Error in meters, valid pixels only. Lower is better.
δ1 (%%)Percentage of valid pixels within 1.25× of ground truth depth. Higher is better. 100%% is perfect.
Coverage (%%)Percentage of pixels that produced valid depth. Higher is better. " + "MinZ improves this at close range (<0.55m) by filling holes the hardware camera cannot see.
FPSProcessing speed (frames per second). Higher is faster.
GT?★ GT marks the ground truth method (NNDepth accurate). Its error columns show "—" because you don\'t compare ground truth to itself.
" + "\n
" + ) + sections = "" + for path in fig_paths: + caption = captions.get(path, "") + title = path.replace(".png", "").replace("_", " ").title() + sections += ( + f'\n
' + f"\n

{title}

" + f'\n
' + f'\n {title}' + f'\n

{caption}

' + f"\n
" + f"\n
" + ) + if path == "summary_table.png": + sections += metric_legend + html = f""" + + + + + Depth Quality Report — rs-enhanced-depth + + + +

Depth Quality Report

+

rs-enhanced-depth — multi-method depth quality analysis

+ +
{meta_rows}
+{config_html} +{sections} +
Generated by tests/depth_report/ on {ts}
+ +""" + (self._out / "index.html").write_text(html) \ No newline at end of file diff --git a/scripts/run_box_detector_fs.py b/scripts/run_box_detector_fs.py new file mode 100644 index 0000000..6165c75 --- /dev/null +++ b/scripts/run_box_detector_fs.py @@ -0,0 +1,2710 @@ +#!/usr/bin/env python + +''' +Tester for multi planar plain detector with foundation stereo +================== + +Using depth image to compute depth planes locally for specific ROI. + + +Usage: + +Environemt : + ..\\docker + +Install : + + + +''' + +import sys +import numpy as np +import cv2 as cv +import random +import unittest +#from scipy.spatial.transform import Rotation as Rot +import matplotlib.pyplot as plt +import logging +log = logging.getLogger("robot") +log.setLevel(logging.DEBUG) +from opencv_realsense_camera import RealSense, draw_str +from run_fast_foundation_with_rs import convert_disparity_to_depth, foundation_stereo_algo_init, foundation_stereo_algo, process_arguments + +#!/usr/bin/env python +# for rotations +from scipy.spatial.transform import Rotation as R + + + + + +#%% Main +class PlaneDetector: + def __init__(self, detect_type = 'p', image_size = (1280,720)): + + self.detect_type = detect_type # plane + + self.frame_size = image_size + self.img = None + self.cam_matrix = np.array([[1000,0,self.frame_size[0]/2],[0,1000,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + + self.img3d = None # contains x,y and depth plains + self.img_xyz = None # comntains X,Y,Z information after depth image to XYZ transform + self.img_mask = None # which pixels belongs to the plain + self.rect = None # roi + self.img_roi = None # roi image + self.img_roi_normal = None # normals at roi image + + # detector type + self.matrix_inv = None # holds inverse params of the + self.rect_z = None # flat z for ROI + self.rect_dir = None # direct u,v,1 for ROI + self.rect_xyz = None # direct u,v,1 multiplied by z ROI + self.full_dir = None # direct u,v,1 for entire image + self.full_xyz = None # direct u,v,1 multiplied by z entire image + self.roi_index = None # index of the points og an ROI in the original image + self.plane_params = None # rvec not normalized + self.plane_center = None # tvec + self.plane_confidence = 0 # reliability of the detcetion + + #self.corner_ind = [0, 10, 40, 50] # corner of the rectnagle for the projection + self.rect_3d = None # roi but projected on 3D + + # for the box + self.contour = None + self.hull = None + self.floor_z = 10000 + + # params + self.MIN_SPLIT_SIZE = 32 + self.MIN_STD_ERROR = 0.01 + + # color for the mask + self.color_mask = np.random.randint(0,255,3) # random color + + # help variable + self.ang_vec = np.zeros((3,1)) # help variable + + def init_image(self, img = None): + "load image" + + self.img = img + h,w = img.shape[:2] + self.frame_size = (w,h) + self.img_mask = np.zeros((h,w),dtype = np.uint8) + return True + + def init_roi(self, roi_type = 1): + "load the test case" + w,h = self.frame_size[0],self.frame_size[1] + w2,h2 = w>>1,h>>1 + roi = [0,0,w,h] + if roi_type == 1: + roi = [w2-3,h2-3,w2+3,h2+3] # xlu, ylu, xrb, yrb + elif roi_type == 2: + roi = [300,220,340,260] # xlu, ylu, xrb, yrb + elif roi_type == 3: + roi = [280,200,360,280] # xlu, ylu, xrb, yrb + elif roi_type == 4: + roi = [220,140,420,340] # xlu, ylu, xrb, yrb + elif roi_type == 5: + roi = [200,120,440,360] # xlu, ylu, xrb, yrb + elif roi_type == 11: + roi = [w2-16,h2-16,w2+16,h2+16] # xlu, ylu, xrb, yrb + elif roi_type == 12: + roi = [w2-32,h2-32,w2+32,h2+32] # xlu, ylu, xrb, yrb + elif roi_type == 13: + roi = [w2-64,h2-64,w2+64,h2+64] # xlu, ylu, xrb, yrb + elif roi_type == 14: + roi = [w2-64,h2-48,w2+64,h2+48] # xlu, ylu, xrb, yrb + elif roi_type == 21: # lower center image position + roi = [w2-64,h2+128,w2+64,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 22: # lower left image position + roi = [w2-400,h2+128,w2-272,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 23: # lower right image position + roi = [w2+272,h2+128,w2+400,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 31: # upper center image position + #roi = [w2-64,h2-196,w2+64,h2-128] # xlu, ylu, xrb, yrb + roi = [w2+64,h2-128,w2+128,h2-64] # xlu, ylu, xrb, yrb + elif roi_type == 32: # upper left image position + roi = [w2-400,h2-196,w2-272,h2-128] # xlu, ylu, xrb, yrb + elif roi_type == 33: # upper right image position + roi = [w2+302,h2-196,w2+430,h2-128] # xlu, ylu, xrb, yrb + elif roi_type == 41: # center center image position + #roi = [w2-64,h2+32,w2+64,h2+96] # xlu, ylu, xrb, yrb + roi = [w2-128,h2+64,w2+128,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 42: # center left image position + roi = [w2-200,h2+32,w2-72,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 43: # center right image position + roi = [w2+72,h2+32,w2+200,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 43: # center right image position + roi = [w2+72,h2+32,w2+200,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 52: # upper left image position for data 432 + roi = [w2-256,h2-256,w2-128,h2-192] # xlu, ylu, xrb, yrb + elif roi_type == 53: # upper center image position + roi = [w2-128,h2+16,w2+128,h2+300] # xlu, ylu, xrb, yrb + + elif roi_type == 60: # 422 on cube smaller + roi = [645,395,665,415] # xlu, ylu, xrb, yrb + elif roi_type == 61: # 422 on cube + roi = [640,390,670,420] # xlu, ylu, xrb, yrb + elif roi_type == 62: # 422 right cube side + roi = [690,390,720,420] # xlu, ylu, xrb, yrb + elif roi_type == 71: # set 422 - cube + roi = [630,h2+72,690,h2+128] # xlu, ylu, xrb, yrb + elif roi_type == 72: # set 422 - cube + roi = [430,h2-30,890,h2+228] # xlu, ylu, xrb, yrb + elif roi_type == 73: # set 422 - down side of the cube + roi = [630,h2+140,690,h2+190] # xlu, ylu, xrb, yrb + elif roi_type == 74: # set 422 - right side of the cube + roi = [700,h2+64,800,h2+164] # xlu, ylu, xrb, yrb + elif roi_type == 75: # set 422 - up side of the cube + roi = [700,h2+20,800,h2+96] # xlu, ylu, xrb, yrb + self.rect = roi + #self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + log.info(f'Using ROI : {roi}') + return roi + + def preprocess(self, img = None): + "image preprocessing - extracts roi and converts from uint8 to float using log function" + if img is None: + log.info('No image provided') + return False + + if self.img_mask is None: + ret = self.init_image(img) + + if self.rect is None: # use entire image + roi = self.init_roi(4) + + # init params of the inverse + if self.full_dir is None: + self.fit_plane_init() + + #x0, y0, x1, y1 = self.rect + if len(img.shape) > 2: + #img_roi = img[y0:y1,x0:x1,2].astype(np.float32) + img_roi = img[:,:,2].astype(np.float32) + else: + #img_roi = img[y0:y1,x0:x1].astype(np.float32) + img_roi = img.astype(np.float32) + return img_roi + + def init_img3d(self, img = None): + "initializes xyz coordinates for each point" + img = self.img if img is None else img + h,w = img.shape[:2] + x = np.arange(w) + y = np.arange(h) + x,y = np.meshgrid(x,y) + fx = self.cam_matrix[0,0] + fy = self.cam_matrix[1,1] + + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)) + v = xy_undistorted[:,0,1].reshape((h,w)) + z3d = img.astype(np.float32) + x3d = z3d.copy() + y3d = z3d.copy() + + #ii = np.logical_and(z3d> 1e-6 , np.isfinite(z3d)) + ii = z3d > 5 + x3d[ii] = u[ii]*z3d[ii] #/fx + y3d[ii] = v[ii]*z3d[ii] #/fy + z3d[ii] = z3d[ii] + + #self.img3d = np.stack((u/fx,v/fy,z3d), axis = 2) + self.img3d = np.stack((u,v,z3d), axis = 2) + self.img_mask = np.zeros((h,w)) + return self.img3d + + def compute_img3d(self, img = None): + "compute xyz coordinates for each point using prvious init" + img = self.img if img is None else img + xyz = self.img3d + if xyz is None: + xyz = self.init_img3d(img) + + if np.any(img.shape[:2] != xyz.shape[:2]): + print('Image dimension change') + return + + imgXYZ = self.img3d.copy() + + z3d = img.astype(np.float32) + x3d = self.img3d[:,:,0].copy() # u/f + y3d = self.img3d[:,:,1].copy() # v/f + + # filter bad z values + #ii = np.logical_and(z3d > 1e-6 , np.isfinite(z3d)) + ii = z3d > 15 + x3d[ii] = x3d[ii]*z3d[ii] + y3d[ii] = y3d[ii]*z3d[ii] + z3d[ii] = z3d[ii] + + # x,y,z coordinates in 3D + imgXYZ[:,:,0] = x3d + imgXYZ[:,:,1] = y3d + imgXYZ[:,:,2] = z3d + + self.img_xyz = imgXYZ + return imgXYZ + + def check_error(self, xyz1_mtrx, vnorm): + "checking the error norm" + err = np.dot(xyz1_mtrx, vnorm) + err_std = err.std() + return err_std + + def get_plane_params(self): + "for external interface support" + rvec_left = self.plane_params + tvec_left = self.plane_center + conf_left = self.plane_confidence + return tvec_left, rvec_left, conf_left + + def convert_plane_params(self, plane_equation): + "convert plane params to rvec" + # 4. Convert plane parameters to rvec and tvec + # - The plane normal vector is (A, B, C). + # - We can use the normal vector to get the rotation. + # - A point on the plane can be used for the translation vector. + + # Normalize the plane normal vector + normal = plane_equation #np.array([plane_equation[0], plane_equation[1], plane_equation[2]]) + normal_norm = np.linalg.norm(normal) + if normal_norm == 0: + log.error("Error: Zero norm for plane normal vector.") + return None + normal = normal / normal_norm + + # Use the normalized normal vector to get the rotation matrix + # This is a common method, but there are other ways to do this. + z_axis = np.array([0, 0, 1]) + rotation_axis = np.cross(z_axis, normal) + rotation_angle = np.arccos(np.dot(z_axis, normal)) + + # Handle the case where the rotation axis is zero (normal is parallel to z-axis) + if np.linalg.norm(rotation_axis) < 1e-6: + if normal[2] > 0: + rvec = np.zeros(3) # Rotation is identity + else: + rvec = np.array([0, np.pi, 0]) # Rotation by 180 degrees around X or Y. + else: + rvec, _ = cv.Rodrigues(rotation_axis * rotation_angle) + rvec, _ = cv.Rodrigues(rvec) + + return rvec + + def convert_plane_params_to_pose(self, plane_params = None, plane_center = None): + "converting params of the plane to the pose vector" + + plane_params = self.plane_params if plane_params is None else plane_params[:3].flatten() + plane_center = self.plane_center if plane_center is None else plane_center[:3].flatten() + + tvec = plane_center.reshape((1,-1)) + rvec = plane_params.reshape((1,-1)) #reshape((-1,1)) + rvec = rvec/np.linalg.norm(rvec.flatten()) + + pose_norm = np.hstack((tvec, rvec)) + #log.info('roi to pose') + return pose_norm #.flatten() + + def fit_plane_init(self): + "prepares data for real time fit a*x+b*y+c = z" + if self.cam_matrix is None: + self.cam_matrix = np.array([[650,0,self.frame_size[0]/2],[0,650,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + log.info('Camera matrix is initialized to default.') + + x0,y0,x1,y1 = 0,0,self.frame_size[0],self.frame_size[1] #self.rect + h,w = y1-y0, x1-x0 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + + # remember corner indexes for reprojection [0 .... h*(w-1)) + # . . + # h ......h*w-1] + #self.corner_ind = [0, h, h*w-1, h*(w-1), 0] + #h2,w2 = h>>1, w>>1 + #self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # camera coordinates + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)).reshape(-1,1) + v = xy_undistorted[:,0,1].reshape((h,w)).reshape(-1,1) + + # check + #u, v = u*self.cam_matrix[0,0], v*self.cam_matrix[1,1] + + self.full_dir = np.hstack((u,v,u*0+1)) + #self.matrix_inv = np.linalg.pinv(self.rect_dir) + + def fit_plane_init_old(self): + "prepares data for real time fit a*x+b*y+c = z" + self.cam_matrix = np.array([[650,0,self.frame_size[0]/2],[0,650,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + + x0,y0,x1,y1 = self.rect + h,w = y1-y0, x1-x0 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + + # remember corner indexes for reprojection [0 .... h*(w-1)) + # . . + # h ......h*w-1] + #self.corner_ind = [0, h, h*w-1, h*(w-1), 0] + h2,w2 = h>>1, w>>1 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # camera coordinates + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)).reshape(-1,1) + v = xy_undistorted[:,0,1].reshape((h,w)).reshape(-1,1) + + # check + #u, v = u*self.cam_matrix[0,0], v*self.cam_matrix[1,1] + + self.rect_dir = np.hstack((u,v,u*0+1)) + #self.matrix_inv = np.linalg.pinv(self.rect_dir) + + def convert_roi_to_points(self, img, point_num = 30, step_size = 1, roi_rect = None): + "converting roi to pts in XYZ - Nx3 array. point_num - is the target point number" + + # init params of the inverse + if self.full_dir is None: # do not use mtrix_dir - initialized before + self.fit_plane_init() + + # deal iwth different rect options + roi_rect = self.rect if roi_rect is None else roi_rect + x0, y0, x1, y1 = roi_rect + + # make rectangle + h,w = (y1-y0)>>1, (x1-x0)>>1 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # extract roi - must be compatible with image dimensions + # n,m = img.shape[:2] + # img_roi_mask = np.zeros((n,m), dtype = np.bool_) + # img_roi_mask[y0:y1,x0:x1] = True + # valid_bool = img_roi_mask > 0 & img > 0 + + # check if roi is valid. +1 to grow in positive x and y since arange does not include x1,y1 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + #flat_indices = np.ravel_multi_index((y, x), img.shape[:2]).reshape((-1,1)) + flat_indices = y * self.frame_size[0] + x + flat_indices = flat_indices.ravel().astype(np.int32) + # valid under mask + #valid_bool = img.flat[flat_indices] > 0 + #ii = flat_indices[valid_bool] + img_roi = img[y0:y1,x0:x1].flatten() #.astype(np.float32).reshape((-1,1)) + valid_bool = img_roi > 1 # valid pixels in the roi + ii = np.where(valid_bool)[0] + + valid_point_num = len(ii) + if valid_point_num < 5: + return np.zeros((0,3)) + + step_size = np.maximum(step_size, np.int32(valid_point_num/point_num)) + ii = ii[::step_size] + + # plane params - using only valid + z = img_roi[ii].reshape((-1,1)) + jj = flat_indices[ii].flatten() + uv1_matrix = self.full_dir[jj,:] + xyz_matrix = uv1_matrix[:,:3]*z # keep 1 intact + + #self.plane_center = xyz_center.flatten() + self.rect_z = z + self.rect_dir = uv1_matrix + self.rect_xyz = xyz_matrix + self.roi_index = jj + self.img_roi = img[y0:y1,x0:x1] + + return xyz_matrix + + + def convert_roi_to_points_old(self, img_roi, point_num = 30, step_size = 1): + "converting roi to pts in XYZ - Nx3 array. point_num - is the target point number" + # x1,y1 = self.img_xyz.shape[:2] + # roi_area = x1*y1 + + # # reduce size of the grid for speed + # if step_size < 1 and roi_area > 100: + # step_size = np.maximum(1,int(np.sqrt(roi_area)/10)) + + + # #roi3d = self.img_xyz[y0:y1:step_size,x0:x1:step_size,:] + # roi3d = self.img_xyz[::step_size,::step_size,:] + # x,y,z = roi3d[:,:,0].reshape((-1,1)), roi3d[:,:,1].reshape((-1,1)), roi3d[:,:,2].reshape((-1,1)) + # xyz_matrix = np.hstack((x,y,z)) + # + + # init params of the inverse + if self.rect_dir is None: + self.fit_plane_init_old() + + # extract roi + + n,m = img_roi.shape[:2] + img_roi = img_roi.reshape((-1,1)) + valid_bool = img_roi > 0 + valid_bool = valid_bool.flatten() + #log.info(f'Timing : 1') + + # all non valid + ii = np.where(valid_bool)[0] + valid_point_num = len(ii) + if valid_point_num < 5: + return None + step_size = np.maximum(step_size, np.int32(valid_point_num/point_num)) + ii = ii[::step_size] + + # plane params - using only valid + z = img_roi[ii] + xyz_matrix = self.rect_dir[ii,:] + xyz_matrix[:,:3] = xyz_matrix[:,:3]*z # keep 1 intact + + # update corners of the rect in 3d + #self.rect_3d = self.rect_dir[self.corner_ind,:]*img_roi[self.corner_ind] + # rect to show + x0, y0, x1, y1 = self.rect + h,w = y1-y0, x1-x0 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + # substract mean + #xyz_center = xyz_matrix[:,:3].mean(axis=0) + #xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + #self.plane_center = xyz_center.flatten() + self.rect_xyz = xyz_matrix + + return xyz_matrix + + def fit_plane_svd(self, img_roi): + "estimates mean and std of the plane fit" + + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 600, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + # using svd to make the fit + U, S, Vh = np.linalg.svd(xyz_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #log.info(f'Timing : 3') + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params) + #z_est = z + err + xyz_center[2] + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_svd_old(self, img_roi): + "estimates mean and std of the plane fit" + # n,m = img_roi.shape[:2] + # img_roi = img_roi.reshape((-1,1)) + # valid_bool = img_roi > 0 + # valid_bool = valid_bool.flatten() + # #log.info(f'Timing : 1') + + # # init params of the inverse + # if self.matrix_inv is None: + # self.fit_plane_init() + + # # plane params - using only valid + # z = img_roi[valid_bool] + # xyz_matrix = self.rect_dir[valid_bool,:] + # xyz_matrix[:,:3] = xyz_matrix[:,:3]*z # keep 1 intact + + # update corners of the rect in 3d + #self.rect_3d = self.rect_dir[self.corner_ind,:]*img_roi[self.corner_ind] + + # roi converted to points with step size on the grid + #xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 1e4, step_size = 1) + xyz_matrix = self.convert_roi_to_points_old(img_roi, point_num = 1e4, step_size = 1) + + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + # using svd to make the fit + U, S, Vh = np.linalg.svd(xyz_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #log.info(f'Timing : 3') + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params) + #z_est = z + err + xyz_center[2] + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_svd_weighted(self, img_roi): + "estimates mean and std of the plane fit - fit is weighted assuming Sigma(z) = a*z" + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 500, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # # substract mean + # xyz_center = xyz_matrix[:,:3].mean(axis=0) + # xyz_matrix = xyz_matrix - xyz_center + + # plane params - using only valid + z = self.rect_z + uv1_matrix = self.rect_dir # (x - x0)/f, (y - y0)/f, 1 + + # minimization function min |ua/f + vb/f + c + dz|^2/|z|^2 + f = self.cam_matrix[0,0] + # uvf_matrix = np.dot(uv1_matrix , np.diag([1/f,1/f,1])) + + mtrx_inv = np.linalg.pinv(uv1_matrix) + b = f/z + plane_params = np.dot(mtrx_inv,b) + plane_params = plane_params/np.linalg.norm(plane_params) + + # center + xyz_matrix = self.rect_xyz #uv1_matrix[:,:3]*z # keep 1 intact + xyz_center = xyz_matrix.mean(axis=0) + + # estimate error + err = np.dot(uv1_matrix,plane_params) + z_est = z - err + + img_mean = z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_with_outliers(self, img_roi): + "computes normal for the specifric roi and evaluates error. Do it twice to reject outliers" + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 1500, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center_1 = xyz_matrix[:,:3].mean(axis=0) + xyz_1 = xyz_matrix - xyz_center_1 + + # using svd to make the fit to a sub group + U, S, Vh = np.linalg.svd(xyz_1, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #vnorm = vnorm*np.sign(vnorm[2]) # keep orientation + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_1,plane_params) + err_std = err.std() + log.info('Fit error iteration 1: %s' %str(err_std)) + + # filter only the matching points + inlier_ind = np.abs(err) < 3*err_std + + # substract mean when only inliers are kept + xyz_center_2 = xyz_matrix[inlier_ind,:3].mean(axis=0)# + xyz_2 = xyz_matrix[inlier_ind,:] - xyz_center_2 + + # perform svd one more time + U, S, Vh = np.linalg.svd(xyz_2, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # checking error + err = np.dot(xyz_2, plane_params) + err_std = err.std() + log.info('Fit error iteration 2: %s' %str(err_std)) + + # # We can convert this flat index to row and column indices + # row_index, col_index = np.unravel_index(inlier_ind, self.img_mask.shape) + # self.img_mask[row_index, col_index] = 1 + + img_mean = xyz_center_2[2] #z_est.mean() + img_std = err_std + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center_2.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_ransac(self, img_roi): + + """ + Find the best equation for a plane. + + :param pts: 3D point cloud as a `np.array (N,3)`. + :param thresh: Threshold distance from the plane which is considered inlier. + :param maxIteration: Number of maximum iteration which RANSAC will loop over. + :returns: + - `self.equation`: Parameters of the plane using Ax+By+Cy+D `np.array (1, 4)` + - `self.inliers`: points from the dataset considered inliers + + """ + #log.info('Fit ransac: ...') + # roi converted to points with step size on the grid + #xyz_matrix = self.convert_roi_to_points_old(img_roi, point_num = 250, step_size = 1) + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 250, step_size = 1) + if xyz_matrix is None: + log.error('No points in the ROI') + return 0, 0 + + thresh = 1.05 + maxIteration = 100 + + + n_points = xyz_matrix.shape[0] + best_eq = [] + best_inliers = [] + + for it in range(maxIteration): + + # Samples 3 random points + if n_points < 3: break + id_samples = random.sample(range(0, n_points), 3) + pt_samples = xyz_matrix[id_samples,:] + + # We have to find the plane equation described by those 3 points + # We find first 2 vectors that are part of this plane + # A = pt2 - pt1 + # B = pt3 - pt1 + + vecA = pt_samples[1, :] - pt_samples[0, :] + vecB = pt_samples[2, :] - pt_samples[0, :] + + # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane + vecC = np.cross(vecA, vecB) + vecC_norm = np.linalg.norm(vecC) + + # protect from the close spaced points + if vecC_norm < 10e-6: + continue + + # make sure that Z direction is positive + vecC = vecC * np.sign(vecC[2]) + + # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k + # We have to use a point to find k + vecC = vecC / vecC_norm + #k = -np.sum(np.multiply(vecC, pt_samples[1, :])) + k = -np.dot(vecC, pt_samples[1, :]) + plane_eq = [vecC[0], vecC[1], vecC[2], k] + + # Distance from a point to a plane + # https://mathworld.wolfram.com/Point-PlaneDistance.html + # pt_id_inliers = [] # list of inliers ids + # dist_pt = ( + # plane_eq[0] * xyz_matrix[:, 0] + plane_eq[1] * xyz_matrix[:, 1] + plane_eq[2] * xyz_matrix[:, 2] + plane_eq[3] + # ) / np.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2) + + dist_pt = np.dot(xyz_matrix, vecC) + plane_eq[3] + + # Select indexes where distance is biggers than the threshold + pt_id_inliers = np.where(np.abs(dist_pt) <= thresh)[0] + if len(pt_id_inliers) > len(best_inliers): + best_eq = plane_eq + best_inliers = pt_id_inliers + + #self.inliers = best_inliers + #self.equation = best_eq + + # rtansform to pose output + #tvec = xyz_matrix[best_inliers,:].mean(axis=0) + #pts_best = xyz_matrix[best_inliers,:] - tvec + tvec = xyz_matrix.mean(axis=0) + pts_best = xyz_matrix - tvec + vnorm = np.array(best_eq[:3]) + + # checking error + err = np.dot(pts_best, vnorm) + err_std = err.std() + log.info('Fit error ransac: %s' %str(err_std)) + + img_mean = tvec[2] #z_est.mean() + img_std = err_std + self.plane_params = vnorm.flatten() + self.plane_center = tvec.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def estimate_normals_from_depth_map(self,depth_map): + """ + Estimates the surface normal vector for each pixel in a depth map + using the image gradient (Sobel operator). + + Args: + depth_map (np.ndarray): A single-channel depth image (e.g., CV_32F or CV_64F). + Depth values must be in a consistent metric (e.g., meters). + + Returns: + np.ndarray: A 3-channel image (H, W, 3) where each pixel contains the + (nx, ny, nz) unit normal vector, as CV_32F. + """ + # 1. Convert to CV_32F for accurate gradient calculation + if depth_map.dtype != np.float32: + depth_map = depth_map.astype(np.float32) + + depth_map = cv.GaussianBlur(depth_map, (5, 5), 0) + + # 2. Calculate Derivatives using Sobel Operator (Gradient) + # The kernel size 'ksize=1' is often preferred for depth maps as it corresponds + # to a 3x1 or 1x3 kernel, providing a close approximation of the derivative. + ksize = 1 + + # Calculate dz/du (gradient in X/horizontal direction) + # dx=1, dy=0 + grad_x = cv.Sobel(depth_map, cv.CV_32F, 1, 0, ksize=ksize, borderType=cv.BORDER_DEFAULT) + + # Calculate dz/dv (gradient in Y/vertical direction) + # dx=0, dy=1 + grad_y = cv.Sobel(depth_map, cv.CV_32F, 0, 1, ksize=ksize, borderType=cv.BORDER_DEFAULT) + + # 3. Construct the Normal Vector Components + # The normal vector is proportional to n = (-dz/du, -dz/dv, 1) + + # Reshape the gradients to (H, W, 1) for stacking + grad_x = grad_x[:, :, np.newaxis] + grad_y = grad_y[:, :, np.newaxis] + + # Create the 'z' component of the direction vector, which is always 1 + # np.ones_like creates an array with the same shape and type as the gradient arrays + z_component = np.ones_like(grad_x) + + # Stack the components to create the direction vector (H, W, 3) + # The X and Y gradients are negated: -dz/du and -dz/dv + direction_vectors = np.concatenate((-grad_x, -grad_y, z_component), axis=2) + + # 4. Normalize the Direction Vectors + # Calculate the magnitude (Euclidean norm) of each (nx, ny, nz) vector + # axis=2 computes the norm across the 3 channels + magnitude = np.linalg.norm(direction_vectors, axis=2, keepdims=True) + + # Use np.divide and np.where to prevent division by zero for magnitude=0 + # Set normals to (0, 0, 0) or another placeholder where magnitude is zero (flat or invalid depth) + normals = np.divide(direction_vectors, magnitude, out=np.zeros_like(direction_vectors), where=magnitude != 0) + + return normals + + def estimate_normals_using_box_filters(self,img_roi): + """ + Estimates the surface normal vector for each pixel in a depth map + using the image gradient (Sobel operator). + + Args: + img_roi (np.ndarray): A single-channel depth image (e.g., CV_32F or CV_64F). + Depth values must be in a consistent metric (e.g., meters). + + Returns: + np.ndarray: A 3-channel image (H, W, 3) where each pixel contains the + (nx, ny, nz) unit normal vector, as CV_32F. + """ + + # sum of image pixels using box filter + img_roi = img_roi.astype(np.float32) + roih, roiw = img_roi.shape[:2] + kernel_size = 5 + # count good points + img_roi_mask = (img_roi > 0).astype(np.float32) + + img_roi_sum = cv.boxFilter(img_roi, -1, (kernel_size, kernel_size), normalize=False) + img_roi_count = cv.boxFilter(img_roi_mask, -1, (kernel_size, kernel_size), normalize=False) + + # protect from non valid + img_roi_count[img_roi_count < 1] = 1 + img_roi_mean = img_roi_sum/img_roi_count + + # compute gradients + shift = kernel_size>>1 + shift2 = shift<<1 + img_normal = np.ones((roih,roiw,3)) + # cross product + #a × b = (a₂b₃ - a₃b₂)i + (a₃b₁ - a₁b₃)j + (a₁b₂ - a₂b₁)k + # dzdx, dzdy + img_normal[:,shift:-shift,0] = img_roi_mean[:,shift2:] - img_roi_mean[:,:-shift2] + img_normal[shift:-shift,:,1] = img_roi_mean[shift2:,:] - img_roi_mean[:-shift2,:] + + # align directions + img_normal[:,:,0] = -img_normal[:,:,0] # dx + img_normal[:,:,1] = -img_normal[:,:,1] # dy + + # normalize each vector to unit length + norm2 = np.sqrt(np.sum(img_normal**2, axis=2)) + img_normal = img_normal / norm2[:,:,np.newaxis] + return img_normal + + def fit_plane_using_gradients(self, img_full, roi_rect = None): + "estimates normal to the plane fit using gradients" + + # roi converted to points with step size on the grid + xyz_roi = self.convert_roi_to_points(img_full, point_num = 500, step_size = 1, roi_rect = roi_rect) + if self.img_roi is None: + log.error('No ROI in image') + return 0, 0 + img_roi = self.img_roi.astype(np.float32) + + # old code + #img_normal = self.estimate_normals_using_box_filters(img_roi) + + img_normal = self.estimate_normals_from_depth_map(img_roi) + + # roih, roiw = img_roi.shape[:2] + + # # count good points + # img_roi_mask = (img_roi > 0).astype(np.float32) + + # # sum of image pixels using box filter + # kernel_size = 7 + # img_roi_sum = cv.boxFilter(img_roi, -1, (kernel_size, kernel_size), normalize=False) + # img_roi_count = cv.boxFilter(img_roi_mask, -1, (kernel_size, kernel_size), normalize=False) + + # # protect from non valid + # img_roi_count[img_roi_count < 1] = 1 + # img_roi_mean = img_roi_sum/img_roi_count + + # # compute gradients + # shift = kernel_size>>1 + # shift2 = shift<<1 + # img_normal = np.ones((roih,roiw,3)) + # # cross product + # #a × b = (a₂b₃ - a₃b₂)i + (a₃b₁ - a₁b₃)j + (a₁b₂ - a₂b₁)k + # # dzdx, dzdy + # img_normal[:,shift:-shift,0] = img_roi_mean[:,shift2:] - img_roi_mean[:,:-shift2] + # img_normal[shift:-shift,:,1] = img_roi_mean[shift2:,:] - img_roi_mean[:-shift2,:] + + # # align directions + # img_normal[:,:,0] = -img_normal[:,:,0] # dx + # img_normal[:,:,1] = -img_normal[:,:,1] # dy + + # # normalize each vector to unit length + # norm2 = np.sqrt(np.sum(img_normal**2, axis=2)) + # img_normal = img_normal / norm2[:,:,np.newaxis] + + # plane normal + plane_params = img_normal.mean(axis=(0,1)) + + # some problem with points + if xyz_roi.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # estimate error + xyz_center = xyz_roi[:,:3].mean(axis=0) + xyz_matrix = xyz_roi - xyz_center + err = np.dot(xyz_matrix,plane_params) + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + #self.plane_confidence = 1/(1+img_std) + + self.img_roi_normal = img_normal # save for debug and display + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_ransac_and_grow(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then grow the ROI + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + #img_mean, img_std = self.fit_plane_ransac(img_full) + + # make sure that mask is not empty - initial rectangle + x0, y0, x1, y1 = self.rect + self.img_mask[y0:y1,x0:x1] = 1 + + # grow the mask + y,x = np.where(self.img_mask > 0.7) + y_min, y_max = y.min(), y.max() + x_min, x_max = x.min(), x.max() + y_min, y_max = np.maximum(0,y_min-1), np.minimum(self.img_mask.shape[0],y_max+2) + x_min, x_max = np.maximum(0,x_min-1), np.minimum(self.img_mask.shape[1],x_max+2) + + # extract ROI + roi_rect = [x_min, y_min, x_max, y_max] + #img_roi = img_full[y_min:y_max,x_min:x_max].astype(np.float32) + xyz_matrix = self.convert_roi_to_points(img_full, point_num = 5000, step_size = 1, roi_rect = roi_rect) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_offset = np.dot(self.plane_center, vecC) + dist_pt = np.dot(xyz_matrix, vecC) - dist_offset + + # Select indexes where distance is biggers than the threshold + thresh = 3.5 + err = np.abs(dist_pt) + i2 = np.where( err <= thresh)[0] + + # transfer xi,yi coordinates to the original image index + ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + self.img_mask = 0.95*self.img_mask + self.img_mask.flat[ii] = self.img_mask.flat[ii] + 0.5*(1 - self.img_mask.flat[ii]) + + + # position in 2d array + # unravel_index(a.argmax(), a.shape) + + # output + img_std = err.std() + img_mean = xyz_matrix[i2].mean(axis=0)[2] + + + return img_mean, img_std + + def fit_and_split_roi_recursively(self, roi, level = 0): + # splits ROI on 4 regions and recursevly call + x0,y0,x1,y1 = roi + #roi3d = self.img_xyz[y0:y1,x0:x1,:] + log.info('Processing level %d, region x = %d, y = %d' %(level,x0,y0)) + # check the current fit + roi_params_f = self.fit_plane(roi) + roi_params_ret = [roi_params_f] + if roi_params_f['error'] < self.MIN_STD_ERROR: + log.info('Fit is good enough x = %d, y = %d' %(x0,y0)) + return roi_params_ret + + # too small exit + xs, ys = int((x1 + x0)/2), int((y1 + y0)/2) + if (xs - x0) < self.MIN_SPLIT_SIZE or (ys - y0) < self.MIN_SPLIT_SIZE: + log.info('Min size is reached x = %d, y = %d' %(x0,y0)) + return roi_params_ret + + # 4 ROIs - accept the split if error of one of them is lower from the total + roi_params_list = [] + roi_split = [[x0,y0,xs,ys],[x0,ys,xs,y1],[xs,y0,x1,ys],[xs,ys,x1,y1]] + for roi_s in roi_split: + roi_params_prev = self.fit_and_split_roi_recursively(roi_s, level + 1) + # save locally + #roi_params_list.append(roi_params_prev) + roi_params_list = roi_params_list + roi_params_prev + + # extract each of the below and check the error + makeTheSplit = False + for roi_params_s in roi_params_list: + #roi_params_s = roi_params_prev[-1] + # accept the split if twice lower (if noise of 4 split should be 2) + if roi_params_s['error'] < roi_params_f['error']/2: + makeTheSplit = True + break + + # decide what to return + if makeTheSplit: + roi_params_ret = roi_params_list + log.info('Split at level %d, region x = %d, y = %d' %(level,x0,y0)) + else: + log.info('No split level %d, region x = %d, y = %d' %(level,x0,y0)) + + return roi_params_ret + + def fit_plane_svd_weighted_4d(self, img_roi): + "estimates weighted plane fit using weight inversely proportional to the depth" + + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 350, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # form matrix [u,v,f,zf] result of sigma(z) = alpha*z + if self.roi_index is None: + log.error('No ROI index found') + return 0,0 + + # form matrix for svd + f = self.cam_matrix[0,0] + # uv1z_matrix = np.hstack((self.full_dir[self.roi_index,:],xyz_matrix[:,2].reshape((-1,1)))) + # uv1z_matrix[:,2:3] *= f # keep 1 intact + + # plane params - using only valid + z = self.rect_z + uv1_matrix = self.rect_dir + uv1z_matrix = np.hstack((uv1_matrix,1/z)) + uv1z_matrix[:,2:3] *= f # keep 1 intact + + # using svd to make the fit + U, S, Vh = np.linalg.svd(uv1z_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params[:3]) + + # patch mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_and_project_the_image(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then projecvt the entire image on the plane + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + # extract ROI + roi_rect = [50, 50, w-50, h-50] + xyz_matrix = self.convert_roi_to_points(img_full, point_num = 500000, step_size = 1, roi_rect = roi_rect) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_offset = np.dot(self.plane_center, vecC) + dist_pt = np.dot(xyz_matrix, vecC) - dist_offset + + # Select indexes where distance is biggers than the threshold + thresh = 2.5 + err = np.abs(dist_pt) + i2 = np.where(err <= thresh)[0] + + # transfer xi,yi coordinates to the original image index + ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + self.img_mask.flat[ii] = 1 + + # output + img_std = err.std() + img_mean = xyz_matrix[i2].mean(axis=0)[2] + + return img_mean, img_std + + def fit_box(self): + "using mask finds box" + self.contour = None + self.hull = None + + # find convex hull + contours, hierarchy = cv.findContours(self.img_mask, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE) + + # Select a contour (e.g., the first one) + max_len_index = np.argmax([len(cnt) for cnt in contours]) + cnt = contours[max_len_index] + + # Find the convex hull + hull = cv.convexHull(cnt) + + self.contour = cnt + self.hull = hull + return True + + + + def fit_plane_and_project_the_image_using_gradients(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then projecvt the entire image on the plane + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + # extract ROI + #roi_rect = [50, 50, w-50, h-50] + #xyz_matrix = self.convert_roi_to_points(img_full, point_num = 50000, step_size = 1, roi_rect = roi_rect) + + # estimate normals for the entire image + img_normal = self.estimate_normals_from_depth_map(img_full) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_pt = np.dot(img_normal, vecC) + + # Select indexes where distance is biggers than the threshold + thresh = 0.1 + dist_abs = np.abs(dist_pt) + #ii = np.where(dist_abs > thresh)[0] + ii = dist_abs > thresh + # transfer xi,yi coordinates to the original image index + #ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + #self.img_mask.flat[ii] = 1 + # make sure that mask is not empty - initial rectangle + x0, y0, x1, y1 = self.rect + self.img_mask[y0:y1,x0:x1] = 1 + + # update mask according to the valid pixels + self.img_mask = 0.95*self.img_mask + #self.img_mask.flat[ii] = self.img_mask.flat[ii] + 0.5*(1 - self.img_mask.flat[ii]) + self.img_mask[ii] = self.img_mask[ii] + 0.5*(1 - self.img_mask[ii]) + + # output + img_std = dist_abs.std() + img_mean = 0 #xyz_matrix[i2].mean(axis=0)[2] + + + return img_mean, img_std + + def growingStep(self, depths:np.ndarray, step:int, direction:str, mean:float, std:float, + left:int, top:int, right:int, bottom:int, numConfInt:int=3, testRatio:float=0.95): + """ + testing whether growing in a given direction and size is acceptable + :param depths: uncropped depth image + :param step: step size to test + :param direction: growing direction. should be either: 'left', 'right', 'top' or 'bottom' + :param mean: the current mean depth of the segment + :param std: the current standard deviation of the segment's depths + :param left: the current left bound of the segment + :param top: the current top bound of the segment + :param right: the current right bound of the segment + :param bottom: the current bottom bound of the segment + :param numConfInt: number of confidence intervals threshold + :param testRatio: ratio of elements in growing area to be passed the confidence interval threshold + :return: true if growing is accepted, otherwise false + """ + directions = ['left', 'right', 'bottom', 'top'] + if direction not in directions: + raise ValueError('direction must be one of "left", "right", "bottom", "top"') + + if step < 0: + raise ValueError('step must be non-negative') + + if numConfInt < 1: + raise ValueError('numConfInt must be at least 1') + + if testRatio > 1 or testRatio < 0: + raise ValueError('testRatio must be between 0 and 1') + + data = { + 'left': depths[bottom:top + 1, left - step:left] if step > 0 else None, + 'right': depths[bottom:top + 1, right + 1:right + step + 1] if step > 0 else None, + 'top': depths[top + 1:top + step + 1, left:right] if step > 0 else None, + 'bottom': depths[bottom - step:bottom, left:right] if step > 0 else None, + } + testData = data[direction] + testRows, testCols = np.nonzero(testData) # getting the indexes of all nonzero elements + norTestData = np.abs(testData[testRows, testCols] - mean) / std # normalizing test data to standard values + test = np.nonzero(norTestData < numConfInt)[0] # test if elements are below confidence interval threshold + if test.size / testData.size > testRatio: + return True # growing is acceptable + else: + return False # growing is denied + + def findMaxROI(self, depths:np.ndarray, initialRoi) -> tuple[int, int, int, int]: + """ + Find maximum ROI in depth image using region growing from an initial ROI + :param depths: entire depth image + :param initialRoi: initial ROI to grow from as list/tuple in the order: left, top, right, bottom + :return: final ROI found as a tuple in the order: left, top, right, bottom + """ + height, width = depths.shape + left, top, right, bottom = initialRoi + stepLeft = stepRight = stepTop = stepBottom = 1 # initializing growing steps + + while stepLeft + stepRight + stepTop + stepBottom > 0: + currentData = depths[bottom:top + 1, left:right + 1] + currentRows, currentCols = np.nonzero(currentData) # getting the indexes of all nonzero elements + currentMean = currentData[currentRows, currentCols].mean() + currentStd = currentData[currentRows, currentCols].std() + + if stepLeft > 0: # attempting to grow to the left + if left - stepLeft < 0: # growing left with current step exceeded image dimensions + stepLeft = 1 if stepLeft > 1 else 0 + elif left == 0: # growing reached and of image, no more growing available + stepLeft = 0 + else: + res = self.growingStep(depths, stepLeft, 'left', currentMean, currentStd, left, top, right, bottom) + if res: + left -= stepLeft + stepLeft *= 2 # increasing growing step for next iteration + else: + stepLeft = 1 if stepLeft > 1 else 0 + + if stepRight > 0: # attempting to grow to the right + if right + stepRight > width: # growing right with current step exceeded image dimensions + stepRight = 1 if stepRight > 1 else 0 + elif right == width - 1: # growing reached and of image, no more growing available + stepRight = 0 + else: + res = self.growingStep(depths, stepRight, 'right', currentMean, currentStd, left, top, right, bottom) + if res: + right += stepRight + stepRight *= 2 # increasing growing step for next iteration + else: + stepRight = 1 if stepRight > 1 else 0 + + if stepBottom > 0: # attempting to grow down + if bottom - stepBottom < 0: # growing down with current step exceeded image dimensions + stepBottom = 1 if stepBottom > 1 else 0 + elif bottom == 0: # growing reached and of image, no more growing available + stepBottom = 0 + else: + res = self.growingStep(depths, stepBottom, 'bottom', currentMean, currentStd, left, top, right, bottom) + if res: + bottom -= stepBottom + stepBottom *= 2 # increasing growing step for next iteration + else: + stepBottom = 1 if stepBottom > 1 else 0 + + if stepTop > 0: # attempting to grow up + if top + stepTop > height: # growing up with current step exceeded image dimensions + stepTop = 1 if stepTop > 1 else 0 + elif top == height - 1: # growing reached and of image, no more growing available + stepTop = 0 + else: + res = self.growingStep(depths, stepTop, 'top', currentMean, currentStd, left, top, right, bottom) + if res: + top += stepTop + stepTop *= 2 # increasing growing step for next iteration + else: + stepTop = 1 if stepTop > 1 else 0 + + # print(left, top, right, bottom, '|', stepLeft, stepTop, stepRight, stepBottom) + + return left, top, right, bottom + + def find_planes(self, img): + "finds planes using different algo" + detect_type = self.detect_type.upper() + + img_mean, img_std = 0,0 + if detect_type == 'P': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) + + elif detect_type == 'W': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd_weighted(img_roi) + + elif detect_type == 'O': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_with_outliers(img_roi) + + elif detect_type == 'T': # weighted by depth + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd_weighted_4d(img_roi) + + elif detect_type == 'R': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_ransac(img_roi) + + elif detect_type == 'G': # gradients + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_using_gradients(img_roi) + + elif detect_type == 'F': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + img_mean, img_std = self.fit_plane_ransac_and_grow(img) + + elif detect_type == 'S': # project all the points on the plane and color them accordingly + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + #img_mean, img_std = self.fit_plane_svd_weighted(img_roi) + # img_mean, img_std = self.fit_plane_svd_weighted_4d(img_roi) + img_mean, img_std = self.fit_plane_and_project_the_image(img) + + elif detect_type == 'N': # project all the points on the plane and color them accordingly + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + img_mean, img_std = self.fit_plane_and_project_the_image_using_gradients(img) + + elif detect_type == 'B': # pbox detector + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + img_mean, img_std = self.fit_plane_and_project_the_image(img) + ret = self.fit_box() + + + #log.debug(f'camera noise - roi mean : {img_mean}') + self.img_mean = img_mean # final measurements per frame + self.img_std = img_std + return True + + def process_frame(self, img): + "process the entire image and find the planes" + + img_roi = self.preprocess(img) + img3d = self.init_img3d(img_roi) + imgXYZ = self.compute_img3d(img_roi) + roim,rois = self.fit_plane_with_outliers(img_roi) + pose = self.convert_plane_params_to_pose() + + return pose + + +#%% Helpers +def draw_axis(img, rvec, tvec, cam_mtrx, cam_dist, len = 10): + # unit is mm + try: + points = np.float32([[len, 0, 0], [0, len, 0], [0, 0, len], [0, 0, 0]]).reshape(-1, 3) + axisPoints, _ = cv.projectPoints(points, rvec.astype(np.float32), tvec.astype(np.float32), cam_mtrx, cam_dist) + axisPoints = axisPoints.squeeze().astype(np.int32) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[0].ravel()), (0,0,255), 3) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[1].ravel()), (0,255,0), 3) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[2].ravel()), (255,0,0), 3) + except Exception as e: + print(e) + print(rvec, tvec, img.shape) + return img + +def draw_polygon(img, rvec, tvec, cam_mtrx, cam_dist, points3d): + # unit is mm + points = np.float32(points3d).reshape(-1, 3) + polygon_points, _ = cv.projectPoints(points, rvec, tvec, cam_mtrx, cam_dist) + polygon_points = polygon_points.squeeze().astype(np.int32) + img = cv.polylines(img, [polygon_points], True, (0, 200, 200), 1) + + # To fill the polygon, use thickness=-1 + # cv2.fillPoly(img, [pts], color) + + return img + +def draw_cube(img, corners, imgpts): + imgpts = np.int32(imgpts).reshape(-1,2) + # draw ground floor in green + img = cv.drawContours(img, [imgpts[:4]],-1,(0,255,0),-3) + # draw pillars in blue color + for i,j in zip(range(4),range(4,8)): + img = cv.line(img, tuple(imgpts[i]), tuple(imgpts[j]),(255),3) + + # draw top layer in red color + img = cv.drawContours(img, [imgpts[4:]],-1,(0,0,255),3) + return img + + +#%% ROI selector from OpenCV +class RectSelector: + def __init__(self, win, callback): + self.win = win + self.callback = callback + cv.setMouseCallback(win, self.onmouse) + self.drag_start = None + self.drag_rect = None + def onmouse(self, event, x, y, flags, param): + x, y = np.int16([x, y]) # BUG + if event == cv.EVENT_LBUTTONDOWN: + self.drag_start = (x, y) + return + if self.drag_start: + if flags & cv.EVENT_FLAG_LBUTTON: + xo, yo = self.drag_start + x0, y0 = np.minimum([xo, yo], [x, y]) + x1, y1 = np.maximum([xo, yo], [x, y]) + self.drag_rect = None + if x1-x0 > 0 and y1-y0 > 0: + self.drag_rect = (x0, y0, x1, y1) + else: + rect = self.drag_rect + self.drag_start = None + self.drag_rect = None + if rect: + self.callback(rect) + def draw(self, vis): + if not self.drag_rect: + return False + x0, y0, x1, y1 = self.drag_rect + cv.rectangle(vis, (x0, y0), (x1, y1), (0, 255, 0), 2) + return True + @property + def dragging(self): + return self.drag_rect is not None + +#%% Data Generator +class DataGen: + def __init__(self, img_size = (640,480)): + + self.frame_size = img_size + self.img = None + self.rect = None # roi + + + def add_noise(self, img_gray, noise_percentage = 0.01): + "salt and pepper noise" + if noise_percentage < 0.001: + return img_gray + + + # Get the image size (number of pixels in the image). + img_size = img_gray.size + + # Set the percentage of pixels that should contain noise + #noise_percentage = 0.1 # Setting to 10% + + # Determine the size of the noise based on the noise precentage + noise_size = int(noise_percentage*img_size) + + # Randomly select indices for adding noise. + random_indices = np.random.choice(img_size, noise_size) + + # Create a copy of the original image that serves as a template for the noised image. + img_noised = img_gray.copy() + + # Create a noise list with random placements of min and max values of the image pixels. + #noise = np.random.choice([img_gray.min(), img_gray.max()], noise_size) + noise = np.random.choice([-10, 10], noise_size) + + # Replace the values of the templated noised image at random indices with the noise, to obtain the final noised image. + img_noised.flat[random_indices] += noise + + log.info('adding image noise') + return img_noised + + def init_image(self, img_type = 1): + # create some images for test + w,h = self.frame_size + if img_type == 1: # / + + self.img = np.tile(np.linspace(100, 300, w), (h,1)) + + elif img_type == 2: # /|/ + + self.img = np.tile(np.linspace(100, 200, int(w/2)), (h,2)) + + elif img_type == 3: # |_| + + self.img = np.tile(np.linspace(100, 200, h).reshape((-1,1)), (1,w)) + + elif img_type == 4: # /\ + + self.img = np.tile(np.hstack((np.linspace(300, 500, w>>1),np.linspace(500, 300, w>>1))), (h,1)) + + elif img_type == 5: # dome + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = (np.abs(x - w/2) + np.abs(y - h/2))/10 + 200 # less slope + + elif img_type == 6: # sphere + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = np.sqrt((x - w/2)**2 + (y - h/2)**2)/10 + 200 # less slope + + elif img_type == 7: # stair + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = (np.sign(x - w/2) + np.sign(y - h/2))*5 + 200 # less slope + + + elif img_type == 8: # corner + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = np.ones((h,w))*250 + img_bool = np.logical_and((x - w/2) < 0, (y - h/2) < 0) + self.img[img_bool] = 230 # quarter + + elif img_type == 10: # flat + + self.img = np.ones((h,w))*500 + + elif img_type == 11: + "chess board" + fname = r"C:\Users\udubin\Documents\Code\opencv-4x\samples\data\left04.jpg" + self.img = cv.imread(fname) + + elif img_type == 12: + self.img = cv.imread('image_scl_001.png', cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + elif img_type == 13: + self.img = cv.imread(r"wrappers\python\applications\planes\data\image_ddd_000.png", cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + elif img_type == 21: + self.img = cv.imread(r"C:\Data\Depth\Plane\image_scl_000.png", cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + #self.img = np.uint8(self.img) + + self.img = self.add_noise(self.img, 0) + self.frame_size = self.img.shape[:2] + return self.img + + def init_roi(self, test_type = 1): + "load the test case" + roi = [0,0,self.frame_size[0],self.frame_size[1]] + if test_type == 1: + roi = [310,230,330,250] # xlu, ylu, xrb, yrb + elif test_type == 2: + roi = [300,220,340,260] # xlu, ylu, xrb, yrb + elif test_type == 3: + roi = [280,200,360,280] # xlu, ylu, xrb, yrb + elif test_type == 4: + roi = [220,140,420,340] # xlu, ylu, xrb, yrb + elif test_type == 4: + roi = [200,120,440,360] # xlu, ylu, xrb, yrb + return roi + + def test_image(self): + "test single image depth" + img = self.init_image(1) + roi = self.init_roi(1) + +#%% Adds display functionality to the PlaneDetector +class PlaneDetectorDisplay(PlaneDetector): + def __init__(self, detect_type='p',image_size = (1280,720)): + super().__init__(detect_type, image_size=image_size) + self.detect_type = detect_type + self.frame_size = image_size + self.img = None + self.show_dict = {} # show figures in 3D + + def show_image_data(self, frame, display_mode = 1): + "converts image data to 3d color" + + if display_mode == 1: # left + img = frame[:,:,0] + elif display_mode == 2: # right + img = frame[:,:,1] + elif display_mode == 3: + img = cv.convertScaleAbs(frame[:,:,2], alpha=0.1) + else: # depth + img = cv.convertScaleAbs(frame[:,:,2], alpha=0.03) + + vis = np.uint8(img) + vis = cv.cvtColor(vis, cv.COLOR_GRAY2BGR) + return vis + + def show_data(self, frame = None, ttl = 'Depth'): + "draw relevant image data" + + if frame is None : + log.info('No images found') + return False + + if len(frame.shape) > 2 and frame.shape[2]==2: # extract 3 images + img_show = np.concatenate((frame[:,:,0], frame[:,:,1]), axis = 1) + img_show = np.uint8(img_show) + + elif len(frame.shape) == 2 and frame.dtype == 'uint16': + img_show = cv.convertScaleAbs(frame, alpha=0.03) + img_show = np.uint8(img_show) + else: + img_show = np.uint8(frame) + + while img_show.shape[1] > 2000: + img_show = cv.resize(img_show, (img_show.shape[1]>>1,img_show.shape[0]>>1), interpolation=cv.INTER_LINEAR) + + while img_show.shape[0] < 300: + img_show = cv.resize(img_show, (img_show.shape[1]<<1,img_show.shape[0]<<1), interpolation=cv.INTER_LINEAR) + + cv.imshow(ttl + ' (q-Quit)', img_show) + ch = cv.waitKey(10) + ret = ch != ord('q') + return ret + + def show_image_with_axis(self, img, poses = []): + "draw results : axis on the image. poses are list of 6D vectors" + axis_number = len(poses) + if axis_number < 1: + log.error('No poses found') + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + for k in range(axis_number): + + euler_angles = poses[k][3:] # orientation in degrees + rvec = Rot.from_euler('xyz',euler_angles[:3], degrees=True).as_rotvec() + tvec = np.array(poses[k][:3]) #np.array(, dtype = np.float32).reshape(rvec.shape) # center of the patch + img_show = draw_axis(img_show, rvec, tvec, self.cam_matrix, self.cam_distort, len = 10) + + cv.imshow('Image & Axis', img_show) + log.info('show done') + ch = cv.waitKey() + + def show_image_with_rois(self, img, roi_params_ret = []): + "draw results by projecting ROIs on image" + + axis_number = len(roi_params_ret) + if axis_number < 1: + print('No poses found') + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + for roi_p in roi_params_ret: + + pose = self.convert_roi_params_to_pose(roi_p) + + avec = pose[3:6] # orientation in degrees + levl = pose[6] # level + #R = eulerAnglesToRotationMatrix(avec) + R = Rot.from_euler('zyx',avec, degrees = True).as_matrix() + rvec, _ = cv.Rodrigues(R) + tvec = np.array(pose[:3], dtype = np.float32).reshape(rvec.shape) # center of the patch + img_show= draw_axis(img_show, rvec, tvec, self.cam_matrix, self.cam_distort, len = levl) + + cv.imshow('Image & Axis', img_show) + log.info('show done') + ch = cv.waitKey() + + def show_points_3d_with_normal(self, img3d, pose = None): + "display in 3D" + fig = plt.figure() + ax = fig.add_subplot(projection='3d') + + #xs,ys,zs = img3d[:,:,0].reshape((-1,1)), img3d[:,:,1].reshape((-1,1)), img3d[:,:,2].reshape((-1,1)) + + xs,ys,zs = img3d[:,0].reshape((-1,1)), img3d[:,1].reshape((-1,1)), img3d[:,2].reshape((-1,1)) + ax.scatter(xs, ys, zs, marker='.') + + if pose is not None: + pose = pose.flatten() + vnorm = pose[3:6].flatten()*10 + xa, ya, za = [pose[0], pose[0]+vnorm[0]], [pose[1], pose[1]+vnorm[1]], [pose[2], pose[2]+vnorm[2]] + ax.plot(xa, ya, za, 'r', label='Normal') + + + ax.set_xlabel('X [mm]') + ax.set_ylabel('Y [mm]') + ax.set_zlabel('Z [mm]') + ax.set_aspect('equal', 'box') + plt.show() + + def show_rois_3d_with_normals(self, roi_params_ret = [], roi_init = None): + "display in 3D each ROI region with split" + + if len(roi_params_ret) < 1: + log.info('roi_params_ret is empty') + return + + # extract the initial ROI - to make the show more compact + roi_init = [0,0,self.frame_size[1], self.frame_size[0]] if roi_init is None else roi_init + x0,y0,x1,y1 = roi_init + + if self.img_xyz is None: + log.info('Need init') + return + + img3d = self.img_xyz[y0:y1,x0:x1,:] + xs,ys,zs = img3d[:,:,0].reshape((-1,1)), img3d[:,:,1].reshape((-1,1)), img3d[:,:,2].reshape((-1,1)) + + fig = plt.figure() + ax = fig.add_subplot(projection='3d') + ax.scatter(xs, ys, zs, marker='.') + + for roi_p in roi_params_ret: + pose = self.convert_roi_params_to_pose(roi_p) + pose = pose.flatten() + # R = Rot.from_euler('zyx',pose[3:6],degrees=True).as_matrix() + # vnorm = R[:,2]*pose[6] + vnorm = pose[3:6]*pose[6] + #log.info(str(vnorm)) + xa, ya, za = [pose[0], pose[0]+vnorm[0]], [pose[1], pose[1]+vnorm[1]], [pose[2], pose[2]+vnorm[2]] + ax.plot(xa, ya, za, 'r', label='Normal') + + + ax.set_xlabel('X [mm]') + ax.set_ylabel('Y [mm]') + ax.set_zlabel('Z [mm]') + ax.set_aspect('equal', 'box') + plt.show() #block=False) + + def show_3d_point_cloud(self): + "displays point cloud in real time" + if self.rect is None or self.rect_xyz is None: + return + X = self.rect_xyz[:,:3] + if not 'line' in self.show_dict : #len(self.show_dict) < 1: + + fig_num = int(self.rect[0]+self.rect[1]) + fig = plt.figure(fig_num) + plt.clf() + #fig.canvas.set_window_title('3D Scene') + try: + ax = fig.gca(projection='3d') + except: + ax = fig.add_subplot(projection = '3d') + fig.tight_layout() + fig.suptitle(f'ROI : {self.rect[0]},{self.rect[1]}') + ax.set_proj_type('ortho') + #lineGray, = ax.plot3D(X[:,0], X[:,1], X[:,2], color='k') + lineGray = ax.scatter(X[:,0], X[:,1], X[:,2]) + + ax.xaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + ax.yaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + ax.zaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + + ax.set_xlabel('x') + ax.set_ylabel('y') + ax.set_zlabel('z') + ax.invert_yaxis() + ax.view_init(elev=-70, azim=-90) + + lims = 500 + ax.set_xlim(X[:,0].min()*0.8, X[:,0].max()*1.2) + ax.set_ylim(X[:,1].min()*0.8, X[:,1].max()*1.2) + ax.set_zlim(X[:,2].min()*0.9, X[:,2].max()*1.2) + plt.ion() + plt.show(block = False) + self.show_dict = {'fig':fig, 'ax':ax, 'line':lineGray} + else: + #self.show_dict['line'].set_data(X[:,0], X[:,1]) + #self.show_dict['line'].set_3d_properties(X[:,2]) + self.show_dict['line']._offsets3d = (X[:,0], X[:,1], X[:,2]) + #self.show_dict['ax'].set_ylim(low_limit, high_limit) + + self.show_dict['fig'].canvas.draw_idle() + self.show_dict['fig'].canvas.flush_events() + return + + def show_axis(self, vis): + "draw axis after plane estimation" + if self.plane_params is None: + return vis + + #rvec = self.plane_params/np.sum(self.plane_params**2) # normalize + rvec = self.convert_plane_params(self.plane_params) + #rvec = self.convert_plane_to_rvec(self.plane_params) + + tvec = self.plane_center + vis = draw_axis(vis, rvec, tvec, self.cam_matrix, self.cam_distort, len = 50) + return vis + + def show_text(self, vis): + "draw text plane estimation" + err_mean, err_std = self.img_mean, self.img_std + if err_mean is None: + return vis + + if self.rect is None: + return vis + + x0, y0, x1, y1 = self.rect + txt = f'{self.detect_type}:{err_mean:.2f}:{err_std:.3f}' + #if self.detect_type == 'F': + # txt = f'{self.detect_type}:{self.img_fill:.2f} %' + vis = draw_str(vis,(x0,y0-10),txt) + + return vis + + def show_rect_and_text(self, vis): + "draw axis after plane estimation" + err_mean, err_std = self.img_mean, self.img_std + if err_mean is None: + return vis + + if self.rect is None: + return vis + + x0, y0, x1, y1 = self.rect + clr = (0, 0, 0) if vis[y0:y1,x0:x1].mean() > 128 else (240,240,240) + vis = cv.rectangle(vis, (x0, y0), (x1, y1), clr, 2) + txt = f'{self.detect_type}:{err_mean:.2f}-{err_std:.3f}' + if self.detect_type == 'F': + txt = f'{self.detect_type}:{self.img_fill:.2f} %' + vis = draw_str(vis,(x0,y0-10),txt) + + return vis + + def show_rect_and_axis_projected(self, vis): + "projects rectangle on the plane" + if self.rect is None: + return vis + if self.plane_params is None: + return vis + + rvec = self.convert_plane_params(self.plane_params) + tvec = self.plane_center + + vis = draw_axis(vis, rvec, tvec, self.cam_matrix, self.cam_distort, len = 50) + vis = draw_polygon(vis, rvec, tvec, self.cam_matrix, self.cam_distort, self.rect_3d) + + return vis + + def show_mask(self, img): + "draw image mask" + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + if not np.all(self.img_mask.shape[:2] == img_show.shape[:2]): + log.error('mask and image size are not equal') + return img_show + + img_show[self.img_mask > 0.75] = self.color_mask + return img_show + + def show_polygon(self, img): + "shows polygon on top of the image" + # Define polygon vertices (e.g., a triangle) + if self.polygon is None: + return img + + polygon = np.array(self.polygon, np.int32) + + # Reshape for OpenCV (required shape: (n_points, 1, 2)) + polygon = polygon.reshape((-1, 1, 2)) + + # Draw the polygon on the image + img = cv.polylines(img, [polygon], isClosed=True, color=(0, 255, 0), thickness=2) + + return img + + def show_box(self, img): + "shows rectangular on top of the image" + # Define polygon vertices (e.g., a triangle) + if self.contour is None: + return img + + # # check the tracker distance from the floor - dont show the floor + # plane_z = self.plane_center[2] + # if self.floor_z + 20 > plane_z: + # return img + + # make mask not persistent + #self.img_mask[:,:] = 0 + + + # Draw the original contour (optional, in green) + #cv.drawContours(img, [self.contour], 0, (0, 255, 0), 2) + + #epsilon = 0.1*cv.arcLength(self.contour,True) + #box_approx = cv.approxPolyDP(self.contour,epsilon,True) + + # Draw the convex hull (in red) + cv.drawContours(img, [self.hull], 0, (0, 0, 255), 1) + + # 5. Min/Max Axis (using minAreaRect) + rect = cv.minAreaRect(self.contour) # (center(x,y), (width, height), angle - (0:-90)) + box = cv.boxPoints(rect) # Get 4 corners + box = np.array(box).astype(int) + cv.drawContours(img, [box], 0, (0, 255, 255), 2) # Draw oriented box + #print(rect) + + # 1. Euler angles (pitch, roll, yaw) in radians + euler_angles = [0, 0, np.deg2rad(rect[2])] + + # 2. Convert Euler to Rotation Matrix (e.g., ZYX order) + rotation = R.from_euler('xyz', euler_angles, degrees=False) + rot_matrix = rotation.as_matrix() + + # 3. Convert Rotation Matrix to rvec (Rodrigues) + rvec, _ = cv.Rodrigues(rot_matrix) + tvec = self.plane_center + + # save it to draw axis + img = draw_axis(img, rvec, tvec, self.cam_matrix, self.cam_distort, len = 50) + + return img + + + def show_image_colored_by_normals(self, img = None): + "draw results : show normals at each point of ROI" + if img is None: + log.error('No image found') + return img + if self.rect is None: # roi + log.error('No ROI found') + return img + if self.img_roi_normal is None: + log.error('No normals found') + return img + + # deal with black and white + img_show = np.uint8(img).copy() + if len(img.shape) < 3: + #img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + img_show = np.tile(img_show[:,:,np.newaxis], (1,1,3)) + + # scale normals to fir RGB color space + normals = (self.img_roi_normal + 1)*127.5 + x0,y0,x1,y1 = self.rect + #img_show[y0:y1,x0:x1, :] = np.uint8(normals) + img_show[y0:y1,x0:x1, :] = cv.addWeighted(img_show[y0:y1,x0:x1, :], 0.2, np.uint8(normals), 0.8, 0) + + # make black outside roi or non valid + non_valid_pixels = img[:,:,2] < 1 + img_show[non_valid_pixels,0] = 0 + img_show[non_valid_pixels,1] = 0 + img_show[non_valid_pixels,2] = 0 + + #cv.imshow('Image & Normals', img_show) + #log.info('show done') + #ch = cv.waitKey(0) + return img_show + + def show_scene(self, vis): + "draw ROI and Info" + + #vis = self.show_rect_and_text(vis) + #vis = self.show_axis(vis) + + vis = self.show_mask(vis) + vis = self.show_box(vis) + #vis = self.show_image_colored_by_normals(vis) + #vis = self.show_rect_and_axis_projected(vis) + vis = self.show_text(vis) + + return vis + + + + + + + + +# ---------------------- +#%% Tests +class TestPlaneDetector(unittest.TestCase): + + def test_image_show(self): + "checking image show" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + poses = [[0,0,100,0,0,45,10]] + p.show_image_with_axis(img,poses) + self.assertFalse(d.img is None) + + def test_init_img3d(self): + "XYZ point cloud structure init" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + isOk = p.init_image(img) + img3d = p.init_img3d() + self.assertFalse(img3d is None) + + def test_compute_img3d(self): + "XYZ point cloud structure init and compute" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + self.assertFalse(imgXYZ is None) + + def test_show_img3d(self): + "XYZ point cloud structure init and compute" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + roi = p.init_roi(1) + x0,y0,x1,y1 = roi + roiXYZ = imgXYZ[y0:y1,x0:x1,:] + p.show_points_3d_with_normal(roiXYZ) + self.assertFalse(imgXYZ is None) + + def test_convert_roi_to_points(self): + "computes 3d points of the ROI" + im_size = (640,480) + d = DataGen(img_size=im_size) + img = d.init_image(1) + p = PlaneDetectorDisplay(image_size=im_size) + roi = p.init_roi(1) + rect3d = p.convert_roi_to_points(img, roi_rect=roi) + self.assertTrue(rect3d.shape[0] > 4) + + def test_fit_plane_svd(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(5) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_svd(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.matrix_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_depth_image(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(13) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_svd(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_with_outliers(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(13) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_with_outliers(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_ransac(self): + "computes with ransac" + d = DataGen() + img = d.init_image(6) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_ransac(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_using_gradients(self): + "computes normal of the ROI using gradients and then cross product" + im_size = (640,480) + d = DataGen(img_size=im_size) + img = d.init_image(1) # corner + p = PlaneDetectorDisplay(image_size=im_size) + roi = p.init_roi(13) # image center + roim,rois = p.fit_plane_using_gradients(img, roi) + pose = p.convert_plane_params_to_pose() + p.show_image_colored_by_normals(img) + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_split_roi(self): + "computes ROIS and splits if needed" + p = PlaneDetector() + p.MIN_STD_ERROR = 0.1 + img = p.init_image(13) + roi = p.init_roi(4) + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + roi_list= p.fit_and_split_roi_recursively(roi) + p.show_rois_3d_with_normals(roi_list, roi) + p.show_image_with_rois(p.img, roi_list) + + for roi_s in roi_list: + self.assertFalse(roi_s['error'] > 0.01) + + def test_plane_fit(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_weighted(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('W') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_outliers(self): + "plane fit data" + d = DataSourceMovie() + srcid = 421 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('O') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(41) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_ransac(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('R') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_weighted_4d(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('T') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(53) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_using_gradients(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('G') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(33) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_multi_plane_fit(self): + "plane fit data multiple ROIs" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + roi_types = [21,22,23,31,32,33,52,53] + pm = [] + for rt in roi_types: + p = PlaneDetectorDisplay('T') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(rt) # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_scene(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + + def test_grid_plane_fit(self): + "plane fit data multiple ROIs in grid" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + + "generate grid of trackers" + nx, ny = 10, 10 + w,h = 1280, 720 + dx, dy = int(w/(nx+1)), int(h/(ny+1)) + wx,wy = int(dx*0.4), int(dy*0.4) + pm = [] + for ix in range(nx): + for iy in range(ny): + x0, y0 = (ix+1)*dx, (iy+1)*dy + #x1, y1 = min((ix+1)*dx, w-1), min((iy+1)*dy, h-1) + rect = (x0-wx,y0-wy,x0+wx,y0+wy) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + p.rect = rect # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_scene(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + + + def test_plane_fit_with_show_3d(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 421 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(41) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + p.show_3d_point_cloud() + d.finish() + self.assertFalse(ret) + + def test_plane_fit_colored_by_normals(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('G') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(0) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_image_colored_by_normals(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_plane_fit_with_grow(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('F') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_mask(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_plane_fit_with_project_on_image(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('S') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_mask(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_multi_plane_fit_with_project_on_image(self): + "plane fit data multiple ROIs with image coloring per ROI mask" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + roi_types = [21,31,52,33] #22,23,31,32,33,52,53] + pm = [] + for rt in roi_types: + p = PlaneDetectorDisplay('S') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(rt) # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_mask(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + +# ---------------------- +#%% Run Test +def RunTest(): + #unittest.main() + #suite = unittest.TestSuite() + suite = TestPlaneDetector() + #suite.test_image_show() # ok + #suite.test_init_img3d() # ok + #suite.test_compute_img3d() # ok + #suite.test_show_img3d() # ok + #suite.test_convert_roi_to_points() # ok + + + #suite.test_fit_plane_svd() # ok + #suite.test_fit_plane_depth_image() # + #suite.test_fit_plane_with_outliers() + #suite.test_fit_plane_ransac() + #suite.test_fit_plane_using_gradients() # ok + #suite.test_split_roi() + + #suite.test_plane_fit() # ok + #suite.test_plane_fit_weighted() # ok + #suite.test_plane_fit_outliers() # ok + #suite.test_plane_fit_with_show_3d() # ok + + #suite.test_plane_fit_ransac() # ok + #suite.test_plane_fit_weighted_4d() # ok + #suite.test_multi_plane_fit() # ok + #suite.test_plane_fit_using_gradients() # ok + #suite.test_plane_fit_colored_by_normals() # ok + #suite.test_plane_fit_with_grow() # + #suite.test_grid_plane_fit() # ok + #suite.test_plane_fit_with_project_on_image() # ok + suite.test_multi_plane_fit_with_project_on_image() # ok + + + + # runner = unittest.TextTestRunner() + # runner.run(suite) + +# ---------------------- +#%% App +class PlaneApp: + def __init__(self): + self.cap = RealSense() # + self.cap.set_display_mode('d16') + #self.cap.set_exposure(1000) + self.frame = None + self.rect = None + self.paused = False + self.trackers = [] + + self.camera_bf = self.cap.get_bf() # for depth conversion (should be in meter) + self.args = process_arguments() + self.args.scale = 0.5 + + # init + self.model = foundation_stereo_algo_init(self.args) + + self.show_dict = {} # hist show + + self.detect_type = 'G' + self.show_type = 'depth' # left, depth + self.win_name = 'Plane Detector (q-quit, c-clear, a,b,r,p,o,g,f,t,s,n,w 0-depth,1-left,2-right)' + + cv.namedWindow(self.win_name ) + self.rect_sel = RectSelector(self.win_name , self.on_rect) + self.run() + + def on_rect(self, rect): + "remember ROI defined by user" + #self.define_roi(self.frame, rect) + tracker = PlaneDetectorDisplay() #estimator_type=self.estim_type, estimator_id=estim_ind) + tracker.rect = rect + tracker.detect_type = self.detect_type + self.trackers.append(tracker) + log.info(f'Adding plane estimator at : {rect}') + + def generate_grid_trackers(self): + "generate grid of trackers" + nx, ny = 10, 10 + w,h = self.cap.frame_size + dx, dy = int(w/nx), int(h/ny) + for ix in range(nx): + for iy in range(ny): + x0, y0 = ix*dx, iy*dy + x1, y1 = min((ix+1)*dx, w-1), min((iy+1)*dy, h-1) + rect = (x0,y0,x1,y1) + self.on_rect(rect) + + def process_image(self, img_depth): + "makes measurements" + for tracker in self.trackers: + tracker.find_planes(img_depth) + + def show_scene(self, frame, img_depth_fs): + "draw ROI and Info" + if self.show_type == 'left': + vis = frame[:,:,0].astype(np.uint8) + elif self.show_type == 'right': + vis = frame[:,:,1].astype(np.uint8) + elif self.show_type == 'rs': + vis = cv.convertScaleAbs(frame[:,:,2], alpha=0.1).astype(np.uint8) + else: + vis = cv.convertScaleAbs(img_depth_fs, alpha=0.1).astype(np.uint8) + + vis = cv.cvtColor(vis, cv.COLOR_GRAY2BGR) + self.rect_sel.draw(vis) + + # first tracker is the floor + if len(self.trackers) > 0: + floor_z = self.trackers[0].plane_center[2] + else: + floor_z = 10000 + + for tracker in self.trackers: + tracker.floor_z = floor_z + vis = tracker.show_scene(vis) + + + return vis + + def show_histogram(self, img): + "show roi histgram" + if self.rect is None: + #print('define ROI') + return 0 + + x0, y0, x1, y1 = self.rect + img_roi = img[y0:y1,x0:x1].astype(np.float32) + # Compute histogram + hist, bins = np.histogram(img_roi.flatten(), bins=1024, range=[0, 2**15]) + + if not 'fig' in self.show_dict : #len(self.show_dict) < 1: + fig, ax = plt.subplots() + fig.set_size_inches([24, 16]) + ax.set_title('Histogram (Depth)') + ax.set_xlabel('Bin') + ax.set_ylabel('Frequency') + lineGray, = ax.plot(bins[:-1], hist, c='k', lw=3) + ax.set_xlim(bins[0], bins[-1]) + ax.set_ylim(0, max(hist)+10) + plt.ion() + #plt.show() + + self.show_dict = {'fig':fig, 'ax':ax, 'line':lineGray} + else: + self.show_dict['line'].set_ydata(hist) + + self.show_dict['fig'].canvas.draw() + return + + def run(self): + while True: + playing = not self.paused and not self.rect_sel.dragging + if playing or self.frame is None: + ret, frame = self.cap.read() + if not ret: + break + self.frame = frame.copy() + + # depth + img_left, img_right, img_depth_rs = self.frame[:,:,0], self.frame[:,:,1], self.frame[:,:,2] + + img_disparity = foundation_stereo_algo(self.args, self.model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(self.camera_bf, img_disparity) + + + #self.statistics(frame) + self.process_image(img_depth_fs) + + vis = self.show_scene(frame, img_depth_fs) + cv.imshow(self.win_name , vis) + ch = cv.waitKey(1) + if ch == ord(' '): + self.paused = not self.paused + elif ch == ord('a'): + self.detect_type = 'A' + log.info(f'Detect type : {self.detect_type}') + elif ch == ord('r'): + self.detect_type = 'R' + log.info(f'Detect Ransac : {self.detect_type}') + elif ch == ord('p'): + self.detect_type = 'P' + log.info(f'Detect svd : {self.detect_type}') + elif ch == ord('o'): + self.detect_type = 'O' + log.info(f'Detect with outliers : {self.detect_type}') + elif ch == ord('g'): + self.detect_type = 'G' + log.info(f'Detect gradients : {self.detect_type}') + elif ch == ord('f'): + self.detect_type = 'F' + log.info(f'Detect fit and grow : {self.detect_type}') + elif ch == ord('w'): + self.detect_type = 'W' + log.info(f'Detect weighted svd : {self.detect_type}') + elif ch == ord('s'): + self.detect_type = 'S' + log.info(f'Detect entire image : {self.detect_type}') + elif ch == ord('n'): + self.detect_type = 'N' + log.info(f'Detect entire image using normals : {self.detect_type}') + elif ch == ord('t'): + self.detect_type = 'T' + log.info(f'Detect weighted svd 4d : {self.detect_type}') + elif ch == ord('b'): + self.detect_type = 'B' + log.info(f'Detect boxes : {self.detect_type}') + elif ch == ord('0'): + self.show_type = 'rs' + log.info(f'Show type : {self.show_type}') + elif ch == ord('1'): + self.show_type = 'left' + log.info(f'Show type : {self.show_type}') + elif ch == ord('2'): + self.show_type = 'right' + log.info(f'Show type : {self.show_type}') + elif ch == ord('3'): + self.show_type = 'fs' + log.info(f'Show type : {self.show_type}') + elif ch == ord('m'): + self.generate_grid_trackers() + elif ch == ord('c'): + if len(self.trackers) > 0: + t = self.trackers.pop() + elif ch == 27 or ch == ord('q'): + break + + +if __name__ == '__main__': + #print(__doc__) + + #RunTest() + PlaneApp() + + + diff --git a/scripts/run_demo.py b/scripts/run_demo.py index ef75362..6d72e07 100755 --- a/scripts/run_demo.py +++ b/scripts/run_demo.py @@ -23,11 +23,14 @@ if __name__=="__main__": code_dir = os.path.dirname(os.path.realpath(__file__)) parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', default=f'{code_dir}/../weights/23-36-37/model_best_bp2_serialize.pth', type=str) + # parser.add_argument('--left_file', default=f'{code_dir}/../data/mono/img0074.jpg', type=str) + # parser.add_argument('--right_file', default=f'{code_dir}/../data/mono/img0076.jpg', type=str) parser.add_argument('--left_file', default=f'{code_dir}/../demo_data/left.png', type=str) parser.add_argument('--right_file', default=f'{code_dir}/../demo_data/right.png', type=str) parser.add_argument('--intrinsic_file', default=f'{code_dir}/../demo_data/K.txt', type=str, help='camera intrinsic matrix and baseline file') - parser.add_argument('--out_dir', default='/home/bowen/debug/stereo_output', type=str) + parser.add_argument('--out_dir', default=f'{code_dir}/../demo_data_out', type=str) parser.add_argument('--remove_invisible', default=1, type=int) parser.add_argument('--denoise_cloud', default=0, type=int) parser.add_argument('--denoise_nb_points', type=int, default=30, help='number of points to consider for radius outlier removal') @@ -44,7 +47,7 @@ set_seed(0) torch.autograd.set_grad_enabled(False) - os.system(f'rm -rf {args.out_dir} && mkdir -p {args.out_dir}') + #os.system(f'rm -rf {args.out_dir} && mkdir -p {args.out_dir}') with open(f'{os.path.dirname(args.model_dir)}/cfg.yaml', 'r') as ff: cfg:dict = yaml.safe_load(ff) @@ -76,8 +79,8 @@ img0_ori = img0.copy() img1_ori = img1.copy() logging.info(f"img0: {img0.shape}") - imageio.imwrite(f'{args.out_dir}/left.png', img0) - imageio.imwrite(f'{args.out_dir}/right.png', img1) + # imageio.imwrite(f'{args.out_dir}/left.png', img0) + # imageio.imwrite(f'{args.out_dir}/right.png', img1) img0 = torch.as_tensor(img0).cuda().float()[None].permute(0,3,1,2) img1 = torch.as_tensor(img1).cuda().float()[None].permute(0,3,1,2) @@ -99,7 +102,7 @@ max_val = None vis = vis_disparity(disp, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) vis = np.concatenate([img0_ori, img1_ori, vis], axis=1) - imageio.imwrite(f'{args.out_dir}/disp_vis.png', vis) + imageio.imwrite(f'{args.out_dir}/disp_0074.png', vis) s = 1280/vis.shape[1] resized_vis = cv2.resize(vis, (int(vis.shape[1]*s), int(vis.shape[0]*s))) cv2.imshow('disp', resized_vis[:,:,::-1]) diff --git a/scripts/run_fast_foundation_with_faro.py b/scripts/run_fast_foundation_with_faro.py new file mode 100644 index 0000000..bfba560 --- /dev/null +++ b/scripts/run_fast_foundation_with_faro.py @@ -0,0 +1,763 @@ + +''' +python scripts/run_fast_foundation_with_rs.py - github + +Environment: + ffs - GPU laptop + +Installation: + python -m venv ./envs/ffs + source ./envs/ffs/bin/activate + pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124 + cd Fast-FoundationStereo + pip install -r requirements.txt + pip install pyrealsense2 + download weights : https://drive.google.com/drive/folders/1HuTt7UIp7gQsMiDvJwVuWmKpvFzIIMap + + python scripts/run_demo.py --model_dir weights/20-30-48/model_best_bp2_serialize.pth --left_file demo_data/left.png --right_file demo_data/right.png --intrinsic_file assets/K.txt --out_dir output/ --remove_invisible 0 --denoise_cloud 1 --scale 1 --get_pc 1 --valid_iters 8 --max_disp 192 --zfar 100 + + +''' + + + + +import numpy as np +from argparse import ArgumentParser + +import os,sys +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +from omegaconf import OmegaConf +from core.utils.utils import InputPadder +import argparse, torch, logging, yaml +import numpy as np +from Utils import ( + AMP_DTYPE, set_logging_format, set_seed, vis_disparity, + depth2xyzmap, toOpen3dCloud, o3d, +) +import cv2 +#from faro_data_manager import DataSource +from scripts.data_manager_faro_laptop import DataSource + + +def process_arguments(): + parser = ArgumentParser() + code_dir = os.path.dirname(os.path.realpath(__file__)) + + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', default=f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth', type=str) + parser.add_argument('--left_file', default=f'{code_dir}/../demo_data/left.png', type=str) + parser.add_argument('--right_file', default=f'{code_dir}/../demo_data/right.png', type=str) + parser.add_argument('--intrinsic_file', default=f'{code_dir}/../demo_data/K.txt', type=str, help='camera intrinsic matrix and baseline file') + parser.add_argument('--out_dir', default='/home/bowen/debug/stereo_output', type=str) + parser.add_argument('--remove_invisible', default=1, type=int) + parser.add_argument('--denoise_cloud', default=0, type=int) + parser.add_argument('--denoise_nb_points', type=int, default=30, help='number of points to consider for radius outlier removal') + parser.add_argument('--denoise_radius', type=float, default=0.03, help='radius to use for outlier removal') + parser.add_argument('--scale', default=1, type=float) + parser.add_argument('--hiera', default=0, type=int) + parser.add_argument('--get_pc', type=int, default=1, help='save point cloud output') + parser.add_argument('--valid_iters', type=int, default=8, help='number of flow-field updates during forward pass') + parser.add_argument('--max_disp', type=int, default=192, help='maximum disparity') + parser.add_argument('--zfar', type=float, default=100, help="max depth to include in point cloud") + + return parser.parse_args() + +# Original ------------------------------------------- + +# Original Split ------------------------------------------- +#from opencv_realsense_camera import RealSense +import time + +def foundation_stereo_algo_init(args): + "initialize the algorithm" + + set_logging_format() + set_seed(0) + torch.autograd.set_grad_enabled(False) + + os.system(f'rm -rf {args.out_dir} && mkdir -p {args.out_dir}') + + with open(f'{os.path.dirname(args.model_dir)}/cfg.yaml', 'r') as ff: + cfg:dict = yaml.safe_load(ff) + for k in args.__dict__: + if args.__dict__[k] is not None: + cfg[k] = args.__dict__[k] + args = OmegaConf.create(cfg) + logging.info(f"args:\n{args}") + model = torch.load(args.model_dir, map_location='cpu', weights_only=False) + model.args.valid_iters = args.valid_iters + model.args.max_disp = args.max_disp + + model.cuda().eval() + return model + +def foundation_stereo_algo(args, model, np_left, np_right): + "stereo algo" + scale = args.scale + + img0 = np_left #imageio.imread(args.left_file) + img1 = np_right #imageio.imread(args.right_file) + if len(img0.shape)==2: + img0 = np.tile(img0[...,None], (1,1,3)) + img1 = np.tile(img1[...,None], (1,1,3)) + + img0 = img0[...,:3] + img1 = img1[...,:3] + Ho,Wo = img0.shape[:2] + + img0 = cv2.resize(img0, fx=scale, fy=scale, dsize=None) + img1 = cv2.resize(img1, dsize=(img0.shape[1], img0.shape[0])) + + H,W = img0.shape[:2] + #img0_ori = img0.copy() + #img1_ori = img1.copy() + #logging.info(f"img0: {img0.shape}") + #imageio.imwrite(f'{args.out_dir}/left.png', img0) + #imageio.imwrite(f'{args.out_dir}/right.png', img1) + + img0 = torch.as_tensor(img0).cuda().float()[None].permute(0,3,1,2) + img1 = torch.as_tensor(img1).cuda().float()[None].permute(0,3,1,2) + padder = InputPadder(img0.shape, divis_by=32, force_square=False) + img0, img1 = padder.pad(img0, img1) + + #logging.info(f"Start forward, 1st time run can be slow due to compilation") + with torch.amp.autocast('cuda', enabled=True, dtype=AMP_DTYPE): + if not args.hiera: + disp = model.forward(img0, img1, iters=args.valid_iters, test_mode=True, optimize_build_volume='pytorch1') + else: + disp = model.run_hierachical(img0, img1, iters=args.valid_iters, test_mode=True, small_ratio=0.5) + + #logging.info("forward done") + + scale_factor = 1/scale + disp = padder.unpad(disp.float()) + #disp = disp.data.cpu().numpy().reshape(H,W).clip(0, None) + disp = disp.data.cpu().numpy().reshape(H,W)*scale_factor + disp = disp.clip(0, None) + + # recover original size + #orig_shape = np_left.shape[::-1] + orig_shape = (np_left.shape[1],np_left.shape[0]) + #print(orig_shape, disp.shape) + #scale_factor = orig_shape[0]/process_shape[0] + disp_out = cv2.resize(disp, orig_shape, interpolation=cv2.INTER_NEAREST) # A: changed + + # cmap = None + # min_val = None + # max_val = None + # vis = vis_disparity(disp, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + # vis = np.concatenate([img0_ori, img1_ori, vis], axis=1) + # imageio.imwrite(f'{args.out_dir}/disp_vis.png', vis) + # s = 1280/vis.shape[1] + # resized_vis = cv2.resize(vis, (int(vis.shape[1]*s), int(vis.shape[0]*s))) + # cv2.imshow('disp', resized_vis[:,:,::-1]) + # cv2.waitKey(0) + + # if args.remove_invisible: + # yy,xx = np.meshgrid(np.arange(disp.shape[0]), np.arange(disp.shape[1]), indexing='ij') + # us_right = xx-disp + # invalid = us_right<0 + # disp[invalid] = np.inf + + # debug and show + #show_point_cloud(args, disp, img0_ori) + #disp = disp * 1000 # must be in mm + return disp_out + +def convert_disparity_to_depth(BF, disparity): + "from GIL" + + disparity = disparity.astype(np.float32) + depth = np.zeros_like(disparity) + disparity_valid = disparity > 0.1 + depth[disparity_valid] = BF / disparity[disparity_valid] + #depth[disparity_valid] += 0.5 # LUT in the simulator + return depth.astype(np.uint16) + +def depth_opencv_rs_merge(depth_rs, depth_cv): + "computing disparity by merging depth from real sense and opencv" + depth_merged = np.copy(depth_rs) + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = (depth_cv < 550) & (depth_cv > 80) + depth_merged[mask] = depth_cv[mask] + + # A: changed + # # if the real sense depth is invalid use opencv depth + # mask = (depth_rs < 1) & (depth_cv > 0) + # depth_merged[mask] = depth_cv[mask] + + return depth_merged + +def depth_fs_rs_error(depth_rs, depth_fs): + "computing depth error between real sense and fs" + depth_rs, depth_fs = depth_rs.astype(np.float32), depth_fs.astype(np.float32) + depth_error = np.abs(depth_rs - depth_fs) + + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = depth_rs < 2 + depth_error[mask] = 0 + + return depth_error + +def preprocess(frame, scale_factor = 0.5): + "convert and downscale" + frame = frame.astype(np.float32) + + # assign + imgL = frame[:,:,0] + imgR = frame[:,:,1] + imgD = frame[:,:,2] + + # if scale_factor < 0.9: + + # imgL = cv2.resize(imgL, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgR = cv2.resize(imgR, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgD = cv2.resize(imgD, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_NEAREST) + + return imgL, imgR, imgD + +def show_images_depth(imgD = None, imgC = None, fig_num = 1, fig_name = 'Depth Image', vmax = 1500): + "draw results of depth estimation" + + + if (imgD is None) and (imgC is None): + print('No images found') + return False + + elif imgD is None: # no data acquired + img_show = imgC + + elif imgC is None: # no data is processed + img_show = imgD + #img_show = cv.applyColorMap(self.imgD, cv.COLORMAP_TURBO) + + elif np.all(imgD.shape == imgC.shape): + img_show = np.concatenate((imgD, imgC ), axis = 1) + + # deal with 16 uint + if img_show.dtype == 'uint16' or img_show.dtype == 'float32': + img_show = cv2.convertScaleAbs(img_show, alpha=0.1) + img_show = cv2.applyColorMap(img_show, cv2.COLORMAP_TURBO) # + pass + else: + #self.imgD = np.repeat(self.imgD[:,:,np.newaxis], 3, axis = 2) + #img_show = np.concatenate((self.imgD, self.imgC ), axis = 1) + #img_show = cv.applyColorMap(img_show.astype(np.uint8), cv.COLORMAP_TURBO) + #img_show = self.imgC #np.concatenate((self.imgD, self.imgC ), axis = 1) + pass + + # deal with black and white + if img_show.shape[1] > 2400: + img_show = cv2.pyrDown(img_show) + + cv2.imshow(f'{fig_name} (q-exit)', img_show) + ch = cv2.waitKey(5) + ret = ch == ord('q') + + # plt.figure() + # plt.imshow(imgD, vmin = 100, vmax=vmax) + # plt.title(fig_name) + # plt.show(block=False) + + return ret + +def show_point_cloud(args, disp, img0_ori): + "from fs" + if not args.get_pc: + return + + scale = args.scale + with open(args.intrinsic_file, 'r') as f: + lines = f.readlines() + K = np.array(list(map(float, lines[0].rstrip().split()))).astype(np.float32).reshape(3,3) + baseline = float(lines[1]) + K[:2] *= scale + depth = K[0,0]*baseline/disp + np.save(f'{args.out_dir}/depth_meter.npy', depth) + xyz_map = depth2xyzmap(depth, K) + pcd = toOpen3dCloud(xyz_map.reshape(-1,3), img0_ori.reshape(-1,3)) + keep_mask = (np.asarray(pcd.points)[:,2]>0) & (np.asarray(pcd.points)[:,2]<=args.zfar) + keep_ids = np.arange(len(np.asarray(pcd.points)))[keep_mask] + pcd = pcd.select_by_index(keep_ids) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud.ply', pcd) + #logging.info(f"PCL saved to {args.out_dir}") + + if args.denoise_cloud: + logging.info("[Optional step] denoise point cloud...") + cl, ind = pcd.remove_radius_outlier(nb_points=args.denoise_nb_points, radius=args.denoise_radius) + inlier_cloud = pcd.select_by_index(ind) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud_denoise.ply', inlier_cloud) + pcd = inlier_cloud + + logging.info("Visualizing point cloud. Press ESC to exit.") + vis = o3d.visualization.Visualizer() + vis.create_window() + vis.add_geometry(pcd) + vis.get_render_option().point_size = 1.0 + vis.get_render_option().background_color = np.array([0.5, 0.5, 0.5]) + ctr = vis.get_view_control() + ctr.set_front([0, 0, -1]) + id = np.asarray(pcd.points)[:,2].argmin() + ctr.set_lookat(np.asarray(pcd.points)[id]) + ctr.set_up([0, -1, 0]) + vis.run() + vis.destroy_window() + +def depth2xyz(depth:np.ndarray, K, us=None, vs=None, zmin=0.1): + #invalid_mask = (depth dt: + # Options (uncomment each to try them out): + # 1) extend with ndarrays. + pcd.points.extend(np.random.rand(n_new, 3)) + + # 2) extend with Vector3dVector instances. + # pcd.points.extend( + # o3d.utility.Vector3dVector(np.random.rand(n_new, 3))) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(pcd) + previous_t = time.time() + + keep_running = vis.poll_events() + vis.update_renderer() + + vis.destroy_window() + +def test_video_stream_rs_and_foundation(): + "streaming rs to foundation" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_merged = depth_opencv_rs_merge(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_merged, None, fig_name='Depth Merged') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + +def test_video_error_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + #print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + #print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + + # plt.show() + + d.release() + +def test_point_cloud_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + #d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + #camera_k = d.get_camera_intrinsics() + args = process_arguments() + K = np.array([[637,0,640],[0,637,360],[0,0,1]]) + + # init + model = foundation_stereo_algo_init(args) + + # create visualizer and window. + vis = o3d.visualization.Visualizer() + vis.create_window(height=720, width=1280) + + # initialize pointcloud instance. + pcd = o3d.geometry.PointCloud() + # *optionally* add initial points + points = np.random.rand(10, 3)*1000 + pcd.points = o3d.utility.Vector3dVector(points) + us, vs = None, None + + # include it in the visualizer before non-blocking visualization. + vis.add_geometry(pcd) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + #xyz_map, pts = depth2xyzmap(img_depth_fs, K) + pts, us, vs = depth2xyz(img_depth_fs, K, us, vs, zmin=100) + clr = np.zeros_like(pts) + clr[:,0] = clr[:,1] = clr[:,2] = img_left.flatten() + + # show point cloud + #pcd.points.extend(pts) + + #vis.remove_geometry(pcd) + #pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(pts) + #pcd.colors = o3d.utility.Vector3dVector(clr) + downpcd = pcd.voxel_down_sample(voxel_size=0.1) + #vis.add_geometry(pcd) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(downpcd) + + keep_running = vis.poll_events() + if not keep_running: break + vis.update_renderer() + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + vis.destroy_window() + +def test_video_stream_rs_versus_foundation_x2(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def merge_fs_rs(depth_rs, depth_fs): + "trying to merge information and deal with non valid regions" + nr, nc = depth_rs.shape + depth_rs_out = depth_rs.copy() + valid_rs = depth_rs > 1 + start_rs = np.zeros_like(valid_rs) + start_rs[:,:-1] = np.logical_and(valid_rs[:,:-1] , ~valid_rs[:,1:]) # if pixel k is valid and k + 1 is not + stop_rs = np.zeros_like(valid_rs) + stop_rs[:,1:] = np.logical_and(~valid_rs[:,:-1] , valid_rs[:,1:]) # if pixel k-1 is not valid and k is valid + + for r in range(nr): + start_ind = np.where(start_rs[r,:])[0] + stop_ind = np.where(stop_rs[r,:])[0] + if len(start_ind) < 1 or len(stop_ind) < 1: + continue + + for s in start_ind: + ii = np.where(s < stop_ind)[0] + if len(ii) < 1: continue + f = stop_ind[0] + if np.abs(depth_fs[r,s] - depth_fs[r,f]) < 0.1*depth_fs[r,s]: + depth_rs_out[r,s:f] = depth_fs[r,s:f] + print('.') + + return depth_rs_out + +def test_video_stream_rs_fs_merge(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_depth_merge = merge_fs_rs(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(img_depth_merge, None, fig_name='Depth Merge', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.save_image(img_depth_rs,fname='depth_rs') + d.save_image(img_depth_fs,fname='depth_fs') + d.save_image(img_left,fname='img_left') + d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def test_faro_rs_fs_error(): + "reading data files from FARO and comparing the results" + + #d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + d = DataSource() + img_num = d.init_directory() + camera_bf = d.get_bf() # for depth conversion (should be in meter) + + args = process_arguments() + args.scale = 0.5 + #args.model_dir = '/home/administrato/dev/Fast-FoundationStereo/weights/20-30-48/model_finetuned_faro.pth' + #args.model_dir = '../weights/20-30-48/model_finetuned_faro.pth' + + # init + model = foundation_stereo_algo_init(args) + + + img_index = np.random.randint(0,img_num,8) + for k in img_index: + + # frame is I1,I2, D data + out_data = d.get_item(k, debug = True) + #out_data = d.load_specific_files(debug = True) + #print(out_data) + + # extract + img_left, img_right, img_depth_rs, img_depth_faro = out_data["img_left"], out_data["img_right"], out_data["img_depth_rs"], out_data["img_depth_faro"] + print(img_left.shape, img_right.shape) + + # process + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_error_fs = depth_fs_rs_error(img_depth_rs, img_depth_fs) + img_error_faro = depth_fs_rs_error(img_depth_faro, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS', vmax = 400) + ret3 = show_images_depth(img_depth_faro, None, fig_name='Depth Faro') + ret4 = show_images_depth(img_error_fs.astype(np.uint8), None, fig_name='Error RS-FS', vmax = 100) + ret5 = show_images_depth(img_error_faro.astype(np.uint8), None, fig_name='Error Faro-FS', vmax = 100) + ret6 = show_images_depth(img_left.astype(np.uint8), None, fig_name='Images L') + ret7 = show_images_depth(img_right.astype(np.uint8), None, fig_name='Images R') + ret = ret1 or ret2 or ret3 or ret4 or ret5 or ret6 or ret7 + if ret: break + + # d.save_image(img_depth_rs,fname='depth_rs') + #d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + #d.ckose() + + +if __name__ == '__main__': + #main() + #test_video_stream_rs_and_foundation() # ok + #test_video_error_rs_versus_foundation() # ok + # test_point_cloud_rt() # ok + #test_point_cloud_rs_versus_foundation() + #test_video_stream_rs_versus_foundation_x2() # ok + #test_video_stream_rs_fs_merge() + test_faro_rs_fs_error() + + + + + + + + + \ No newline at end of file diff --git a/scripts/run_fast_foundation_with_rs.py b/scripts/run_fast_foundation_with_rs.py new file mode 100644 index 0000000..10bce17 --- /dev/null +++ b/scripts/run_fast_foundation_with_rs.py @@ -0,0 +1,763 @@ + +''' +python scripts/run_fast_foundation_with_rs.py - github + +Environment: + ffs - GPU laptop + +Installation: + python -m venv ./envs/ffs + source ./envs/ffs/bin/activate + pip install torch==2.6.0 torchvision==0.21.0 xformers --index-url https://download.pytorch.org/whl/cu124 + cd Fast-FoundationStereo + pip install -r requirements.txt + pip install pyrealsense2 + download weights : https://drive.google.com/drive/folders/1HuTt7UIp7gQsMiDvJwVuWmKpvFzIIMap + + python scripts/run_demo.py --model_dir weights/20-30-48/model_best_bp2_serialize.pth --left_file demo_data/left.png --right_file demo_data/right.png --intrinsic_file assets/K.txt --out_dir output/ --remove_invisible 0 --denoise_cloud 1 --scale 1 --get_pc 1 --valid_iters 8 --max_disp 192 --zfar 100 + + +''' + + + + +import numpy as np +from argparse import ArgumentParser + +import os,sys +code_dir = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(f'{code_dir}/../') +from omegaconf import OmegaConf +from core.utils.utils import InputPadder +import argparse, torch, logging, yaml +import numpy as np +from Utils import ( + AMP_DTYPE, set_logging_format, set_seed, vis_disparity, + depth2xyzmap, toOpen3dCloud, o3d, +) +import cv2 +#from faro_data_manager import DataSource +from scripts.data_manager_faro_laptop import DataSource + + +def process_arguments(): + parser = ArgumentParser() + code_dir = os.path.dirname(os.path.realpath(__file__)) + + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', default=f'{code_dir}/../weights/20-30-48/model_best_bp2_serialize.pth', type=str) + parser.add_argument('--left_file', default=f'{code_dir}/../demo_data/left.png', type=str) + parser.add_argument('--right_file', default=f'{code_dir}/../demo_data/right.png', type=str) + parser.add_argument('--intrinsic_file', default=f'{code_dir}/../demo_data/K.txt', type=str, help='camera intrinsic matrix and baseline file') + parser.add_argument('--out_dir', default='/home/bowen/debug/stereo_output', type=str) + parser.add_argument('--remove_invisible', default=1, type=int) + parser.add_argument('--denoise_cloud', default=0, type=int) + parser.add_argument('--denoise_nb_points', type=int, default=30, help='number of points to consider for radius outlier removal') + parser.add_argument('--denoise_radius', type=float, default=0.03, help='radius to use for outlier removal') + parser.add_argument('--scale', default=1, type=float) + parser.add_argument('--hiera', default=0, type=int) + parser.add_argument('--get_pc', type=int, default=1, help='save point cloud output') + parser.add_argument('--valid_iters', type=int, default=8, help='number of flow-field updates during forward pass') + parser.add_argument('--max_disp', type=int, default=192, help='maximum disparity') + parser.add_argument('--zfar', type=float, default=100, help="max depth to include in point cloud") + + return parser.parse_args() + +# Original ------------------------------------------- + +# Original Split ------------------------------------------- +from opencv_realsense_camera import RealSense +import time + +def foundation_stereo_algo_init(args): + "initialize the algorithm" + + set_logging_format() + set_seed(0) + torch.autograd.set_grad_enabled(False) + + os.system(f'rm -rf {args.out_dir} && mkdir -p {args.out_dir}') + + with open(f'{os.path.dirname(args.model_dir)}/cfg.yaml', 'r') as ff: + cfg:dict = yaml.safe_load(ff) + for k in args.__dict__: + if args.__dict__[k] is not None: + cfg[k] = args.__dict__[k] + args = OmegaConf.create(cfg) + logging.info(f"args:\n{args}") + model = torch.load(args.model_dir, map_location='cpu', weights_only=False) + model.args.valid_iters = args.valid_iters + model.args.max_disp = args.max_disp + + model.cuda().eval() + return model + +def foundation_stereo_algo(args, model, np_left, np_right): + "stereo algo" + scale = args.scale + + img0 = np_left #imageio.imread(args.left_file) + img1 = np_right #imageio.imread(args.right_file) + if len(img0.shape)==2: + img0 = np.tile(img0[...,None], (1,1,3)) + img1 = np.tile(img1[...,None], (1,1,3)) + + img0 = img0[...,:3] + img1 = img1[...,:3] + Ho,Wo = img0.shape[:2] + + img0 = cv2.resize(img0, fx=scale, fy=scale, dsize=None) + img1 = cv2.resize(img1, dsize=(img0.shape[1], img0.shape[0])) + + H,W = img0.shape[:2] + #img0_ori = img0.copy() + #img1_ori = img1.copy() + #logging.info(f"img0: {img0.shape}") + #imageio.imwrite(f'{args.out_dir}/left.png', img0) + #imageio.imwrite(f'{args.out_dir}/right.png', img1) + + img0 = torch.as_tensor(img0).cuda().float()[None].permute(0,3,1,2) + img1 = torch.as_tensor(img1).cuda().float()[None].permute(0,3,1,2) + padder = InputPadder(img0.shape, divis_by=32, force_square=False) + img0, img1 = padder.pad(img0, img1) + + #logging.info(f"Start forward, 1st time run can be slow due to compilation") + with torch.amp.autocast('cuda', enabled=True, dtype=AMP_DTYPE): + if not args.hiera: + disp = model.forward(img0, img1, iters=args.valid_iters, test_mode=True, optimize_build_volume='pytorch1') + else: + disp = model.run_hierachical(img0, img1, iters=args.valid_iters, test_mode=True, small_ratio=0.5) + + #logging.info("forward done") + + scale_factor = 1/scale + disp = padder.unpad(disp.float()) + #disp = disp.data.cpu().numpy().reshape(H,W).clip(0, None) + disp = disp.data.cpu().numpy().reshape(H,W)*scale_factor + disp = disp.clip(0, None) + + # recover original size + #orig_shape = np_left.shape[::-1] + orig_shape = (np_left.shape[1],np_left.shape[0]) + #print(orig_shape, disp.shape) + #scale_factor = orig_shape[0]/process_shape[0] + disp_out = cv2.resize(disp, orig_shape, interpolation=cv2.INTER_NEAREST) # A: changed + + # cmap = None + # min_val = None + # max_val = None + # vis = vis_disparity(disp, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + # vis = np.concatenate([img0_ori, img1_ori, vis], axis=1) + # imageio.imwrite(f'{args.out_dir}/disp_vis.png', vis) + # s = 1280/vis.shape[1] + # resized_vis = cv2.resize(vis, (int(vis.shape[1]*s), int(vis.shape[0]*s))) + # cv2.imshow('disp', resized_vis[:,:,::-1]) + # cv2.waitKey(0) + + # if args.remove_invisible: + # yy,xx = np.meshgrid(np.arange(disp.shape[0]), np.arange(disp.shape[1]), indexing='ij') + # us_right = xx-disp + # invalid = us_right<0 + # disp[invalid] = np.inf + + # debug and show + #show_point_cloud(args, disp, img0_ori) + #disp = disp * 1000 # must be in mm + return disp_out + +def convert_disparity_to_depth(BF, disparity): + "from GIL" + + disparity = disparity.astype(np.float32) + depth = np.zeros_like(disparity) + disparity_valid = disparity > 0.1 + depth[disparity_valid] = BF / disparity[disparity_valid] + #depth[disparity_valid] += 0.5 # LUT in the simulator + return depth.astype(np.uint16) + +def depth_opencv_rs_merge(depth_rs, depth_cv): + "computing disparity by merging depth from real sense and opencv" + depth_merged = np.copy(depth_rs) + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = (depth_cv < 550) & (depth_cv > 80) + depth_merged[mask] = depth_cv[mask] + + # A: changed + # # if the real sense depth is invalid use opencv depth + # mask = (depth_rs < 1) & (depth_cv > 0) + # depth_merged[mask] = depth_cv[mask] + + return depth_merged + +def depth_fs_rs_error(depth_rs, depth_fs): + "computing depth error between real sense and fs" + depth_rs, depth_fs = depth_rs.astype(np.float32), depth_fs.astype(np.float32) + depth_error = np.abs(depth_rs - depth_fs) + + # lesss than 400 mm use opencv depth (520 min Z in HD D455) + mask = depth_rs < 2 + depth_error[mask] = 0 + + return depth_error + +def preprocess(frame, scale_factor = 0.5): + "convert and downscale" + frame = frame.astype(np.float32) + + # assign + imgL = frame[:,:,0] + imgR = frame[:,:,1] + imgD = frame[:,:,2] + + # if scale_factor < 0.9: + + # imgL = cv2.resize(imgL, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgR = cv2.resize(imgR, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) + # imgD = cv2.resize(imgD, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_NEAREST) + + return imgL, imgR, imgD + +def show_images_depth(imgD = None, imgC = None, fig_num = 1, fig_name = 'Depth Image', vmax = 1500): + "draw results of depth estimation" + + + if (imgD is None) and (imgC is None): + print('No images found') + return False + + elif imgD is None: # no data acquired + img_show = imgC + + elif imgC is None: # no data is processed + img_show = imgD + #img_show = cv.applyColorMap(self.imgD, cv.COLORMAP_TURBO) + + elif np.all(imgD.shape == imgC.shape): + img_show = np.concatenate((imgD, imgC ), axis = 1) + + # deal with 16 uint + if img_show.dtype == 'uint16' or img_show.dtype == 'float32': + img_show = cv2.convertScaleAbs(img_show, alpha=0.1) + img_show = cv2.applyColorMap(img_show, cv2.COLORMAP_TURBO) # + pass + else: + #self.imgD = np.repeat(self.imgD[:,:,np.newaxis], 3, axis = 2) + #img_show = np.concatenate((self.imgD, self.imgC ), axis = 1) + #img_show = cv.applyColorMap(img_show.astype(np.uint8), cv.COLORMAP_TURBO) + #img_show = self.imgC #np.concatenate((self.imgD, self.imgC ), axis = 1) + pass + + # deal with black and white + if img_show.shape[1] > 2400: + img_show = cv2.pyrDown(img_show) + + cv2.imshow(f'{fig_name} (q-exit)', img_show) + ch = cv2.waitKey(5) + ret = ch == ord('q') + + # plt.figure() + # plt.imshow(imgD, vmin = 100, vmax=vmax) + # plt.title(fig_name) + # plt.show(block=False) + + return ret + +def show_point_cloud(args, disp, img0_ori): + "from fs" + if not args.get_pc: + return + + scale = args.scale + with open(args.intrinsic_file, 'r') as f: + lines = f.readlines() + K = np.array(list(map(float, lines[0].rstrip().split()))).astype(np.float32).reshape(3,3) + baseline = float(lines[1]) + K[:2] *= scale + depth = K[0,0]*baseline/disp + np.save(f'{args.out_dir}/depth_meter.npy', depth) + xyz_map = depth2xyzmap(depth, K) + pcd = toOpen3dCloud(xyz_map.reshape(-1,3), img0_ori.reshape(-1,3)) + keep_mask = (np.asarray(pcd.points)[:,2]>0) & (np.asarray(pcd.points)[:,2]<=args.zfar) + keep_ids = np.arange(len(np.asarray(pcd.points)))[keep_mask] + pcd = pcd.select_by_index(keep_ids) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud.ply', pcd) + #logging.info(f"PCL saved to {args.out_dir}") + + if args.denoise_cloud: + logging.info("[Optional step] denoise point cloud...") + cl, ind = pcd.remove_radius_outlier(nb_points=args.denoise_nb_points, radius=args.denoise_radius) + inlier_cloud = pcd.select_by_index(ind) + #o3d.io.write_point_cloud(f'{args.out_dir}/cloud_denoise.ply', inlier_cloud) + pcd = inlier_cloud + + logging.info("Visualizing point cloud. Press ESC to exit.") + vis = o3d.visualization.Visualizer() + vis.create_window() + vis.add_geometry(pcd) + vis.get_render_option().point_size = 1.0 + vis.get_render_option().background_color = np.array([0.5, 0.5, 0.5]) + ctr = vis.get_view_control() + ctr.set_front([0, 0, -1]) + id = np.asarray(pcd.points)[:,2].argmin() + ctr.set_lookat(np.asarray(pcd.points)[id]) + ctr.set_up([0, -1, 0]) + vis.run() + vis.destroy_window() + +def depth2xyz(depth:np.ndarray, K, us=None, vs=None, zmin=0.1): + #invalid_mask = (depth dt: + # Options (uncomment each to try them out): + # 1) extend with ndarrays. + pcd.points.extend(np.random.rand(n_new, 3)) + + # 2) extend with Vector3dVector instances. + # pcd.points.extend( + # o3d.utility.Vector3dVector(np.random.rand(n_new, 3))) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(pcd) + previous_t = time.time() + + keep_running = vis.poll_events() + vis.update_renderer() + + vis.destroy_window() + +def test_video_stream_rs_and_foundation(): + "streaming rs to foundation" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_merged = depth_opencv_rs_merge(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_merged, None, fig_name='Depth Merged') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + +def test_video_error_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + #print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + #print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + + # plt.show() + + d.release() + +def test_point_cloud_rs_versus_foundation(): + "streaming rs to foundation - checks error between them" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #self.cap.set_exposure(1000) # 10ms + #d.switch_projector(True) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + #camera_k = d.get_camera_intrinsics() + args = process_arguments() + K = np.array([[637,0,640],[0,637,360],[0,0,1]]) + + # init + model = foundation_stereo_algo_init(args) + + # create visualizer and window. + vis = o3d.visualization.Visualizer() + vis.create_window(height=720, width=1280) + + # initialize pointcloud instance. + pcd = o3d.geometry.PointCloud() + # *optionally* add initial points + points = np.random.rand(10, 3)*1000 + pcd.points = o3d.utility.Vector3dVector(points) + us, vs = None, None + + # include it in the visualizer before non-blocking visualization. + vis.add_geometry(pcd) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + print(f'Disp : {img_disparity.min()} - {img_disparity.max()}') + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + print(f'Depth : {img_depth_fs.min()} - {img_depth_fs.max()}') + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + #xyz_map, pts = depth2xyzmap(img_depth_fs, K) + pts, us, vs = depth2xyz(img_depth_fs, K, us, vs, zmin=100) + clr = np.zeros_like(pts) + clr[:,0] = clr[:,1] = clr[:,2] = img_left.flatten() + + # show point cloud + #pcd.points.extend(pts) + + #vis.remove_geometry(pcd) + #pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(pts) + #pcd.colors = o3d.utility.Vector3dVector(clr) + downpcd = pcd.voxel_down_sample(voxel_size=0.1) + #vis.add_geometry(pcd) + + # 3) other iterables, e.g + # pcd.points.extend(np.random.rand(n_new, 3).tolist()) + + vis.update_geometry(downpcd) + + keep_running = vis.poll_events() + if not keep_running: break + vis.update_renderer() + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error') + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + d.release() + vis.destroy_window() + +def test_video_stream_rs_versus_foundation_x2(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + depth_error = depth_fs_rs_error(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(depth_error, None, fig_name='Depth Error', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def merge_fs_rs(depth_rs, depth_fs): + "trying to merge information and deal with non valid regions" + nr, nc = depth_rs.shape + depth_rs_out = depth_rs.copy() + valid_rs = depth_rs > 1 + start_rs = np.zeros_like(valid_rs) + start_rs[:,:-1] = np.logical_and(valid_rs[:,:-1] , ~valid_rs[:,1:]) # if pixel k is valid and k + 1 is not + stop_rs = np.zeros_like(valid_rs) + stop_rs[:,1:] = np.logical_and(~valid_rs[:,:-1] , valid_rs[:,1:]) # if pixel k-1 is not valid and k is valid + + for r in range(nr): + start_ind = np.where(start_rs[r,:])[0] + stop_ind = np.where(stop_rs[r,:])[0] + if len(start_ind) < 1 or len(stop_ind) < 1: + continue + + for s in start_ind: + ii = np.where(s < stop_ind)[0] + if len(ii) < 1: continue + f = stop_ind[0] + if np.abs(depth_fs[r,s] - depth_fs[r,f]) < 0.1*depth_fs[r,s]: + depth_rs_out[r,s:f] = depth_fs[r,s:f] + print('.') + + return depth_rs_out + +def test_video_stream_rs_fs_merge(): + "streaming rs to foundation - checks speed" + d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + #d.load_preset_from_file('vpi/preset_rsm.json') + #d.set_exposure(100000) # 10ms + #d.switch_projector(False) + #self.cap.set_laser_power(100) # max power + camera_bf = d.get_bf() # for depth conversion (should be in meter) + args = process_arguments() + args.scale = 0.5 + + # init + model = foundation_stereo_algo_init(args) + + ret = False + while not ret: + # frame is I1,I2, D data + retf, frame = d.read() + if retf is False: + print('is your camera open/connected?') + break + + # extract + img_left, img_right, img_depth_rs = preprocess(frame, scale_factor=1.0) + + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_depth_merge = merge_fs_rs(img_depth_rs, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # massage + #cmap, min_val, max_val = None, None, None + #img_depth_fs = vis_disparity(img_disparity, min_val=min_val, max_val=max_val, cmap=cmap, color_map=cv2.COLORMAP_TURBO) + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS') + ret3 = show_images_depth(img_depth_merge, None, fig_name='Depth Merge', vmax = 100) + ret4 = show_images_depth(img_left.astype(np.uint8), img_right.astype(np.uint8), fig_name='Images L-R') + ret = ret1 or ret2 or ret3 or ret4 + + # d.save_image(img_depth_rs,fname='depth_rs') + # d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + d.release() + +def test_faro_rs_fs_error(): + "reading data files from FARO and comparing the results" + + #d = RealSense(mode = 'd16', use_ir = True, frame_size = (1280,720)) + d = DataSource() + img_num = d.init_directory() + camera_bf = d.get_bf() # for depth conversion (should be in meter) + + args = process_arguments() + args.scale = 0.5 + args.model_dir = '/home/administrato/dev/Fast-FoundationStereo/weights/20-30-48/model_finetuned_faro.pth' + args.model_dir = '/home/administrato/dev/Fast-FoundationStereo/weights/20-30-48/model_finetuned_faro.pth' + + # init + model = foundation_stereo_algo_init(args) + + + img_index = np.random.randint(0,img_num,8) + for k in img_index: + + # frame is I1,I2, D data + out_data = d.get_item(k, debug = True) + #out_data = d.load_specific_files(debug = True) + #print(out_data) + + # extract + img_left, img_right, img_depth_rs, img_depth_faro = out_data["img_left"], out_data["img_right"], out_data["img_depth_rs"], out_data["img_depth_faro"] + print(img_left.shape, img_right.shape) + + # process + t_start = time.time() + img_disparity = foundation_stereo_algo(args, model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(camera_bf, img_disparity) + img_error_fs = depth_fs_rs_error(img_depth_rs, img_depth_fs) + img_error_faro = depth_fs_rs_error(img_depth_faro, img_depth_fs) + print(f'Proces time : {time.time()-t_start} sec') + + # show + ret1 = show_images_depth(img_depth_rs, None, fig_name='Depth RS') + ret2 = show_images_depth(img_depth_fs, None, fig_name='Depth FS', vmax = 400) + ret3 = show_images_depth(img_depth_faro, None, fig_name='Depth Faro') + ret4 = show_images_depth(img_error_fs.astype(np.uint8), None, fig_name='Error RS-FS', vmax = 100) + ret5 = show_images_depth(img_error_faro.astype(np.uint8), None, fig_name='Error Faro-FS', vmax = 100) + ret6 = show_images_depth(img_left.astype(np.uint8), None, fig_name='Images L') + ret7 = show_images_depth(img_right.astype(np.uint8), None, fig_name='Images R') + ret = ret1 or ret2 or ret3 or ret4 or ret5 or ret6 or ret7 + if ret: break + + # d.save_image(img_depth_rs,fname='depth_rs') + #d.save_image(img_depth_fs,fname='depth_fs') + # d.save_image(img_left,fname='img_left') + # d.save_image(img_right,fname='img_right') + # plt.show() + + #d.ckose() + + +if __name__ == '__main__': + #main() + #test_video_stream_rs_and_foundation() # ok + #test_video_error_rs_versus_foundation() # ok + # test_point_cloud_rt() # ok + #test_point_cloud_rs_versus_foundation() + #test_video_stream_rs_versus_foundation_x2() # ok + test_video_stream_rs_fs_merge() + #test_faro_rs_fs_error() + + + + + + + + + \ No newline at end of file diff --git a/scripts/run_plane_detector_fs.py b/scripts/run_plane_detector_fs.py new file mode 100644 index 0000000..839d63d --- /dev/null +++ b/scripts/run_plane_detector_fs.py @@ -0,0 +1,2651 @@ +#!/usr/bin/env python + +''' +Tester for multi planar plain detector with foundation stereo +================== + +Using depth image to compute depth planes locally for specific ROI. + + +Usage: + +Environemt : + ..\\docker + +Install : + + + +''' + +import sys +import numpy as np +import cv2 as cv +import random +import unittest +#from scipy.spatial.transform import Rotation as Rot +import matplotlib.pyplot as plt +import logging +log = logging.getLogger("robot") +log.setLevel(logging.DEBUG) +from opencv_realsense_camera import RealSense, draw_str +from run_fast_foundation_with_rs import convert_disparity_to_depth, foundation_stereo_algo_init, foundation_stereo_algo, process_arguments + +#!/usr/bin/env python + + + + + + +#%% Main +class PlaneDetector: + def __init__(self, detect_type = 'p', image_size = (1280,720)): + + self.detect_type = detect_type # plane + + self.frame_size = image_size + self.img = None + self.cam_matrix = np.array([[1000,0,self.frame_size[0]/2],[0,1000,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + + self.img3d = None # contains x,y and depth plains + self.img_xyz = None # comntains X,Y,Z information after depth image to XYZ transform + self.img_mask = None # which pixels belongs to the plain + self.rect = None # roi + self.img_roi = None # roi image + self.img_roi_normal = None # normals at roi image + + # detector type + self.matrix_inv = None # holds inverse params of the + self.rect_z = None # flat z for ROI + self.rect_dir = None # direct u,v,1 for ROI + self.rect_xyz = None # direct u,v,1 multiplied by z ROI + self.full_dir = None # direct u,v,1 for entire image + self.full_xyz = None # direct u,v,1 multiplied by z entire image + self.roi_index = None # index of the points og an ROI in the original image + self.plane_params = None # rvec not normalized + self.plane_center = None # tvec + self.plane_confidence = 0 # reliability of the detcetion + + #self.corner_ind = [0, 10, 40, 50] # corner of the rectnagle for the projection + self.rect_3d = None # roi but projected on 3D + + # params + self.MIN_SPLIT_SIZE = 32 + self.MIN_STD_ERROR = 0.01 + + # color for the mask + self.color_mask = np.random.randint(0,255,3) # random color + + # help variable + self.ang_vec = np.zeros((3,1)) # help variable + + def init_image(self, img = None): + "load image" + + self.img = img + h,w = img.shape[:2] + self.frame_size = (w,h) + self.img_mask = np.zeros((h,w)) + return True + + def init_roi(self, roi_type = 1): + "load the test case" + w,h = self.frame_size[0],self.frame_size[1] + w2,h2 = w>>1,h>>1 + roi = [0,0,w,h] + if roi_type == 1: + roi = [w2-3,h2-3,w2+3,h2+3] # xlu, ylu, xrb, yrb + elif roi_type == 2: + roi = [300,220,340,260] # xlu, ylu, xrb, yrb + elif roi_type == 3: + roi = [280,200,360,280] # xlu, ylu, xrb, yrb + elif roi_type == 4: + roi = [220,140,420,340] # xlu, ylu, xrb, yrb + elif roi_type == 5: + roi = [200,120,440,360] # xlu, ylu, xrb, yrb + elif roi_type == 11: + roi = [w2-16,h2-16,w2+16,h2+16] # xlu, ylu, xrb, yrb + elif roi_type == 12: + roi = [w2-32,h2-32,w2+32,h2+32] # xlu, ylu, xrb, yrb + elif roi_type == 13: + roi = [w2-64,h2-64,w2+64,h2+64] # xlu, ylu, xrb, yrb + elif roi_type == 14: + roi = [w2-64,h2-48,w2+64,h2+48] # xlu, ylu, xrb, yrb + elif roi_type == 21: # lower center image position + roi = [w2-64,h2+128,w2+64,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 22: # lower left image position + roi = [w2-400,h2+128,w2-272,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 23: # lower right image position + roi = [w2+272,h2+128,w2+400,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 31: # upper center image position + #roi = [w2-64,h2-196,w2+64,h2-128] # xlu, ylu, xrb, yrb + roi = [w2+64,h2-128,w2+128,h2-64] # xlu, ylu, xrb, yrb + elif roi_type == 32: # upper left image position + roi = [w2-400,h2-196,w2-272,h2-128] # xlu, ylu, xrb, yrb + elif roi_type == 33: # upper right image position + roi = [w2+302,h2-196,w2+430,h2-128] # xlu, ylu, xrb, yrb + elif roi_type == 41: # center center image position + #roi = [w2-64,h2+32,w2+64,h2+96] # xlu, ylu, xrb, yrb + roi = [w2-128,h2+64,w2+128,h2+196] # xlu, ylu, xrb, yrb + elif roi_type == 42: # center left image position + roi = [w2-200,h2+32,w2-72,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 43: # center right image position + roi = [w2+72,h2+32,w2+200,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 43: # center right image position + roi = [w2+72,h2+32,w2+200,h2+96] # xlu, ylu, xrb, yrb + elif roi_type == 52: # upper left image position for data 432 + roi = [w2-256,h2-256,w2-128,h2-192] # xlu, ylu, xrb, yrb + elif roi_type == 53: # upper center image position + roi = [w2-128,h2+16,w2+128,h2+300] # xlu, ylu, xrb, yrb + + elif roi_type == 60: # 422 on cube smaller + roi = [645,395,665,415] # xlu, ylu, xrb, yrb + elif roi_type == 61: # 422 on cube + roi = [640,390,670,420] # xlu, ylu, xrb, yrb + elif roi_type == 62: # 422 right cube side + roi = [690,390,720,420] # xlu, ylu, xrb, yrb + elif roi_type == 71: # set 422 - cube + roi = [630,h2+72,690,h2+128] # xlu, ylu, xrb, yrb + elif roi_type == 72: # set 422 - cube + roi = [430,h2-30,890,h2+228] # xlu, ylu, xrb, yrb + elif roi_type == 73: # set 422 - down side of the cube + roi = [630,h2+140,690,h2+190] # xlu, ylu, xrb, yrb + elif roi_type == 74: # set 422 - right side of the cube + roi = [700,h2+64,800,h2+164] # xlu, ylu, xrb, yrb + elif roi_type == 75: # set 422 - up side of the cube + roi = [700,h2+20,800,h2+96] # xlu, ylu, xrb, yrb + self.rect = roi + #self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + log.info(f'Using ROI : {roi}') + return roi + + def preprocess(self, img = None): + "image preprocessing - extracts roi and converts from uint8 to float using log function" + if img is None: + log.info('No image provided') + return False + + if self.img_mask is None: + ret = self.init_image(img) + + if self.rect is None: # use entire image + roi = self.init_roi(4) + + # init params of the inverse + if self.full_dir is None: + self.fit_plane_init() + + #x0, y0, x1, y1 = self.rect + if len(img.shape) > 2: + #img_roi = img[y0:y1,x0:x1,2].astype(np.float32) + img_roi = img[:,:,2].astype(np.float32) + else: + #img_roi = img[y0:y1,x0:x1].astype(np.float32) + img_roi = img.astype(np.float32) + return img_roi + + def init_img3d(self, img = None): + "initializes xyz coordinates for each point" + img = self.img if img is None else img + h,w = img.shape[:2] + x = np.arange(w) + y = np.arange(h) + x,y = np.meshgrid(x,y) + fx = self.cam_matrix[0,0] + fy = self.cam_matrix[1,1] + + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)) + v = xy_undistorted[:,0,1].reshape((h,w)) + z3d = img.astype(np.float32) + x3d = z3d.copy() + y3d = z3d.copy() + + #ii = np.logical_and(z3d> 1e-6 , np.isfinite(z3d)) + ii = z3d > 5 + x3d[ii] = u[ii]*z3d[ii] #/fx + y3d[ii] = v[ii]*z3d[ii] #/fy + z3d[ii] = z3d[ii] + + #self.img3d = np.stack((u/fx,v/fy,z3d), axis = 2) + self.img3d = np.stack((u,v,z3d), axis = 2) + self.img_mask = np.zeros((h,w)) + return self.img3d + + def compute_img3d(self, img = None): + "compute xyz coordinates for each point using prvious init" + img = self.img if img is None else img + xyz = self.img3d + if xyz is None: + xyz = self.init_img3d(img) + + if np.any(img.shape[:2] != xyz.shape[:2]): + print('Image dimension change') + return + + imgXYZ = self.img3d.copy() + + z3d = img.astype(np.float32) + x3d = self.img3d[:,:,0].copy() # u/f + y3d = self.img3d[:,:,1].copy() # v/f + + # filter bad z values + #ii = np.logical_and(z3d > 1e-6 , np.isfinite(z3d)) + ii = z3d > 15 + x3d[ii] = x3d[ii]*z3d[ii] + y3d[ii] = y3d[ii]*z3d[ii] + z3d[ii] = z3d[ii] + + # x,y,z coordinates in 3D + imgXYZ[:,:,0] = x3d + imgXYZ[:,:,1] = y3d + imgXYZ[:,:,2] = z3d + + self.img_xyz = imgXYZ + return imgXYZ + + def check_error(self, xyz1_mtrx, vnorm): + "checking the error norm" + err = np.dot(xyz1_mtrx, vnorm) + err_std = err.std() + return err_std + + def get_plane_params(self): + "for external interface support" + rvec_left = self.plane_params + tvec_left = self.plane_center + conf_left = self.plane_confidence + return tvec_left, rvec_left, conf_left + + def convert_plane_params(self, plane_equation): + "convert plane params to rvec" + # 4. Convert plane parameters to rvec and tvec + # - The plane normal vector is (A, B, C). + # - We can use the normal vector to get the rotation. + # - A point on the plane can be used for the translation vector. + + # Normalize the plane normal vector + normal = plane_equation #np.array([plane_equation[0], plane_equation[1], plane_equation[2]]) + normal_norm = np.linalg.norm(normal) + if normal_norm == 0: + log.error("Error: Zero norm for plane normal vector.") + return None + normal = normal / normal_norm + + # Use the normalized normal vector to get the rotation matrix + # This is a common method, but there are other ways to do this. + z_axis = np.array([0, 0, 1]) + rotation_axis = np.cross(z_axis, normal) + rotation_angle = np.arccos(np.dot(z_axis, normal)) + + # Handle the case where the rotation axis is zero (normal is parallel to z-axis) + if np.linalg.norm(rotation_axis) < 1e-6: + if normal[2] > 0: + rvec = np.zeros(3) # Rotation is identity + else: + rvec = np.array([0, np.pi, 0]) # Rotation by 180 degrees around X or Y. + else: + rvec, _ = cv.Rodrigues(rotation_axis * rotation_angle) + rvec, _ = cv.Rodrigues(rvec) + + return rvec + + def convert_plane_params_to_pose(self, plane_params = None, plane_center = None): + "converting params of the plane to the pose vector" + + plane_params = self.plane_params if plane_params is None else plane_params[:3].flatten() + plane_center = self.plane_center if plane_center is None else plane_center[:3].flatten() + + tvec = plane_center.reshape((1,-1)) + rvec = plane_params.reshape((1,-1)) #reshape((-1,1)) + rvec = rvec/np.linalg.norm(rvec.flatten()) + + pose_norm = np.hstack((tvec, rvec)) + #log.info('roi to pose') + return pose_norm #.flatten() + + def fit_plane_init(self): + "prepares data for real time fit a*x+b*y+c = z" + if self.cam_matrix is None: + self.cam_matrix = np.array([[650,0,self.frame_size[0]/2],[0,650,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + log.info('Camera matrix is initialized to default.') + + x0,y0,x1,y1 = 0,0,self.frame_size[0],self.frame_size[1] #self.rect + h,w = y1-y0, x1-x0 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + + # remember corner indexes for reprojection [0 .... h*(w-1)) + # . . + # h ......h*w-1] + #self.corner_ind = [0, h, h*w-1, h*(w-1), 0] + #h2,w2 = h>>1, w>>1 + #self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # camera coordinates + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)).reshape(-1,1) + v = xy_undistorted[:,0,1].reshape((h,w)).reshape(-1,1) + + # check + #u, v = u*self.cam_matrix[0,0], v*self.cam_matrix[1,1] + + self.full_dir = np.hstack((u,v,u*0+1)) + #self.matrix_inv = np.linalg.pinv(self.rect_dir) + + def fit_plane_init_old(self): + "prepares data for real time fit a*x+b*y+c = z" + self.cam_matrix = np.array([[650,0,self.frame_size[0]/2],[0,650,self.frame_size[1]/2],[0,0,1]], dtype = np.float32) + self.cam_distort = np.array([0,0,0,0,0],dtype = np.float32) + + x0,y0,x1,y1 = self.rect + h,w = y1-y0, x1-x0 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + + # remember corner indexes for reprojection [0 .... h*(w-1)) + # . . + # h ......h*w-1] + #self.corner_ind = [0, h, h*w-1, h*(w-1), 0] + h2,w2 = h>>1, w>>1 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # camera coordinates + xy = np.hstack((x.reshape(-1,1),y.reshape(-1,1))) + xy = np.expand_dims(xy, axis=1).astype(np.float32) + xy_undistorted = cv.undistortPoints(xy, self.cam_matrix, self.cam_distort) + + u = xy_undistorted[:,0,0].reshape((h,w)).reshape(-1,1) + v = xy_undistorted[:,0,1].reshape((h,w)).reshape(-1,1) + + # check + #u, v = u*self.cam_matrix[0,0], v*self.cam_matrix[1,1] + + self.rect_dir = np.hstack((u,v,u*0+1)) + #self.matrix_inv = np.linalg.pinv(self.rect_dir) + + def convert_roi_to_points(self, img, point_num = 30, step_size = 1, roi_rect = None): + "converting roi to pts in XYZ - Nx3 array. point_num - is the target point number" + + # init params of the inverse + if self.full_dir is None: # do not use mtrix_dir - initialized before + self.fit_plane_init() + + # deal iwth different rect options + roi_rect = self.rect if roi_rect is None else roi_rect + x0, y0, x1, y1 = roi_rect + + # make rectangle + h,w = (y1-y0)>>1, (x1-x0)>>1 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + + # extract roi - must be compatible with image dimensions + # n,m = img.shape[:2] + # img_roi_mask = np.zeros((n,m), dtype = np.bool_) + # img_roi_mask[y0:y1,x0:x1] = True + # valid_bool = img_roi_mask > 0 & img > 0 + + # check if roi is valid. +1 to grow in positive x and y since arange does not include x1,y1 + x_grid = np.arange(x0, x1, 1) + y_grid = np.arange(y0, y1, 1) + x, y = np.meshgrid(x_grid, y_grid) + #flat_indices = np.ravel_multi_index((y, x), img.shape[:2]).reshape((-1,1)) + flat_indices = y * self.frame_size[0] + x + flat_indices = flat_indices.ravel().astype(np.int32) + # valid under mask + #valid_bool = img.flat[flat_indices] > 0 + #ii = flat_indices[valid_bool] + img_roi = img[y0:y1,x0:x1].flatten() #.astype(np.float32).reshape((-1,1)) + valid_bool = img_roi > 1 # valid pixels in the roi + ii = np.where(valid_bool)[0] + + valid_point_num = len(ii) + if valid_point_num < 5: + return np.zeros((0,3)) + + step_size = np.maximum(step_size, np.int32(valid_point_num/point_num)) + ii = ii[::step_size] + + # plane params - using only valid + z = img_roi[ii].reshape((-1,1)) + jj = flat_indices[ii].flatten() + uv1_matrix = self.full_dir[jj,:] + xyz_matrix = uv1_matrix[:,:3]*z # keep 1 intact + + #self.plane_center = xyz_center.flatten() + self.rect_z = z + self.rect_dir = uv1_matrix + self.rect_xyz = xyz_matrix + self.roi_index = jj + self.img_roi = img[y0:y1,x0:x1] + + return xyz_matrix + + + def convert_roi_to_points_old(self, img_roi, point_num = 30, step_size = 1): + "converting roi to pts in XYZ - Nx3 array. point_num - is the target point number" + # x1,y1 = self.img_xyz.shape[:2] + # roi_area = x1*y1 + + # # reduce size of the grid for speed + # if step_size < 1 and roi_area > 100: + # step_size = np.maximum(1,int(np.sqrt(roi_area)/10)) + + + # #roi3d = self.img_xyz[y0:y1:step_size,x0:x1:step_size,:] + # roi3d = self.img_xyz[::step_size,::step_size,:] + # x,y,z = roi3d[:,:,0].reshape((-1,1)), roi3d[:,:,1].reshape((-1,1)), roi3d[:,:,2].reshape((-1,1)) + # xyz_matrix = np.hstack((x,y,z)) + # + + # init params of the inverse + if self.rect_dir is None: + self.fit_plane_init_old() + + # extract roi + + n,m = img_roi.shape[:2] + img_roi = img_roi.reshape((-1,1)) + valid_bool = img_roi > 0 + valid_bool = valid_bool.flatten() + #log.info(f'Timing : 1') + + # all non valid + ii = np.where(valid_bool)[0] + valid_point_num = len(ii) + if valid_point_num < 5: + return None + step_size = np.maximum(step_size, np.int32(valid_point_num/point_num)) + ii = ii[::step_size] + + # plane params - using only valid + z = img_roi[ii] + xyz_matrix = self.rect_dir[ii,:] + xyz_matrix[:,:3] = xyz_matrix[:,:3]*z # keep 1 intact + + # update corners of the rect in 3d + #self.rect_3d = self.rect_dir[self.corner_ind,:]*img_roi[self.corner_ind] + # rect to show + x0, y0, x1, y1 = self.rect + h,w = y1-y0, x1-x0 + self.rect_3d = [[-w,-h,0],[w,-h,0],[w,h,0],[-w,h,0],[-w,-h,0]] + # substract mean + #xyz_center = xyz_matrix[:,:3].mean(axis=0) + #xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + #self.plane_center = xyz_center.flatten() + self.rect_xyz = xyz_matrix + + return xyz_matrix + + def fit_plane_svd(self, img_roi): + "estimates mean and std of the plane fit" + + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 600, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + # using svd to make the fit + U, S, Vh = np.linalg.svd(xyz_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #log.info(f'Timing : 3') + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params) + #z_est = z + err + xyz_center[2] + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_svd_old(self, img_roi): + "estimates mean and std of the plane fit" + # n,m = img_roi.shape[:2] + # img_roi = img_roi.reshape((-1,1)) + # valid_bool = img_roi > 0 + # valid_bool = valid_bool.flatten() + # #log.info(f'Timing : 1') + + # # init params of the inverse + # if self.matrix_inv is None: + # self.fit_plane_init() + + # # plane params - using only valid + # z = img_roi[valid_bool] + # xyz_matrix = self.rect_dir[valid_bool,:] + # xyz_matrix[:,:3] = xyz_matrix[:,:3]*z # keep 1 intact + + # update corners of the rect in 3d + #self.rect_3d = self.rect_dir[self.corner_ind,:]*img_roi[self.corner_ind] + + # roi converted to points with step size on the grid + #xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 1e4, step_size = 1) + xyz_matrix = self.convert_roi_to_points_old(img_roi, point_num = 1e4, step_size = 1) + + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + xyz_matrix = xyz_matrix - xyz_center + #log.info(f'Timing : 2') + + # mtrx_dir = np.hstack((self.rect_dir[valid_bool,0]*z,self.rect_dir[valid_bool,1]*z,z*0+1)) + # mtrx_inv = np.linalg.pinv(mtrx_dir) + # #mtrx_inv = self.matrix_inv[:,valid_bool] + # plane_params = np.dot(mtrx_inv,z) + + # decimate to make it run faster reduce size of the grid for speed. 1000 pix - 30x30 - step 1, 10000 pix - step=3 + #roi_area = n*m + #step_size = int(np.sqrt(roi_area)/7) if roi_area > 1000 else 1 + + # using svd to make the fit + U, S, Vh = np.linalg.svd(xyz_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #log.info(f'Timing : 3') + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params) + #z_est = z + err + xyz_center[2] + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_svd_weighted(self, img_roi): + "estimates mean and std of the plane fit - fit is weighted assuming Sigma(z) = a*z" + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 500, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # # substract mean + # xyz_center = xyz_matrix[:,:3].mean(axis=0) + # xyz_matrix = xyz_matrix - xyz_center + + # plane params - using only valid + z = self.rect_z + uv1_matrix = self.rect_dir # (x - x0)/f, (y - y0)/f, 1 + + # minimization function min |ua/f + vb/f + c + dz|^2/|z|^2 + f = self.cam_matrix[0,0] + # uvf_matrix = np.dot(uv1_matrix , np.diag([1/f,1/f,1])) + + mtrx_inv = np.linalg.pinv(uv1_matrix) + b = f/z + plane_params = np.dot(mtrx_inv,b) + plane_params = plane_params/np.linalg.norm(plane_params) + + # center + xyz_matrix = self.rect_xyz #uv1_matrix[:,:3]*z # keep 1 intact + xyz_center = xyz_matrix.mean(axis=0) + + # estimate error + err = np.dot(uv1_matrix,plane_params) + z_est = z - err + + img_mean = z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_with_outliers(self, img_roi): + "computes normal for the specifric roi and evaluates error. Do it twice to reject outliers" + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 1500, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # substract mean + xyz_center_1 = xyz_matrix[:,:3].mean(axis=0) + xyz_1 = xyz_matrix - xyz_center_1 + + # using svd to make the fit to a sub group + U, S, Vh = np.linalg.svd(xyz_1, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + #vnorm = vnorm*np.sign(vnorm[2]) # keep orientation + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_1,plane_params) + err_std = err.std() + log.info('Fit error iteration 1: %s' %str(err_std)) + + # filter only the matching points + inlier_ind = np.abs(err) < 3*err_std + + # substract mean when only inliers are kept + xyz_center_2 = xyz_matrix[inlier_ind,:3].mean(axis=0)# + xyz_2 = xyz_matrix[inlier_ind,:] - xyz_center_2 + + # perform svd one more time + U, S, Vh = np.linalg.svd(xyz_2, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # checking error + err = np.dot(xyz_2, plane_params) + err_std = err.std() + log.info('Fit error iteration 2: %s' %str(err_std)) + + # # We can convert this flat index to row and column indices + # row_index, col_index = np.unravel_index(inlier_ind, self.img_mask.shape) + # self.img_mask[row_index, col_index] = 1 + + img_mean = xyz_center_2[2] #z_est.mean() + img_std = err_std + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center_2.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def fit_plane_ransac(self, img_roi): + + """ + Find the best equation for a plane. + + :param pts: 3D point cloud as a `np.array (N,3)`. + :param thresh: Threshold distance from the plane which is considered inlier. + :param maxIteration: Number of maximum iteration which RANSAC will loop over. + :returns: + - `self.equation`: Parameters of the plane using Ax+By+Cy+D `np.array (1, 4)` + - `self.inliers`: points from the dataset considered inliers + + """ + #log.info('Fit ransac: ...') + # roi converted to points with step size on the grid + #xyz_matrix = self.convert_roi_to_points_old(img_roi, point_num = 250, step_size = 1) + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 250, step_size = 1) + if xyz_matrix is None: + log.error('No points in the ROI') + return 0, 0 + + thresh = 1.05 + maxIteration = 100 + + + n_points = xyz_matrix.shape[0] + best_eq = [] + best_inliers = [] + + for it in range(maxIteration): + + # Samples 3 random points + if n_points < 3: break + id_samples = random.sample(range(0, n_points), 3) + pt_samples = xyz_matrix[id_samples,:] + + # We have to find the plane equation described by those 3 points + # We find first 2 vectors that are part of this plane + # A = pt2 - pt1 + # B = pt3 - pt1 + + vecA = pt_samples[1, :] - pt_samples[0, :] + vecB = pt_samples[2, :] - pt_samples[0, :] + + # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane + vecC = np.cross(vecA, vecB) + vecC_norm = np.linalg.norm(vecC) + + # protect from the close spaced points + if vecC_norm < 10e-6: + continue + + # make sure that Z direction is positive + vecC = vecC * np.sign(vecC[2]) + + # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k + # We have to use a point to find k + vecC = vecC / vecC_norm + #k = -np.sum(np.multiply(vecC, pt_samples[1, :])) + k = -np.dot(vecC, pt_samples[1, :]) + plane_eq = [vecC[0], vecC[1], vecC[2], k] + + # Distance from a point to a plane + # https://mathworld.wolfram.com/Point-PlaneDistance.html + # pt_id_inliers = [] # list of inliers ids + # dist_pt = ( + # plane_eq[0] * xyz_matrix[:, 0] + plane_eq[1] * xyz_matrix[:, 1] + plane_eq[2] * xyz_matrix[:, 2] + plane_eq[3] + # ) / np.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2) + + dist_pt = np.dot(xyz_matrix, vecC) + plane_eq[3] + + # Select indexes where distance is biggers than the threshold + pt_id_inliers = np.where(np.abs(dist_pt) <= thresh)[0] + if len(pt_id_inliers) > len(best_inliers): + best_eq = plane_eq + best_inliers = pt_id_inliers + + #self.inliers = best_inliers + #self.equation = best_eq + + # rtansform to pose output + #tvec = xyz_matrix[best_inliers,:].mean(axis=0) + #pts_best = xyz_matrix[best_inliers,:] - tvec + tvec = xyz_matrix.mean(axis=0) + pts_best = xyz_matrix - tvec + vnorm = np.array(best_eq[:3]) + + # checking error + err = np.dot(pts_best, vnorm) + err_std = err.std() + log.info('Fit error ransac: %s' %str(err_std)) + + img_mean = tvec[2] #z_est.mean() + img_std = err_std + self.plane_params = vnorm.flatten() + self.plane_center = tvec.flatten() + + #log.info(f'Plane : {self.plane_params}, error {img_std:.3f}, step {step_size}') + + return img_mean, img_std + + def estimate_normals_from_depth_map(self,depth_map): + """ + Estimates the surface normal vector for each pixel in a depth map + using the image gradient (Sobel operator). + + Args: + depth_map (np.ndarray): A single-channel depth image (e.g., CV_32F or CV_64F). + Depth values must be in a consistent metric (e.g., meters). + + Returns: + np.ndarray: A 3-channel image (H, W, 3) where each pixel contains the + (nx, ny, nz) unit normal vector, as CV_32F. + """ + # 1. Convert to CV_32F for accurate gradient calculation + if depth_map.dtype != np.float32: + depth_map = depth_map.astype(np.float32) + + depth_map = cv.GaussianBlur(depth_map, (5, 5), 0) + + # 2. Calculate Derivatives using Sobel Operator (Gradient) + # The kernel size 'ksize=1' is often preferred for depth maps as it corresponds + # to a 3x1 or 1x3 kernel, providing a close approximation of the derivative. + ksize = 1 + + # Calculate dz/du (gradient in X/horizontal direction) + # dx=1, dy=0 + grad_x = cv.Sobel(depth_map, cv.CV_32F, 1, 0, ksize=ksize, borderType=cv.BORDER_DEFAULT) + + # Calculate dz/dv (gradient in Y/vertical direction) + # dx=0, dy=1 + grad_y = cv.Sobel(depth_map, cv.CV_32F, 0, 1, ksize=ksize, borderType=cv.BORDER_DEFAULT) + + # 3. Construct the Normal Vector Components + # The normal vector is proportional to n = (-dz/du, -dz/dv, 1) + + # Reshape the gradients to (H, W, 1) for stacking + grad_x = grad_x[:, :, np.newaxis] + grad_y = grad_y[:, :, np.newaxis] + + # Create the 'z' component of the direction vector, which is always 1 + # np.ones_like creates an array with the same shape and type as the gradient arrays + z_component = np.ones_like(grad_x) + + # Stack the components to create the direction vector (H, W, 3) + # The X and Y gradients are negated: -dz/du and -dz/dv + direction_vectors = np.concatenate((-grad_x, -grad_y, z_component), axis=2) + + # 4. Normalize the Direction Vectors + # Calculate the magnitude (Euclidean norm) of each (nx, ny, nz) vector + # axis=2 computes the norm across the 3 channels + magnitude = np.linalg.norm(direction_vectors, axis=2, keepdims=True) + + # Use np.divide and np.where to prevent division by zero for magnitude=0 + # Set normals to (0, 0, 0) or another placeholder where magnitude is zero (flat or invalid depth) + normals = np.divide(direction_vectors, magnitude, out=np.zeros_like(direction_vectors), where=magnitude != 0) + + return normals + + def estimate_normals_using_box_filters(self,img_roi): + """ + Estimates the surface normal vector for each pixel in a depth map + using the image gradient (Sobel operator). + + Args: + img_roi (np.ndarray): A single-channel depth image (e.g., CV_32F or CV_64F). + Depth values must be in a consistent metric (e.g., meters). + + Returns: + np.ndarray: A 3-channel image (H, W, 3) where each pixel contains the + (nx, ny, nz) unit normal vector, as CV_32F. + """ + + # sum of image pixels using box filter + img_roi = img_roi.astype(np.float32) + roih, roiw = img_roi.shape[:2] + kernel_size = 5 + # count good points + img_roi_mask = (img_roi > 0).astype(np.float32) + + img_roi_sum = cv.boxFilter(img_roi, -1, (kernel_size, kernel_size), normalize=False) + img_roi_count = cv.boxFilter(img_roi_mask, -1, (kernel_size, kernel_size), normalize=False) + + # protect from non valid + img_roi_count[img_roi_count < 1] = 1 + img_roi_mean = img_roi_sum/img_roi_count + + # compute gradients + shift = kernel_size>>1 + shift2 = shift<<1 + img_normal = np.ones((roih,roiw,3)) + # cross product + #a × b = (a₂b₃ - a₃b₂)i + (a₃b₁ - a₁b₃)j + (a₁b₂ - a₂b₁)k + # dzdx, dzdy + img_normal[:,shift:-shift,0] = img_roi_mean[:,shift2:] - img_roi_mean[:,:-shift2] + img_normal[shift:-shift,:,1] = img_roi_mean[shift2:,:] - img_roi_mean[:-shift2,:] + + # align directions + img_normal[:,:,0] = -img_normal[:,:,0] # dx + img_normal[:,:,1] = -img_normal[:,:,1] # dy + + # normalize each vector to unit length + norm2 = np.sqrt(np.sum(img_normal**2, axis=2)) + img_normal = img_normal / norm2[:,:,np.newaxis] + return img_normal + + def fit_plane_using_gradients(self, img_full, roi_rect = None): + "estimates normal to the plane fit using gradients" + + # roi converted to points with step size on the grid + xyz_roi = self.convert_roi_to_points(img_full, point_num = 500, step_size = 1, roi_rect = roi_rect) + if self.img_roi is None: + log.error('No ROI in image') + return 0, 0 + img_roi = self.img_roi.astype(np.float32) + + # old code + #img_normal = self.estimate_normals_using_box_filters(img_roi) + + img_normal = self.estimate_normals_from_depth_map(img_roi) + + # roih, roiw = img_roi.shape[:2] + + # # count good points + # img_roi_mask = (img_roi > 0).astype(np.float32) + + # # sum of image pixels using box filter + # kernel_size = 7 + # img_roi_sum = cv.boxFilter(img_roi, -1, (kernel_size, kernel_size), normalize=False) + # img_roi_count = cv.boxFilter(img_roi_mask, -1, (kernel_size, kernel_size), normalize=False) + + # # protect from non valid + # img_roi_count[img_roi_count < 1] = 1 + # img_roi_mean = img_roi_sum/img_roi_count + + # # compute gradients + # shift = kernel_size>>1 + # shift2 = shift<<1 + # img_normal = np.ones((roih,roiw,3)) + # # cross product + # #a × b = (a₂b₃ - a₃b₂)i + (a₃b₁ - a₁b₃)j + (a₁b₂ - a₂b₁)k + # # dzdx, dzdy + # img_normal[:,shift:-shift,0] = img_roi_mean[:,shift2:] - img_roi_mean[:,:-shift2] + # img_normal[shift:-shift,:,1] = img_roi_mean[shift2:,:] - img_roi_mean[:-shift2,:] + + # # align directions + # img_normal[:,:,0] = -img_normal[:,:,0] # dx + # img_normal[:,:,1] = -img_normal[:,:,1] # dy + + # # normalize each vector to unit length + # norm2 = np.sqrt(np.sum(img_normal**2, axis=2)) + # img_normal = img_normal / norm2[:,:,np.newaxis] + + # plane normal + plane_params = img_normal.mean(axis=(0,1)) + + # some problem with points + if xyz_roi.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # estimate error + xyz_center = xyz_roi[:,:3].mean(axis=0) + xyz_matrix = xyz_roi - xyz_center + err = np.dot(xyz_matrix,plane_params) + + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + #self.plane_confidence = 1/(1+img_std) + + self.img_roi_normal = img_normal # save for debug and display + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_ransac_and_grow(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then grow the ROI + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + #img_mean, img_std = self.fit_plane_ransac(img_full) + + # make sure that mask is not empty - initial rectangle + x0, y0, x1, y1 = self.rect + self.img_mask[y0:y1,x0:x1] = 1 + + # grow the mask + y,x = np.where(self.img_mask > 0.7) + y_min, y_max = y.min(), y.max() + x_min, x_max = x.min(), x.max() + y_min, y_max = np.maximum(0,y_min-1), np.minimum(self.img_mask.shape[0],y_max+2) + x_min, x_max = np.maximum(0,x_min-1), np.minimum(self.img_mask.shape[1],x_max+2) + + # extract ROI + roi_rect = [x_min, y_min, x_max, y_max] + #img_roi = img_full[y_min:y_max,x_min:x_max].astype(np.float32) + xyz_matrix = self.convert_roi_to_points(img_full, point_num = 5000, step_size = 1, roi_rect = roi_rect) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_offset = np.dot(self.plane_center, vecC) + dist_pt = np.dot(xyz_matrix, vecC) - dist_offset + + # Select indexes where distance is biggers than the threshold + thresh = 3.5 + err = np.abs(dist_pt) + i2 = np.where( err <= thresh)[0] + + # transfer xi,yi coordinates to the original image index + ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + self.img_mask = 0.95*self.img_mask + self.img_mask.flat[ii] = self.img_mask.flat[ii] + 0.5*(1 - self.img_mask.flat[ii]) + + + # position in 2d array + # unravel_index(a.argmax(), a.shape) + + # output + img_std = err.std() + img_mean = xyz_matrix[i2].mean(axis=0)[2] + + + return img_mean, img_std + + def fit_and_split_roi_recursively(self, roi, level = 0): + # splits ROI on 4 regions and recursevly call + x0,y0,x1,y1 = roi + #roi3d = self.img_xyz[y0:y1,x0:x1,:] + log.info('Processing level %d, region x = %d, y = %d' %(level,x0,y0)) + # check the current fit + roi_params_f = self.fit_plane(roi) + roi_params_ret = [roi_params_f] + if roi_params_f['error'] < self.MIN_STD_ERROR: + log.info('Fit is good enough x = %d, y = %d' %(x0,y0)) + return roi_params_ret + + # too small exit + xs, ys = int((x1 + x0)/2), int((y1 + y0)/2) + if (xs - x0) < self.MIN_SPLIT_SIZE or (ys - y0) < self.MIN_SPLIT_SIZE: + log.info('Min size is reached x = %d, y = %d' %(x0,y0)) + return roi_params_ret + + # 4 ROIs - accept the split if error of one of them is lower from the total + roi_params_list = [] + roi_split = [[x0,y0,xs,ys],[x0,ys,xs,y1],[xs,y0,x1,ys],[xs,ys,x1,y1]] + for roi_s in roi_split: + roi_params_prev = self.fit_and_split_roi_recursively(roi_s, level + 1) + # save locally + #roi_params_list.append(roi_params_prev) + roi_params_list = roi_params_list + roi_params_prev + + # extract each of the below and check the error + makeTheSplit = False + for roi_params_s in roi_params_list: + #roi_params_s = roi_params_prev[-1] + # accept the split if twice lower (if noise of 4 split should be 2) + if roi_params_s['error'] < roi_params_f['error']/2: + makeTheSplit = True + break + + # decide what to return + if makeTheSplit: + roi_params_ret = roi_params_list + log.info('Split at level %d, region x = %d, y = %d' %(level,x0,y0)) + else: + log.info('No split level %d, region x = %d, y = %d' %(level,x0,y0)) + + return roi_params_ret + + def fit_plane_svd_weighted_4d(self, img_roi): + "estimates weighted plane fit using weight inversely proportional to the depth" + + # roi converted to points with step size on the grid + xyz_matrix = self.convert_roi_to_points(img_roi, point_num = 350, step_size = 1) + + # some problem with points + if xyz_matrix.shape[0] < 2: + log.warning('Not enough points in the ROI') + return 0, 0 + + # form matrix [u,v,f,zf] result of sigma(z) = alpha*z + if self.roi_index is None: + log.error('No ROI index found') + return 0,0 + + # form matrix for svd + f = self.cam_matrix[0,0] + # uv1z_matrix = np.hstack((self.full_dir[self.roi_index,:],xyz_matrix[:,2].reshape((-1,1)))) + # uv1z_matrix[:,2:3] *= f # keep 1 intact + + # plane params - using only valid + z = self.rect_z + uv1_matrix = self.rect_dir + uv1z_matrix = np.hstack((uv1_matrix,1/z)) + uv1z_matrix[:,2:3] *= f # keep 1 intact + + # using svd to make the fit + U, S, Vh = np.linalg.svd(uv1z_matrix, full_matrices=True) + ii = np.argmin(S) + vnorm = Vh[ii,:] + + # keep orientation + plane_params = vnorm*np.sign(vnorm[2]) + + # estimate error + err = np.dot(xyz_matrix,plane_params[:3]) + + # patch mean + xyz_center = xyz_matrix[:,:3].mean(axis=0) + img_mean = xyz_center[2] #z_est.mean() + img_std = err.std() + self.plane_params = plane_params[:3].flatten() + self.plane_center = xyz_center.flatten() + + log.info(f'Plane : {self.plane_params}, error {img_std:.3f}') + + return img_mean, img_std + + def fit_plane_and_project_the_image(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then projecvt the entire image on the plane + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + # extract ROI + roi_rect = [50, 50, w-50, h-50] + xyz_matrix = self.convert_roi_to_points(img_full, point_num = 50000, step_size = 1, roi_rect = roi_rect) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_offset = np.dot(self.plane_center, vecC) + dist_pt = np.dot(xyz_matrix, vecC) - dist_offset + + # Select indexes where distance is biggers than the threshold + thresh = 1.5 + err = np.abs(dist_pt) + i2 = np.where(err <= thresh)[0] + + # transfer xi,yi coordinates to the original image index + ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + self.img_mask.flat[ii] = 1 + # make sure that mask is not empty - initial rectangle + # x0, y0, x1, y1 = self.rect + # self.img_mask[y0:y1,x0:x1] = 1 + + # position in 2d array + # unravel_index(a.argmax(), a.shape) + + # output + img_std = err.std() + img_mean = xyz_matrix[i2].mean(axis=0)[2] + + + return img_mean, img_std + + def fit_plane_and_project_the_image_using_gradients(self, img_full): + + """ + Find the best equation for a plane of the predefined ROI and then projecvt the entire image on the plane + """ + h,w = img_full.shape[:2] + if len(img_full.shape) > 2: + img_full = img_full[:,:,2].astype(np.float32) + + # start from the original ROI + if self.img_mask is None: + isOk = self.init_image(img_full) + + # extract ROI + #roi_rect = [50, 50, w-50, h-50] + #xyz_matrix = self.convert_roi_to_points(img_full, point_num = 50000, step_size = 1, roi_rect = roi_rect) + + # estimate normals for the entire image + img_normal = self.estimate_normals_from_depth_map(img_full) + + # check against the plane : do not substract plane.center from all the points + vecC = self.plane_params[:3] + dist_pt = np.dot(img_normal, vecC) + + # Select indexes where distance is biggers than the threshold + thresh = 0.1 + dist_abs = np.abs(dist_pt) + #ii = np.where(dist_abs > thresh)[0] + ii = dist_abs > thresh + # transfer xi,yi coordinates to the original image index + #ii = self.roi_index[i2] # convert to 2D index + + # update mask according to the valid pixels + #self.img_mask.flat[ii] = 1 + # make sure that mask is not empty - initial rectangle + x0, y0, x1, y1 = self.rect + self.img_mask[y0:y1,x0:x1] = 1 + + # update mask according to the valid pixels + self.img_mask = 0.95*self.img_mask + #self.img_mask.flat[ii] = self.img_mask.flat[ii] + 0.5*(1 - self.img_mask.flat[ii]) + self.img_mask[ii] = self.img_mask[ii] + 0.5*(1 - self.img_mask[ii]) + + # output + img_std = dist_abs.std() + img_mean = 0 #xyz_matrix[i2].mean(axis=0)[2] + + + return img_mean, img_std + + + + def growingStep(self, depths:np.ndarray, step:int, direction:str, mean:float, std:float, + left:int, top:int, right:int, bottom:int, numConfInt:int=3, testRatio:float=0.95): + """ + testing whether growing in a given direction and size is acceptable + :param depths: uncropped depth image + :param step: step size to test + :param direction: growing direction. should be either: 'left', 'right', 'top' or 'bottom' + :param mean: the current mean depth of the segment + :param std: the current standard deviation of the segment's depths + :param left: the current left bound of the segment + :param top: the current top bound of the segment + :param right: the current right bound of the segment + :param bottom: the current bottom bound of the segment + :param numConfInt: number of confidence intervals threshold + :param testRatio: ratio of elements in growing area to be passed the confidence interval threshold + :return: true if growing is accepted, otherwise false + """ + directions = ['left', 'right', 'bottom', 'top'] + if direction not in directions: + raise ValueError('direction must be one of "left", "right", "bottom", "top"') + + if step < 0: + raise ValueError('step must be non-negative') + + if numConfInt < 1: + raise ValueError('numConfInt must be at least 1') + + if testRatio > 1 or testRatio < 0: + raise ValueError('testRatio must be between 0 and 1') + + data = { + 'left': depths[bottom:top + 1, left - step:left] if step > 0 else None, + 'right': depths[bottom:top + 1, right + 1:right + step + 1] if step > 0 else None, + 'top': depths[top + 1:top + step + 1, left:right] if step > 0 else None, + 'bottom': depths[bottom - step:bottom, left:right] if step > 0 else None, + } + testData = data[direction] + testRows, testCols = np.nonzero(testData) # getting the indexes of all nonzero elements + norTestData = np.abs(testData[testRows, testCols] - mean) / std # normalizing test data to standard values + test = np.nonzero(norTestData < numConfInt)[0] # test if elements are below confidence interval threshold + if test.size / testData.size > testRatio: + return True # growing is acceptable + else: + return False # growing is denied + + def findMaxROI(self, depths:np.ndarray, initialRoi) -> tuple[int, int, int, int]: + """ + Find maximum ROI in depth image using region growing from an initial ROI + :param depths: entire depth image + :param initialRoi: initial ROI to grow from as list/tuple in the order: left, top, right, bottom + :return: final ROI found as a tuple in the order: left, top, right, bottom + """ + height, width = depths.shape + left, top, right, bottom = initialRoi + stepLeft = stepRight = stepTop = stepBottom = 1 # initializing growing steps + + while stepLeft + stepRight + stepTop + stepBottom > 0: + currentData = depths[bottom:top + 1, left:right + 1] + currentRows, currentCols = np.nonzero(currentData) # getting the indexes of all nonzero elements + currentMean = currentData[currentRows, currentCols].mean() + currentStd = currentData[currentRows, currentCols].std() + + if stepLeft > 0: # attempting to grow to the left + if left - stepLeft < 0: # growing left with current step exceeded image dimensions + stepLeft = 1 if stepLeft > 1 else 0 + elif left == 0: # growing reached and of image, no more growing available + stepLeft = 0 + else: + res = self.growingStep(depths, stepLeft, 'left', currentMean, currentStd, left, top, right, bottom) + if res: + left -= stepLeft + stepLeft *= 2 # increasing growing step for next iteration + else: + stepLeft = 1 if stepLeft > 1 else 0 + + if stepRight > 0: # attempting to grow to the right + if right + stepRight > width: # growing right with current step exceeded image dimensions + stepRight = 1 if stepRight > 1 else 0 + elif right == width - 1: # growing reached and of image, no more growing available + stepRight = 0 + else: + res = self.growingStep(depths, stepRight, 'right', currentMean, currentStd, left, top, right, bottom) + if res: + right += stepRight + stepRight *= 2 # increasing growing step for next iteration + else: + stepRight = 1 if stepRight > 1 else 0 + + if stepBottom > 0: # attempting to grow down + if bottom - stepBottom < 0: # growing down with current step exceeded image dimensions + stepBottom = 1 if stepBottom > 1 else 0 + elif bottom == 0: # growing reached and of image, no more growing available + stepBottom = 0 + else: + res = self.growingStep(depths, stepBottom, 'bottom', currentMean, currentStd, left, top, right, bottom) + if res: + bottom -= stepBottom + stepBottom *= 2 # increasing growing step for next iteration + else: + stepBottom = 1 if stepBottom > 1 else 0 + + if stepTop > 0: # attempting to grow up + if top + stepTop > height: # growing up with current step exceeded image dimensions + stepTop = 1 if stepTop > 1 else 0 + elif top == height - 1: # growing reached and of image, no more growing available + stepTop = 0 + else: + res = self.growingStep(depths, stepTop, 'top', currentMean, currentStd, left, top, right, bottom) + if res: + top += stepTop + stepTop *= 2 # increasing growing step for next iteration + else: + stepTop = 1 if stepTop > 1 else 0 + + # print(left, top, right, bottom, '|', stepLeft, stepTop, stepRight, stepBottom) + + return left, top, right, bottom + + def find_planes(self, img): + "finds planes using different algo" + detect_type = self.detect_type.upper() + + img_mean, img_std = 0,0 + if detect_type == 'P': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) + + elif detect_type == 'W': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd_weighted(img_roi) + + elif detect_type == 'O': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_with_outliers(img_roi) + + elif detect_type == 'T': # weighted by depth + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd_weighted_4d(img_roi) + + elif detect_type == 'R': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_ransac(img_roi) + + elif detect_type == 'G': # gradients + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_using_gradients(img_roi) + + elif detect_type == 'F': + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + img_mean, img_std = self.fit_plane_ransac_and_grow(img) + + elif detect_type == 'S': # project all the points on the plane and color them accordingly + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + #img_mean, img_std = self.fit_plane_svd_weighted(img_roi) + # img_mean, img_std = self.fit_plane_svd_weighted_4d(img_roi) + img_mean, img_std = self.fit_plane_and_project_the_image(img) + + elif detect_type == 'N': # project all the points on the plane and color them accordingly + img_roi = self.preprocess(img) + img_mean, img_std = self.fit_plane_svd(img_roi) #fit_plane_ransac(img_roi) # initial ROI plane + img_mean, img_std = self.fit_plane_and_project_the_image_using_gradients(img) + + + #log.debug(f'camera noise - roi mean : {img_mean}') + self.img_mean = img_mean # final measurements per frame + self.img_std = img_std + return True + + def process_frame(self, img): + "process the entire image and find the planes" + + img_roi = self.preprocess(img) + img3d = self.init_img3d(img_roi) + imgXYZ = self.compute_img3d(img_roi) + roim,rois = self.fit_plane_with_outliers(img_roi) + pose = self.convert_plane_params_to_pose() + + return pose + + +#%% Helpers +def draw_axis(img, rvec, tvec, cam_mtrx, cam_dist, len = 10): + # unit is mm + try: + points = np.float32([[len, 0, 0], [0, len, 0], [0, 0, len], [0, 0, 0]]).reshape(-1, 3) + axisPoints, _ = cv.projectPoints(points, rvec.astype(np.float32), tvec.astype(np.float32), cam_mtrx, cam_dist) + axisPoints = axisPoints.squeeze().astype(np.int32) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[0].ravel()), (0,0,255), 3) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[1].ravel()), (0,255,0), 3) + img = cv.line(img, tuple(axisPoints[3].ravel()), tuple(axisPoints[2].ravel()), (255,0,0), 3) + except Exception as e: + print(e) + print(rvec, tvec, img.shape) + return img + +def draw_polygon(img, rvec, tvec, cam_mtrx, cam_dist, points3d): + # unit is mm + points = np.float32(points3d).reshape(-1, 3) + polygon_points, _ = cv.projectPoints(points, rvec, tvec, cam_mtrx, cam_dist) + polygon_points = polygon_points.squeeze().astype(np.int32) + img = cv.polylines(img, [polygon_points], True, (0, 200, 200), 1) + + # To fill the polygon, use thickness=-1 + # cv2.fillPoly(img, [pts], color) + + return img + +def draw_cube(img, corners, imgpts): + imgpts = np.int32(imgpts).reshape(-1,2) + # draw ground floor in green + img = cv.drawContours(img, [imgpts[:4]],-1,(0,255,0),-3) + # draw pillars in blue color + for i,j in zip(range(4),range(4,8)): + img = cv.line(img, tuple(imgpts[i]), tuple(imgpts[j]),(255),3) + + # draw top layer in red color + img = cv.drawContours(img, [imgpts[4:]],-1,(0,0,255),3) + return img + +#%% ROI selector from OpenCV +class RectSelector: + def __init__(self, win, callback): + self.win = win + self.callback = callback + cv.setMouseCallback(win, self.onmouse) + self.drag_start = None + self.drag_rect = None + def onmouse(self, event, x, y, flags, param): + x, y = np.int16([x, y]) # BUG + if event == cv.EVENT_LBUTTONDOWN: + self.drag_start = (x, y) + return + if self.drag_start: + if flags & cv.EVENT_FLAG_LBUTTON: + xo, yo = self.drag_start + x0, y0 = np.minimum([xo, yo], [x, y]) + x1, y1 = np.maximum([xo, yo], [x, y]) + self.drag_rect = None + if x1-x0 > 0 and y1-y0 > 0: + self.drag_rect = (x0, y0, x1, y1) + else: + rect = self.drag_rect + self.drag_start = None + self.drag_rect = None + if rect: + self.callback(rect) + def draw(self, vis): + if not self.drag_rect: + return False + x0, y0, x1, y1 = self.drag_rect + cv.rectangle(vis, (x0, y0), (x1, y1), (0, 255, 0), 2) + return True + @property + def dragging(self): + return self.drag_rect is not None + +#%% Data Generator +class DataGen: + def __init__(self, img_size = (640,480)): + + self.frame_size = img_size + self.img = None + self.rect = None # roi + + + def add_noise(self, img_gray, noise_percentage = 0.01): + "salt and pepper noise" + if noise_percentage < 0.001: + return img_gray + + + # Get the image size (number of pixels in the image). + img_size = img_gray.size + + # Set the percentage of pixels that should contain noise + #noise_percentage = 0.1 # Setting to 10% + + # Determine the size of the noise based on the noise precentage + noise_size = int(noise_percentage*img_size) + + # Randomly select indices for adding noise. + random_indices = np.random.choice(img_size, noise_size) + + # Create a copy of the original image that serves as a template for the noised image. + img_noised = img_gray.copy() + + # Create a noise list with random placements of min and max values of the image pixels. + #noise = np.random.choice([img_gray.min(), img_gray.max()], noise_size) + noise = np.random.choice([-10, 10], noise_size) + + # Replace the values of the templated noised image at random indices with the noise, to obtain the final noised image. + img_noised.flat[random_indices] += noise + + log.info('adding image noise') + return img_noised + + def init_image(self, img_type = 1): + # create some images for test + w,h = self.frame_size + if img_type == 1: # / + + self.img = np.tile(np.linspace(100, 300, w), (h,1)) + + elif img_type == 2: # /|/ + + self.img = np.tile(np.linspace(100, 200, int(w/2)), (h,2)) + + elif img_type == 3: # |_| + + self.img = np.tile(np.linspace(100, 200, h).reshape((-1,1)), (1,w)) + + elif img_type == 4: # /\ + + self.img = np.tile(np.hstack((np.linspace(300, 500, w>>1),np.linspace(500, 300, w>>1))), (h,1)) + + elif img_type == 5: # dome + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = (np.abs(x - w/2) + np.abs(y - h/2))/10 + 200 # less slope + + elif img_type == 6: # sphere + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = np.sqrt((x - w/2)**2 + (y - h/2)**2)/10 + 200 # less slope + + elif img_type == 7: # stair + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = (np.sign(x - w/2) + np.sign(y - h/2))*5 + 200 # less slope + + + elif img_type == 8: # corner + + x,y = np.meshgrid(np.arange(w),np.arange(h)) + self.img = np.ones((h,w))*250 + img_bool = np.logical_and((x - w/2) < 0, (y - h/2) < 0) + self.img[img_bool] = 230 # quarter + + elif img_type == 10: # flat + + self.img = np.ones((h,w))*500 + + elif img_type == 11: + "chess board" + fname = r"C:\Users\udubin\Documents\Code\opencv-4x\samples\data\left04.jpg" + self.img = cv.imread(fname) + + elif img_type == 12: + self.img = cv.imread('image_scl_001.png', cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + elif img_type == 13: + self.img = cv.imread(r"wrappers\python\applications\planes\data\image_ddd_000.png", cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + elif img_type == 21: + self.img = cv.imread(r"C:\Data\Depth\Plane\image_scl_000.png", cv.IMREAD_GRAYSCALE) + #self.img = cv.resize(self.img , dsize = self.frame_size) + + #self.img = np.uint8(self.img) + + self.img = self.add_noise(self.img, 0) + self.frame_size = self.img.shape[:2] + return self.img + + def init_roi(self, test_type = 1): + "load the test case" + roi = [0,0,self.frame_size[0],self.frame_size[1]] + if test_type == 1: + roi = [310,230,330,250] # xlu, ylu, xrb, yrb + elif test_type == 2: + roi = [300,220,340,260] # xlu, ylu, xrb, yrb + elif test_type == 3: + roi = [280,200,360,280] # xlu, ylu, xrb, yrb + elif test_type == 4: + roi = [220,140,420,340] # xlu, ylu, xrb, yrb + elif test_type == 4: + roi = [200,120,440,360] # xlu, ylu, xrb, yrb + return roi + + def test_image(self): + "test single image depth" + img = self.init_image(1) + roi = self.init_roi(1) + +#%% Adds display functionality to the PlaneDetector +class PlaneDetectorDisplay(PlaneDetector): + def __init__(self, detect_type='p',image_size = (1280,720)): + super().__init__(detect_type, image_size=image_size) + self.detect_type = detect_type + self.frame_size = image_size + self.img = None + self.show_dict = {} # show figures in 3D + + def show_image_data(self, frame, display_mode = 1): + "converts image data to 3d color" + + if display_mode == 1: # left + img = frame[:,:,0] + elif display_mode == 2: # right + img = frame[:,:,1] + elif display_mode == 3: + img = cv.convertScaleAbs(frame[:,:,2], alpha=0.1) + else: # depth + img = cv.convertScaleAbs(frame[:,:,2], alpha=0.03) + + vis = np.uint8(img) + vis = cv.cvtColor(vis, cv.COLOR_GRAY2BGR) + return vis + + def show_data(self, frame = None, ttl = 'Depth'): + "draw relevant image data" + + if frame is None : + log.info('No images found') + return False + + if len(frame.shape) > 2 and frame.shape[2]==2: # extract 3 images + img_show = np.concatenate((frame[:,:,0], frame[:,:,1]), axis = 1) + img_show = np.uint8(img_show) + + elif len(frame.shape) == 2 and frame.dtype == 'uint16': + img_show = cv.convertScaleAbs(frame, alpha=0.03) + img_show = np.uint8(img_show) + else: + img_show = np.uint8(frame) + + while img_show.shape[1] > 2000: + img_show = cv.resize(img_show, (img_show.shape[1]>>1,img_show.shape[0]>>1), interpolation=cv.INTER_LINEAR) + + while img_show.shape[0] < 300: + img_show = cv.resize(img_show, (img_show.shape[1]<<1,img_show.shape[0]<<1), interpolation=cv.INTER_LINEAR) + + cv.imshow(ttl + ' (q-Quit)', img_show) + ch = cv.waitKey(10) + ret = ch != ord('q') + return ret + + def show_image_with_axis(self, img, poses = []): + "draw results : axis on the image. poses are list of 6D vectors" + axis_number = len(poses) + if axis_number < 1: + log.error('No poses found') + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + for k in range(axis_number): + + euler_angles = poses[k][3:] # orientation in degrees + rvec = Rot.from_euler('xyz',euler_angles[:3], degrees=True).as_rotvec() + tvec = np.array(poses[k][:3]) #np.array(, dtype = np.float32).reshape(rvec.shape) # center of the patch + img_show = draw_axis(img_show, rvec, tvec, self.cam_matrix, self.cam_distort, len = 10) + + cv.imshow('Image & Axis', img_show) + log.info('show done') + ch = cv.waitKey() + + # def show_image_with_roi_normals(self, img = None): + # "draw results : show normals at each point of ROI" + # if img is None: + # log.error('No image found') + # return img + # if self.rect is None: # roi + # log.error('No ROI found') + # return img + # if self.img_roi_normal is None: + # log.error('No normals found') + # return img + + # # deal with black and white + # img_show = np.uint8(img) #.copy() + # if len(img.shape) < 3: + # #img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + # img_show = np.tile(img_show[:,:,np.newaxis], (1,1,3)) + + # # scale normals to fir RGB color space + # normals = (self.img_roi_normal + 1)*127.5 + # x0,y0,x1,y1 = self.rect + # img_show[y0:y1,x0:x1, :] = np.uint8(normals) + + # #cv.imshow('Image & Normals', img_show) + # #log.info('show done') + # #ch = cv.waitKey(0) + # return img_show + + def show_image_with_rois(self, img, roi_params_ret = []): + "draw results by projecting ROIs on image" + + axis_number = len(roi_params_ret) + if axis_number < 1: + print('No poses found') + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + for roi_p in roi_params_ret: + + pose = self.convert_roi_params_to_pose(roi_p) + + avec = pose[3:6] # orientation in degrees + levl = pose[6] # level + #R = eulerAnglesToRotationMatrix(avec) + R = Rot.from_euler('zyx',avec, degrees = True).as_matrix() + rvec, _ = cv.Rodrigues(R) + tvec = np.array(pose[:3], dtype = np.float32).reshape(rvec.shape) # center of the patch + img_show= draw_axis(img_show, rvec, tvec, self.cam_matrix, self.cam_distort, len = levl) + + cv.imshow('Image & Axis', img_show) + log.info('show done') + ch = cv.waitKey() + + def show_points_3d_with_normal(self, img3d, pose = None): + "display in 3D" + fig = plt.figure() + ax = fig.add_subplot(projection='3d') + + #xs,ys,zs = img3d[:,:,0].reshape((-1,1)), img3d[:,:,1].reshape((-1,1)), img3d[:,:,2].reshape((-1,1)) + + xs,ys,zs = img3d[:,0].reshape((-1,1)), img3d[:,1].reshape((-1,1)), img3d[:,2].reshape((-1,1)) + ax.scatter(xs, ys, zs, marker='.') + + if pose is not None: + pose = pose.flatten() + vnorm = pose[3:6].flatten()*10 + xa, ya, za = [pose[0], pose[0]+vnorm[0]], [pose[1], pose[1]+vnorm[1]], [pose[2], pose[2]+vnorm[2]] + ax.plot(xa, ya, za, 'r', label='Normal') + + + ax.set_xlabel('X [mm]') + ax.set_ylabel('Y [mm]') + ax.set_zlabel('Z [mm]') + ax.set_aspect('equal', 'box') + plt.show() + + def show_rois_3d_with_normals(self, roi_params_ret = [], roi_init = None): + "display in 3D each ROI region with split" + + if len(roi_params_ret) < 1: + log.info('roi_params_ret is empty') + return + + # extract the initial ROI - to make the show more compact + roi_init = [0,0,self.frame_size[1], self.frame_size[0]] if roi_init is None else roi_init + x0,y0,x1,y1 = roi_init + + if self.img_xyz is None: + log.info('Need init') + return + + img3d = self.img_xyz[y0:y1,x0:x1,:] + xs,ys,zs = img3d[:,:,0].reshape((-1,1)), img3d[:,:,1].reshape((-1,1)), img3d[:,:,2].reshape((-1,1)) + + fig = plt.figure() + ax = fig.add_subplot(projection='3d') + ax.scatter(xs, ys, zs, marker='.') + + for roi_p in roi_params_ret: + pose = self.convert_roi_params_to_pose(roi_p) + pose = pose.flatten() + # R = Rot.from_euler('zyx',pose[3:6],degrees=True).as_matrix() + # vnorm = R[:,2]*pose[6] + vnorm = pose[3:6]*pose[6] + #log.info(str(vnorm)) + xa, ya, za = [pose[0], pose[0]+vnorm[0]], [pose[1], pose[1]+vnorm[1]], [pose[2], pose[2]+vnorm[2]] + ax.plot(xa, ya, za, 'r', label='Normal') + + + ax.set_xlabel('X [mm]') + ax.set_ylabel('Y [mm]') + ax.set_zlabel('Z [mm]') + ax.set_aspect('equal', 'box') + plt.show() #block=False) + + def show_3d_point_cloud(self): + "displays point cloud in real time" + if self.rect is None or self.rect_xyz is None: + return + X = self.rect_xyz[:,:3] + if not 'line' in self.show_dict : #len(self.show_dict) < 1: + + fig_num = int(self.rect[0]+self.rect[1]) + fig = plt.figure(fig_num) + plt.clf() + #fig.canvas.set_window_title('3D Scene') + try: + ax = fig.gca(projection='3d') + except: + ax = fig.add_subplot(projection = '3d') + fig.tight_layout() + fig.suptitle(f'ROI : {self.rect[0]},{self.rect[1]}') + ax.set_proj_type('ortho') + #lineGray, = ax.plot3D(X[:,0], X[:,1], X[:,2], color='k') + lineGray = ax.scatter(X[:,0], X[:,1], X[:,2]) + + ax.xaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + ax.yaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + ax.zaxis.set_pane_color((0.0, 0.0, 0.0, 0.0)) + + ax.set_xlabel('x') + ax.set_ylabel('y') + ax.set_zlabel('z') + ax.invert_yaxis() + ax.view_init(elev=-70, azim=-90) + + lims = 500 + ax.set_xlim(X[:,0].min()*0.8, X[:,0].max()*1.2) + ax.set_ylim(X[:,1].min()*0.8, X[:,1].max()*1.2) + ax.set_zlim(X[:,2].min()*0.9, X[:,2].max()*1.2) + plt.ion() + plt.show(block = False) + self.show_dict = {'fig':fig, 'ax':ax, 'line':lineGray} + else: + #self.show_dict['line'].set_data(X[:,0], X[:,1]) + #self.show_dict['line'].set_3d_properties(X[:,2]) + self.show_dict['line']._offsets3d = (X[:,0], X[:,1], X[:,2]) + #self.show_dict['ax'].set_ylim(low_limit, high_limit) + + self.show_dict['fig'].canvas.draw_idle() + self.show_dict['fig'].canvas.flush_events() + return + + def show_axis(self, vis): + "draw axis after plane estimation" + if self.plane_params is None: + return vis + + #rvec = self.plane_params/np.sum(self.plane_params**2) # normalize + rvec = self.convert_plane_params(self.plane_params) + #rvec = self.convert_plane_to_rvec(self.plane_params) + + tvec = self.plane_center + vis = draw_axis(vis, rvec, tvec, self.cam_matrix, self.cam_distort, len = 50) + return vis + + def show_text(self, vis): + "draw text plane estimation" + err_mean, err_std = self.img_mean, self.img_std + if err_mean is None: + return vis + + if self.rect is None: + return vis + + x0, y0, x1, y1 = self.rect + txt = f'{self.detect_type}:{err_mean:.2f}:{err_std:.3f}' + #if self.detect_type == 'F': + # txt = f'{self.detect_type}:{self.img_fill:.2f} %' + vis = draw_str(vis,(x0,y0-10),txt) + + return vis + + def show_rect_and_text(self, vis): + "draw axis after plane estimation" + err_mean, err_std = self.img_mean, self.img_std + if err_mean is None: + return vis + + if self.rect is None: + return vis + + x0, y0, x1, y1 = self.rect + clr = (0, 0, 0) if vis[y0:y1,x0:x1].mean() > 128 else (240,240,240) + vis = cv.rectangle(vis, (x0, y0), (x1, y1), clr, 2) + txt = f'{self.detect_type}:{err_mean:.2f}-{err_std:.3f}' + if self.detect_type == 'F': + txt = f'{self.detect_type}:{self.img_fill:.2f} %' + vis = draw_str(vis,(x0,y0-10),txt) + + return vis + + def show_rect_and_axis_projected(self, vis): + "projects rectangle on the plane" + if self.rect is None: + return vis + if self.plane_params is None: + return vis + + rvec = self.convert_plane_params(self.plane_params) + tvec = self.plane_center + + vis = draw_axis(vis, rvec, tvec, self.cam_matrix, self.cam_distort, len = 50) + vis = draw_polygon(vis, rvec, tvec, self.cam_matrix, self.cam_distort, self.rect_3d) + + return vis + + def show_mask(self, img): + "draw image mask" + + # deal with black and white + img_show = np.uint8(img) #.copy() + if len(img.shape) < 3: + img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + + if not np.all(self.img_mask.shape[:2] == img_show.shape[:2]): + log.error('mask and image size are not equal') + return img_show + + img_show[self.img_mask > 0.75] = self.color_mask + return img_show + + def show_polygon(self, img): + "shows polygon on top of the image" + # Define polygon vertices (e.g., a triangle) + if self.polygon is None: + return img + + polygon = np.array(self.polygon, np.int32) + + # Reshape for OpenCV (required shape: (n_points, 1, 2)) + polygon = polygon.reshape((-1, 1, 2)) + + # Draw the polygon on the image + img = cv.polylines(img, [polygon], isClosed=True, color=(0, 255, 0), thickness=2) + + return img + + def show_image_colored_by_normals(self, img = None): + "draw results : show normals at each point of ROI" + if img is None: + log.error('No image found') + return img + if self.rect is None: # roi + log.error('No ROI found') + return img + if self.img_roi_normal is None: + log.error('No normals found') + return img + + # deal with black and white + img_show = np.uint8(img).copy() + if len(img.shape) < 3: + #img_show = cv.applyColorMap(img_show, cv.COLORMAP_JET) + img_show = np.tile(img_show[:,:,np.newaxis], (1,1,3)) + + # scale normals to fir RGB color space + normals = (self.img_roi_normal + 1)*127.5 + x0,y0,x1,y1 = self.rect + #img_show[y0:y1,x0:x1, :] = np.uint8(normals) + img_show[y0:y1,x0:x1, :] = cv.addWeighted(img_show[y0:y1,x0:x1, :], 0.2, np.uint8(normals), 0.8, 0) + + # make black outside roi or non valid + non_valid_pixels = img[:,:,2] < 1 + img_show[non_valid_pixels,0] = 0 + img_show[non_valid_pixels,1] = 0 + img_show[non_valid_pixels,2] = 0 + + #cv.imshow('Image & Normals', img_show) + #log.info('show done') + #ch = cv.waitKey(0) + return img_show + + def show_scene(self, vis): + "draw ROI and Info" + + #vis = self.show_rect_and_text(vis) + #vis = self.show_axis(vis) + + vis = self.show_mask(vis) + + #vis = self.show_image_colored_by_normals(vis) + vis = self.show_rect_and_axis_projected(vis) + vis = self.show_text(vis) + + + + return vis + + + +# ---------------------- +#%% Tests +class TestPlaneDetector(unittest.TestCase): + + def test_image_show(self): + "checking image show" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + poses = [[0,0,100,0,0,45,10]] + p.show_image_with_axis(img,poses) + self.assertFalse(d.img is None) + + def test_init_img3d(self): + "XYZ point cloud structure init" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + isOk = p.init_image(img) + img3d = p.init_img3d() + self.assertFalse(img3d is None) + + def test_compute_img3d(self): + "XYZ point cloud structure init and compute" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + self.assertFalse(imgXYZ is None) + + def test_show_img3d(self): + "XYZ point cloud structure init and compute" + d = DataGen() + img = d.init_image(1) + p = PlaneDetectorDisplay() + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + roi = p.init_roi(1) + x0,y0,x1,y1 = roi + roiXYZ = imgXYZ[y0:y1,x0:x1,:] + p.show_points_3d_with_normal(roiXYZ) + self.assertFalse(imgXYZ is None) + + def test_convert_roi_to_points(self): + "computes 3d points of the ROI" + im_size = (640,480) + d = DataGen(img_size=im_size) + img = d.init_image(1) + p = PlaneDetectorDisplay(image_size=im_size) + roi = p.init_roi(1) + rect3d = p.convert_roi_to_points(img, roi_rect=roi) + self.assertTrue(rect3d.shape[0] > 4) + + def test_fit_plane_svd(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(5) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_svd(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.matrix_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_depth_image(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(13) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_svd(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_with_outliers(self): + "computes normal to the ROI" + d = DataGen() + img = d.init_image(13) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_with_outliers(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_ransac(self): + "computes with ransac" + d = DataGen() + img = d.init_image(6) + p = PlaneDetectorDisplay() + roi = p.init_roi(4) + img_roi = p.preprocess(img) + roim,rois = p.fit_plane_ransac(img_roi) + pose = p.convert_plane_params_to_pose() + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_fit_plane_using_gradients(self): + "computes normal of the ROI using gradients and then cross product" + im_size = (640,480) + d = DataGen(img_size=im_size) + img = d.init_image(1) # corner + p = PlaneDetectorDisplay(image_size=im_size) + roi = p.init_roi(13) # image center + roim,rois = p.fit_plane_using_gradients(img, roi) + pose = p.convert_plane_params_to_pose() + p.show_image_colored_by_normals(img) + p.show_image_with_axis(img, pose) + p.show_points_3d_with_normal(p.rect_xyz, pose) + self.assertTrue(pose[0][2] > 0.01) + + def test_split_roi(self): + "computes ROIS and splits if needed" + p = PlaneDetector() + p.MIN_STD_ERROR = 0.1 + img = p.init_image(13) + roi = p.init_roi(4) + img3d = p.init_img3d(img) + imgXYZ = p.compute_img3d(img) + roi_list= p.fit_and_split_roi_recursively(roi) + p.show_rois_3d_with_normals(roi_list, roi) + p.show_image_with_rois(p.img, roi_list) + + for roi_s in roi_list: + self.assertFalse(roi_s['error'] > 0.01) + + def test_plane_fit(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_weighted(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('W') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_outliers(self): + "plane fit data" + d = DataSourceMovie() + srcid = 421 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('O') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(41) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_ransac(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('R') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_weighted_4d(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('T') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(53) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_plane_fit_using_gradients(self): + "plane fit data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('G') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(33) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + d.finish() + self.assertFalse(ret) + + def test_multi_plane_fit(self): + "plane fit data multiple ROIs" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + roi_types = [21,22,23,31,32,33,52,53] + pm = [] + for rt in roi_types: + p = PlaneDetectorDisplay('T') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(rt) # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_scene(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + + def test_grid_plane_fit(self): + "plane fit data multiple ROIs in grid" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + + "generate grid of trackers" + nx, ny = 10, 10 + w,h = 1280, 720 + dx, dy = int(w/(nx+1)), int(h/(ny+1)) + wx,wy = int(dx*0.4), int(dy*0.4) + pm = [] + for ix in range(nx): + for iy in range(ny): + x0, y0 = (ix+1)*dx, (iy+1)*dy + #x1, y1 = min((ix+1)*dx, w-1), min((iy+1)*dy, h-1) + rect = (x0-wx,y0-wy,x0+wx,y0+wy) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + p.rect = rect # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_scene(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + + + def test_plane_fit_with_show_3d(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 421 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('P') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(41) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=3) + vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + p.show_3d_point_cloud() + d.finish() + self.assertFalse(ret) + + def test_plane_fit_colored_by_normals(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('G') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(0) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_image_colored_by_normals(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_plane_fit_with_grow(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('F') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_mask(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_plane_fit_with_project_on_image(self): + "plane fit data and show 3d data" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + p = PlaneDetectorDisplay('S') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(21) # center image + while ret: + ret,img = d.get_data() + if not ret: break + retp = p.find_planes(img) + vis = p.show_image_data(img, display_mode=1) + vis = p.show_mask(vis) + #vis = p.show_scene(vis) + ret = p.show_data(vis) & ret + #p.show_3d_point_cloud() + + d.finish() + self.assertFalse(ret) + + def test_multi_plane_fit_with_project_on_image(self): + "plane fit data multiple ROIs with image coloring per ROI mask" + d = DataSourceMovie() + srcid = 422 # 421,422, 423-ok + ret = d.init_video(srcid) + roi_types = [21,31,52,33] #22,23,31,32,33,52,53] + pm = [] + for rt in roi_types: + p = PlaneDetectorDisplay('S') #estimator_type=self.estim_type, estimator_id=estim_ind) + roi = p.init_roi(rt) # center image + pm.append(p) + + while ret: + ret,img = d.get_data() + if not ret: break + vis = pm[0].show_image_data(img, display_mode=3) + for p in pm: + retp = p.find_planes(img) + vis = p.show_mask(vis) + + ret = p.show_data(vis) + + d.finish() + self.assertFalse(ret) + +# ---------------------- +#%% Run Test +def RunTest(): + #unittest.main() + #suite = unittest.TestSuite() + suite = TestPlaneDetector() + #suite.test_image_show() # ok + #suite.test_init_img3d() # ok + #suite.test_compute_img3d() # ok + #suite.test_show_img3d() # ok + #suite.test_convert_roi_to_points() # ok + + + #suite.test_fit_plane_svd() # ok + #suite.test_fit_plane_depth_image() # + #suite.test_fit_plane_with_outliers() + #suite.test_fit_plane_ransac() + #suite.test_fit_plane_using_gradients() # ok + #suite.test_split_roi() + + #suite.test_plane_fit() # ok + #suite.test_plane_fit_weighted() # ok + #suite.test_plane_fit_outliers() # ok + #suite.test_plane_fit_with_show_3d() # ok + + #suite.test_plane_fit_ransac() # ok + #suite.test_plane_fit_weighted_4d() # ok + #suite.test_multi_plane_fit() # ok + #suite.test_plane_fit_using_gradients() # ok + #suite.test_plane_fit_colored_by_normals() # ok + #suite.test_plane_fit_with_grow() # + #suite.test_grid_plane_fit() # ok + #suite.test_plane_fit_with_project_on_image() # ok + suite.test_multi_plane_fit_with_project_on_image() # ok + + + + # runner = unittest.TextTestRunner() + # runner.run(suite) + +# ---------------------- +#%% App +class PlaneApp: + def __init__(self): + self.cap = RealSense() # + self.cap.set_display_mode('d16') + #self.cap.set_exposure(1000) + self.frame = None + self.rect = None + self.paused = False + self.trackers = [] + + self.camera_bf = self.cap.get_bf() # for depth conversion (should be in meter) + self.args = process_arguments() + self.args.scale = 0.5 + + # init + self.model = foundation_stereo_algo_init(self.args) + + self.show_dict = {} # hist show + + self.detect_type = 'G' + self.show_type = 'depth' # left, depth + self.win_name = 'Plane Detector (q-quit, c-clear, a,r,p,o,g,f,t,s,n,w 0-depth,1-left,2-right)' + + cv.namedWindow(self.win_name ) + self.rect_sel = RectSelector(self.win_name , self.on_rect) + self.run() + + def on_rect(self, rect): + "remember ROI defined by user" + #self.define_roi(self.frame, rect) + tracker = PlaneDetectorDisplay() #estimator_type=self.estim_type, estimator_id=estim_ind) + tracker.rect = rect + tracker.detect_type = self.detect_type + self.trackers.append(tracker) + log.info(f'Adding plane estimator at : {rect}') + + def generate_grid_trackers(self): + "generate grid of trackers" + nx, ny = 10, 10 + w,h = self.cap.frame_size + dx, dy = int(w/nx), int(h/ny) + for ix in range(nx): + for iy in range(ny): + x0, y0 = ix*dx, iy*dy + x1, y1 = min((ix+1)*dx, w-1), min((iy+1)*dy, h-1) + rect = (x0,y0,x1,y1) + self.on_rect(rect) + + def process_image(self, img_depth): + "makes measurements" + for tracker in self.trackers: + tracker.find_planes(img_depth) + + def show_scene(self, frame, img_depth_fs): + "draw ROI and Info" + if self.show_type == 'left': + vis = frame[:,:,0].astype(np.uint8) + elif self.show_type == 'right': + vis = frame[:,:,1].astype(np.uint8) + elif self.show_type == 'rs': + vis = cv.convertScaleAbs(frame[:,:,2], alpha=0.1).astype(np.uint8) + else: + vis = cv.convertScaleAbs(img_depth_fs, alpha=0.1).astype(np.uint8) + + vis = cv.cvtColor(vis, cv.COLOR_GRAY2BGR) + self.rect_sel.draw(vis) + + for tracker in self.trackers: + vis = tracker.show_scene(vis) + + return vis + + def show_histogram(self, img): + "show roi histgram" + if self.rect is None: + #print('define ROI') + return 0 + + x0, y0, x1, y1 = self.rect + img_roi = img[y0:y1,x0:x1].astype(np.float32) + # Compute histogram + hist, bins = np.histogram(img_roi.flatten(), bins=1024, range=[0, 2**15]) + + if not 'fig' in self.show_dict : #len(self.show_dict) < 1: + fig, ax = plt.subplots() + fig.set_size_inches([24, 16]) + ax.set_title('Histogram (Depth)') + ax.set_xlabel('Bin') + ax.set_ylabel('Frequency') + lineGray, = ax.plot(bins[:-1], hist, c='k', lw=3) + ax.set_xlim(bins[0], bins[-1]) + ax.set_ylim(0, max(hist)+10) + plt.ion() + #plt.show() + + self.show_dict = {'fig':fig, 'ax':ax, 'line':lineGray} + else: + self.show_dict['line'].set_ydata(hist) + + self.show_dict['fig'].canvas.draw() + return + + def run(self): + while True: + playing = not self.paused and not self.rect_sel.dragging + if playing or self.frame is None: + ret, frame = self.cap.read() + if not ret: + break + self.frame = frame.copy() + + # depth + img_left, img_right, img_depth_rs = self.frame[:,:,0], self.frame[:,:,1], self.frame[:,:,2] + + img_disparity = foundation_stereo_algo(self.args, self.model, img_left, img_right) + img_depth_fs = convert_disparity_to_depth(self.camera_bf, img_disparity) + + + #self.statistics(frame) + self.process_image(img_depth_fs) + + vis = self.show_scene(frame, img_depth_fs) + cv.imshow(self.win_name , vis) + ch = cv.waitKey(1) + if ch == ord(' '): + self.paused = not self.paused + elif ch == ord('a'): + self.detect_type = 'A' + log.info(f'Detect type : {self.detect_type}') + elif ch == ord('r'): + self.detect_type = 'R' + log.info(f'Detect Ransac : {self.detect_type}') + elif ch == ord('p'): + self.detect_type = 'P' + log.info(f'Detect svd : {self.detect_type}') + elif ch == ord('o'): + self.detect_type = 'O' + log.info(f'Detect with outliers : {self.detect_type}') + elif ch == ord('g'): + self.detect_type = 'G' + log.info(f'Detect gradients : {self.detect_type}') + elif ch == ord('f'): + self.detect_type = 'F' + log.info(f'Detect fit and grow : {self.detect_type}') + elif ch == ord('w'): + self.detect_type = 'W' + log.info(f'Detect weighted svd : {self.detect_type}') + elif ch == ord('s'): + self.detect_type = 'S' + log.info(f'Detect entire image : {self.detect_type}') + elif ch == ord('n'): + self.detect_type = 'N' + log.info(f'Detect entire image using normals : {self.detect_type}') + elif ch == ord('t'): + self.detect_type = 'T' + log.info(f'Detect weighted svd 4d : {self.detect_type}') + elif ch == ord('0'): + self.show_type = 'rs' + log.info(f'Show type : {self.show_type}') + elif ch == ord('1'): + self.show_type = 'left' + log.info(f'Show type : {self.show_type}') + elif ch == ord('2'): + self.show_type = 'right' + log.info(f'Show type : {self.show_type}') + elif ch == ord('3'): + self.show_type = 'fs' + log.info(f'Show type : {self.show_type}') + elif ch == ord('m'): + self.generate_grid_trackers() + elif ch == ord('c'): + if len(self.trackers) > 0: + t = self.trackers.pop() + elif ch == 27 or ch == ord('q'): + break + + +if __name__ == '__main__': + #print(__doc__) + + #RunTest() + PlaneApp() + + + diff --git a/scripts/synthetic_data_manager.py b/scripts/synthetic_data_manager.py new file mode 100644 index 0000000..5f274c2 --- /dev/null +++ b/scripts/synthetic_data_manager.py @@ -0,0 +1,366 @@ +''' + +Dataset management for Synthetic stereo dataset. + +Loads RealSense IR stereo pairs left, right images +and Depth depthmap and computes the ground-truth from chess baord pattern in the left image. +The png file have 3 channels: first chnnel is left, second is right, third is depth. + +The left channel contains images of the chess board pattern captured by the left RealSense camera. +, which is used for computing the synthetic depth ground truth. +The right channel contains the corresponding images from the right RealSense camera. +The depth channel contains the depth maps obtained from the RealSense sensor, +which can be used for comparison against the synthetic depth computed from the chessboard pattern. + +Expected directory layout (one or more session folders under root): + / + / + 405/ + / + image_d16_.png # left, right and depth image (uint16) + image_d16_.png # left, right and depth image (uint16) + / + image_d16_.png # left, right and depth image (uint16) + + +Only samples that have BOTH a realsense pair AND a matching zivid depth are +included. Sessions that lack a zivid subfolder (e.g. freedrive-only captures) +are silently skipped. + +Output dict keys (same as faro_data_manager for compatibility): + left : numpy array (H, W) uint8/uint16 IR + right : numpy array (H, W) uint8/uint16 IR + depth_syn : numpy array (H, W) float32, mm ← Synthetic depth from chessboard pattern (empty / zeros if absent) + depth_rs : numpy array (H, W) float32, mm (empty / zeros if absent) + +''' + +import numpy as np +import cv2 +import matplotlib.pyplot as plt +import os +import glob +import unittest +import logging as log +import yaml + +from object_chessboard import ObjectChessboard + +# -------------------------------- +# 405 / 1280x720 +CAMERA_MATRIX_RS = np.array([ + [644.471, 0, 649.253], + [0, 644.471, 365.398], + [0, 0, 1] +]) +DIST_COEFFS_RS = np.array([ 0.0, -0.0, -0.0, 0.0, -0.0]) + + + +# -------------------------------- +#%% Data source +class DataSource: + + def __init__(self): + self.gray_scale_input = False + self.depth_estimator = ObjectChessboard() # for synthetic GT depth estimation from chessboard pattern + self.imgs = [] # list of dicts: {packed_png} + log.info('Source is defined') + + def init_directory(self, input_rectified='', gray_scale_input=False, sub_indexes=None): + """Scan root for packed synthetic PNG files and populate self.imgs. + + Expected layout (one or more sessions): + //405//image_d16_.png + """ + if len(input_rectified) < 3: + input_rectified = r'C:\Work\Data\DepthRS\ffs' + + self.gray_scale_input = gray_scale_input + self.imgs = [] + + if not os.path.isdir(input_rectified): + log.error(f"Directory not found: {input_rectified}") + return 0 + + patterns = [ + os.path.join(input_rectified, '**', '405', '*', 'image_d16_*.png'), # recursive, supports /405/... and //405/... # legacy/session path + ] + + packed_paths = [] + for pattern in patterns: + packed_paths.extend(glob.glob(pattern, recursive=True)) + packed_paths = sorted(set(packed_paths)) + + for packed_path in packed_paths: + self.imgs.append({'packed_png': packed_path}) + + if sub_indexes is not None: + self.imgs = [self.imgs[i] for i in sub_indexes] + + log.info(f"DataSource: found {len(self.imgs)} samples in {input_rectified}") + return len(self.imgs) + + def get_item(self, index: int, debug: bool = False): + """Return one sample as a dict with left, right, depth_syn, depth_rs.""" + output_str = { + "left": [], + "right": [], + "depth_syn": [], + "depth_rs": [], + "rgb": np.array([], dtype=np.uint8) + } + + entry = self.imgs[index] + + packed_img = cv2.imread(entry['packed_png'], cv2.IMREAD_UNCHANGED) + if packed_img is None: + log.warning(f"Failed to load sample {index}: {entry['packed_png']}") + return output_str + + if packed_img.ndim != 3 or packed_img.shape[2] < 3: + log.warning(f"Invalid packed PNG format (expected 3 channels): {entry['packed_png']}") + return output_str + + left_img = packed_img[:, :, 0] + right_img = packed_img[:, :, 1] + depth_rs = packed_img[:, :, 2].astype(np.float32) + + # Synthetic GT can be computed from chessboard if available; + depth_syn = self.get_synthetic_depth(left_img) + + output_str["left"] = left_img + output_str["right"] = right_img + output_str["depth_syn"] = depth_syn + output_str["depth_rs"] = depth_rs + + if debug: + img_list = [left_img, right_img, depth_rs, depth_syn] + ttl_list = ['left', 'right', 'depth RS (mm)', 'depth SYN (mm)'] + self.show_subset(img_list, ttl_list) + + return output_str + + def get_synthetic_depth(self, left_img): + """Compute synthetic depth from chessboard pattern in the left image.""" + result = self.depth_estimator.estimate_camera_pose(left_img, camera_matrix = CAMERA_MATRIX_RS, dist_coeffs = DIST_COEFFS_RS) + if result["success"]: + XYZ, projected_points = self.depth_estimator.get_grid_in_camera_coordinates( + rvec=result['rvec'], + tvec=result['tvec'], + camera_matrix=CAMERA_MATRIX_RS, + dist_coeffs=DIST_COEFFS_RS + ) + depth_syn = self.project_3d_to_camera(XYZ, CAMERA_MATRIX_RS, DIST_COEFFS_RS, frame_size = left_img.shape) # Project back to image space to get depth map + return depth_syn + else: + log.warning("Failed to estimate camera pose for synthetic depth computation.") + return np.zeros_like(left_img, dtype=np.float32) + + def get_item_projected(self, index: int, debug: bool = False): + """Compatibility wrapper for synthetic data; returns the same as get_item.""" + return self.get_item(index=index, debug=debug) + + def compute_depth_error(self, depth_pred, depth_gt, depth_mask=None): + """Compute absolute depth error between prediction and GT.""" + depth_pred = depth_pred.astype(np.float32) + depth_gt = depth_gt.astype(np.float32) + depth_error = np.zeros_like(depth_pred) + mask = np.ones_like(depth_pred, dtype=bool) if depth_mask is None else depth_mask + valid = np.logical_and(depth_gt > 0, mask) + valid = np.logical_and(depth_pred > 0, valid) + depth_error[valid] = (depth_pred[valid] - depth_gt[valid]) + return depth_error + + def show_subset(self, img_list, ttl_list, vmin=None, vmax=None, save_path='', fig_name=''): + """Display a list of images in a grid.""" + img_num = len(img_list) + col_num = min(img_num, 3) + row_num = (img_num + col_num - 1) // col_num + fig, axes = plt.subplots(row_num, col_num, sharey=True, sharex=True) + axes = np.array(axes).reshape(row_num, col_num) + for k in range(img_num): + ri, ci = k // col_num, k % col_num + axes[ri, ci].imshow(img_list[k], vmin=vmin, vmax=vmax) + axes[ri, ci].set_title(ttl_list[k]) + for k in range(img_num, row_num * col_num): + axes[k // col_num, k % col_num].axis('off') + if save_path and os.path.exists(save_path): + fig.savefig(os.path.join(save_path, fig_name + ".png")) + plt.show(block=False) + + def save_data_to_folder(self, output_str, output_directory): + """Save sample dict to PNG files on disk.""" + os.makedirs(output_directory, exist_ok=True) + + paths = { + "img_left.png": output_str["left"], + "img_right.png": output_str["right"], + "img_depth_syn.png": output_str["depth_syn"].astype(np.uint16), + "img_depth_rs.png": output_str["depth_rs"].astype(np.uint16), + } + success = True + for fname, img in paths.items(): + out = cv2.imwrite(os.path.join(output_directory, fname), img, + [cv2.IMWRITE_PNG_COMPRESSION, 0]) + success = success and out + + if output_str["rgb"] is not None and np.asarray(output_str["rgb"]).size > 0: + cv2.imwrite(os.path.join(output_directory, "img_rgb.png"), + output_str["rgb"], [cv2.IMWRITE_PNG_COMPRESSION, 0]) + + return success + + + def save_to_ply(self, points: np.ndarray, filename: str): + """Save a point cloud to a PLY file for visualization.""" + with open(filename, 'w') as f: + f.write('ply\n') + f.write('format ascii 1.0\n') + f.write(f'element vertex {len(points)}\n') + f.write('property float x\n') + f.write('property float y\n') + f.write('property float z\n') + f.write('end_header\n') + for x, y, z in points: + f.write(f'{x} {y} {z}\n') + + def project_camera_to_3d(self, depth_img_mm: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray) -> np.ndarray: + """Project 2D pixel coordinates with depth to 3D points in camera space.""" + h, w = depth_img_mm.shape + xs, ys = np.meshgrid(np.arange(w, dtype=np.float32), np.arange(h, dtype=np.float32), indexing='xy') + + # OpenCV expects Nx1x2 contiguous float32/float64 image points in (x, y) order. + distorted_points = np.stack([xs, ys], axis=-1).reshape(-1, 1, 2).astype(np.float32) + undistorted_points = cv2.undistortPoints(distorted_points, cam_matrix.astype(np.float32), dist_coeffs.astype(np.float32) ) + + uv = undistorted_points.reshape(-1, 2) + Z = depth_img_mm.reshape(-1).astype(np.float32) + valid = np.isfinite(Z) & (Z > 0) + if not np.any(valid): + return np.zeros((0, 3), dtype=np.float32) + + uv = uv[valid] + Z = Z[valid] + X = uv[:, 0] * Z + Y = uv[:, 1] * Z + + # save to ply point cloud for visualization + XYZ = np.stack([X, Y, Z], axis=1).astype(np.float32) + + return XYZ + + def project_3d_to_camera(self, points_3d: np.ndarray, cam_matrix: np.ndarray, dist_coeffs: np.ndarray, frame_size = (480,640)) -> np.ndarray: + """Project 3D points in camera space back to 2D pixel coordinates.""" + if points_3d.shape[1] != 3: + raise ValueError("Input points_3d must have shape (N, 3)") + + projected_pts, _ = cv2.projectPoints( + points_3d.reshape(-1, 1, 3), + np.zeros(3, dtype=np.float32), + np.zeros(3, dtype=np.float32), + cam_matrix.astype(np.float32), + dist_coeffs.astype(np.float32), + ) + + uv_rs = projected_pts.reshape(-1, 2) + u_idx = np.rint(uv_rs[:, 0]).astype(np.int32) + v_idx = np.rint(uv_rs[:, 1]).astype(np.int32) + + h_rs, w_rs = frame_size + in_bounds = (u_idx >= 0) & (u_idx < w_rs) & (v_idx >= 0) & (v_idx < h_rs) + if not np.any(in_bounds): + return np.zeros((h_rs, w_rs), dtype=np.float32) + + u_idx = u_idx[in_bounds] + v_idx = v_idx[in_bounds] + z_vals = points_3d[in_bounds, 2] # Z values of the valid points + + # Rasterize by nearest pixel; if multiple points hit a pixel, keep the closest depth. + lin = v_idx * w_rs + u_idx + depth_buffer = np.full(h_rs * w_rs, np.inf, dtype=np.float32) + np.minimum.at(depth_buffer, lin, z_vals) + depth_projected = depth_buffer.reshape(h_rs, w_rs) + depth_projected[~np.isfinite(depth_projected)] = 0.0 + return depth_projected + + # project from zivid depth patrix to point cloud and back to depth matrix with rs intrinsics and distortion to get "zivid GT as seen by RealSense" for pixel-level comparison + def project_depth_zivid_to_rs(self,depth_zivid_mm: np.ndarray, depth_rs_mm: np.ndarray, finx = 0) -> np.ndarray: + # create 3D point cloud from zivid depth + XYZ = self.project_camera_to_3d(depth_zivid_mm, CAMERA_MATRIX_RS, DIST_COEFFS_RS) # (N, 3) array of 3D points in Zivid camera space + # save to ply point cloud for visualization + #save_to_ply(XYZ/1000, f'zivid_original_points_{finx:03d}.ply') # save in meters for visualization + + # project back on imaage RS + depth_zivid_projected_mm = self.project_3d_to_camera(XYZ, CAMERA_MATRIX_RS, DIST_COEFFS_RS, frame_size = depth_rs_mm.shape) # (H, W) depth map of Zivid points projected into RealSense pixel space + + XYZ_RS = self.project_camera_to_3d(depth_zivid_projected_mm, CAMERA_MATRIX_RS, DIST_COEFFS_RS) + # save to ply point cloud for visualization + #save_to_ply(XYZ_RS/1000, f'zivid_projected_points_{finx:03d}.ply') # save in meters for visualization + + return depth_zivid_projected_mm + + def show_projection(self, rs_map, zv_map, zv_valid, idx): + fig, axes = plt.subplots(1, 3, sharey=True, sharex=True, figsize=(8,4)) + axes[0].imshow(rs_map, vmin=-10, vmax=1000),axes[0].set_title(f"RealSense Depth Diff (mm)"), + axes[1].imshow(zv_map, vmin=-10, vmax=1000),axes[1].set_title(f"Zivid Projected Depth Diff (mm)"), + axes[2].imshow(zv_valid, cmap='gray'),axes[2].set_title(f"Valid Mask (Zivid Projection)"), + plt.suptitle(f"Sample {idx:03d} Depth Difference Maps and Valid Mask", fontsize=16) + plt.tight_layout() + plt.show() + + +# -------------------------------- +#%% Tests +class TestDataSource(unittest.TestCase): + + def test_init_directory(self): + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + + def test_get_item(self): + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + out = p.get_item(0, debug=True) + self.assertTrue(len(out["left"]) > 0) + + def test_show_images(self): + p = DataSource() + img_num = p.init_directory() + if img_num == 0: + log.warning("No images found.") + return + for k in np.random.randint(0, img_num, size=min(4, img_num)): + out = p.get_item(int(k), debug=True) + self.assertTrue(len(out["left"]) > 0) + p.show_subset([out["left"], out["right"], out["depth_rs"]], + ['left (RS)', 'right (RS)', 'depth RS (mm)']) + + plt.show() + + def test_get_item_projected(self): + p = DataSource() + img_num = p.init_directory() + self.assertTrue(img_num > 0) + out = p.get_item_projected(80, debug=False) + err = p.compute_depth_error(out["depth_rs"], out["depth_syn"]) + self.assertTrue(len(out["left"]) > 0) + p.show_subset([out["left"], out["right"], out["depth_rs"], out["depth_syn"], err], + ['left (RS)', 'right (RS)', 'depth RS (mm)', 'depth SYN (mm)', 'error (mm)'], vmax=None) + plt.show() + + +# -------------------------------- +#%% Run Test +def RunTest(): + tst = TestDataSource() + #tst.test_init_directory() + #tst.test_get_item() + #tst.test_show_images() + tst.test_get_item_projected() + + +if __name__ == '__main__': + RunTest() diff --git a/uris_model_chat.txt b/uris_model_chat.txt new file mode 100644 index 0000000..2806c62 --- /dev/null +++ b/uris_model_chat.txt @@ -0,0 +1,1059 @@ + ▐▛███▜▌ Claude Code v2.1.126 +▝▜█████▛▘ Sonnet 4.6 · Claude Team + ▘▘ ▝▝ ~/repos/Fast-FoundationStereo + +❯ in the environment tao_stereo install onnx and trt compilation tool and run + example of the network conversion + +● Let me first explore the project structure to understand the context before + proceeding. + +● Bash(ls /home/adiroha/repos/Fast-FoundationStereo/ && conda env list + 2>/dev/null || true) + ⎿  assets + core + data + … +26 lines (ctrl+o to expand) + + Read 1 file (ctrl+o to expand) + +● Bash(find /home/adiroha/repos/Fast-FoundationStereo -name "*.py" | xargs grep + -l -i "onnx\|tensorrt\|trt" 2>/dev/null | head -20) + ⎿  /home/adiroha/repos/Fast-FoundationStereo/core/foundation_stereo.py + /home/adiroha/repos/Fast-FoundationStereo/scripts/make_onnx.py + /home/adiroha/repos/Fast-FoundationStereo/scripts/run_demo_tensorrt.py + +● Bash(conda run -n tao_stereo pip list 2>/dev/null | grep -i + "onnx\|tensorrt\|torch\|trt" | head -20) + ⎿  open_clip_torch 3.3.0 + pytorch-lightning 2.6.1 + torch 2.10.0+cu126 + … +3 lines (ctrl+o to expand) + + Read 2 files (ctrl+o to expand) + +● Let me check CUDA version, requirements, and what's needed for TRT conversion. + + Read 1 file (ctrl+o to expand) + +● Bash(nvcc --version 2>/dev/null; nvidia-smi 2>/dev/null | head -10) + ⎿  nvcc: NVIDIA (R) Cuda compiler driver + Copyright (c) 2005-2023 NVIDIA Corporation + Built on Fri_Jan__6_16:45:21_PST_2023 + … +16 lines (ctrl+o to expand) + +● Bash(which trtexec 2>/dev/null; dpkg -l | grep -i tensorrt 2>/dev/null | head + -10; ls /usr/lib/python3/dist-packages/ 2>/dev/null | grep -i trt) + ⎿  No matches found + + Searched for 2 patterns, read 1 file (ctrl+o to expand) + +● Now I have the full picture. Let me check the weights and then install the + required packages. + + Listed 3 directories (ctrl+o to expand) + +● Now let me install the ONNX and TensorRT packages in the tao_stereo + environment. + +● Bash(conda run -n tao_stereo pip install onnx onnxruntime-gpu tensorrt-cu12 + tensorrt-lean-cu12 tensorrt-dispatch-cu12 2>&1 | tail -20) + ⎿  Running in the background (↓ to manage) + +● Installation running in background. Let me set up the output directory and + review the ONNX script arguments. + +● Bash(mkdir -p /home/adiroha/repos/Fast-FoundationStereo/output/onnx_trt) + ⎿  Done + +● Background command "Install ONNX and TensorRT packages in tao_stereo env" completed (exit code 0) + + Read 1 file (ctrl+o to expand) + +● All packages installed. Now let me find trtexec and run the ONNX export. + +● Bash(find /home/adiroha/miniconda3/envs/tao_stereo -name "trtexec" + 2>/dev/null; conda run -n tao_stereo python -c "import tensorrt as trt; + print('TRT version:', trt.…) + ⎿  TRT version: 10.16.1.11 + Listed 1 directory (ctrl+o to expand) + +● trtexec isn't included in the pip package — I'll use the TensorRT Python API + for ONNX→engine conversion. First, let me run the ONNX export. + +● Bash(conda run -n tao_stereo python -c "import cv2; + img=cv2.imread('assets/left.png'); print('Image size:', img.shape if img + is not None else 'not found')") + ⎿  [ WARN:0@0.009] global loadsave.cpp:278 findDecoder imread_('assets/le + ft.png'): can't open/read file: check file path/integrity + + Image size: not found + + Listed 1 directory (ctrl+o to expand) + +● No stereo test images in assets — I'll use the default 448x640 dimensions. Now + let me run the ONNX export. + +● Bash(conda run -n tao_stereo python scripts/make_onnx.py \ + --model_dir weights/20-26-39/model_best_bp2_serialize.pth \…) + ⎿  Running in the background (↓ to manage) + +● While the ONNX export runs, let me write the TRT engine conversion script. + +● Write(scripts/make_trt_engine.py) + ⎿  Wrote 53 lines to scripts/make_trt_engine.py + API."""ert ONNX models to TensorRT engines using the Python TRT + 2 import argparse + 3 import os + 4 import tensorrt as trt + 5 + 7 TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) + 8 + 9 def build_engine(onnx_path: str, engine_path: str, fp16: bool = + True, workspace_gb: int = 4): + 10 builder = trt.Builder(TRT_LOGGER) + … +51 lines (ctrl+o to expand) + +● Background command "Export model to ONNX format" failed with exit code 1 + + Read 1 file (ctrl+o to expand) + +● Bash(conda run -n tao_stereo pip install onnxscript 2>&1 | tail -5) + ⎿  Downloading onnx_ir-0.2.1-py3-none-any.whl (166 kB) + Installing collected packages: onnx_ir, onnxscript + + Successfully installed onnx_ir-0.2.1 onnxscript-0.7.0 + +● Bash(conda run -n tao_stereo python scripts/make_onnx.py \ + --model_dir weights/20-26-39/model_best_bp2_serialize.pth \…) + ⎿  Running in the background (↓ to manage) + +● ONNX export running again with onnxscript now installed. Let me wait for it. + + Read 1 file (ctrl+o to expand) + +● Waiting for the ONNX export to produce output... + + Listed 1 directory (ctrl+o to expand) + +● feature_runner.onnx is done (38MB data file). The post_runner.onnx export is + still in progress. Let me keep monitoring. + + Read 1 file, listed 1 directory (ctrl+o to expand) + +● Background command "Re-run ONNX export" failed with exit code 1 + +● The new PyTorch 2.10 ONNX exporter doesn't handle adaptive_max_pool2d. I need + to force the legacy TorchScript-based exporter with dynamo=False. + + Read 1 file (ctrl+o to expand) + +● Update(scripts/make_onnx.py) + ⎿  Added 4 lines, removed 2 lines + 63 opset_version=17, + 64 input_names = ['left', 'right'], + 65 output_names = ['features_left_04', 'features_left_08', + 'features_left_16', 'features_left_32', 'features_right_04', ' + stem_2x'], + 66 - do_constant_folding=True + 66 + do_constant_folding=True, + 67 + dynamo=False, + 68 ) + 69 + 70 features_left_04, features_left_08, features_left_16, featu + res_left_32, features_right_04, stem_2x = feature_runner(left_i + mg, right_img) + ... + 78 opset_version=17, + 79 input_names = ['features_left_04', 'features_left_08', + 'features_left_16', 'features_left_32', 'features_right_04', 's + tem_2x', 'gwc_volume'], + 80 output_names = ['disp'], + 80 - do_constant_folding=True + 81 + do_constant_folding=True, + 82 + dynamo=False, + 83 ) + 84 + 85 with open(f'{args.save_path}/onnx.yaml', 'w') as f: + +● Bash(rm -f /home/adiroha/repos/Fast-FoundationStereo/output/onnx_trt/*.onnx + /home/adiroha/repos/Fast-FoundationStereo/output/onnx_trt/*.data && conda + run -n tao_ster…) + ⎿  Running in the background (↓ to manage) +● Bash(sleep 60 && cat /tmp/claude-1000/-home-adiroha-repos-Fast-FoundationStere + o/9e5eae82-b63e-4097-921a-bd15bd674371/tasks/bqvmt6zhn.output 2>/dev/null + | tail -20 &&…) + ⎿  Error: Blocked: sleep 60 followed by: cat + /tmp/claude-1000/-home-adiroha-repos-Fast-FoundationStereo/9e5eae82-b63e-40 + 97-921a-bd15bd674371/tasks/bqvmt6zhn.output tail -20 ls -lh + /home/adiroha/repos/Fast-FoundationStereo/output/onnx_trt/. To wait for a + condition, use Monitor with an until-loop (e.g. `until ; do sleep 2; + done`). To wait for a command you started, use run_in_background: true. Do + not chain shorter sleeps to work around this block. + +● Background command "Re-run ONNX export with legacy exporter" completed (exit code 0) + + Listed 1 directory (ctrl+o to expand) + +● Both ONNX files exported successfully (37MB + 18MB). Now converting to TRT + engines. + +● Bash(conda run -n tao_stereo python scripts/make_trt_engine.py --onnx_dir + output/onnx_trt 2>&1) + ⎿  Running in the background (↓ to manage) + +● TRT engine compilation can take several minutes. Let me monitor progress. + + Read 1 file (ctrl+o to expand) + +● TRT compilation in progress. I'll report back when the engines are ready — + this typically takes 3–10 minutes for the first compile on a new model. + + --- + What's been done so far: + + ┌──────────────────────────────────────────────────────────────┬──────────┐ + │ Step │ Status │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Install onnx, onnxruntime-gpu, tensorrt-cu12 10.16.1 │ ✅ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Install onnxscript (missing dep for PyTorch 2.10 exporter) │ ✅ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Fix make_onnx.py to use legacy TorchScript exporter │ ✅ │ + │ (dynamo=False) to avoid adaptive_max_pool2d dispatch error │ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Export feature_runner.onnx (37 MB) │ ✅ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Export post_runner.onnx (18 MB) │ ✅ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Write scripts/make_trt_engine.py (Python TRT API builder) │ ✅ │ + ├──────────────────────────────────────────────────────────────┼──────────┤ + │ Build TRT engines (feature_runner.engine, │ ⏳ │ + │ post_runner.engine) │ running │ + └──────────────────────────────────────────────────────────────┴──────────┘ + +✻ Baked for 10m 48s + +✻ Claude resuming /loop wakeup (Apr 26 3:02pm) + + Read 1 file, listed 1 directory (ctrl+o to expand) + +● feature_runner.engine (21 MB) is compiled. post_runner.engine still in + progress — let me wait for it. + +● feature_runner.engine done (21 MB). post_runner.engine still compiling — I'll + report back when it finishes. + +✻ Claude resuming /loop wakeup (Apr 26 3:03pm) + + Listed 1 directory (ctrl+o to expand) + +● Both ONNX files exist and TRT conversion is already running (job byephtlsb). + feature_runner.engine is done (21 MB), post_runner.engine still compiling. + Output file is empty so far — TRT logging goes to stderr after the build + completes. + +● Still waiting on post_runner.engine. I'll update you when it's done. + +✻ Claude resuming /loop wakeup (Apr 26 3:05pm) + +✻ Claude resuming /loop wakeup (Apr 26 3:05pm) + + Listed 1 directory (ctrl+o to expand) + +● post_runner.engine still not written — compilation ongoing. Scheduling another + check. +● Still compiling — post_runner is the larger/more complex subgraph. Will check + again in ~4 minutes. + +✻ Claude resuming /loop wakeup (Apr 26 3:06pm) + + Listed 1 directory (ctrl+o to expand) + +● Bash(ps aux | grep -i "make_trt\|trt_engine\|tensorrt" | grep -v grep) + ⎿  adiroha 2166525 0.0 0.0 10072 3740 ? Ss 14:48 0:00 /bi + n/bash -c source /home/adiroha/.claude/shell-snapshots/snapshot-bash-1 + 777204063714-7d3qxc.sh 2>/dev/null || true && shopt -u extglob 2>/dev/ + … +13 lines (ctrl+o to expand) + +● Process is alive and running hot (PID 2191990, ~80% CPU, 7.6 GB RAM). The TRT + builder is crunching through post_runner. Output is still buffered — nothing + printed until build_serialized_network() returns. This is normal for complex + models. + +● Still compiling post_runner — process is healthy, just CPU-bound on kernel + optimization. Checking again in ~4 min. + +✻ Sautéed for 38s + +● Background command "Convert ONNX models to TRT engines" completed (exit code +0) + + Read 1 file, listed 1 directory (ctrl+o to expand) + +● Both engines compiled and saved successfully. Here's the full summary: + + ONNX export — done (using legacy TorchScript exporter with dynamo=False) + - feature_runner.onnx — 37 MB, opset 17, PyTorch 2.10.0 + - post_runner.onnx — 18 MB + + TRT engine compilation — done in ~6 minutes (15:00 → 15:06), FP16 enabled on + RTX 4080 + - feature_runner.engine — 21 MB + - post_runner.engine — 14 MB + + All four files are in output/onnx_trt/. The engines are ready to use with + scripts/run_demo_tensorrt.py — you just need a stereo pair (--left_file, + --right_file) and camera intrinsics (--intrinsic_file). + +✻ Claude resuming /loop wakeup (Apr 26 3:09pm) + +● Both engines already completed — I reported this in my previous message. To + confirm: + + Listed 1 directory (ctrl+o to expand) + +● Everything is complete and healthy: + + ┌───────────────────────┬───────┬──────────────────────────────┐ + │ File │ Size │ Status │ + ├───────────────────────┼───────┼──────────────────────────────┤ + │ feature_runner.onnx │ 37 MB │ ✅ exported │ + ├───────────────────────┼───────┼──────────────────────────────┤ + │ post_runner.onnx │ 18 MB │ ✅ exported │ + ├───────────────────────┼───────┼──────────────────────────────┤ + │ feature_runner.engine │ 21 MB │ ✅ compiled (FP16, RTX 4080) │ + ├───────────────────────┼───────┼──────────────────────────────┤ + │ post_runner.engine │ 14 MB │ ✅ compiled (FP16, RTX 4080) │ + ├───────────────────────┼───────┼──────────────────────────────┤ + │ onnx.yaml │ 178 B │ ✅ model config │ + └───────────────────────┴───────┴──────────────────────────────┘ + + The engines are ready. To run inference, provide a stereo pair: + + conda run -n tao_stereo python scripts/run_demo_tensorrt.py \ + --onnx_dir output/onnx_trt \ + --left_file \ + --right_file \ + --intrinsic_file \ + --out_dir output/trt_demo + +✻ Claude resuming /loop wakeup (Apr 26 3:11pm) + +● Already done — both engines compiled successfully. No need to recheck. + + Final status: + + ┌───────────────────────┬───────┬────────────────────────┐ + │ File │ Size │ Details │ + ├───────────────────────┼───────┼────────────────────────┤ + │ feature_runner.engine │ 21 MB │ FP16, compiled 15:01 │ + ├───────────────────────┼───────┼────────────────────────┤ + │ post_runner.engine │ 14 MB │ FP16, compiled 15:06 │ + ├───────────────────────┼───────┼────────────────────────┤ + │ feature_runner.onnx │ 37 MB │ opset 17, PyTorch 2.10 │ + ├───────────────────────┼───────┼────────────────────────┤ + │ post_runner.onnx │ 18 MB │ opset 17, PyTorch 2.10 │ + └───────────────────────┴───────┴────────────────────────┘ + + What was installed in tao_stereo: + - onnx 1.21.0 + onnxscript 0.7.0 + - onnxruntime-gpu 1.23.2 + - tensorrt-cu12 10.16.1 + lean/dispatch variants + + Fix applied to make_onnx.py: added dynamo=False to both torch.onnx.export + calls — the new PyTorch 2.10 dynamo exporter lacks a decomposition for + aten.adaptive_max_pool2d, the legacy TorchScript exporter handles it fine. + + New file added: scripts/make_trt_engine.py — Python TRT API builder (replaces + trtexec which isn't included in pip-installed TensorRT). + +※ recap: Installed ONNX and TensorRT 10.16 in tao_stereo and converted the FoundationStereo model to two TRT engines (FP16, RTX 4080) saved in output/onnx_trt. Next: run inference with scripts/run_demo_tensorrt.py using a real stereo pair. (disable recaps in /config) + +❯ create new file benchmark_inbolt_trt.py that will run the orifinal tests plus + trt model created from fast foundation finetuned model + +● Let me read the existing benchmark files and TRT runner to understand what to + combine. + + Read 3 files, listed 1 directory (ctrl+o to expand) + +● Now let me check the ONNX yaml and the data manager to understand image sizes. + + Read 1 file (ctrl+o to expand) + +● Now I have everything I need. Let me write the benchmark file. + +● Write(scripts/benchmark_inbolt_trt.py) + ⎿  Wrote 457 lines to scripts/benchmark_inbolt_trt.py + FS-TRT (TensorRT FP16) on Inbolt.d FFS (PyTorch) and fine-tuned F + 3 Extends ``benchmark_inbolt_fs.py`` by adding a TensorRT FP16 mode + l alongside the + ccuracy and speedd RealSense hardware depth, so you can compare a + 5 between the full-precision PyTorch path and the compiled TRT engi + 6 ne. + l weights:gines must be built beforehand from the fine-tuned mode + 8 1. python scripts/make_onnx.py --model_dir --sa + 9 2. python scripts/make_trt_engine.py --onnx_dir output/onnx_trt + _ft + … +535 lines (ctrl+o to expand) + +● Now let me do a quick import check to catch any obvious errors: + +● Bash(conda run -n tao_stereo python -c " + import sys, os…) ⎿  AST parse OK + benchmark_inbolt imports OK + TrtRunner import OK + +● All imports resolve cleanly. Here's what the new file adds on top of + benchmark_inbolt_fs.py: + + New method: finetuned_trt (purple) + - Loaded via load_trt_model(trt_dir) — reads feature_runner.engine + + post_runner.engine + onnx.yaml; gracefully skips if any file is missing + - infer_depth_m_trt resizes the IR images to the fixed TRT engine resolution + (448×640), runs TrtRunner.forward(), then resizes disparity back and divides + by fx to restore original-pixel-unit disparity before the BF depth conversion + + Key CLI args: + python scripts/benchmark_inbolt_trt.py \ + --trt_dir output/onnx_trt \ # default — point here or at a dir built from + finetuned weights + --trt_height 448 --trt_width 640 \ + --out_dir reports/inbolt_trt_benchmark + + Note: the current engines in output/onnx_trt/ were built from the original + model weights. To compare the finetuned TRT model, rebuild the engines from + the finetuned .pth: + python scripts/make_onnx.py \ + --model_dir weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth \ + --save_path output/onnx_trt_ft + python scripts/make_trt_engine.py --onnx_dir output/onnx_trt_ft + Then pass --trt_dir output/onnx_trt_ft. + +✻ Cooked for 3m 40s + +※ recap: Benchmarking FastFoundationStereo on the Inbolt dataset with TensorRT acceleration. ONNX export and TRT engine compilation are complete; the next step is rebuilding the engines from the finetuned model weights and running benchmark_inbolt_trt.py. (disable recaps in /config) + +❯ add rebuild command to the benchmark_inbolt_trt.py to run at the beginning + + Read 2 files (ctrl+o to expand) + +● Now I have everything. I'll add the rebuild function inline (no subprocess), + importing TrtFeatureRunner, TrtPostRunner, build_gwc_volume_triton, and + build_engine directly. + + Read 1 file (ctrl+o to expand) + +● Update(scripts/benchmark_inbolt_trt.py) + ⎿  Added 8 lines, removed 7 lines + 4 PyTorch models and RealSense hardware depth, so you can compare + accuracy and speed + 5 between the full-precision PyTorch path and the compiled TRT en + gine. + 6 + 7 -The TRT engines must be built beforehand from the fine-tuned mo + -del weights: + 8 - 1. python scripts/make_onnx.py --model_dir -- + -save_path output/onnx_trt_ft + 9 - 2. python scripts/make_trt_engine.py --onnx_dir output/onnx_t + -rt_ft + 7 +Pass --rebuild_trt to export ONNX and compile TRT engines from + +the fine-tuned model + 8 +at the start of the run (requires ~10 min on first compile). T + +he engines are written + 9 +to --trt_dir and reused on subsequent runs without --rebuild_tr + +t. + 10 + 11 -Then point --trt_dir at the directory containing feature_runner + -.engine and + 12 -post_runner.engine (and onnx.yaml). + 13 - + 11 Usage: + 12 cd /home/adiroha/repos/Fast-FoundationStereo + 16 - python scripts/benchmark_inbolt_trt.py [--trt_dir output/onnx + -_trt] [--out_dir reports/inbolt_trt_benchmark] + 13 + # First run: build engines then benchmark + 14 + python scripts/benchmark_inbolt_trt.py --rebuild_trt [--trt_d + +ir output/onnx_trt_ft] + 15 + # Subsequent runs: reuse existing engines + 16 + python scripts/benchmark_inbolt_trt.py [--trt_dir output/onnx + +_trt_ft] [--out_dir reports/inbolt_trt_benchmark] + 17 """ + 18 + 19 import argparse + ... + 21 import os + 22 import sys + 23 import time + 24 +import warnings + 25 import cv2 + 26 from pathlib import Path + 27 from typing import Dict, Optional + +● Update(scripts/benchmark_inbolt_trt.py) + ⎿  Added 2 lines, removed 1 line + 52 CLOSE_RANGE_THRESHOLD_M, + 53 ) + 54 from report import ReportGenerator + 55 -from core.foundation_stereo import TrtRunner + 55 +from core.foundation_stereo import TrtRunner, TrtFeatureRunner, + + TrtPostRunner, build_gwc_volume_triton + 56 +from make_trt_engine import build_engine + 57 + 58 + 59 # ── constants ──────────────────────────────────────────────── + ──────────────── + +● Now add the rebuild_trt_engines function right after the constants block: + +● Update(scripts/benchmark_inbolt_trt.py) + ⎿  Added 86 lines + 78 RS_NAME = 'depth_rs' + 79 + 80 + 81 +# ── TRT build ─────────────────────────────────────────────── + +───────────────── + 82 + + 83 +def rebuild_trt_engines(model_path: str, trt_dir: str, height: + + int, width: int) -> None: + 84 + """Export the model at *model_path* to ONNX, then compile + +both TRT engines. + 85 + + 86 + Writes to *trt_dir*: + 87 + feature_runner.onnx / feature_runner.engine + 88 + post_runner.onnx / post_runner.engine + 89 + onnx.yaml + 90 + """ + 91 + assert height % 32 == 0 and width % 32 == 0, \ + 92 + f"height and width must be divisible by 32, got {heigh + +t}x{width}" + 93 + + 94 + os.makedirs(trt_dir, exist_ok=True) + 95 + logging.info(f'[rebuild] Loading model from {model_path}') + 96 + model = torch.load(model_path, map_location='cpu', weights + +_only=False) + 97 + model.cuda().eval() + 98 + + 99 + feature_runner = TrtFeatureRunner(model).cuda().eval() + 100 + post_runner = TrtPostRunner(model).cuda().eval() + 101 + + 102 + dummy_left = torch.randn(1, 3, height, width, device='cud + +a').float() * 255 + 103 + dummy_right = torch.randn(1, 3, height, width, device='cud + +a').float() * 255 + 104 + + 105 + # ── feature runner → ONNX ─────────────────────────────── + +───────────────── + 106 + feature_onnx = os.path.join(trt_dir, 'feature_runner.onnx' + +) + 107 + logging.info(f'[rebuild] Exporting feature_runner → {featu + +re_onnx}') + 108 + with warnings.catch_warnings(): + 109 + warnings.simplefilter('ignore') + 110 + torch.onnx.export( + 111 + feature_runner, + 112 + (dummy_left, dummy_right), + 113 + feature_onnx, + 114 + opset_version=17, + 115 + input_names=['left', 'right'], + 116 + output_names=['features_left_04', 'features_left_0 + +8', 'features_left_16', + 117 + 'features_left_32', 'features_right_ + +04', 'stem_2x'], + 118 + do_constant_folding=True, + 119 + dynamo=False, + 120 + ) + 121 + + 122 + # ── post runner → ONNX ────────────────────────────────── + +───────────────── + 123 + with torch.no_grad(): + 124 + feats = feature_runner(dummy_left, dummy_right) + 125 + f04, f08, f16, f32, fr04, stem_2x = feats + 126 + cv_group = getattr(model, 'cv_group', 8) + 127 + gwc_volume = build_gwc_volume_triton( + 128 + f04.half(), fr04.half(), model.args.max_disp // 4, + + cv_group + 129 + ) + 130 + + 131 + post_onnx = os.path.join(trt_dir, 'post_runner.onnx') + 132 + logging.info(f'[rebuild] Exporting post_runner → {post_onn + +x}') + 133 + with warnings.catch_warnings(): + 134 + warnings.simplefilter('ignore') + 135 + torch.onnx.export( + 136 + post_runner, + 137 + (f04.float(), f08.float(), f16.float(), f32.float( + +), + 138 + fr04.float(), stem_2x.float(), gwc_volume.float() + +), + 139 + post_onnx, + 140 + opset_version=17, + 141 + input_names=['features_left_04', 'features_left_08 + +', 'features_left_16', + 142 + 'features_left_32', 'features_right_0 + +4', 'stem_2x', 'gwc_volume'], + 143 + output_names=['disp'], + 144 + do_constant_folding=True, + 145 + dynamo=False, + 146 + ) + 147 + + 148 + # ── save model config ─────────────────────────────────── + +────────────────── + 149 + yaml_path = os.path.join(trt_dir, 'onnx.yaml') + 150 + with open(yaml_path, 'w') as f: + 151 + yaml.safe_dump(OmegaConf.to_container(model.args), f) + 152 + logging.info(f'[rebuild] Saved onnx.yaml → {yaml_path}') + 153 + + 154 + # ── compile TRT engines ───────────────────────────────── + +────────────────── + 155 + del model, feature_runner, post_runner # free GPU memory + +before TRT build + 156 + torch.cuda.empty_cache() + 157 + + 158 + for name in ('feature_runner', 'post_runner'): + 159 + onnx_path = os.path.join(trt_dir, f'{name}.onnx') + 160 + engine_path = os.path.join(trt_dir, f'{name}.engine') + 161 + logging.info(f'[rebuild] Compiling {name}.engine (this + + may take several minutes) …') + 162 + build_engine(onnx_path, engine_path, fp16=True, worksp + +ace_gb=4) + 163 + + 164 + logging.info(f'[rebuild] TRT engines ready in {trt_dir}') + 165 + + 166 + + 167 # ── TRT helpers ───────────────────────────────────────────── + ────────────────── + 168 + 169 def load_trt_model(trt_dir: str) -> Optional[TrtRunner]: + +● Now wire it into main() — add the flag and call it before loading the TRT + model: + +● Update(scripts/benchmark_inbolt_trt.py) + ⎿  Added 2 lines + 354 help='Directory with feature_runner.en + gine, post_runner.engine, onnx.yaml') + 355 parser.add_argument('--trt_height', type=int, default=448, + help='TRT engine input height') + 356 parser.add_argument('--trt_width', type=int, default=640, + help='TRT engine input width') + 357 + parser.add_argument('--rebuild_trt', action='store_true', + 358 + help='Re-export ONNX and recompile TRT + + engines from --finetuned before benchmarking') + 359 parser.add_argument('--n_viz', type=int, default=N_VIZ, he + lp='Frames saved for visual comparison') + 360 args = parser.parse_args() + 361 + +● Update(scripts/benchmark_inbolt_trt.py) + ⎿  Added 6 lines + 363 out_dir = Path(args.out_dir) + 364 out_dir.mkdir(parents=True, exist_ok=True) + 365 + 366 + # ── optionally rebuild TRT engines ────────────────────── + +────────────────── + 367 + if args.rebuild_trt: + 368 + build_src = resolve_finetuned_model_path(args.finetune + +d) or args.original + 369 + logging.info(f'[rebuild_trt] Building TRT engines from + + {build_src} → {args.trt_dir}') + 370 + rebuild_trt_engines(build_src, args.trt_dir, args.trt_ + +height, args.trt_width) + 371 + + 372 # ── load PyTorch models ───────────────────────────────── + ────────────────── + 373 pt_models = {} + 374 finetuned_path = resolve_finetuned_model_path(args.finetun + ed) + +● Now verify the final file parses and all symbols exist: + +● Bash(conda run -n tao_stereo python -c " + import sys, os, ast, pathlib…) + ⎿  AST parse OK + build_engine import OK + TrtFeatureRunner/TrtPostRunner/build_gwc_volume_triton imports OK + all symbols OK + +● Everything checks out. Here's what was added: + + rebuild_trt_engines(model_path, trt_dir, height, width) — runs in-process (no + subprocess): + 1. Loads the model weights onto GPU + 2. Wraps in TrtFeatureRunner / TrtPostRunner and exports both to ONNX with the + legacy TorchScript exporter (dynamo=False, warnings suppressed) + 3. Saves onnx.yaml from the model's args + 4. Frees GPU memory, then calls build_engine() for each ONNX file to compile + the TRT engines + + Wired into main() — runs before anything else if --rebuild_trt is passed. + Falls back to the original model if no finetuned checkpoint is found: + + # build engines from finetuned model, then benchmark — one command + python scripts/benchmark_inbolt_trt.py \ + --rebuild_trt \ + --finetuned weights/23-36-37/model_finetuned_inbolt-20260415_epoch_111.pth \ + --trt_dir output/onnx_trt_ft \ + --out_dir reports/inbolt_trt_benchmark + + # subsequent runs reuse existing engines (no rebuild) + python scripts/benchmark_inbolt_trt.py \ + --trt_dir output/onnx_trt_ft \ + --out_dir reports/inbolt_trt_benchmark + +✻ Cooked for 1m 56s + +※ recap: Built ONNX export and TRT engine compilation for FastFoundationStereo in the tao_stereo env, then created benchmark_inbolt_trt.py to compare PyTorch and TRT models on Inbolt. Run with --rebuild_trt to compile engines from the finetuned model and benchmark in one command. (disable recaps in /config) + +● Background command "Search for trtexec binary and tensorrt directories" +completed (exit code 0) + +● That's a stale notification from the initial trtexec search at the start of + the session — no action needed. All TRT work is already complete. + +❯ /branch + ⎿ Branched conversation. You are now in the branch. Use /resume 9e5eae82-b63e-4097-921a-bd15bd674371 to return to the original. + +❯ /rename niv_fork_for_uris_net + ⎿ Session renamed to: niv_fork_for_uris_net + +❯ im trying to understand the concepts you used when creating this new network + lets go step by step what you added to the ffs so i will understand that + +● Good question to start with — let me be precise about what we actually did, + because we didn't change the FFS network architecture at all. The model + weights and math are identical. What we built is a deployment pipeline that + takes the existing model and makes it run faster on GPU. + + Here's the outline — tell me which step to dive into first: + + --- + Step 1 — The original model (FFS) + The full PyTorch model runs as one monolithic forward pass. It has two + conceptually distinct stages: + - A feature extractor (ViT backbone → multi-scale feature maps) + - A disparity estimator (cost volume + GRU refinement loop → upsampled + disparity) + + Step 2 — Splitting into TrtFeatureRunner + TrtPostRunner + We split those two stages into separate modules. This is required because the + middle step (building the GWC correlation volume with Triton) isn't + expressible in ONNX — it has to stay in PyTorch and run between the two + exported parts. + + Step 3 — ONNX export + Each part is traced with dummy inputs and serialized to the ONNX format — a + vendor-neutral graph representation of the computation. + + Step 4 — TensorRT compilation + TensorRT takes each ONNX graph and compiles it into a GPU-native .engine file: + it fuses layers, picks the fastest CUDA kernels for this specific GPU (RTX + 4080), and converts weights to FP16. + + Step 5 — Inference at benchmark time + The engines expect a fixed input size (448×640), so we resize the camera + images, run both engines with the Triton volume step in between, then scale + the disparity back to original resolution for depth conversion. + + --- + Which step do you want to unpack first? + +✻ Cooked for 23s + +❯ i think you drifted away from what i mean, you just enhanced the ffs model + with rs_prior data and im trying to understand the concepts you used there +● Let me look at those files — they were in the repo before this session. + + Read 1 file (ctrl+o to expand) + +● Got it, now I see exactly what you mean. The finetune_inbolt_depthrs.py adds + three fusion points where RealSense depth is injected into the FFS pipeline. + Let me walk through them one by one. + + --- + The core idea + + FFS normally only sees two IR images (left + right). This enhancement says: we + also have a noisy-but-free depth reading from the RS hardware sensor — let's + teach the network to use it as a hint at three places in the pipeline. + + --- + Stage 1 — Early: depth features fused into image features + + DepthEncoder takes the RS depth map (normalized to [0,1]) plus a validity mask + (1=valid pixel, 0=missing) and runs it through a small 4-scale CNN, producing + depth feature maps at 1/4, 1/8, 1/16, 1/32 resolution. + + DepthFusionModule then adds those depth features residually into the + left-image feature maps that the backbone already produced — one 1×1 + convolution per scale, zero-initialized so at the start of training it + contributes nothing and the model starts from the pretrained behaviour. + + Stage 2 — Mid: Gaussian prior bias on the cost volume + + After the cost volume is built and aggregated, the model computes a softmax + over disparity bins to get an initial disparity estimate. This stage adds a + Gaussian "bump" centered at the disparity predicted from the RS depth + measurement onto the logits before that softmax — making the network's initial + guess gravitate toward the RS reading where it's valid. Two learned scalars + control this: depth_sigma (how sharp/wide the Gaussian is) and + depth_prior_scale (overall weight of the bias). + + Stage 3 — Late: learned blend for GRU initialization + + DepthInitBlend takes the stereo soft-argmax result, the RS-derived disparity + prior, and the validity mask, and learns a pixel-wise blend weight. Where RS + depth is valid and the blend weight is high, the GRU starts from the + RS-derived disparity rather than the pure stereo estimate. The final output + nn.Conv2d is bias-initialized to −5 so sigmoid(−5) ≈ 0.007 — meaning it starts + almost fully trusting stereo and only slowly learns to trust RS depth. + + --- + Which stage do you want to go deeper on? + +✻ Brewed for 29s + +❯ each and every one of them, i want to understand if they are all necessary, + and if they might create more noise than signal. lets deep dive into each one + +● Let me go through each stage honestly — including where I think the design is + sound and where it could backfire. + + --- + Stage 1 — Early: DepthEncoder + DepthFusionModule + + What it does + + RS depth + validity mask → small 4-scale CNN → feature maps at 1/4…1/32 + resolution → added residually (via 1×1 conv, zero-init) into the left image + feature maps from the backbone. + + Why the zero-init matters + + At training start, the 1×1 projections output exactly zero, so the model + starts from the pretrained stereo behaviour. The network has to earn the depth + contribution via gradient descent. This is a sound training stability trick. + + The validity masking per scale + + v4 = F.max_pool2d(validity, 2, 2) # H/4 + d4 = self.stage1(x2) * v4 + Using max-pool on the validity mask means a coarse-scale cell is valid if any + fine-scale pixel in it was valid. That's intentionally conservative — better + to use a slightly noisy valid region than to zero out real signal. + + Where it can create noise over signal + + The depth features are added to the left features only. Those left features + then go into the cost volume — which is a cross-correlation between left and + right. The right features are untouched. So you're introducing an asymmetry: + the left side now "knows" something about depth that the right side doesn't. + If the RS depth is accurate, this helps the left features tell the cost volume + "look near this disparity". If RS depth is wrong (reflective surface, + occluded region, edge bleed), it corrupts the feature matching for that region + and the cost volume gets a misleading signal that's hard to override + downstream. + + Verdict: This is the riskiest of the three. The benefit is real but the + corruption of feature-matching asymmetry is a genuine concern. Worth ablating + — Stage 2 alone may give most of the gain at lower risk. + + --- + Stage 2 — Mid: Gaussian prior bias on cost volume logits + + What it does + + After the cost volume produces logits over disparity bins (shape B × D × H/4 × + W/4), this adds a Gaussian bump centered at the disparity predicted from RS + depth: + + d_idx = arange(D) # disparity bin index + prior_bias = -0.5 * ((d_idx - disp_prior_14) / sigma)² + logits = logits + depth_prior_scale * prior_bias + + Why this is the most principled stage + + It's a proper Bayesian-style prior injection. The logits are log-probabilities + (before softmax), so adding a Gaussian in log-space is exactly equivalent to + multiplying the stereo probability distribution by a Gaussian prior — which is + the textbook way to combine two uncertain measurements. The stereo evidence + can still dominate if it's strong; the prior only matters when stereo evidence + is weak (textureless regions, bad lighting). + + The two learned scalars + + - depth_sigma (init=4.0 disparity units at 1/4 scale): controls how tight the + prior is. Too small → a slightly wrong RS reading creates a sharp misleading + spike. Too large → the prior is so diffuse it does nothing. + - depth_prior_scale (init=0.1): overall weight. Starts near-zero, network + learns how much to trust RS globally. + + Where it can create noise + + RS depth suffers from mixed-pixel / edge bleed — at depth discontinuities the + sensor averages foreground and background, giving a reading that belongs to + neither. In those pixels the Gaussian bump is centered at a phantom disparity, + and if depth_sigma has learned to be tight, it actively suppresses the + correct bin. The validity mask doesn't help here — the pixel is valid, just + wrong. + + Verdict: The cleanest and lowest-risk stage. This one is almost certainly + worth keeping. + + --- + Stage 3 — Late: DepthInitBlend (GRU initialisation) + + What it does + + Takes three inputs at 1/4 resolution: the stereo soft-argmax stereo_init, the + RS-derived disparity disp_prior_14, and the validity mask. A small 3-layer CNN + predicts a per-pixel blend weight w = sigmoid(...), and outputs: + + blend_w * disp_prior + (1 - blend_w) * stereo_init + + The last conv is bias-init to −5 → sigmoid(−5) ≈ 0.007 at the start, so the + GRU initialises almost entirely from the stereo soft-argmax. + + The claimed benefit + + The GRU is iterative (8 iterations here). A better initialisation means it + needs fewer iterations to converge, and reduces the risk of getting stuck at + the wrong disparity. RS depth, even noisy, can cut across textureless regions + where stereo's soft-argmax is genuinely confused. + + The redundancy problem + + This is where I'd push back hardest. After Stage 2, stereo_init is already + biased toward the RS prior — the Gaussian bump in logits has already nudged + the soft-argmax in that direction. Stage 3 then takes that already-biased + estimate and blends it again with the raw RS disparity. In the good case (RS + correct, Stage 2 worked) you've over-counted the RS signal twice. In the bad + case (RS wrong at an edge), the GRU starts from the wrong disparity and has to + spend all 8 iterations fighting its way back to the correct answer — the + iterative mechanism that's supposed to be robust to bad init is now working + against you. + + The other risk: local minima + + RAFT-style GRUs are known to be fairly robust to initialisation within a + reasonable range. But "reasonable" here means within a few disparity units. If + a RS edge-bleed pixel pushes the init 30 disparity units off, 8 iterations + may genuinely not be enough to recover. + + Verdict: Most questionable of the three. Partially redundant with Stage 2, and + introduces GRU convergence risk. If you had to cut one stage, cut this one + first and see if loss changes at all. + + --- + Summary table + + ┌────────────────┬──────────────────┬───────────────────────┬────────────┐ + │ Stage │ Mechanism │ Risk │ Worth │ + │ │ │ │ keeping? │ + ├────────────────┼──────────────────┼───────────────────────┼────────────┤ + │ │ Residual depth │ Corrupts │ │ + │ 1 — │ features → left │ feature-matching │ Ablate │ + │ DepthEncoder │ image features │ asymmetry if RS is │ first │ + │ │ │ wrong │ │ + ├────────────────┼──────────────────┼───────────────────────┼────────────┤ + │ 2 — Gaussian │ Adds prior bump │ Edge-bleed pixels get │ Almost │ + │ prior │ to cost-volume │ tight wrong prior │ certainly │ + │ │ logits │ │ yes │ + ├────────────────┼──────────────────┼───────────────────────┼────────────┤ + │ 3 — │ Blends stereo │ Redundant with Stage │ Ablate │ + │ DepthInitBlend │ init with RS │ 2; GRU local minima │ second │ + │ │ disparity │ risk │ │ + └────────────────┴──────────────────┴───────────────────────┴────────────┘ + + The clean experiment to run: train three variants — Stage 2 only, Stages 1+2, + all three — and compare validation loss. My guess is Stage 2 alone gets you + 80%+ of the gain. + +✻ Cooked for 55s + +❯ two follow up questions- 1. is it possible that using just one would yeild + superior results? 2. this model uses all 192 disparities? + +● Two sharp questions. Let me take them in order. + + --- + 1. Could using just one stage yield superior results? + + Yes, genuinely possible — and Stage 2 alone is my best candidate to outperform + all three combined. Here's why. + + The double-counting problem + + After Stage 2 biases the soft-argmax toward the RS prior, stereo_init is + already a mix of stereo evidence and RS prior. Stage 3 then takes that + already-biased estimate and blends it again with disp_prior. If you expand the + algebra: + + stereo_init ≈ α·disp_prior + (1-α)·pure_stereo # after Stage 2 + final_init = β·disp_prior + (1-β)·stereo_init + = (β + (1-β)·α)·disp_prior + (1-β)·(1-α)·pure_stereo + + The effective weight on the RS prior is higher than either stage intended. + When RS is correct this is fine; when RS is wrong at an edge, both stages + compound the error into the GRU starting point. + + The limited data problem + + The Inbolt dataset isn't large. Three fusion stages means three sets of new + parameters competing for gradient signal. With less data, a single well-placed + module generalises better than three weakly-constrained ones trying to + cooperate. + + The asymmetry problem in Stage 1 + + Adding depth features to left-only features while right features stay + unchanged creates an input mismatch the cost volume wasn't designed to handle. + If Stage 1 learned to encode RS noise, the cross-correlation degrades in a + way Stage 2 and 3 can't fully compensate. Stage 2 doesn't have this problem — + it operates after features are already matched. + + What to run: three training runs — Stage 2 only, Stages 2+3, all three — same + epochs, same data split. My prediction is Stage 2 alone wins or ties on the + test set. + + --- + 2. Does this model use all 192 disparities? + + Not quite — and this is an important architectural point. + + The cost volume is built at 1/4 resolution with max_disp // 4 = 48 bins. Each + bin represents a 4-pixel step in disparity at full resolution, covering the + range 0–192. So the explicit search is 48 coarse bins, not 192 individual + values. + + The RS Gaussian prior in Stage 2 operates over these same 48 bins: + + d_idx = torch.arange(D) # 0 … 47 + disp_prior_14 = disp_prior / 4.0 # convert to 1/4-scale units + prior_bias = -0.5 * ((d_idx - disp_prior_14) / sigma)² + + With sigma initialised at 4.0 (= 4 bins = 16 full-resolution pixels), the + prior is fairly broad. It's not pinning the network to a single disparity — + it's weighting a neighbourhood. + + The GRU then refines continuously from the soft-argmax initialisation, so + sub-bin and sub-pixel accuracy is still reachable. The 48-bin cost volume just + determines where the GRU starts from. + + The efficiency implication you might be heading toward + + Because we have the RS prior, we could potentially build a much smaller cost + volume — say, only 16 bins centered around the RS-predicted disparity for each + pixel — rather than always computing all 48. That would cut cost-volume + memory and compute significantly, and Stage 2's Gaussian would collapse to a + much tighter prior over a smaller range. + + The current implementation doesn't do this — it still builds the full 48-bin + volume and just biases the logits. That's leaving performance on the table: + you pay the full compute cost but don't use the RS prior to reduce the search + space, only to reweight it after the fact. + + Is that the direction you were thinking? + +✻ Churned for 1m 17s \ No newline at end of file diff --git a/weights/23-36-37/cfg.yaml b/weights/23-36-37/cfg.yaml new file mode 100644 index 0000000..eac5ad7 --- /dev/null +++ b/weights/23-36-37/cfg.yaml @@ -0,0 +1,12 @@ +corr_levels: 2 +corr_radius: 4 +hidden_dims: +- 128 +low_memory: 0 +max_disp: 416 +mixed_precision: true +n_downsample: 2 +n_gru_layers: 1 +slow_fast_gru: false +valid_iters: 8 +vit_size: vitl diff --git a/weights/weights_niv/model_best_bp2_serialize.pth b/weights/weights_niv/model_best_bp2_serialize.pth new file mode 100644 index 0000000..86c6749 Binary files /dev/null and b/weights/weights_niv/model_best_bp2_serialize.pth differ diff --git a/weights/weights_niv/stage3_best.pt b/weights/weights_niv/stage3_best.pt new file mode 100644 index 0000000..fca0704 Binary files /dev/null and b/weights/weights_niv/stage3_best.pt differ