diff --git a/Project.toml b/Project.toml
index b063816f7..132a19ae1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,8 +6,11 @@ version = "0.14.9"
 [deps]
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MatrixAlgebraKit = "6c742aac-3347-4629-af66-fc926824e5e4"
+OhMyThreads = "67456a42-1dca-4109-a031-0a68de7e3ad5"
 PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
 TensorKitSectors = "13a9c161-d5da-41f0-bcbd-e1a08ae0647f"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
@@ -15,23 +18,35 @@ TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
 VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 
 [weakdeps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 
 [extensions]
+TensorKitAMDGPUExt = "AMDGPU"
+TensorKitCUDAExt = ["CUDA", "cuTENSOR"]
 TensorKitChainRulesCoreExt = "ChainRulesCore"
 TensorKitFiniteDifferencesExt = "FiniteDifferences"
 
 [compat]
+AMDGPU = "2"
+Adapt = "4"
 Aqua = "0.6, 0.7, 0.8"
+CUDA = "5"
+cuTENSOR = "2"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
 Combinatorics = "1"
 FiniteDifferences = "0.12"
 LRUCache = "1.0.2"
 LinearAlgebra = "1"
+MatrixAlgebraKit = "0.3"
+OhMyThreads = "0.8.0"
 PackageExtensionCompat = "1"
 Random = "1"
+ScopedValues = "1.3.0"
 Strided = "2"
 TensorKitSectors = "0.1"
 TensorOperations = "5.1"
@@ -43,7 +58,10 @@ Zygote = "0.7"
 julia = "1.10"
 
 [extras]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -55,4 +73,10 @@ TestExtras = "5ed8adda-3752-4e41-b88a-e8b09835ee3a"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "Combinatorics", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+test = ["Adapt", "Aqua", "Combinatorics", "CUDA", "cuTENSOR", "LinearAlgebra", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote"]
+
+[sources]
+CUDA = {url = "https://github.com/JuliaGPU/CUDA.jl", rev = "master"}
+cuTENSOR = {url = "https://github.com/JuliaGPU/CUDA.jl", subdir="lib/cutensor", rev = "ksh/cutensor_bump"}
+MatrixAlgebraKit = {url = "https://github.com/QuantumKitHub/MatrixAlgebraKit.jl", rev = "ksh/tk"}
+TensorOperations = {url = "https://github.com/QuantumKitHub/TensorOperations.jl", rev = "ksh/cutensor_bump"}
diff --git a/docs/src/lib/spaces.md b/docs/src/lib/spaces.md
index 83350156d..205301d83 100644
--- a/docs/src/lib/spaces.md
+++ b/docs/src/lib/spaces.md
@@ -90,6 +90,7 @@ dual
 conj
 flip
 ⊕
+⊖
 zero(::ElementarySpace)
 oneunit
 supremum
diff --git a/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
new file mode 100644
index 000000000..db5a5c733
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/TensorKitAMDGPUExt.jl
@@ -0,0 +1,10 @@
+module TensorKitAMDGPUExt
+
+using TensorKit
+using TensorKit: SectorDict
+using AMDGPU 
+using Random
+
+include("roctensormap.jl")
+
+end
diff --git a/ext/TensorKitAMDGPUExt/roctensormap.jl b/ext/TensorKitAMDGPUExt/roctensormap.jl
new file mode 100644
index 000000000..38a3e1355
--- /dev/null
+++ b/ext/TensorKitAMDGPUExt/roctensormap.jl
@@ -0,0 +1,103 @@
+const _ROCMatOrDict{I,T} = Union{ROCMatrix{T},SectorDict{I,ROCMatrix{T}}}
+const ROCTensorMap{T,S,N₁,N₂,I,A<:_ROCMatOrDict{I,T}} = TensorMap{T,S,N₁,N₂,A}
+const ROCTensor{T, S, N, I, A <: _ROCMatOrDict{I, T}} = ROCTensorMap{T, S, N, 0, I, A}
+
+function ROCTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    A = ROCMatrix{T, AMDGPU.default_memory}
+    TT = tensormaptype{S, N₁, N₂, A}
+    return TT(undef, codomain(V), domain(V))
+end
+
+function ROCTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
+                         domain::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, codomain ← domain)
+end
+function ROCTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T,S}
+    return ROCTensorMap{T}(undef, V ← one(V))
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function AMDGPU.$fname(codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {S<:IndexSpace}
+            return AMDGPU.$fname(codomain ← domain)
+        end
+        function AMDGPU.$fname(::Type{T}, codomain::TensorSpace{S},
+                               domain::TensorSpace{S}=one(codomain)) where {T,S<:IndexSpace}
+            return AMDGPU.$fname(T, codomain ← domain)
+        end
+        AMDGPU.$fname(V::TensorMapSpace) = AMDGPU.$fname(Float64, V)
+        function AMDGPU.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function AMDGPU.$randfun(codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {S<:IndexSpace}
+            return AMDGPU.$randfun(codomain ← domain)
+        end
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(T, codomain ← domain)
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace{S},
+                                 domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return AMDGPU.$randfun(rng, T, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        AMDGPU.$randfun(codomain::TensorSpace) = AMDGPU.$randfun(codomain ← one(codomain))
+        function AMDGPU.$randfun(::Type{T}, codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(T, codomain ← one(codomain))
+        end
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                                 codomain::TensorSpace) where {T}
+            return AMDGPU.$randfun(rng, T, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        AMDGPU.$randfun(V::TensorMapSpace) = AMDGPU.$randfun(Float64, V)
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return AMDGPU.$randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function AMDGPU.$randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return AMDGPU.$randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function AMDGPU.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               V::TensorMapSpace) where {T}
+            t = ROCTensorMap{T}(undef, V)
+            AMDGPU.$randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{ROCTensorMap}, d::Dict{Symbol,Any})
+    try
+        codomain = eval(Meta.parse(d[:codomain]))
+        domain = eval(Meta.parse(d[:domain]))
+        data = SectorDict(eval(Meta.parse(c)) => ROCArray(b) for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(Base.eval(Main, Meta.parse(c)) => ROCArray(b)
+                          for (c, b) in d[:data])
+        return TensorMap(data, codomain, domain)
+    end
+end
+
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
new file mode 100644
index 000000000..ae776673c
--- /dev/null
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -0,0 +1,90 @@
+module TensorKitCUDAExt
+
+using CUDA, CUDA.CUBLAS, LinearAlgebra
+using CUDA: @allowscalar
+using cuTENSOR: cuTENSOR
+
+using TensorKit
+import TensorKit.VectorInterface: scalartype as vi_scalartype
+using TensorKit.Factorizations
+using TensorKit.Factorizations: select_svd_algorithm, OFA, initialize_output, AbstractAlgorithm
+using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap
+
+using TensorKit.MatrixAlgebraKit
+
+using Random
+
+include("cutensormap.jl")
+
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SVD) = CUSOLVER_QRIteration()
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, ::TensorKit.Factorizations.SDD) = throw(ArgumentError("DivideAndConquer unavailable on CUDA")) 
+TensorKit.Factorizations.select_svd_algorithm(::CuTensorMap, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
+
+const CuDiagonalTensorMap{T, S} = DiagonalTensorMap{T, S, CuVector{T, CUDA.DeviceMemory}}
+
+"""
+    CuDiagonalTensorMap{T}(undef, domain::S) where {T,S<:IndexSpace}
+    # expert mode: select storage type `A`
+    DiagonalTensorMap{T,S,A}(undef, domain::S) where {T,S<:IndexSpace,A<:DenseVector{T}}
+
+Construct a `DiagonalTensorMap` with uninitialized data.
+"""
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::TensorMapSpace) where {T}
+    (numin(V) == numout(V) == 1 && domain(V) == codomain(V)) ||
+        throw(ArgumentError("DiagonalTensorMap requires a space with equal domain and codomain and 2 indices"))
+    return CuDiagonalTensorMap{T}(undef, domain(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::ProductSpace) where {T}
+    length(V) == 1 ||
+        throw(ArgumentError("DiagonalTensorMap requires `numin(d) == numout(d) == 1`"))
+    return CuDiagonalTensorMap{T}(undef, only(V))
+end
+function CuDiagonalTensorMap{T}(::UndefInitializer, V::S) where {T,S<:IndexSpace}
+    return CuDiagonalTensorMap{T,S}(undef, V)
+end
+CuDiagonalTensorMap(::UndefInitializer, V::IndexSpace) = CuDiagonalTensorMap{Float64}(undef, V)
+
+function TensorKit.Factorizations.initialize_output(::typeof(svd_compact!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = CuDiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = CuDiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_full!), t::CuTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = CuDiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eigh_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function TensorKit.Factorizations.initialize_output(::typeof(eig_vals!), t::CuTensorMap, alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = CuDiagonalTensorMap{Tc}(undef, V_D)
+end
+
+
+# TODO
+# add VectorInterface extensions for proper CUDA promotion
+function TensorKit.VectorInterface.promote_add(TA::Type{<:CUDA.StridedCuMatrix{Tx}}, TB::Type{<:CUDA.StridedCuMatrix{Ty}}, α::Tα = TensorKit.VectorInterface.One(), β::Tβ = TensorKit.VectorInterface.One()) where {Tx, Ty, Tα, Tβ}
+    return Base.promote_op(add, Tx, Ty, Tα, Tβ)
+end
+
+end
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
new file mode 100644
index 000000000..e3f6cf203
--- /dev/null
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -0,0 +1,207 @@
+const CuTensorMap{T,S,N₁,N₂} = TensorMap{T,S,N₁,N₂, CuVector{T,CUDA.DeviceMemory}}
+const CuTensor{T, S, N} = CuTensorMap{T, S, N, 0}
+
+function TensorKit.tensormaptype(S::Type{<:IndexSpace}, N₁, N₂, TorA::Type{<:StridedCuArray})
+    if TorA <: CuArray
+        return TensorMap{eltype(TorA),S,N₁,N₂,CuVector{eltype(TorA), CUDA.DeviceMemory}}
+    else
+        throw(ArgumentError("argument $TorA should specify a scalar type (`<:Number`) or a storage type `<:CuVector{<:Number}`"))
+    end
+end
+
+function CuTensorMap{T}(::UndefInitializer, V::TensorMapSpace{S, N₁, N₂}) where {T, S, N₁, N₂}
+    return CuTensorMap{T,S,N₁,N₂}(undef, V)
+end
+
+function CuTensorMap{T}(::UndefInitializer, codomain::TensorSpace{S},
+                        domain::TensorSpace{S}) where {T,S}
+    return CuTensorMap{T}(undef, codomain ← domain)
+end
+function CuTensor{T}(::UndefInitializer, V::TensorSpace{S}) where {T,S}
+    return CuTensorMap{T}(undef, V ← one(V))
+end
+# constructor starting from block data
+"""
+    CuTensorMap(data::AbstractDict{<:Sector,<:CuMatrix}, codomain::ProductSpace{S,N₁},
+                domain::ProductSpace{S,N₂}) where {S<:ElementarySpace,N₁,N₂}
+    CuTensorMap(data, codomain ← domain)
+    CuTensorMap(data, domain → codomain)
+
+Construct a `CuTensorMap` by explicitly specifying its block data.
+
+## Arguments
+- `data::AbstractDict{<:Sector,<:CuMatrix}`: dictionary containing the block data for
+  each coupled sector `c` as a matrix of size `(blockdim(codomain, c), blockdim(domain, c))`.
+- `codomain::ProductSpace{S,N₁}`: the codomain as a `ProductSpace` of `N₁` spaces of type
+  `S<:ElementarySpace`.
+- `domain::ProductSpace{S,N₂}`: the domain as a `ProductSpace` of `N₂` spaces of type
+  `S<:ElementarySpace`.
+
+Alternatively, the domain and codomain can be specified by passing a [`HomSpace`](@ref)
+using the syntax `codomain ← domain` or `domain → codomain`.
+"""
+function CuTensorMap(data::AbstractDict{<:Sector,<:CuArray},
+                     V::TensorMapSpace{S,N₁,N₂}) where {S,N₁,N₂}
+    T = eltype(valtype(data))
+    t = CuTensorMap{T}(undef, V)
+    for (c, b) in blocks(t)
+        haskey(data, c) || throw(SectorMismatch("no data for block sector $c"))
+        datac = data[c]
+        size(datac) == size(b) ||
+            throw(DimensionMismatch("wrong size of block for sector $c"))
+        copy!(b, datac)
+    end
+    for (c, b) in data
+        c ∈ blocksectors(t) || isempty(b) ||
+            throw(SectorMismatch("data for block sector $c not expected"))
+    end
+    return t
+end
+function CuTensorMap{T}(data::DenseVector{T}, codomain::TensorSpace{S},
+                        domain::TensorSpace{S}) where {T,S}
+    return CuTensorMap(data, codomain ← domain)
+end
+function CuTensorMap(data::AbstractDict{<:Sector,<:CuMatrix}, codom::TensorSpace{S},
+                   dom::TensorSpace{S}) where {S}
+    return CuTensorMap(data, codom ← dom)
+end
+
+for (fname, felt) in ((:zeros, :zero), (:ones, :one))
+    @eval begin
+        function CUDA.$fname(codomain::TensorSpace{S},
+                             domain::TensorSpace{S}=one(codomain)) where {S<:IndexSpace}
+            return CUDA.$fname(codomain ← domain)
+        end
+        function CUDA.$fname(::Type{T}, codomain::TensorSpace{S},
+                             domain::TensorSpace{S}=one(codomain)) where {T,S<:IndexSpace}
+            return CUDA.$fname(T, codomain ← domain)
+        end
+        CUDA.$fname(V::TensorMapSpace) = CUDA.$fname(Float64, V)
+        function CUDA.$fname(::Type{T}, V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            fill!(t, $felt(T))
+            return t
+        end
+    end
+end
+
+for randfun in (:rand, :randn)
+    randfun! = Symbol(randfun, :!)
+    @eval begin
+        # converting `codomain` and `domain` into `HomSpace`
+        function CUDA.$randfun(codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {S<:IndexSpace}
+            return CUDA.$randfun(codomain ← domain)
+        end
+        function CUDA.$randfun(::Type{T}, codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return CUDA.$randfun(T, codomain ← domain)
+        end
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               codomain::TensorSpace{S},
+                               domain::TensorSpace{S}) where {T,S<:IndexSpace}
+            return CUDA.$randfun(rng, T, codomain ← domain)
+        end
+
+        # accepting single `TensorSpace`
+        CUDA.$randfun(codomain::TensorSpace) = CUDA.$randfun(codomain ← one(codomain))
+        function CUDA.$randfun(::Type{T}, codomain::TensorSpace) where {T}
+            return CUDA.$randfun(T, codomain ← one(codomain))
+        end
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               codomain::TensorSpace) where {T}
+            return CUDA.$randfun(rng, T, codomain ← one(domain))
+        end
+
+        # filling in default eltype
+        CUDA.$randfun(V::TensorMapSpace) = CUDA.$randfun(Float64, V)
+        function CUDA.$randfun(rng::Random.AbstractRNG, V::TensorMapSpace)
+            return CUDA.$randfun(rng, Float64, V)
+        end
+
+        # filling in default rng
+        function CUDA.$randfun(::Type{T}, V::TensorMapSpace) where {T}
+            return CUDA.$randfun(Random.default_rng(), T, V)
+        end
+
+        # implementation
+        function CUDA.$randfun(rng::Random.AbstractRNG, ::Type{T},
+                               V::TensorMapSpace) where {T}
+            t = CuTensorMap{T}(undef, V)
+            CUDA.$randfun!(rng, t)
+            return t
+        end
+    end
+end
+
+# converters
+# ----------
+function Base.convert(::Type{CuTensorMap}, d::Dict{Symbol,Any})
+    try
+        codomain = eval(Meta.parse(d[:codomain]))
+        domain = eval(Meta.parse(d[:domain]))
+        data = SectorDict(eval(Meta.parse(c)) => CuArray(b) for (c, b) in d[:data])
+        return CuTensorMap(data, codomain, domain)
+    catch e # sector unknown in TensorKit.jl; user-defined, hopefully accessible in Main
+        codomain = Base.eval(Main, Meta.parse(d[:codomain]))
+        domain = Base.eval(Main, Meta.parse(d[:domain]))
+        data = SectorDict(Base.eval(Main, Meta.parse(c)) => CuArray(b)
+                          for (c, b) in d[:data])
+        return CuTensorMap(data, codomain, domain)
+    end
+end
+
+function Base.convert(::Type{CuTensorMap}, t::AbstractTensorMap)
+    return copy!(CuTensorMap{scalartype(t)}(undef, space(t)), t)
+end
+
+# Scalar implementation
+#-----------------------
+function TensorKit.scalar(t::CuTensorMap)
+    
+    # TODO: should scalar only work if N₁ == N₂ == 0?
+    return @allowscalar dim(codomain(t)) == dim(domain(t)) == 1 ?
+           first(blocks(t))[2][1, 1] : throw(DimensionMismatch())
+end
+
+TensorKit.scalartype(A::StridedCuArray{T}) where {T} = T
+vi_scalartype(::Type{<:CuTensorMap{T}}) where {T} = T
+vi_scalartype(::Type{<:CuArray{T}}) where {T} = T
+
+function TensorKit.similarstoragetype(TT::Type{<:CuTensorMap{TTT,S,N₁,N₂}}, ::Type{T}) where {TTT,T,S,N₁,N₂}
+    return CuVector{T, CUDA.DeviceMemory}
+end
+
+function Base.convert(TT::Type{CuTensorMap{T,S,N₁,N₂}},
+                      t::AbstractTensorMap{<:Any,S,N₁,N₂}) where {T,S,N₁,N₂}
+    if typeof(t) === TT
+        return t
+    else
+        tnew = TT(undef, space(t))
+        return copy!(tnew, t)
+    end
+end
+
+function Base.copy!(tdst::CuTensorMap{T, S, N₁, N₂}, tsrc::CuTensorMap{T, S, N₁, N₂}) where {T, S, N₁, N₂}
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.copy!(tdst::CuTensorMap, tsrc::TensorKit.AdjointTensorMap)
+    space(tdst) == space(tsrc) || throw(SpaceMismatch("$(space(tdst)) ≠ $(space(tsrc))"))
+    for ((c, bdst), (_, bsrc)) in zip(blocks(tdst), blocks(tsrc))
+        copy!(bdst, bsrc)
+    end
+    return tdst
+end
+
+function Base.promote_rule(::Type{<:TT₁},
+                           ::Type{<:TT₂}) where {S,N₁,N₂, TTT₁, TTT₂,
+                                                 TT₁<:CuTensorMap{TTT₁,S,N₁,N₂},
+                                                 TT₂<:CuTensorMap{TTT₂,S,N₁,N₂}}
+    T = TensorKit.VectorInterface.promote_add(TTT₁, TTT₂)
+    return CuTensorMap{T,S,N₁,N₂}
+end
diff --git a/ext/TensorKitChainRulesCoreExt/TensorKitChainRulesCoreExt.jl b/ext/TensorKitChainRulesCoreExt/TensorKitChainRulesCoreExt.jl
index 16c7583d1..36bb4108d 100644
--- a/ext/TensorKitChainRulesCoreExt/TensorKitChainRulesCoreExt.jl
+++ b/ext/TensorKitChainRulesCoreExt/TensorKitChainRulesCoreExt.jl
@@ -3,6 +3,7 @@ module TensorKitChainRulesCoreExt
 using TensorOperations
 using VectorInterface
 using TensorKit
+using TensorKit: foreachblock
 using ChainRulesCore
 using LinearAlgebra
 using TupleTools
@@ -11,6 +12,11 @@ import TensorOperations as TO
 using TensorOperations: promote_contract, tensoralloc_add, tensoralloc_contract
 using VectorInterface: promote_scale, promote_add
 
+using MatrixAlgebraKit
+using MatrixAlgebraKit: TruncationStrategy,
+                        svd_compact_pullback!, eig_full_pullback!, eigh_full_pullback!,
+                        qr_compact_pullback!, lq_compact_pullback!
+
 include("utility.jl")
 include("constructors.jl")
 include("linalg.jl")
diff --git a/ext/TensorKitChainRulesCoreExt/factorizations.jl b/ext/TensorKitChainRulesCoreExt/factorizations.jl
index abfb724bc..0c6924411 100644
--- a/ext/TensorKitChainRulesCoreExt/factorizations.jl
+++ b/ext/TensorKitChainRulesCoreExt/factorizations.jl
@@ -1,42 +1,38 @@
 # Factorizations rules
 # --------------------
-function ChainRulesCore.rrule(::typeof(TensorKit.tsvd!), t::AbstractTensorMap;
-                              trunc::TensorKit.TruncationScheme=TensorKit.NoTruncation(),
-                              p::Real=2,
-                              alg::Union{TensorKit.SVD,TensorKit.SDD}=TensorKit.SDD())
-    U, Σ, V⁺, truncerr = tsvd(t; trunc=TensorKit.NoTruncation(), p=p, alg=alg)
-
-    if !(trunc isa TensorKit.NoTruncation) && !isempty(blocksectors(t))
-        Σdata = TensorKit.SectorDict(c => diag(b) for (c, b) in blocks(Σ))
-
-        truncdim = TensorKit._compute_truncdim(Σdata, trunc, p)
-        truncerr = TensorKit._compute_truncerr(Σdata, truncdim, p)
-
-        SVDdata = TensorKit.SectorDict(c => (block(U, c), Σc, block(V⁺, c))
-                                       for (c, Σc) in Σdata)
-
-        Ũ, Σ̃, Ṽ⁺ = TensorKit._create_svdtensors(t, SVDdata, truncdim)
-    else
-        Ũ, Σ̃, Ṽ⁺ = U, Σ, V⁺
-    end
+for f in (:tsvd, :eig, :eigh)
+    f! = Symbol(f, :!)
+    f_trunc! = f == :tsvd ? :svd_trunc! : Symbol(f, :_trunc!)
+    f_pullback = Symbol(f, :_pullback)
+    f_pullback! = f == :tsvd ? :svd_compact_pullback! : Symbol(f, :_full_pullback!)
+    @eval function ChainRulesCore.rrule(::typeof(TensorKit.$f!), t::AbstractTensorMap;
+                                        trunc::TruncationStrategy=TensorKit.notrunc(),
+                                        kwargs...)
+        # TODO: I think we can use f! here without issues because we don't actually require
+        # the data of `t` anymore.
+        F = $f(t; trunc=TensorKit.notrunc(), kwargs...)
+
+        if trunc != TensorKit.notrunc() && !isempty(blocksectors(t))
+            F′ = MatrixAlgebraKit.truncate!($f_trunc!, F, trunc)
+        else
+            F′ = F
+        end
 
-    function tsvd!_pullback(ΔUSVϵ)
-        ΔU, ΔΣ, ΔV⁺, = unthunk.(ΔUSVϵ)
-        Δt = similar(t)
-        for (c, b) in blocks(Δt)
-            Uc, Σc, V⁺c = block(U, c), block(Σ, c), block(V⁺, c)
-            ΔUc, ΔΣc, ΔV⁺c = block(ΔU, c), block(ΔΣ, c), block(ΔV⁺, c)
-            Σdc = view(Σc, diagind(Σc))
-            ΔΣdc = (ΔΣc isa AbstractZero) ? ΔΣc : view(ΔΣc, diagind(ΔΣc))
-            svd_pullback!(b, Uc, Σdc, V⁺c, ΔUc, ΔΣdc, ΔV⁺c)
+        function $f_pullback(ΔF′)
+            ΔF = unthunk.(ΔF′)
+            Δt = zerovector(t)
+            foreachblock(Δt) do c, (b,)
+                Fc = block.(F, Ref(c))
+                ΔFc = block.(ΔF, Ref(c))
+                $f_pullback!(b, Fc, ΔFc)
+                return nothing
+            end
+            return NoTangent(), Δt
         end
-        return NoTangent(), Δt
-    end
-    function tsvd!_pullback(::Tuple{ZeroTangent,ZeroTangent,ZeroTangent})
-        return NoTangent(), ZeroTangent()
-    end
+        $f_pullback(::Tuple{ZeroTangent,Vararg{ZeroTangent}}) = NoTangent(), ZeroTangent()
 
-    return (Ũ, Σ̃, Ṽ⁺, truncerr), tsvd!_pullback
+        return F′, $f_pullback
+    end
 end
 
 function ChainRulesCore.rrule(::typeof(LinearAlgebra.svdvals!), t::AbstractTensorMap)
@@ -53,50 +49,6 @@ function ChainRulesCore.rrule(::typeof(LinearAlgebra.svdvals!), t::AbstractTenso
     return s, svdvals_pullback
 end
 
-function ChainRulesCore.rrule(::typeof(TensorKit.eig!), t::AbstractTensorMap; kwargs...)
-    D, V = eig(t; kwargs...)
-
-    function eig!_pullback((_ΔD, _ΔV))
-        ΔD, ΔV = unthunk(_ΔD), unthunk(_ΔV)
-        Δt = similar(t)
-        for (c, b) in blocks(Δt)
-            Dc, Vc = block(D, c), block(V, c)
-            ΔDc, ΔVc = block(ΔD, c), block(ΔV, c)
-            Ddc = view(Dc, diagind(Dc))
-            ΔDdc = (ΔDc isa AbstractZero) ? ΔDc : view(ΔDc, diagind(ΔDc))
-            eig_pullback!(b, Ddc, Vc, ΔDdc, ΔVc)
-        end
-        return NoTangent(), Δt
-    end
-    function eig!_pullback(::Tuple{ZeroTangent,ZeroTangent})
-        return NoTangent(), ZeroTangent()
-    end
-
-    return (D, V), eig!_pullback
-end
-
-function ChainRulesCore.rrule(::typeof(TensorKit.eigh!), t::AbstractTensorMap; kwargs...)
-    D, V = eigh(t; kwargs...)
-
-    function eigh!_pullback((_ΔD, _ΔV))
-        ΔD, ΔV = unthunk(_ΔD), unthunk(_ΔV)
-        Δt = similar(t)
-        for (c, b) in blocks(Δt)
-            Dc, Vc = block(D, c), block(V, c)
-            ΔDc, ΔVc = block(ΔD, c), block(ΔV, c)
-            Ddc = view(Dc, diagind(Dc))
-            ΔDdc = (ΔDc isa AbstractZero) ? ΔDc : view(ΔDc, diagind(ΔDc))
-            eigh_pullback!(b, Ddc, Vc, ΔDdc, ΔVc)
-        end
-        return NoTangent(), Δt
-    end
-    function eigh!_pullback(::Tuple{ZeroTangent,ZeroTangent})
-        return NoTangent(), ZeroTangent()
-    end
-
-    return (D, V), eigh!_pullback
-end
-
 function ChainRulesCore.rrule(::typeof(LinearAlgebra.eigvals!), t::AbstractTensorMap;
                               sortby=nothing, kwargs...)
     @assert sortby === nothing "only `sortby=nothing` is supported"
@@ -115,367 +67,38 @@ end
 function ChainRulesCore.rrule(::typeof(leftorth!), t::AbstractTensorMap; alg=QRpos())
     alg isa TensorKit.QR || alg isa TensorKit.QRpos ||
         error("only `alg=QR()` and `alg=QRpos()` are supported")
-    Q, R = leftorth(t; alg)
-    function leftorth!_pullback((_ΔQ, _ΔR))
-        ΔQ, ΔR = unthunk(_ΔQ), unthunk(_ΔR)
-        Δt = similar(t)
-        for (c, b) in blocks(Δt)
-            qr_pullback!(b, block(Q, c), block(R, c), block(ΔQ, c), block(ΔR, c))
+    QR = leftorth(t; alg)
+    function leftorth!_pullback(ΔQR′)
+        ΔQR = unthunk.(ΔQR′)
+        Δt = zerovector(t)
+        foreachblock(Δt) do c, (b,)
+            QRc = block.(QR, Ref(c))
+            ΔQRc = block.(ΔQR, Ref(c))
+            qr_compact_pullback!(b, QRc, ΔQRc)
+            return nothing
         end
         return NoTangent(), Δt
     end
-    leftorth!_pullback(::Tuple{ZeroTangent,ZeroTangent}) = NoTangent(), ZeroTangent()
-    return (Q, R), leftorth!_pullback
+    leftorth!_pullback(::NTuple{2,ZeroTangent}) = NoTangent(), ZeroTangent()
+
+    return QR, leftorth!_pullback
 end
 
 function ChainRulesCore.rrule(::typeof(rightorth!), t::AbstractTensorMap; alg=LQpos())
     alg isa TensorKit.LQ || alg isa TensorKit.LQpos ||
         error("only `alg=LQ()` and `alg=LQpos()` are supported")
-    L, Q = rightorth(t; alg)
-    function rightorth!_pullback((_ΔL, _ΔQ))
-        ΔL, ΔQ = unthunk(_ΔL), unthunk(_ΔQ)
-        Δt = similar(t)
-        for (c, b) in blocks(Δt)
-            lq_pullback!(b, block(L, c), block(Q, c), block(ΔL, c), block(ΔQ, c))
+    LQ = rightorth(t; alg)
+    function rightorth!_pullback(ΔLQ′)
+        ΔLQ = unthunk(ΔLQ′)
+        Δt = zerovector(t)
+        foreachblock(Δt) do c, (b,)
+            LQc = block.(LQ, Ref(c))
+            ΔLQc = block.(ΔLQ, Ref(c))
+            lq_compact_pullback!(b, LQc, ΔLQc)
+            return nothing
         end
         return NoTangent(), Δt
     end
-    rightorth!_pullback(::Tuple{ZeroTangent,ZeroTangent}) = NoTangent(), ZeroTangent()
-    return (L, Q), rightorth!_pullback
-end
-
-# Corresponding matrix factorisations: implemented as mutating methods
-# ---------------------------------------------------------------------
-# helper routines
-safe_inv(a, tol) = abs(a) < tol ? zero(a) : inv(a)
-
-function lowertriangularind(A::AbstractMatrix)
-    m, n = size(A)
-    I = Vector{Int}(undef, div(m * (m - 1), 2) + m * (n - m))
-    offset = 0
-    for j in 1:n
-        r = (j + 1):m
-        I[offset .- j .+ r] = (j - 1) * m .+ r
-        offset += length(r)
-    end
-    return I
-end
-
-function uppertriangularind(A::AbstractMatrix)
-    m, n = size(A)
-    I = Vector{Int}(undef, div(m * (m - 1), 2) + m * (n - m))
-    offset = 0
-    for i in 1:m
-        r = (i + 1):n
-        I[offset .- i .+ r] = i .+ m .* (r .- 1)
-        offset += length(r)
-    end
-    return I
-end
-
-# SVD_pullback: pullback implementation for general (possibly truncated) SVD
-#
-# Arguments are U, S and Vd of full (non-truncated, but still thin) SVD, as well as
-# cotangent ΔU, ΔS, ΔVd variables of truncated SVD
-# 
-# Checks whether the cotangent variables are such that they would couple to gauge-dependent
-# degrees of freedom (phases of singular vectors), and prints a warning if this is the case
-#
-# An implementation that only uses U, S, and Vd from truncated SVD is also possible, but
-# requires solving a Sylvester equation, which does not seem to be supported on GPUs.
-#
-# Other implementation considerations for GPU compatibility:
-# no scalar indexing, lots of broadcasting and views
-#
-function svd_pullback!(ΔA::AbstractMatrix, U::AbstractMatrix, S::AbstractVector,
-                       Vd::AbstractMatrix, ΔU, ΔS, ΔVd;
-                       tol::Real=default_pullback_gaugetol(S))
-
-    # Basic size checks and determination
-    m, n = size(U, 1), size(Vd, 2)
-    size(U, 2) == size(Vd, 1) == length(S) == min(m, n) || throw(DimensionMismatch())
-    p = -1
-    if !(ΔU isa AbstractZero)
-        m == size(ΔU, 1) || throw(DimensionMismatch())
-        p = size(ΔU, 2)
-    end
-    if !(ΔVd isa AbstractZero)
-        n == size(ΔVd, 2) || throw(DimensionMismatch())
-        if p == -1
-            p = size(ΔVd, 1)
-        else
-            p == size(ΔVd, 1) || throw(DimensionMismatch())
-        end
-    end
-    if !(ΔS isa AbstractZero)
-        if p == -1
-            p = length(ΔS)
-        else
-            p == length(ΔS) || throw(DimensionMismatch())
-        end
-    end
-    Up = view(U, :, 1:p)
-    Vp = view(Vd, 1:p, :)'
-    Sp = view(S, 1:p)
-
-    # rank
-    r = searchsortedlast(S, tol; rev=true)
-
-    # compute antihermitian part of projection of ΔU and ΔV onto U and V
-    # also already subtract this projection from ΔU and ΔV
-    if !(ΔU isa AbstractZero)
-        UΔU = Up' * ΔU
-        aUΔU = rmul!(UΔU - UΔU', 1 / 2)
-        if m > p
-            ΔU -= Up * UΔU
-        end
-    else
-        aUΔU = fill!(similar(U, (p, p)), 0)
-    end
-    if !(ΔVd isa AbstractZero)
-        VΔV = Vp' * ΔVd'
-        aVΔV = rmul!(VΔV - VΔV', 1 / 2)
-        if n > p
-            ΔVd -= VΔV' * Vp'
-        end
-    else
-        aVΔV = fill!(similar(Vd, (p, p)), 0)
-    end
-
-    # check whether cotangents arise from gauge-invariance objective function
-    mask = abs.(Sp' .- Sp) .< tol
-    Δgauge = norm(view(aUΔU, mask) + view(aVΔV, mask), Inf)
-    if p > r
-        rprange = (r + 1):p
-        Δgauge = max(Δgauge, norm(view(aUΔU, rprange, rprange), Inf))
-        Δgauge = max(Δgauge, norm(view(aVΔV, rprange, rprange), Inf))
-    end
-    Δgauge < tol ||
-        @warn "`svd` cotangents sensitive to gauge choice: (|Δgauge| = $Δgauge)"
-
-    UdΔAV = (aUΔU .+ aVΔV) .* safe_inv.(Sp' .- Sp, tol) .+
-            (aUΔU .- aVΔV) .* safe_inv.(Sp' .+ Sp, tol)
-    if !(ΔS isa ZeroTangent)
-        UdΔAV[diagind(UdΔAV)] .+= real.(ΔS)
-        # in principle, ΔS is real, but maybe not if coming from an anyonic tensor
-    end
-    mul!(ΔA, Up, UdΔAV * Vp')
-
-    if r > p # contribution from truncation
-        Ur = view(U, :, (p + 1):r)
-        Vr = view(Vd, (p + 1):r, :)'
-        Sr = view(S, (p + 1):r)
-
-        if !(ΔU isa AbstractZero)
-            UrΔU = Ur' * ΔU
-            if m > r
-                ΔU -= Ur * UrΔU # subtract this part from ΔU
-            end
-        else
-            UrΔU = fill!(similar(U, (r - p, p)), 0)
-        end
-        if !(ΔVd isa AbstractZero)
-            VrΔV = Vr' * ΔVd'
-            if n > r
-                ΔVd -= VrΔV' * Vr' # subtract this part from ΔV
-            end
-        else
-            VrΔV = fill!(similar(Vd, (r - p, p)), 0)
-        end
-
-        X = (1 // 2) .* ((UrΔU .+ VrΔV) .* safe_inv.(Sp' .- Sr, tol) .+
-                         (UrΔU .- VrΔV) .* safe_inv.(Sp' .+ Sr, tol))
-        Y = (1 // 2) .* ((UrΔU .+ VrΔV) .* safe_inv.(Sp' .- Sr, tol) .-
-                         (UrΔU .- VrΔV) .* safe_inv.(Sp' .+ Sr, tol))
-
-        # ΔA += Ur * X * Vp' + Up * Y' * Vr'
-        mul!(ΔA, Ur, X * Vp', 1, 1)
-        mul!(ΔA, Up * Y', Vr', 1, 1)
-    end
-
-    if m > max(r, p) && !(ΔU isa AbstractZero) # remaining ΔU is already orthogonal to U[:,1:max(p,r)]
-        # ΔA += (ΔU .* safe_inv.(Sp', tol)) * Vp'
-        mul!(ΔA, ΔU .* safe_inv.(Sp', tol), Vp', 1, 1)
-    end
-    if n > max(r, p) && !(ΔVd isa AbstractZero) # remaining ΔV is already orthogonal to V[:,1:max(p,r)]
-        # ΔA += U * (safe_inv.(Sp, tol) .* ΔVd)
-        mul!(ΔA, Up, safe_inv.(Sp, tol) .* ΔVd, 1, 1)
-    end
-    return ΔA
-end
-
-function eig_pullback!(ΔA::AbstractMatrix, D::AbstractVector, V::AbstractMatrix, ΔD, ΔV;
-                       tol::Real=default_pullback_gaugetol(D))
-
-    # Basic size checks and determination
-    n = LinearAlgebra.checksquare(V)
-    n == length(D) || throw(DimensionMismatch())
-
-    if !(ΔV isa AbstractZero)
-        VdΔV = V' * ΔV
-
-        mask = abs.(transpose(D) .- D) .< tol
-        Δgauge = norm(view(VdΔV, mask), Inf)
-        Δgauge < tol ||
-            @warn "`eig` cotangents sensitive to gauge choice: (|Δgauge| = $Δgauge)"
-
-        VdΔV .*= conj.(safe_inv.(transpose(D) .- D, tol))
-
-        if !(ΔD isa AbstractZero)
-            view(VdΔV, diagind(VdΔV)) .+= ΔD
-        end
-        PΔV = V' \ VdΔV
-        if eltype(ΔA) <: Real
-            ΔAc = mul!(VdΔV, PΔV, V') # recycle VdΔV memory
-            ΔA .= real.(ΔAc)
-        else
-            mul!(ΔA, PΔV, V')
-        end
-    else
-        PΔV = V' \ Diagonal(ΔD)
-        if eltype(ΔA) <: Real
-            ΔAc = PΔV * V'
-            ΔA .= real.(ΔAc)
-        else
-            mul!(ΔA, PΔV, V')
-        end
-    end
-    return ΔA
-end
-
-function eigh_pullback!(ΔA::AbstractMatrix, D::AbstractVector, V::AbstractMatrix, ΔD, ΔV;
-                        tol::Real=default_pullback_gaugetol(D))
-
-    # Basic size checks and determination
-    n = LinearAlgebra.checksquare(V)
-    n == length(D) || throw(DimensionMismatch())
-
-    if !(ΔV isa AbstractZero)
-        VdΔV = V' * ΔV
-        aVdΔV = rmul!(VdΔV - VdΔV', 1 / 2)
-
-        mask = abs.(D' .- D) .< tol
-        Δgauge = norm(view(aVdΔV, mask))
-        Δgauge < tol ||
-            @warn "`eigh` cotangents sensitive to gauge choice: (|Δgauge| = $Δgauge)"
-
-        aVdΔV .*= safe_inv.(D' .- D, tol)
-
-        if !(ΔD isa AbstractZero)
-            view(aVdΔV, diagind(aVdΔV)) .+= real.(ΔD)
-            # in principle, ΔD is real, but maybe not if coming from an anyonic tensor
-        end
-        # recylce VdΔV space
-        mul!(ΔA, mul!(VdΔV, V, aVdΔV), V')
-    else
-        mul!(ΔA, V * Diagonal(ΔD), V')
-    end
-    return ΔA
-end
-
-function qr_pullback!(ΔA::AbstractMatrix, Q::AbstractMatrix, R::AbstractMatrix, ΔQ, ΔR;
-                      tol::Real=default_pullback_gaugetol(R))
-    Rd = view(R, diagind(R))
-    p = something(findlast(≥(tol) ∘ abs, Rd), 0)
-    m, n = size(R)
-
-    Q1 = view(Q, :, 1:p)
-    R1 = view(R, 1:p, :)
-    R11 = view(R, 1:p, 1:p)
-
-    ΔA1 = view(ΔA, :, 1:p)
-    ΔQ1 = view(ΔQ, :, 1:p)
-    ΔR1 = view(ΔR, 1:p, :)
-
-    M = similar(R, (p, p))
-    ΔR isa AbstractZero || mul!(M, ΔR1, R1')
-    ΔQ isa AbstractZero || mul!(M, Q1', ΔQ1, -1, !(ΔR isa AbstractZero))
-    view(M, lowertriangularind(M)) .= conj.(view(M, uppertriangularind(M)))
-    if eltype(M) <: Complex
-        Md = view(M, diagind(M))
-        Md .= real.(Md)
-    end
-
-    ΔA1 .= ΔQ1
-    mul!(ΔA1, Q1, M, +1, 1)
-
-    if n > p
-        R12 = view(R, 1:p, (p + 1):n)
-        ΔA2 = view(ΔA, :, (p + 1):n)
-        ΔR12 = view(ΔR, 1:p, (p + 1):n)
-
-        if ΔR isa AbstractZero
-            ΔA2 .= zero(eltype(ΔA))
-        else
-            mul!(ΔA2, Q1, ΔR12)
-            mul!(ΔA1, ΔA2, R12', -1, 1)
-        end
-    end
-    if m > p && !(ΔQ isa AbstractZero) # case where R is not full rank
-        Q2 = view(Q, :, (p + 1):m)
-        ΔQ2 = view(ΔQ, :, (p + 1):m)
-        Q1dΔQ2 = Q1' * ΔQ2
-        Δgauge = norm(mul!(copy(ΔQ2), Q1, Q1dΔQ2, -1, 1), Inf)
-        Δgauge < tol ||
-            @warn "`qr` cotangents sensitive to gauge choice: (|Δgauge| = $Δgauge)"
-        mul!(ΔA1, Q2, Q1dΔQ2', -1, 1)
-    end
-    rdiv!(ΔA1, UpperTriangular(R11)')
-    return ΔA
-end
-
-function lq_pullback!(ΔA::AbstractMatrix, L::AbstractMatrix, Q::AbstractMatrix, ΔL, ΔQ;
-                      tol::Real=default_pullback_gaugetol(L))
-    Ld = view(L, diagind(L))
-    p = something(findlast(≥(tol) ∘ abs, Ld), 0)
-    m, n = size(L)
-
-    L1 = view(L, :, 1:p)
-    L11 = view(L, 1:p, 1:p)
-    Q1 = view(Q, 1:p, :)
-
-    ΔA1 = view(ΔA, 1:p, :)
-    ΔQ1 = view(ΔQ, 1:p, :)
-    ΔL1 = view(ΔL, :, 1:p)
-
-    M = similar(L, (p, p))
-    ΔL isa AbstractZero || mul!(M, L1', ΔL1)
-    ΔQ isa AbstractZero || mul!(M, ΔQ1, Q1', -1, !(ΔL isa AbstractZero))
-    view(M, uppertriangularind(M)) .= conj.(view(M, lowertriangularind(M)))
-    if eltype(M) <: Complex
-        Md = view(M, diagind(M))
-        Md .= real.(Md)
-    end
-
-    ΔA1 .= ΔQ1
-    mul!(ΔA1, M, Q1, +1, 1)
-
-    if m > p
-        L21 = view(L, (p + 1):m, 1:p)
-        ΔA2 = view(ΔA, (p + 1):m, :)
-        ΔL21 = view(ΔL, (p + 1):m, 1:p)
-
-        if ΔL isa AbstractZero
-            ΔA2 .= zero(eltype(ΔA))
-        else
-            mul!(ΔA2, ΔL21, Q1)
-            mul!(ΔA1, L21', ΔA2, -1, 1)
-        end
-    end
-    if n > p && !(ΔQ isa AbstractZero) # case where R is not full rank
-        Q2 = view(Q, (p + 1):n, :)
-        ΔQ2 = view(ΔQ, (p + 1):n, :)
-        ΔQ2Q1d = ΔQ2 * Q1'
-        Δgauge = norm(mul!(copy(ΔQ2), ΔQ2Q1d, Q1, -1, 1))
-        Δgauge < tol ||
-            @warn "`lq` cotangents sensitive to gauge choice: (|Δgauge| = $Δgauge)"
-        mul!(ΔA1, ΔQ2Q1d', Q2, -1, 1)
-    end
-    ldiv!(LowerTriangular(L11)', ΔA1)
-    return ΔA
-end
-
-function default_pullback_gaugetol(a)
-    n = norm(a, Inf)
-    return eps(eltype(n))^(3 / 4) * max(n, one(n))
+    rightorth!_pullback(::NTuple{2,ZeroTangent}) = NoTangent(), ZeroTangent()
+    return LQ, rightorth!_pullback
 end
diff --git a/src/TensorKit.jl b/src/TensorKit.jl
index 6f7467e37..4301fe88b 100644
--- a/src/TensorKit.jl
+++ b/src/TensorKit.jl
@@ -31,7 +31,7 @@ export TruncationScheme
 export SpaceMismatch, SectorMismatch, IndexError # error types
 
 # general vector space methods
-export space, field, dual, dim, reduceddim, dims, fuse, flip, isdual, oplus,
+export space, field, dual, dim, reduceddim, dims, fuse, flip, isdual, oplus, ominus,
        insertleftunit, insertrightunit, removeunit
 
 # partial order for vector spaces
@@ -47,7 +47,7 @@ export ZNSpace, SU2Irrep, U1Irrep, CU1Irrep
 #        bendleft, bendright, foldleft, foldright, cycleclockwise, cycleanticlockwise
 
 # some unicode
-export ⊕, ⊗, ×, ⊠, ℂ, ℝ, ℤ, ←, →, ≾, ≿, ≅, ≺, ≻
+export ⊕, ⊗, ⊖, ×, ⊠, ℂ, ℝ, ℤ, ←, →, ≾, ≿, ≅, ≺, ≻
 export ℤ₂, ℤ₃, ℤ₄, U₁, SU, SU₂, CU₁
 export fℤ₂, fU₁, fSU₂
 export ℤ₂Space, ℤ₃Space, ℤ₄Space, U₁Space, CU₁Space, SU₂Space
@@ -72,8 +72,9 @@ export inner, dot, norm, normalize, normalize!, tr
 export mul!, lmul!, rmul!, adjoint!, pinv, axpy!, axpby!
 export leftorth, rightorth, leftnull, rightnull,
        leftorth!, rightorth!, leftnull!, rightnull!,
+       left_polar, left_polar!, right_polar, right_polar!,
        tsvd!, tsvd, eigen, eigen!, eig, eig!, eigh, eigh!, exp, exp!,
-       isposdef, isposdef!, ishermitian, sylvester, rank, cond
+       isposdef, isposdef!, ishermitian, isisometry, isunitary, sylvester, rank, cond
 export braid, braid!, permute, permute!, transpose, transpose!, twist, twist!, repartition,
        repartition!
 export catdomain, catcodomain
@@ -104,7 +105,11 @@ using TensorOperations: TensorOperations, @tensor, @tensoropt, @ncon, ncon
 using TensorOperations: IndexTuple, Index2Tuple, linearize, AbstractBackend
 const TO = TensorOperations
 
+using MatrixAlgebraKit: MatrixAlgebraKit as MAK
+
 using LRUCache
+using OhMyThreads
+using ScopedValues
 
 using TensorKitSectors
 import TensorKitSectors: dim, BraidingStyle, FusionStyle, ⊠, ⊗
@@ -117,7 +122,7 @@ using Base: @boundscheck, @propagate_inbounds, @constprop,
             SizeUnknown, HasLength, HasShape, IsInfinite, EltypeUnknown, HasEltype
 using Base.Iterators: product, filter
 
-using LinearAlgebra: LinearAlgebra
+using LinearAlgebra: LinearAlgebra, BlasFloat
 using LinearAlgebra: norm, dot, normalize, normalize!, tr,
                      axpy!, axpby!, lmul!, rmul!, mul!, ldiv!, rdiv!,
                      adjoint, adjoint!, transpose, transpose!,
@@ -125,6 +130,7 @@ using LinearAlgebra: norm, dot, normalize, normalize!, tr,
                      eigen, eigen!, svd, svd!,
                      isposdef, isposdef!, ishermitian, rank, cond,
                      Diagonal, Hermitian
+using MatrixAlgebraKit
 
 import Base.Meta
 
@@ -202,6 +208,7 @@ end
 #-------------------------------------
 # general definitions
 include("tensors/abstracttensor.jl")
+include("tensors/backends.jl")
 include("tensors/blockiterator.jl")
 include("tensors/tensor.jl")
 include("tensors/adjoint.jl")
@@ -211,10 +218,13 @@ include("tensors/tensoroperations.jl")
 include("tensors/treetransformers.jl")
 include("tensors/indexmanipulations.jl")
 include("tensors/diagonal.jl")
-include("tensors/truncation.jl")
-include("tensors/factorizations.jl")
 include("tensors/braidingtensor.jl")
 
+include("tensors/factorizations/factorizations.jl")
+using .Factorizations
+# include("tensors/factorizations/matrixalgebrakit.jl")
+# include("tensors/truncation.jl")
+
 # # Planar macros and related functionality
 # #-----------------------------------------
 @nospecialize
diff --git a/src/auxiliary/deprecate.jl b/src/auxiliary/deprecate.jl
index fa7667b2b..b235cbd7c 100644
--- a/src/auxiliary/deprecate.jl
+++ b/src/auxiliary/deprecate.jl
@@ -1,29 +1,29 @@
 import Base: transpose
 
 #! format: off
-Base.@deprecate(permute(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple; copy::Bool=false),
-                permute(t, (p1, p2); copy=copy))
-Base.@deprecate(transpose(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple; copy::Bool=false),
-                transpose(t, (p1, p2); copy=copy))
-Base.@deprecate(braid(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple, levels; copy::Bool=false),
-                braid(t, (p1, p2), levels; copy=copy))
-
-Base.@deprecate(tsvd(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                tsvd(t, (p₁, p₂); kwargs...))
-Base.@deprecate(leftorth(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                leftorth(t, (p₁, p₂); kwargs...))
-Base.@deprecate(rightorth(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                rightorth(t, (p₁, p₂); kwargs...))
-Base.@deprecate(leftnull(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                leftnull(t, (p₁, p₂); kwargs...))
-Base.@deprecate(rightnull(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                rightnull(t, (p₁, p₂); kwargs...))
-Base.@deprecate(LinearAlgebra.eigen(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                LinearAlgebra.eigen(t, (p₁, p₂); kwargs...), false)
-Base.@deprecate(eig(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                eig(t, (p₁, p₂); kwargs...))
-Base.@deprecate(eigh(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
-                eigh(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(permute(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple; copy::Bool=false),
+#                 permute(t, (p1, p2); copy=copy))
+# Base.@deprecate(transpose(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple; copy::Bool=false),
+#                 transpose(t, (p1, p2); copy=copy))
+# Base.@deprecate(braid(t::AbstractTensorMap, p1::IndexTuple, p2::IndexTuple, levels; copy::Bool=false),
+#                 braid(t, (p1, p2), levels; copy=copy))
+
+# Base.@deprecate(tsvd(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 tsvd(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(leftorth(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 leftorth(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(rightorth(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 rightorth(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(leftnull(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 leftnull(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(rightnull(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 rightnull(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(LinearAlgebra.eigen(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 LinearAlgebra.eigen(t, (p₁, p₂); kwargs...), false)
+# Base.@deprecate(eig(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 eig(t, (p₁, p₂); kwargs...))
+# Base.@deprecate(eigh(t::AbstractTensorMap, p₁::IndexTuple, p₂::IndexTuple; kwargs...),
+#                 eigh(t, (p₁, p₂); kwargs...))
 
 for f in (:rand, :randn, :zeros, :ones)
     @eval begin
diff --git a/src/auxiliary/linalg.jl b/src/auxiliary/linalg.jl
index 4277be67f..8a20eb8ac 100644
--- a/src/auxiliary/linalg.jl
+++ b/src/auxiliary/linalg.jl
@@ -46,341 +46,5 @@ Base.adjoint(alg::Union{SVD,SDD,Polar}) = alg
 const OFA = OrthogonalFactorizationAlgorithm
 const SVDAlg = Union{SVD,SDD}
 
-# Matrix algebra: entrypoint for calling matrix methods from within tensor implementations
-#------------------------------------------------------------------------------------------
-module MatrixAlgebra
-# TODO: all methods tha twe define here will need an extended version for CuMatrix in the
-# CUDA package extension.
-
-# TODO: other methods to include here:
-# mul! (possibly call matmul! instead)
-# adjoint!
-# sylvester
-# exp!
-# schur!?
-# 
-
-using LinearAlgebra
-using LinearAlgebra: BlasFloat, BlasReal, BlasComplex, checksquare
-
-using ..TensorKit: OrthogonalFactorizationAlgorithm,
-                   QL, QLpos, QR, QRpos, LQ, LQpos, RQ, RQpos, SVD, SDD, Polar
-
-# only defined in >v1.7
-@static if VERSION < v"1.7-"
-    _rf_findmax((fm, im), (fx, ix)) = isless(fm, fx) ? (fx, ix) : (fm, im)
-    _argmax(f, domain) = mapfoldl(x -> (f(x), x), _rf_findmax, domain)[2]
-else
-    _argmax(f, domain) = argmax(f, domain)
-end
-
-# TODO: define for CuMatrix if we support this
-function one!(A::StridedMatrix)
-    length(A) > 0 || return A
-    copyto!(A, LinearAlgebra.I)
-    return A
-end
-
 safesign(s::Real) = ifelse(s < zero(s), -one(s), +one(s))
 safesign(s::Complex) = ifelse(iszero(s), one(s), s / abs(s))
-
-function leftorth!(A::StridedMatrix{<:BlasFloat}, alg::Union{QR,QRpos}, atol::Real)
-    iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-    m, n = size(A)
-    k = min(m, n)
-    A, T = LAPACK.geqrt!(A, min(minimum(size(A)), 36))
-    Q = similar(A, m, k)
-    for j in 1:k
-        for i in 1:m
-            Q[i, j] = i == j
-        end
-    end
-    Q = LAPACK.gemqrt!('L', 'N', A, T, Q)
-    R = triu!(A[1:k, :])
-
-    if isa(alg, QRpos)
-        @inbounds for j in 1:k
-            s = safesign(R[j, j])
-            @simd for i in 1:m
-                Q[i, j] *= s
-            end
-        end
-        @inbounds for j in size(R, 2):-1:1
-            for i in 1:min(k, j)
-                R[i, j] = R[i, j] * conj(safesign(R[i, i]))
-            end
-        end
-    end
-    return Q, R
-end
-
-function leftorth!(A::StridedMatrix{<:BlasFloat}, alg::Union{QL,QLpos}, atol::Real)
-    iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-    m, n = size(A)
-    @assert m >= n
-
-    nhalf = div(n, 2)
-    #swap columns in A
-    @inbounds for j in 1:nhalf, i in 1:m
-        A[i, j], A[i, n + 1 - j] = A[i, n + 1 - j], A[i, j]
-    end
-    Q, R = leftorth!(A, isa(alg, QL) ? QR() : QRpos(), atol)
-
-    #swap columns in Q
-    @inbounds for j in 1:nhalf, i in 1:m
-        Q[i, j], Q[i, n + 1 - j] = Q[i, n + 1 - j], Q[i, j]
-    end
-    #swap rows and columns in R
-    @inbounds for j in 1:nhalf, i in 1:n
-        R[i, j], R[n + 1 - i, n + 1 - j] = R[n + 1 - i, n + 1 - j], R[i, j]
-    end
-    if isodd(n)
-        j = nhalf + 1
-        @inbounds for i in 1:nhalf
-            R[i, j], R[n + 1 - i, j] = R[n + 1 - i, j], R[i, j]
-        end
-    end
-    return Q, R
-end
-
-function leftorth!(A::StridedMatrix{<:BlasFloat}, alg::Union{SVD,SDD,Polar}, atol::Real)
-    U, S, V = alg isa SVD ? LAPACK.gesvd!('S', 'S', A) : LAPACK.gesdd!('S', A)
-    if isa(alg, Union{SVD,SDD})
-        n = count(s -> s .> atol, S)
-        if n != length(S)
-            return U[:, 1:n], lmul!(Diagonal(S[1:n]), V[1:n, :])
-        else
-            return U, lmul!(Diagonal(S), V)
-        end
-    else
-        iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-        # TODO: check Lapack to see if we can recycle memory of A
-        Q = mul!(A, U, V)
-        Sq = map!(sqrt, S, S)
-        SqV = lmul!(Diagonal(Sq), V)
-        R = SqV' * SqV
-        return Q, R
-    end
-end
-
-function leftnull!(A::StridedMatrix{<:BlasFloat}, alg::Union{QR,QRpos}, atol::Real)
-    iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-    m, n = size(A)
-    m >= n || throw(ArgumentError("no null space if less rows than columns"))
-
-    A, T = LAPACK.geqrt!(A, min(minimum(size(A)), 36))
-    N = similar(A, m, max(0, m - n))
-    fill!(N, 0)
-    for k in 1:(m - n)
-        N[n + k, k] = 1
-    end
-    return N = LAPACK.gemqrt!('L', 'N', A, T, N)
-end
-
-function leftnull!(A::StridedMatrix{<:BlasFloat}, alg::Union{SVD,SDD}, atol::Real)
-    size(A, 2) == 0 && return one!(similar(A, (size(A, 1), size(A, 1))))
-    U, S, V = alg isa SVD ? LAPACK.gesvd!('A', 'N', A) : LAPACK.gesdd!('A', A)
-    indstart = count(>(atol), S) + 1
-    return U[:, indstart:end]
-end
-
-function rightorth!(A::StridedMatrix{<:BlasFloat}, alg::Union{LQ,LQpos,RQ,RQpos},
-                    atol::Real)
-    iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-    # TODO: geqrfp seems a bit slower than geqrt in the intermediate region around
-    # matrix size 100, which is the interesting region. => Investigate and fix
-    m, n = size(A)
-    k = min(m, n)
-    At = transpose!(similar(A, n, m), A)
-
-    if isa(alg, RQ) || isa(alg, RQpos)
-        @assert m <= n
-
-        mhalf = div(m, 2)
-        # swap columns in At
-        @inbounds for j in 1:mhalf, i in 1:n
-            At[i, j], At[i, m + 1 - j] = At[i, m + 1 - j], At[i, j]
-        end
-        Qt, Rt = leftorth!(At, isa(alg, RQ) ? QR() : QRpos(), atol)
-
-        @inbounds for j in 1:mhalf, i in 1:n
-            Qt[i, j], Qt[i, m + 1 - j] = Qt[i, m + 1 - j], Qt[i, j]
-        end
-        @inbounds for j in 1:mhalf, i in 1:m
-            Rt[i, j], Rt[m + 1 - i, m + 1 - j] = Rt[m + 1 - i, m + 1 - j], Rt[i, j]
-        end
-        if isodd(m)
-            j = mhalf + 1
-            @inbounds for i in 1:mhalf
-                Rt[i, j], Rt[m + 1 - i, j] = Rt[m + 1 - i, j], Rt[i, j]
-            end
-        end
-        Q = transpose!(A, Qt)
-        R = transpose!(similar(A, (m, m)), Rt) # TODO: efficient in place
-        return R, Q
-    else
-        Qt, Lt = leftorth!(At, alg', atol)
-        if m > n
-            L = transpose!(A, Lt)
-            Q = transpose!(similar(A, (n, n)), Qt) # TODO: efficient in place
-        else
-            Q = transpose!(A, Qt)
-            L = transpose!(similar(A, (m, m)), Lt) # TODO: efficient in place
-        end
-        return L, Q
-    end
-end
-
-function rightorth!(A::StridedMatrix{<:BlasFloat}, alg::Union{SVD,SDD,Polar}, atol::Real)
-    U, S, V = alg isa SVD ? LAPACK.gesvd!('S', 'S', A) : LAPACK.gesdd!('S', A)
-    if isa(alg, Union{SVD,SDD})
-        n = count(s -> s .> atol, S)
-        if n != length(S)
-            return rmul!(U[:, 1:n], Diagonal(S[1:n])), V[1:n, :]
-        else
-            return rmul!(U, Diagonal(S)), V
-        end
-    else
-        iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-        Q = mul!(A, U, V)
-        Sq = map!(sqrt, S, S)
-        USq = rmul!(U, Diagonal(Sq))
-        L = USq * USq'
-        return L, Q
-    end
-end
-
-function rightnull!(A::StridedMatrix{<:BlasFloat}, alg::Union{LQ,LQpos}, atol::Real)
-    iszero(atol) || throw(ArgumentError("nonzero atol not supported by $alg"))
-    m, n = size(A)
-    k = min(m, n)
-    At = adjoint!(similar(A, n, m), A)
-    At, T = LAPACK.geqrt!(At, min(k, 36))
-    N = similar(A, max(n - m, 0), n)
-    fill!(N, 0)
-    for k in 1:(n - m)
-        N[k, m + k] = 1
-    end
-    return N = LAPACK.gemqrt!('R', eltype(At) <: Real ? 'T' : 'C', At, T, N)
-end
-
-function rightnull!(A::StridedMatrix{<:BlasFloat}, alg::Union{SVD,SDD}, atol::Real)
-    size(A, 1) == 0 && return one!(similar(A, (size(A, 2), size(A, 2))))
-    U, S, V = alg isa SVD ? LAPACK.gesvd!('N', 'A', A) : LAPACK.gesdd!('A', A)
-    indstart = count(>(atol), S) + 1
-    return V[indstart:end, :]
-end
-
-function svd!(A::StridedMatrix{T}, alg::Union{SVD,SDD}) where {T<:BlasFloat}
-    # fix another type instability in LAPACK wrappers
-    TT = Tuple{Matrix{T},Vector{real(T)},Matrix{T}}
-    U, S, V = alg isa SVD ? LAPACK.gesvd!('S', 'S', A)::TT : LAPACK.gesdd!('S', A)::TT
-    return U, S, V
-end
-
-function eig!(A::StridedMatrix{T}; permute::Bool=true, scale::Bool=true) where {T<:BlasReal}
-    n = checksquare(A)
-    n == 0 && return zeros(Complex{T}, 0), zeros(Complex{T}, 0, 0)
-
-    A, DR, DI, VL, VR, _ = LAPACK.geevx!(permute ? (scale ? 'B' : 'P') :
-                                         (scale ? 'S' : 'N'), 'N', 'V', 'N', A)
-    D = complex.(DR, DI)
-    V = zeros(Complex{T}, n, n)
-    j = 1
-    while j <= n
-        if DI[j] == 0
-            vr = view(VR, :, j)
-            s = conj(sign(_argmax(abs, vr)))
-            V[:, j] .= s .* vr
-        else
-            vr = view(VR, :, j)
-            vi = view(VR, :, j + 1)
-            s = conj(sign(_argmax(abs, vr))) # vectors coming from lapack have already real absmax component
-            V[:, j] .= s .* (vr .+ im .* vi)
-            V[:, j + 1] .= s .* (vr .- im .* vi)
-            j += 1
-        end
-        j += 1
-    end
-    return D, V
-end
-
-function eig!(A::StridedMatrix{T}; permute::Bool=true,
-              scale::Bool=true) where {T<:BlasComplex}
-    n = checksquare(A)
-    n == 0 && return zeros(T, 0), zeros(T, 0, 0)
-    D, V = LAPACK.geevx!(permute ? (scale ? 'B' : 'P') : (scale ? 'S' : 'N'), 'N', 'V', 'N',
-                         A)[[2, 4]]
-    for j in 1:n
-        v = view(V, :, j)
-        s = conj(sign(_argmax(abs, v)))
-        v .*= s
-    end
-    return D, V
-end
-
-function eigh!(A::StridedMatrix{T}) where {T<:BlasFloat}
-    n = checksquare(A)
-    n == 0 && return zeros(real(T), 0), zeros(T, 0, 0)
-    D, V = LAPACK.syevr!('V', 'A', 'U', A, 0.0, 0.0, 0, 0, -1.0)
-    for j in 1:n
-        v = view(V, :, j)
-        s = conj(sign(_argmax(abs, v)))
-        v .*= s
-    end
-    return D, V
-end
-
-## Old stuff and experiments
-
-# using LinearAlgebra: BlasFloat, Char, BlasInt, LAPACK, LAPACKException,
-#                      DimensionMismatch, SingularException, PosDefException, chkstride1,
-#                      checksquare,
-#                      triu!
-
-# TODO: reconsider the following implementation
-# Unfortunately, geqrfp seems a bit slower than geqrt in the intermediate region
-# around matrix size 100, which is the interesting region. => Investigate and maybe fix
-# function _leftorth!(A::StridedMatrix{<:BlasFloat})
-#     m, n = size(A)
-#     A, τ = geqrfp!(A)
-#     Q = LAPACK.ormqr!('L', 'N', A, τ, eye(eltype(A), m, min(m, n)))
-#     R = triu!(A[1:min(m, n), :])
-#     return Q, R
-# end
-
-# geqrfp!: computes qrpos factorization, missing in Base
-# geqrfp!(A::StridedMatrix{<:BlasFloat}) =
-#     ((m, n) = size(A); geqrfp!(A, similar(A, min(m, n))))
-#
-# for (geqrfp, elty, relty) in
-#     ((:dgeqrfp_, :Float64, :Float64), (:sgeqrfp_, :Float32, :Float32),
-#         (:zgeqrfp_, :ComplexF64, :Float64), (:cgeqrfp_, :ComplexF32, :Float32))
-#     @eval begin
-#         function geqrfp!(A::StridedMatrix{$elty}, tau::StridedVector{$elty})
-#             chkstride1(A, tau)
-#             m, n  = size(A)
-#             if length(tau) != min(m, n)
-#                 throw(DimensionMismatch("tau has length $(length(tau)), but needs length $(min(m, n))"))
-#             end
-#             work  = Vector{$elty}(1)
-#             lwork = BlasInt(-1)
-#             info  = Ref{BlasInt}()
-#             for i = 1:2                # first call returns lwork as work[1]
-#                 ccall((@blasfunc($geqrfp), liblapack), Nothing,
-#                       (Ptr{BlasInt}, Ptr{BlasInt}, Ptr{$elty}, Ptr{BlasInt},
-#                        Ptr{$elty}, Ptr{$elty}, Ptr{BlasInt}, Ptr{BlasInt}),
-#                       Ref(m), Ref(n), A, Ref(max(1, stride(A, 2))),
-#                       tau, work, Ref(lwork), info)
-#                 chklapackerror(info[])
-#                 if i == 1
-#                     lwork = BlasInt(real(work[1]))
-#                     resize!(work, lwork)
-#                 end
-#             end
-#             A, tau
-#         end
-#     end
-# end
-
-end
diff --git a/src/auxiliary/random.jl b/src/auxiliary/random.jl
index bc9df6f65..3289cdc3c 100644
--- a/src/auxiliary/random.jl
+++ b/src/auxiliary/random.jl
@@ -20,6 +20,6 @@ function randisometry!(rng::Random.AbstractRNG, A::AbstractMatrix)
     dims = size(A)
     dims[1] >= dims[2] ||
         throw(DimensionMismatch("cannot create isometric matrix with dimensions $dims; isometry needs to be tall or square"))
-    Q, = MatrixAlgebra.leftorth!(Random.randn!(rng, A), QRpos(), 0)
+    Q, = leftorth!(Random.randn!(rng, A); alg=QRpos())
     return copy!(A, Q)
 end
diff --git a/src/spaces/cartesianspace.jl b/src/spaces/cartesianspace.jl
index f29ed263d..fe12f3dc6 100644
--- a/src/spaces/cartesianspace.jl
+++ b/src/spaces/cartesianspace.jl
@@ -49,7 +49,13 @@ sectortype(::Type{CartesianSpace}) = Trivial
 
 Base.oneunit(::Type{CartesianSpace}) = CartesianSpace(1)
 Base.zero(::Type{CartesianSpace}) = CartesianSpace(0)
+
 ⊕(V₁::CartesianSpace, V₂::CartesianSpace) = CartesianSpace(V₁.d + V₂.d)
+function ⊖(V::CartesianSpace, W::CartesianSpace)
+    V ≿ W || throw(ArgumentError("$(W) is not a subspace of $(V)"))
+    return CartesianSpace(dim(V) - dim(W))
+end
+
 fuse(V₁::CartesianSpace, V₂::CartesianSpace) = CartesianSpace(V₁.d * V₂.d)
 flip(V::CartesianSpace) = V
 
diff --git a/src/spaces/complexspace.jl b/src/spaces/complexspace.jl
index ff05888b8..1031db614 100644
--- a/src/spaces/complexspace.jl
+++ b/src/spaces/complexspace.jl
@@ -50,11 +50,18 @@ Base.conj(V::ComplexSpace) = ComplexSpace(dim(V), !isdual(V))
 
 Base.oneunit(::Type{ComplexSpace}) = ComplexSpace(1)
 Base.zero(::Type{ComplexSpace}) = ComplexSpace(0)
+
 function ⊕(V₁::ComplexSpace, V₂::ComplexSpace)
     return isdual(V₁) == isdual(V₂) ?
            ComplexSpace(dim(V₁) + dim(V₂), isdual(V₁)) :
            throw(SpaceMismatch("Direct sum of a vector space and its dual does not exist"))
 end
+function ⊖(V::ComplexSpace, W::ComplexSpace)
+    (V ≿ W && isdual(V) == isdual(W)) ||
+        throw(ArgumentError("$(W) is not a subspace of $(V)"))
+    return ComplexSpace(dim(V) - dim(W), isdual(V))
+end
+
 fuse(V₁::ComplexSpace, V₂::ComplexSpace) = ComplexSpace(V₁.d * V₂.d)
 flip(V::ComplexSpace) = dual(V)
 
diff --git a/src/spaces/gradedspace.jl b/src/spaces/gradedspace.jl
index 7aa746076..4d8389dcd 100644
--- a/src/spaces/gradedspace.jl
+++ b/src/spaces/gradedspace.jl
@@ -149,6 +149,13 @@ function ⊕(V₁::GradedSpace{I}, V₂::GradedSpace{I}) where {I<:Sector}
     return typeof(V₁)(dims; dual=dual1)
 end
 
+function ⊖(V::GradedSpace{I}, W::GradedSpace{I}) where {I<:Sector}
+    dual = isdual(V)
+    V ≿ W && dual == isdual(W) ||
+        throw(SpaceMismatch("$(W) is not a subspace of $(V)"))
+    return typeof(V)(c => dim(V, c) - dim(W, c) for c in sectors(V); dual)
+end
+
 function flip(V::GradedSpace{I}) where {I<:Sector}
     if isdual(V)
         typeof(V)(c => dim(V, c) for c in sectors(V))
diff --git a/src/spaces/vectorspaces.jl b/src/spaces/vectorspaces.jl
index 844da081f..847182536 100644
--- a/src/spaces/vectorspaces.jl
+++ b/src/spaces/vectorspaces.jl
@@ -150,6 +150,17 @@ function ⊕ end
 ⊕(V::Vararg{ElementarySpace}) = foldl(⊕, V)
 const oplus = ⊕
 
+"""
+    ⊖(V::ElementarySpace, W::ElementarySpace) -> X::ElementarySpace
+    ominus(V::ElementarySpace, W::ElementarySpace) -> X::ElementarySpace
+
+Return a space that is equivalent to the orthogonal complement of `W` in `V`,
+i.e. an instance `X::ElementarySpace` such that `V = W ⊕ X`.
+"""
+⊖(V₁::S, V₂::S) where {S<:ElementarySpace}
+⊖(V₁::VectorSpace, V₂::VectorSpace) = ⊖(promote(V₁, V₂)...)
+const ominus = ⊖
+
 """
     ⊗(V₁::S, V₂::S, V₃::S...) where {S<:ElementarySpace} -> S
 
diff --git a/src/tensors/backends.jl b/src/tensors/backends.jl
new file mode 100644
index 000000000..0fc8f99f6
--- /dev/null
+++ b/src/tensors/backends.jl
@@ -0,0 +1,32 @@
+# Scheduler implementation
+# ------------------------
+function select_scheduler(scheduler=OhMyThreads.Implementation.NotGiven(); kwargs...)
+    return if scheduler == OhMyThreads.Implementation.NotGiven() && isempty(kwargs)
+        Threads.nthreads() == 1 ? SerialScheduler() : DynamicScheduler()
+    else
+        OhMyThreads.Implementation._scheduler_from_userinput(scheduler; kwargs...)
+    end
+end
+
+"""
+    const blockscheduler = ScopedValue{Scheduler}(SerialScheduler())
+
+The default scheduler used when looping over different blocks in the matrix representation of a
+tensor.
+For controlling this value, see also [`set_blockscheduler`](@ref) and [`with_blockscheduler`](@ref).
+"""
+const blockscheduler = ScopedValue{Scheduler}(SerialScheduler())
+
+"""
+    with_blockscheduler(f, [scheduler]; kwargs...)
+
+Run `f` in a scope where the `blockscheduler` is determined by `scheduler' and `kwargs...`.
+"""
+@inline function with_blockscheduler(f, scheduler=OhMyThreads.Implementation.NotGiven();
+                                     kwargs...)
+    @with blockscheduler => select_scheduler(scheduler; kwargs...) f()
+end
+
+# TODO: disable for trivial symmetry or small tensors?
+default_blockscheduler(t::AbstractTensorMap) = default_blockscheduler(typeof(t))
+default_blockscheduler(::Type{T}) where {T<:AbstractTensorMap} = blockscheduler[]
diff --git a/src/tensors/blockiterator.jl b/src/tensors/blockiterator.jl
index 7929b5d19..2938de824 100644
--- a/src/tensors/blockiterator.jl
+++ b/src/tensors/blockiterator.jl
@@ -12,3 +12,34 @@ Base.IteratorSize(::BlockIterator) = Base.HasLength()
 Base.IteratorEltype(::BlockIterator) = Base.HasEltype()
 Base.eltype(::Type{<:BlockIterator{T}}) where {T} = Pair{sectortype(T),blocktype(T)}
 Base.length(iter::BlockIterator) = length(iter.structure)
+
+# TODO: fast-path when structures are the same?
+# TODO: implement scheduler
+"""
+    foreachblock(f, ts::AbstractTensorMap...; [scheduler])
+
+Apply `f` to each block of `t` and the corresponding blocks of `ts`.
+Optionally, `scheduler` can be used to parallelize the computation.
+This function is equivalent to the following loop:
+
+```julia
+for c in union(blocksectors.(ts)...)
+    bs = map(t -> block(t, c), ts)
+    f(c, bs)
+end
+```
+"""
+function foreachblock(f, t::AbstractTensorMap, ts::AbstractTensorMap...; scheduler=nothing)
+    tensors = (t, ts...)
+    allsectors = union(blocksectors.(tensors)...)
+    foreach(allsectors) do c
+        return f(c, block.(tensors, Ref(c)))
+    end
+    return nothing
+end
+function foreachblock(f, t::AbstractTensorMap; scheduler=nothing)
+    foreach(blocks(t)) do (c, b)
+        return f(c, (b,))
+    end
+    return nothing
+end
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index 88d0c3b25..5a3840f1b 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -317,65 +317,6 @@ function LinearAlgebra.isposdef(d::DiagonalTensorMap)
     return all(isposdef, d.data)
 end
 
-function eig!(d::DiagonalTensorMap)
-    return d, one(d)
-end
-function eigh!(d::DiagonalTensorMap{<:Real})
-    return d, one(d)
-end
-function eigh!(d::DiagonalTensorMap{<:Complex})
-    # TODO: should this test for hermiticity? `eigh!(::TensorMap)` also does not do this.
-    return DiagonalTensorMap(real(d.data), d.domain), one(d)
-end
-
-function leftorth!(d::DiagonalTensorMap; alg=QR(), kwargs...)
-    @assert alg isa Union{QR,QL}
-    return one(d), d # TODO: this is only correct for `alg = QR()` or `alg = QL()`
-end
-function rightorth!(d::DiagonalTensorMap; alg=LQ(), kwargs...)
-    @assert alg isa Union{LQ,RQ}
-    return d, one(d) # TODO: this is only correct for `alg = LQ()` or `alg = RQ()`
-end
-# not much to do here:
-leftnull!(d::DiagonalTensorMap; kwargs...) = leftnull!(TensorMap(d); kwargs...)
-rightnull!(d::DiagonalTensorMap; kwargs...) = rightnull!(TensorMap(d); kwargs...)
-
-function tsvd!(d::DiagonalTensorMap; trunc=NoTruncation(), p::Real=2, alg=SDD())
-    return _tsvd!(d, alg, trunc, p)
-end
-# helper function
-function _compute_svddata!(d::DiagonalTensorMap, alg::Union{SVD,SDD})
-    InnerProductStyle(d) === EuclideanInnerProduct() || throw_invalid_innerproduct(:tsvd!)
-    I = sectortype(d)
-    dims = SectorDict{I,Int}()
-    generator = Base.Iterators.map(blocks(d)) do (c, b)
-        lb = length(b.diag)
-        U = zerovector!(similar(b.diag, lb, lb))
-        V = zerovector!(similar(b.diag, lb, lb))
-        p = sortperm(b.diag; by=abs, rev=true)
-        for (i, pi) in enumerate(p)
-            U[pi, i] = MatrixAlgebra.safesign(b.diag[pi])
-            V[i, pi] = 1
-        end
-        Σ = abs.(view(b.diag, p))
-        dims[c] = lb
-        return c => (U, Σ, V)
-    end
-    SVDdata = SectorDict(generator)
-    return SVDdata, dims
-end
-
-function LinearAlgebra.svdvals(d::DiagonalTensorMap)
-    return SectorDict(c => LinearAlgebra.svdvals(b) for (c, b) in blocks(d))
-end
-function LinearAlgebra.eigvals(d::DiagonalTensorMap)
-    return SectorDict(c => LinearAlgebra.eigvals(b) for (c, b) in blocks(d))
-end
-
-function LinearAlgebra.cond(d::DiagonalTensorMap, p::Real=2)
-    return LinearAlgebra.cond(Diagonal(d.data), p)
-end
-
 # matrix functions
 for f in
     (:exp, :cos, :sin, :tan, :cot, :cosh, :sinh, :tanh, :coth, :atan, :acot, :asinh, :sqrt,
diff --git a/src/tensors/factorizations.jl b/src/tensors/factorizations.jl
deleted file mode 100644
index 2292b6c17..000000000
--- a/src/tensors/factorizations.jl
+++ /dev/null
@@ -1,685 +0,0 @@
-# Tensor factorization
-#----------------------
-function factorisation_scalartype(t::AbstractTensorMap)
-    T = scalartype(t)
-    return promote_type(Float32, typeof(zero(T) / sqrt(abs2(one(T)))))
-end
-factorisation_scalartype(f, t) = factorisation_scalartype(t)
-
-function permutedcopy_oftype(t::AbstractTensorMap, T::Type{<:Number}, p::Index2Tuple)
-    return permute!(similar(t, T, permute(space(t), p)), t, p)
-end
-function copy_oftype(t::AbstractTensorMap, T::Type{<:Number})
-    return copy!(similar(t, T), t)
-end
-
-"""
-    tsvd(t::AbstractTensorMap, (leftind, rightind)::Index2Tuple;
-        trunc::TruncationScheme = notrunc(), p::Real = 2, alg::Union{SVD, SDD} = SDD())
-        -> U, S, V, ϵ
-
-Compute the (possibly truncated) singular value decomposition such that
-`norm(permute(t, (leftind, rightind)) - U * S * V) ≈ ϵ`, where `ϵ` thus represents the truncation error.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in
-`t` to be destroyed/overwritten, by using `tsvd!(t, trunc = notrunc(), p = 2)`.
-
-A truncation parameter `trunc` can be specified for the new internal dimension, in which
-case a truncated singular value decomposition will be computed. Choices are:
-*   `notrunc()`: no truncation (default);
-*   `truncerr(η::Real)`: truncates such that the p-norm of the truncated singular values is
-    smaller than `η`;
-*   `truncdim(χ::Int)`: truncates such that the equivalent total dimension of the internal
-    vector space is no larger than `χ`;
-*   `truncspace(V)`: truncates such that the dimension of the internal vector space is
-    smaller than that of `V` in any sector.
-*   `truncbelow(η::Real)`: truncates such that every singular value is larger then `η` ;
-
-Truncation options can also be combined using `&`, i.e. `truncbelow(η) & truncdim(χ)` will
-choose the truncation space such that every singular value is larger than `η`, and the
-equivalent total dimension of the internal vector space is no larger than `χ`.
-
-The method `tsvd` also returns the truncation error `ϵ`, computed as the `p` norm of the
-singular values that were truncated.
-
-THe keyword `alg` can be equal to `SVD()` or `SDD()`, corresponding to the underlying LAPACK
-algorithm that computes the decomposition (`_gesvd` or `_gesdd`).
-
-Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and `tsvd(!)`
-is currently only implemented for `InnerProductStyle(t) === EuclideanInnerProduct()`.
-"""
-function tsvd(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(tsvd, t), p)
-    return tsvd!(tcopy; kwargs...)
-end
-
-function LinearAlgebra.svdvals(t::AbstractTensorMap)
-    tcopy = copy_oftype(t, factorisation_scalartype(tsvd, t))
-    return LinearAlgebra.svdvals!(tcopy)
-end
-
-"""
-    leftorth(t::AbstractTensorMap, (leftind, rightind)::Index2Tuple;
-                alg::OrthogonalFactorizationAlgorithm = QRpos()) -> Q, R
-
-Create orthonormal basis `Q` for indices in `leftind`, and remainder `R` such that
-`permute(t, (leftind, rightind)) = Q*R`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
-to be destroyed/overwritten, by using `leftorth!(t, alg = QRpos())`.
-
-Different algorithms are available, namely `QR()`, `QRpos()`, `SVD()` and `Polar()`. `QR()`
-and `QRpos()` use a standard QR decomposition, producing an upper triangular matrix `R`.
-`Polar()` produces a Hermitian and positive semidefinite `R`. `QRpos()` corrects the
-standard QR decomposition such that the diagonal elements of `R` are positive. Only
-`QRpos()` and `Polar()` are unique (no residual freedom) so that they always return the same
-result for the same input tensor `t`.
-
-Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
-`leftorth(!)` is currently only implemented for 
-    `InnerProductStyle(t) === EuclideanInnerProduct()`.
-"""
-function leftorth(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(leftorth, t), p)
-    return leftorth!(tcopy; kwargs...)
-end
-
-"""
-    rightorth(t::AbstractTensorMap, (leftind, rightind)::Index2Tuple;
-                alg::OrthogonalFactorizationAlgorithm = LQpos()) -> L, Q
-
-Create orthonormal basis `Q` for indices in `rightind`, and remainder `L` such that
-`permute(t, (leftind, rightind)) = L*Q`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
-to be destroyed/overwritten, by using `rightorth!(t, alg = LQpos())`.
-
-Different algorithms are available, namely `LQ()`, `LQpos()`, `RQ()`, `RQpos()`, `SVD()` and
-`Polar()`. `LQ()` and `LQpos()` produce a lower triangular matrix `L` and are computed using
-a QR decomposition of the transpose. `RQ()` and `RQpos()` produce an upper triangular
-remainder `L` and only works if the total left dimension is smaller than or equal to the
-total right dimension. `LQpos()` and `RQpos()` add an additional correction such that the
-diagonal elements of `L` are positive. `Polar()` produces a Hermitian and positive
-semidefinite `L`. Only `LQpos()`, `RQpos()` and `Polar()` are unique (no residual freedom)
-so that they always return the same result for the same input tensor `t`.
-
-Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
-`rightorth(!)` is currently only implemented for 
-`InnerProductStyle(t) === EuclideanInnerProduct()`.
-"""
-function rightorth(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(rightorth, t), p)
-    return rightorth!(tcopy; kwargs...)
-end
-
-"""
-    leftnull(t::AbstractTensor, (leftind, rightind)::Index2Tuple;
-                alg::OrthogonalFactorizationAlgorithm = QRpos()) -> N
-
-Create orthonormal basis for the orthogonal complement of the support of the indices in
-`leftind`, such that `N' * permute(t, (leftind, rightind)) = 0`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
-to be destroyed/overwritten, by using `leftnull!(t, alg = QRpos())`.
-
-Different algorithms are available, namely `QR()` (or equivalently, `QRpos()`), `SVD()` and
-`SDD()`. The first assumes that the matrix is full rank and requires `iszero(atol)` and
-`iszero(rtol)`. With `SVD()` and `SDD()`, `rightnull` will use the corresponding singular
-value decomposition, and one can specify an absolute or relative tolerance for which
-singular values are to be considered zero, where `max(atol, norm(t)*rtol)` is used as upper
-bound.
-
-Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
-`leftnull(!)` is currently only implemented for 
-`InnerProductStyle(t) === EuclideanInnerProduct()`.
-"""
-function leftnull(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(leftnull, t), p)
-    return leftnull!(tcopy; kwargs...)
-end
-
-"""
-    rightnull(t::AbstractTensor, (leftind, rightind)::Index2Tuple;
-                alg::OrthogonalFactorizationAlgorithm = LQ(),
-                atol::Real = 0.0,
-                rtol::Real = eps(real(float(one(scalartype(t)))))*iszero(atol)) -> N
-
-Create orthonormal basis for the orthogonal complement of the support of the indices in
-`rightind`, such that `permute(t, (leftind, rightind))*N' = 0`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
-to be destroyed/overwritten, by using `rightnull!(t, alg = LQpos())`.
-
-Different algorithms are available, namely `LQ()` (or equivalently, `LQpos`), `SVD()` and
-`SDD()`. The first assumes that the matrix is full rank and requires `iszero(atol)` and
-`iszero(rtol)`. With `SVD()` and `SDD()`, `rightnull` will use the corresponding singular
-value decomposition, and one can specify an absolute or relative tolerance for which
-singular values are to be considered zero, where `max(atol, norm(t)*rtol)` is used as upper
-bound.
-
-Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
-`rightnull(!)` is currently only implemented for 
-`InnerProductStyle(t) === EuclideanInnerProduct()`.
-"""
-function rightnull(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(rightnull, t), p)
-    return rightnull!(tcopy; kwargs...)
-end
-
-"""
-    eigen(t::AbstractTensor, (leftind, rightind)::Index2Tuple; kwargs...) -> D, V
-
-Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
-to be destroyed/overwritten, by using `eigen!(t)`. Note that the permuted tensor on which
-`eigen!` is called should have equal domain and codomain, as otherwise the eigenvalue
-decomposition is meaningless and cannot satisfy
-```
-permute(t, (leftind, rightind)) * V = V * D
-```
-
-Accepts the same keyword arguments `scale` and `permute` as `eigen` of dense
-matrices. See the corresponding documentation for more information.
-
-See also `eig` and `eigh`
-"""
-function LinearAlgebra.eigen(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(eigen, t), p)
-    return eigen!(tcopy; kwargs...)
-end
-
-function LinearAlgebra.eigvals(t::AbstractTensorMap; kwargs...)
-    tcopy = copy_oftype(t, factorisation_scalartype(eigen, t))
-    return LinearAlgebra.eigvals!(tcopy; kwargs...)
-end
-
-"""
-    eig(t::AbstractTensor, (leftind, rightind)::Index2Tuple; kwargs...) -> D, V
-
-Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
-The function `eig` assumes that the linear map is not hermitian and returns type stable
-complex valued `D` and `V` tensors for both real and complex valued `t`. See `eigh` for
-hermitian linear maps
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in
-`t` to be destroyed/overwritten, by using `eig!(t)`. Note that the permuted tensor on
-which `eig!` is called should have equal domain and codomain, as otherwise the eigenvalue
-decomposition is meaningless and cannot satisfy
-```
-permute(t, (leftind, rightind)) * V = V * D
-```
-
-Accepts the same keyword arguments `scale` and `permute` as `eigen` of dense
-matrices. See the corresponding documentation for more information.
-
-See also `eigen` and `eigh`.
-"""
-function eig(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(eig, t), p)
-    return eig!(tcopy; kwargs...)
-end
-
-"""
-    eigh(t::AbstractTensorMap, (leftind, rightind)::Index2Tuple) -> D, V
-
-Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
-The function `eigh` assumes that the linear map is hermitian and `D` and `V` tensors with
-the same `scalartype` as `t`. See `eig` and `eigen` for non-hermitian tensors. Hermiticity
-requires that the tensor acts on inner product spaces, and the current implementation
-requires `InnerProductStyle(t) === EuclideanInnerProduct()`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in
-`t` to be destroyed/overwritten, by using `eigh!(t)`. Note that the permuted tensor on
-which `eigh!` is called should have equal domain and codomain, as otherwise the eigenvalue
-decomposition is meaningless and cannot satisfy
-```
-permute(t, (leftind, rightind)) * V = V * D
-```
-
-See also `eigen` and `eig`.
-"""
-function eigh(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(eigh, t), p)
-    return eigh!(tcopy; kwargs...)
-end
-
-"""
-    isposdef(t::AbstractTensor, (leftind, rightind)::Index2Tuple) -> ::Bool
-
-Test whether a tensor `t` is positive definite as linear map from `rightind` to `leftind`.
-
-If `leftind` and `rightind` are not specified, the current partition of left and right
-indices of `t` is used. In that case, less memory is allocated if one allows the data in
-`t` to be destroyed/overwritten, by using `isposdef!(t)`. Note that the permuted tensor on
-which `isposdef!` is called should have equal domain and codomain, as otherwise it is
-meaningless.
-"""
-function LinearAlgebra.isposdef(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
-    tcopy = permutedcopy_oftype(t, factorisation_scalartype(isposdef, t), p)
-    return isposdef!(tcopy)
-end
-
-function tsvd(t::AbstractTensorMap; kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return tsvd!(tcopy; kwargs...)
-end
-function leftorth(t::AbstractTensorMap; alg::OFA=QRpos(), kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return leftorth!(tcopy; alg=alg, kwargs...)
-end
-function rightorth(t::AbstractTensorMap; alg::OFA=LQpos(), kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return rightorth!(tcopy; alg=alg, kwargs...)
-end
-function leftnull(t::AbstractTensorMap; alg::OFA=QR(), kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return leftnull!(tcopy; alg=alg, kwargs...)
-end
-function rightnull(t::AbstractTensorMap; alg::OFA=LQ(), kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return rightnull!(tcopy; alg=alg, kwargs...)
-end
-function LinearAlgebra.eigen(t::AbstractTensorMap; kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return eigen!(tcopy; kwargs...)
-end
-function eig(t::AbstractTensorMap; kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return eig!(tcopy; kwargs...)
-end
-function eigh(t::AbstractTensorMap; kwargs...)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return eigh!(tcopy; kwargs...)
-end
-function LinearAlgebra.isposdef(t::AbstractTensorMap)
-    tcopy = copy_oftype(t, float(scalartype(t)))
-    return isposdef!(tcopy)
-end
-
-# Orthogonal factorizations (mutation for recycling memory):
-# only possible if scalar type is floating point
-# only correct if Euclidean inner product
-#------------------------------------------------------------------------------------------
-const RealOrComplexFloat = Union{AbstractFloat,Complex{<:AbstractFloat}}
-
-function leftorth!(t::TensorMap{<:RealOrComplexFloat};
-                   alg::Union{QR,QRpos,QL,QLpos,SVD,SDD,Polar}=QRpos(),
-                   atol::Real=zero(float(real(scalartype(t)))),
-                   rtol::Real=(alg ∉ (SVD(), SDD())) ? zero(float(real(scalartype(t)))) :
-                              eps(real(float(one(scalartype(t))))) * iszero(atol))
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:leftorth!)
-    if !iszero(rtol)
-        atol = max(atol, rtol * norm(t))
-    end
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-
-    # compute QR factorization for each block
-    if !isempty(blocks(t))
-        generator = Base.Iterators.map(blocks(t)) do (c, b)
-            Qc, Rc = MatrixAlgebra.leftorth!(b, alg, atol)
-            dims[c] = size(Qc, 2)
-            return c => (Qc, Rc)
-        end
-        QRdata = SectorDict(generator)
-    end
-
-    # construct new space
-    S = spacetype(t)
-    V = S(dims)
-    if alg isa Polar
-        @assert V ≅ domain(t)
-        W = domain(t)
-    elseif length(domain(t)) == 1 && domain(t) ≅ V
-        W = domain(t)
-    elseif length(codomain(t)) == 1 && codomain(t) ≅ V
-        W = codomain(t)
-    else
-        W = ProductSpace(V)
-    end
-
-    # construct output tensors
-    T = float(scalartype(t))
-    Q = similar(t, T, codomain(t) ← W)
-    R = similar(t, T, W ← domain(t))
-    if !isempty(blocks(t))
-        for (c, (Qc, Rc)) in QRdata
-            copy!(block(Q, c), Qc)
-            copy!(block(R, c), Rc)
-        end
-    end
-    return Q, R
-end
-
-function leftnull!(t::TensorMap{<:RealOrComplexFloat};
-                   alg::Union{QR,QRpos,SVD,SDD}=QRpos(),
-                   atol::Real=zero(float(real(scalartype(t)))),
-                   rtol::Real=(alg ∉ (SVD(), SDD())) ? zero(float(real(scalartype(t)))) :
-                              eps(real(float(one(scalartype(t))))) * iszero(atol))
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:leftnull!)
-    if !iszero(rtol)
-        atol = max(atol, rtol * norm(t))
-    end
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-
-    # compute QR factorization for each block
-    V = codomain(t)
-    if !isempty(blocksectors(V))
-        generator = Base.Iterators.map(blocksectors(V)) do c
-            Nc = MatrixAlgebra.leftnull!(block(t, c), alg, atol)
-            dims[c] = size(Nc, 2)
-            return c => Nc
-        end
-        Ndata = SectorDict(generator)
-    end
-
-    # construct new space
-    S = spacetype(t)
-    W = S(dims)
-
-    # construct output tensor
-    T = float(scalartype(t))
-    N = similar(t, T, V ← W)
-    if !isempty(blocksectors(V))
-        for (c, Nc) in Ndata
-            copy!(block(N, c), Nc)
-        end
-    end
-    return N
-end
-
-function rightorth!(t::TensorMap{<:RealOrComplexFloat};
-                    alg::Union{LQ,LQpos,RQ,RQpos,SVD,SDD,Polar}=LQpos(),
-                    atol::Real=zero(float(real(scalartype(t)))),
-                    rtol::Real=(alg ∉ (SVD(), SDD())) ? zero(float(real(scalartype(t)))) :
-                               eps(real(float(one(scalartype(t))))) * iszero(atol))
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:rightorth!)
-    if !iszero(rtol)
-        atol = max(atol, rtol * norm(t))
-    end
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-
-    # compute LQ factorization for each block
-    if !isempty(blocks(t))
-        generator = Base.Iterators.map(blocks(t)) do (c, b)
-            Lc, Qc = MatrixAlgebra.rightorth!(b, alg, atol)
-            dims[c] = size(Qc, 1)
-            return c => (Lc, Qc)
-        end
-        LQdata = SectorDict(generator)
-    end
-
-    # construct new space
-    S = spacetype(t)
-    V = S(dims)
-    if alg isa Polar
-        @assert V ≅ codomain(t)
-        W = codomain(t)
-    elseif length(codomain(t)) == 1 && codomain(t) ≅ V
-        W = codomain(t)
-    elseif length(domain(t)) == 1 && domain(t) ≅ V
-        W = domain(t)
-    else
-        W = ProductSpace(V)
-    end
-
-    # construct output tensors
-    T = float(scalartype(t))
-    L = similar(t, T, codomain(t) ← W)
-    Q = similar(t, T, W ← domain(t))
-    if !isempty(blocks(t))
-        for (c, (Lc, Qc)) in LQdata
-            copy!(block(L, c), Lc)
-            copy!(block(Q, c), Qc)
-        end
-    end
-    return L, Q
-end
-
-function rightnull!(t::TensorMap{<:RealOrComplexFloat};
-                    alg::Union{LQ,LQpos,SVD,SDD}=LQpos(),
-                    atol::Real=zero(float(real(scalartype(t)))),
-                    rtol::Real=(alg ∉ (SVD(), SDD())) ? zero(float(real(scalartype(t)))) :
-                               eps(real(float(one(scalartype(t))))) * iszero(atol))
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:rightnull!)
-    if !iszero(rtol)
-        atol = max(atol, rtol * norm(t))
-    end
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-
-    # compute LQ factorization for each block
-    V = domain(t)
-    if !isempty(blocksectors(V))
-        generator = Base.Iterators.map(blocksectors(V)) do c
-            Nc = MatrixAlgebra.rightnull!(block(t, c), alg, atol)
-            dims[c] = size(Nc, 1)
-            return c => Nc
-        end
-        Ndata = SectorDict(generator)
-    end
-
-    # construct new space
-    S = spacetype(t)
-    W = S(dims)
-
-    # construct output tensor
-    T = float(scalartype(t))
-    N = similar(t, T, W ← V)
-    if !isempty(blocksectors(V))
-        for (c, Nc) in Ndata
-            copy!(block(N, c), Nc)
-        end
-    end
-    return N
-end
-
-function leftorth!(t::AdjointTensorMap; alg::OFA=QRpos())
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:leftorth!)
-    return map(adjoint, reverse(rightorth!(adjoint(t); alg=alg')))
-end
-
-function rightorth!(t::AdjointTensorMap; alg::OFA=LQpos())
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:rightorth!)
-    return map(adjoint, reverse(leftorth!(adjoint(t); alg=alg')))
-end
-
-function leftnull!(t::AdjointTensorMap; alg::OFA=QR(), kwargs...)
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:leftnull!)
-    return adjoint(rightnull!(adjoint(t); alg=alg', kwargs...))
-end
-
-function rightnull!(t::AdjointTensorMap; alg::OFA=LQ(), kwargs...)
-    InnerProductStyle(t) === EuclideanInnerProduct() ||
-        throw_invalid_innerproduct(:rightnull!)
-    return adjoint(leftnull!(adjoint(t); alg=alg', kwargs...))
-end
-
-#------------------------------#
-# Singular value decomposition #
-#------------------------------#
-function LinearAlgebra.svdvals!(t::TensorMap{<:RealOrComplexFloat})
-    return SectorDict(c => LinearAlgebra.svdvals!(b) for (c, b) in blocks(t))
-end
-LinearAlgebra.svdvals!(t::AdjointTensorMap) = svdvals!(adjoint(t))
-
-function tsvd!(t::TensorMap{<:RealOrComplexFloat};
-               trunc=NoTruncation(), p::Real=2, alg=SDD())
-    return _tsvd!(t, alg, trunc, p)
-end
-function tsvd!(t::AdjointTensorMap; trunc=NoTruncation(), p::Real=2, alg=SDD())
-    u, s, vt, err = tsvd!(adjoint(t); trunc=trunc, p=p, alg=alg)
-    return adjoint(vt), adjoint(s), adjoint(u), err
-end
-
-# implementation dispatches on algorithm
-function _tsvd!(t::TensorMap{<:RealOrComplexFloat}, alg::Union{SVD,SDD},
-                trunc::TruncationScheme, p::Real=2)
-    # early return
-    if isempty(blocksectors(t))
-        truncerr = zero(real(scalartype(t)))
-        return _empty_svdtensors(t)..., truncerr
-    end
-
-    # compute SVD factorization for each block
-    S = spacetype(t)
-    SVDdata, dims = _compute_svddata!(t, alg)
-    Σdata = SectorDict(c => Σ for (c, (U, Σ, V)) in SVDdata)
-    truncdim = _compute_truncdim(Σdata, trunc, p)
-    truncerr = _compute_truncerr(Σdata, truncdim, p)
-
-    # construct output tensors
-    U, Σ, V⁺ = _create_svdtensors(t, SVDdata, truncdim)
-    return U, Σ, V⁺, truncerr
-end
-
-# helper functions
-function _compute_svddata!(t::TensorMap, alg::Union{SVD,SDD})
-    InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:tsvd!)
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-    generator = Base.Iterators.map(blocks(t)) do (c, b)
-        U, Σ, V = MatrixAlgebra.svd!(b, alg)
-        dims[c] = length(Σ)
-        return c => (U, Σ, V)
-    end
-    SVDdata = SectorDict(generator)
-    return SVDdata, dims
-end
-
-function _create_svdtensors(t::TensorMap{<:RealOrComplexFloat}, SVDdata, dims)
-    T = scalartype(t)
-    S = spacetype(t)
-    W = S(dims)
-
-    Tr = real(T)
-    A = similarstoragetype(t, Tr)
-    Σ = DiagonalTensorMap{Tr,S,A}(undef, W)
-
-    U = similar(t, codomain(t) ← W)
-    V⁺ = similar(t, W ← domain(t))
-    for (c, (Uc, Σc, V⁺c)) in SVDdata
-        r = Base.OneTo(dims[c])
-        copy!(block(U, c), view(Uc, :, r))
-        copy!(block(Σ, c), Diagonal(view(Σc, r)))
-        copy!(block(V⁺, c), view(V⁺c, r, :))
-    end
-    return U, Σ, V⁺
-end
-
-function _empty_svdtensors(t::TensorMap{<:RealOrComplexFloat})
-    T = scalartype(t)
-    S = spacetype(t)
-    I = sectortype(t)
-    dims = SectorDict{I,Int}()
-    W = S(dims)
-
-    Tr = real(T)
-    A = similarstoragetype(t, Tr)
-    Σ = DiagonalTensorMap{Tr,S,A}(undef, W)
-
-    U = similar(t, codomain(t) ← W)
-    V⁺ = similar(t, W ← domain(t))
-    return U, Σ, V⁺
-end
-
-#--------------------------#
-# Eigenvalue decomposition #
-#--------------------------#
-function LinearAlgebra.eigen!(t::TensorMap{<:RealOrComplexFloat})
-    return ishermitian(t) ? eigh!(t) : eig!(t)
-end
-
-function LinearAlgebra.eigvals!(t::TensorMap{<:RealOrComplexFloat}; kwargs...)
-    return SectorDict(c => complex(LinearAlgebra.eigvals!(b; kwargs...))
-                      for (c, b) in blocks(t))
-end
-function LinearAlgebra.eigvals!(t::AdjointTensorMap{<:RealOrComplexFloat}; kwargs...)
-    return SectorDict(c => conj!(complex(LinearAlgebra.eigvals!(b; kwargs...)))
-                      for (c, b) in blocks(t))
-end
-
-function eigh!(t::TensorMap{<:RealOrComplexFloat})
-    InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:eigh!)
-    domain(t) == codomain(t) ||
-        throw(SpaceMismatch("`eigh!` requires domain and codomain to be the same"))
-
-    T = scalartype(t)
-    I = sectortype(t)
-    S = spacetype(t)
-    dims = SectorDict{I,Int}(c => size(b, 1) for (c, b) in blocks(t))
-    W = S(dims)
-
-    Tr = real(T)
-    A = similarstoragetype(t, Tr)
-    D = DiagonalTensorMap{Tr,S,A}(undef, W)
-    V = similar(t, domain(t) ← W)
-    for (c, b) in blocks(t)
-        values, vectors = MatrixAlgebra.eigh!(b)
-        copy!(block(D, c), Diagonal(values))
-        copy!(block(V, c), vectors)
-    end
-    return D, V
-end
-
-function eig!(t::TensorMap{<:RealOrComplexFloat}; kwargs...)
-    domain(t) == codomain(t) ||
-        throw(SpaceMismatch("`eig!` requires domain and codomain to be the same"))
-
-    T = scalartype(t)
-    I = sectortype(t)
-    S = spacetype(t)
-    dims = SectorDict{I,Int}(c => size(b, 1) for (c, b) in blocks(t))
-    W = S(dims)
-
-    Tc = complex(T)
-    A = similarstoragetype(t, Tc)
-    D = DiagonalTensorMap{Tc,S,A}(undef, W)
-    V = similar(t, Tc, domain(t) ← W)
-    for (c, b) in blocks(t)
-        values, vectors = MatrixAlgebra.eig!(b; kwargs...)
-        copy!(block(D, c), Diagonal(values))
-        copy!(block(V, c), vectors)
-    end
-    return D, V
-end
-
-#--------------------------------------------------#
-# Checks for hermiticity and positive definiteness #
-#--------------------------------------------------#
-function LinearAlgebra.ishermitian(t::TensorMap)
-    domain(t) == codomain(t) || return false
-    InnerProductStyle(t) === EuclideanInnerProduct() || return false # hermiticity only defined for euclidean
-    for (c, b) in blocks(t)
-        ishermitian(b) || return false
-    end
-    return true
-end
-
-function LinearAlgebra.isposdef!(t::TensorMap)
-    domain(t) == codomain(t) ||
-        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
-    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
-    for (c, b) in blocks(t)
-        isposdef!(b) || return false
-    end
-    return true
-end
diff --git a/src/tensors/factorizations/adjoint.jl b/src/tensors/factorizations/adjoint.jl
new file mode 100644
index 000000000..08154d15e
--- /dev/null
+++ b/src/tensors/factorizations/adjoint.jl
@@ -0,0 +1,68 @@
+# AdjointTensorMap
+# ----------------
+# 1-arg functions
+function initialize_output(::typeof(left_null!), t::AdjointTensorMap,
+                           alg::AbstractAlgorithm)
+    return adjoint(initialize_output(right_null!, adjoint(t), alg))
+end
+function initialize_output(::typeof(right_null!), t::AdjointTensorMap,
+                           alg::AbstractAlgorithm)
+    return adjoint(initialize_output(left_null!, adjoint(t), alg))
+end
+
+function left_null!(t::AdjointTensorMap, N::AdjointTensorMap, alg::AbstractAlgorithm)
+    right_null!(adjoint(t), adjoint(N), alg)
+    return N
+end
+function right_null!(t::AdjointTensorMap, N::AdjointTensorMap, alg::AbstractAlgorithm)
+    left_null!(adjoint(t), adjoint(N), alg)
+    return N
+end
+
+function MatrixAlgebraKit.is_left_isometry(t::AdjointTensorMap; kwargs...)
+    return is_right_isometry(adjoint(t); kwargs...)
+end
+function MatrixAlgebraKit.is_right_isometry(t::AdjointTensorMap; kwargs...)
+    return is_left_isometry(adjoint(t); kwargs...)
+end
+
+# 2-arg functions
+for (left_f!, right_f!) in zip((:qr_full!, :qr_compact!, :left_polar!, :left_orth!),
+                               (:lq_full!, :lq_compact!, :right_polar!, :right_orth!))
+    @eval function initialize_output(::typeof($left_f!), t::AdjointTensorMap,
+                                     alg::AbstractAlgorithm)
+        return reverse(adjoint.(initialize_output($right_f!, adjoint(t), alg)))
+    end
+    @eval function initialize_output(::typeof($right_f!), t::AdjointTensorMap,
+                                     alg::AbstractAlgorithm)
+        return reverse(adjoint.(initialize_output($left_f!, adjoint(t), alg)))
+    end
+
+    @eval function $left_f!(t::AdjointTensorMap,
+                            F::Tuple{AdjointTensorMap,AdjointTensorMap},
+                            alg::AbstractAlgorithm)
+        $right_f!(adjoint(t), reverse(adjoint.(F)), alg)
+        return F
+    end
+    @eval function $right_f!(t::AdjointTensorMap,
+                             F::Tuple{AdjointTensorMap,AdjointTensorMap},
+                             alg::AbstractAlgorithm)
+        $left_f!(adjoint(t), reverse(adjoint.(F)), alg)
+        return F
+    end
+end
+
+# 3-arg functions
+for f! in (:svd_full!, :svd_compact!, :svd_trunc!)
+    @eval function initialize_output(::typeof($f!), t::AdjointTensorMap,
+                                     alg::AbstractAlgorithm)
+        return reverse(adjoint.(initialize_output($f!, adjoint(t), alg)))
+    end
+    _TS = f! === :svd_full! ? :AdjointTensorMap : DiagonalTensorMap
+    @eval function $f!(t::AdjointTensorMap,
+                       F::Tuple{AdjointTensorMap,$_TS,AdjointTensorMap},
+                       alg::AbstractAlgorithm)
+        $f!(adjoint(t), reverse(adjoint.(F)), alg)
+        return F
+    end
+end
diff --git a/src/tensors/factorizations/deprecations.jl b/src/tensors/factorizations/deprecations.jl
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/src/tensors/factorizations/deprecations.jl
@@ -0,0 +1 @@
+
diff --git a/src/tensors/factorizations/factorizations.jl b/src/tensors/factorizations/factorizations.jl
new file mode 100644
index 000000000..f56e44a1b
--- /dev/null
+++ b/src/tensors/factorizations/factorizations.jl
@@ -0,0 +1,166 @@
+# Tensor factorization
+#----------------------
+# using submodule here to import MatrixAlgebraKit functions without polluting namespace
+module Factorizations
+
+export eig, eig!, eigh, eigh!
+export tsvd, tsvd!, svdvals, svdvals!
+export leftorth, leftorth!, rightorth, rightorth!
+export leftnull, leftnull!, rightnull, rightnull!
+export copy_oftype, permutedcopy_oftype, one!
+export TruncationScheme, notrunc, truncbelow, truncerr, truncdim, truncspace
+
+using ..TensorKit
+using ..TensorKit: AdjointTensorMap, SectorDict, OFA, blocktype, foreachblock, one!
+
+using LinearAlgebra: LinearAlgebra, BlasFloat, Diagonal, svdvals, svdvals!
+import LinearAlgebra: eigen, eigen!, isposdef, isposdef!, ishermitian
+
+using TensorOperations: Index2Tuple
+
+using MatrixAlgebraKit
+using MatrixAlgebraKit: AbstractAlgorithm, TruncatedAlgorithm, TruncationStrategy,
+                        NoTruncation, TruncationKeepAbove, TruncationKeepBelow,
+                        TruncationIntersection, TruncationKeepFiltered
+import MatrixAlgebraKit: default_algorithm,
+                         copy_input, check_input, initialize_output,
+                         qr_compact!, qr_full!, qr_null!, lq_compact!, lq_full!, lq_null!,
+                         svd_compact!, svd_full!, svd_trunc!, svd_vals!,
+                         eigh_full!, eigh_trunc!, eigh_vals!,
+                         eig_full!, eig_trunc!, eig_vals!,
+                         left_polar!, left_orth_polar!, right_polar!, right_orth_polar!,
+                         left_null_svd!, right_null_svd!,
+                         left_orth!, right_orth!, left_null!, right_null!,
+                         truncate!, findtruncated, findtruncated_sorted,
+                         diagview, isisometry
+
+include("utility.jl")
+include("interface.jl")
+include("implementations.jl")
+include("matrixalgebrakit.jl")
+include("truncation.jl")
+include("deprecations.jl")
+include("adjoint.jl")
+
+TensorKit.one!(A::AbstractMatrix) = MatrixAlgebraKit.one!(A)
+
+function isisometry(t::AbstractTensorMap, (p₁, p₂)::Index2Tuple)
+    t = permute(t, (p₁, p₂); copy=false)
+    return isisometry(t)
+end
+
+# Orthogonal factorizations (mutation for recycling memory):
+# only possible if scalar type is floating point
+# only correct if Euclidean inner product
+#------------------------------------------------------------------------------------------
+const RealOrComplexFloat = Union{AbstractFloat,Complex{<:AbstractFloat}}
+
+# DiagonalTensorMap
+# -----------------
+function leftorth!(d::DiagonalTensorMap; alg=QR(), kwargs...)
+    @assert alg isa Union{QR,QL}
+    return one(d), d # TODO: this is only correct for `alg = QR()` or `alg = QL()`
+end
+function rightorth!(d::DiagonalTensorMap; alg=LQ(), kwargs...)
+    @assert alg isa Union{LQ,RQ}
+    return d, one(d) # TODO: this is only correct for `alg = LQ()` or `alg = RQ()`
+end
+leftnull!(d::DiagonalTensorMap; kwargs...) = leftnull!(TensorMap(d); kwargs...)
+rightnull!(d::DiagonalTensorMap; kwargs...) = rightnull!(TensorMap(d); kwargs...)
+
+function tsvd!(d::DiagonalTensorMap; trunc=NoTruncation(), p::Real=2, alg=SDD())
+    return _tsvd!(d, alg, trunc, p)
+end
+
+# helper function
+function _compute_svddata!(d::DiagonalTensorMap, alg::Union{SVD,SDD})
+    InnerProductStyle(d) === EuclideanInnerProduct() || throw_invalid_innerproduct(:tsvd!)
+    I = sectortype(d)
+    dims = SectorDict{I,Int}()
+    generator = Base.Iterators.map(blocks(d)) do (c, b)
+        lb = length(b.diag)
+        U = zerovector!(similar(b.diag, lb, lb))
+        V = zerovector!(similar(b.diag, lb, lb))
+        p = sortperm(b.diag; by=abs, rev=true)
+        for (i, pi) in enumerate(p)
+            U[pi, i] = safesign(b.diag[pi])
+            V[i, pi] = 1
+        end
+        Σ = abs.(view(b.diag, p))
+        dims[c] = lb
+        return c => (U, Σ, V)
+    end
+    SVDdata = SectorDict(generator)
+    return SVDdata, dims
+end
+
+eig!(d::DiagonalTensorMap) = d, one(d)
+eigh!(d::DiagonalTensorMap{<:Real}) = d, one(d)
+eigh!(d::DiagonalTensorMap{<:Complex}) = DiagonalTensorMap(real(d.data), d.domain), one(d)
+
+function LinearAlgebra.svdvals(d::DiagonalTensorMap)
+    return SectorDict(c => svd_vals(b) for (c, b) in blocks(d))
+end
+function LinearAlgebra.eigvals(d::DiagonalTensorMap)
+    return SectorDict(c => eig_vals(b) for (c, b) in blocks(d))
+end
+
+function LinearAlgebra.cond(d::DiagonalTensorMap, p::Real=2)
+    return LinearAlgebra.cond(Diagonal(d.data), p)
+end
+#------------------------------#
+# Singular value decomposition #
+#------------------------------#
+function LinearAlgebra.svdvals!(t::TensorMap{<:RealOrComplexFloat})
+    return SectorDict(c => LinearAlgebra.svdvals!(b) for (c, b) in blocks(t))
+end
+LinearAlgebra.svdvals!(t::AdjointTensorMap) = svdvals!(adjoint(t))
+
+#--------------------------#
+# Eigenvalue decomposition #
+#--------------------------#
+
+function LinearAlgebra.eigvals!(t::TensorMap{<:RealOrComplexFloat}; kwargs...)
+    return SectorDict(c => complex(eig_vals!(b; kwargs...))
+                      for (c, b) in blocks(t))
+end
+function LinearAlgebra.eigvals!(t::AdjointTensorMap{<:RealOrComplexFloat}; kwargs...)
+    return SectorDict(c => conj!(complex(eig_vals!(b; kwargs...)))
+                      for (c, b) in blocks(t))
+end
+
+#--------------------------------------------------#
+# Checks for hermiticity and positive definiteness #
+#--------------------------------------------------#
+function LinearAlgebra.ishermitian(t::TensorMap)
+    domain(t) == codomain(t) || return false
+    InnerProductStyle(t) === EuclideanInnerProduct() || return false # hermiticity only defined for euclidean
+    for (c, b) in blocks(t)
+        ishermitian(b) || return false
+    end
+    return true
+end
+
+function LinearAlgebra.isposdef!(t::TensorMap)
+    domain(t) == codomain(t) ||
+        throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
+    InnerProductStyle(spacetype(t)) === EuclideanInnerProduct() || return false
+    for (c, b) in blocks(t)
+        isposdef!(b) || return false
+    end
+    return true
+end
+
+# TODO: tolerances are per-block, not global or weighted - does that matter?
+function MatrixAlgebraKit.is_left_isometry(t::AbstractTensorMap; kwargs...)
+    domain(t) ≾ codomain(t) || return false
+    f((c, b)) = MatrixAlgebraKit.is_left_isometry(b; kwargs...)
+    return all(f, blocks(t))
+end
+function MatrixAlgebraKit.is_right_isometry(t::AbstractTensorMap; kwargs...)
+    domain(t) ≿ codomain(t) || return false
+    f((c, b)) = MatrixAlgebraKit.is_right_isometry(b; kwargs...)
+    return all(f, blocks(t))
+end
+
+end
diff --git a/src/tensors/factorizations/implementations.jl b/src/tensors/factorizations/implementations.jl
new file mode 100644
index 000000000..f341b4e70
--- /dev/null
+++ b/src/tensors/factorizations/implementations.jl
@@ -0,0 +1,161 @@
+_kindof(::Union{SVD,SDD}) = :svd
+_kindof(::Union{QR,QRpos}) = :qr
+_kindof(::Union{LQ,LQpos}) = :lq
+_kindof(::Polar) = :polar
+
+select_svd_algorithm(t, ::SVD) = LAPACK_QRIteration()
+select_svd_algorithm(t, ::SDD) = LAPACK_DivideAndConquer()
+select_svd_algorithm(t, alg::OFA) = throw(ArgumentError(lazy"Unknown algorithm $alg"))
+select_svd_algorithm(t::Base.ReshapedArray, alg) = select_svd_algorithm(parent(parent(t)), alg) 
+
+leftorth!(t; alg=nothing, kwargs...) = _leftorth!(t, alg; kwargs...)
+
+function _leftorth!(t::AbstractTensorMap, alg::Nothing, ; kwargs...)
+    return isempty(kwargs) ? left_orth!(t) : left_orth!(t; trunc=(; kwargs...))
+end
+function _leftorth!(t::AbstractTensorMap, alg::Union{QL,QLpos}; kwargs...)
+    trunc = isempty(kwargs) ? nothing : (; kwargs...)
+
+    if alg == QL() || alg == QLpos()
+        _reverse!(t; dims=2)
+        Q, R = left_orth!(t; kind=:qr, alg_qr=(; positive=alg == QLpos()), trunc)
+        _reverse!(Q; dims=2)
+        _reverse!(R)
+        return Q, R
+    end
+end
+function _leftorth!(t, alg::OFA; kwargs...)
+    trunc = isempty(kwargs) ? nothing : (; kwargs...)
+
+    Base.depwarn(lazy"$alg is deprecated", :leftorth!)
+
+    kind = _kindof(alg)
+    if kind == :svd
+        alg_svd = select_svd_algorithm(t, alg) 
+        return left_orth!(t; kind, alg_svd, trunc)
+    elseif kind == :qr
+        alg_qr = (; positive=(alg == QRpos()))
+        return left_orth!(t; kind, alg_qr, trunc)
+    elseif kind == :polar
+        return left_orth!(t; kind, trunc)
+    else
+        throw(ArgumentError(lazy"Invalid algorithm: $alg"))
+    end
+end
+# fallback to MatrixAlgebraKit version
+_leftorth!(t, alg; kwargs...) = left_orth!(t; alg, kwargs...)
+
+function leftnull!(t::AbstractTensorMap;
+                   alg::Union{QR,QRpos,SVD,SDD,Nothing}=nothing, kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() ||
+        throw_invalid_innerproduct(:leftnull!)
+    trunc = isempty(kwargs) ? nothing : (; kwargs...)
+
+    isnothing(alg) && return left_null!(t; trunc)
+
+    kind = _kindof(alg)
+    if kind == :svd
+        alg_svd = select_svd_algorithm(t, alg) 
+        return left_null!(t; kind, alg_svd, trunc)
+    elseif kind == :qr
+        alg_qr = (; positive=(alg == QRpos()))
+        return left_null!(t; kind, alg_qr, trunc)
+    else
+        throw(ArgumentError(lazy"Invalid `leftnull!` algorithm: $alg"))
+    end
+end
+
+function rightorth!(t::AbstractTensorMap;
+                    alg::Union{LQ,LQpos,RQ,RQpos,SVD,SDD,Polar,Nothing}=nothing, kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() ||
+        throw_invalid_innerproduct(:rightorth!)
+    trunc = isempty(kwargs) ? nothing : (; kwargs...)
+
+    isnothing(alg) && return right_orth!(t; trunc)
+
+    if alg == RQ() || alg == RQpos()
+        _reverse!(t; dims=1)
+        L, Q = right_orth!(t; kind=:lq, alg_lq=(; positive=alg == RQpos()), trunc)
+        _reverse!(Q; dims=1)
+        _reverse!(L)
+        return L, Q
+    end
+
+    kind = _kindof(alg)
+    if kind == :svd
+        alg_svd = select_svd_algorithm(t, alg) 
+        return right_orth!(t; kind, alg_svd, trunc)
+    elseif kind == :lq
+        alg_lq = (; positive=(alg == LQpos()))
+        return right_orth!(t; kind, alg_lq, trunc)
+    elseif kind == :polar
+        return right_orth!(t; kind, trunc)
+    else
+        throw(ArgumentError(lazy"Invalid `rightorth!` algorithm: $alg"))
+    end
+end
+
+function rightnull!(t::AbstractTensorMap;
+                    alg::Union{LQ,LQpos,SVD,SDD,Nothing}=nothing, kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() ||
+        throw_invalid_innerproduct(:rightnull!)
+    trunc = isempty(kwargs) ? nothing : (; kwargs...)
+
+    isnothing(alg) && return right_null!(t; trunc)
+
+    kind = _kindof(alg)
+    if kind == :svd
+        alg_svd = select_svd_algorithm(t, alg) 
+        return right_null!(t; kind, alg_svd, trunc)
+    elseif kind == :lq
+        alg_lq = (; positive=(alg == LQpos()))
+        return right_null!(t; kind, alg_lq, trunc)
+    else
+        throw(ArgumentError(lazy"Invalid `rightnull!` algorithm: $alg"))
+    end
+end
+
+# Eigenvalue decomposition
+# ------------------------
+function eigh!(t::AbstractTensorMap; trunc=notrunc(), kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:eigh!)
+    if trunc == notrunc()
+        return eigh_full!(t; kwargs...)
+    else
+        return eigh_trunc!(t; trunc, kwargs...)
+    end
+end
+
+function eig!(t::AbstractTensorMap; trunc=notrunc(), kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:eig!)
+
+    if trunc == notrunc()
+        return eig_full!(t; kwargs...)
+    else
+        return eig_trunc!(t; trunc, kwargs...)
+    end
+end
+
+function eigen!(t::AbstractTensorMap; kwargs...)
+    return ishermitian(t) ? eigh!(t; kwargs...) : eig!(t; kwargs...)
+end
+
+# Singular value decomposition
+# ----------------------------
+function tsvd!(t::AbstractTensorMap; trunc=notrunc(), p=nothing, alg=nothing, kwargs...)
+    InnerProductStyle(t) === EuclideanInnerProduct() || throw_invalid_innerproduct(:tsvd!)
+    isnothing(p) || Base.depwarn("p is no longer supported", :tsvd!)
+
+    if alg isa Union{OFA, SVD}
+        Base.depwarn(lazy"$alg is deprecated", :tsvd!)
+        alg_svd = select_svd_algorithm(t, alg)
+    else
+        alg_svd = alg
+    end
+
+    if trunc == notrunc()
+        return svd_compact!(t; alg_svd, kwargs...)
+    else
+        return svd_trunc!(t; trunc, alg_svd, kwargs...)
+    end
+end
diff --git a/src/tensors/factorizations/interface.jl b/src/tensors/factorizations/interface.jl
new file mode 100644
index 000000000..fc757a298
--- /dev/null
+++ b/src/tensors/factorizations/interface.jl
@@ -0,0 +1,241 @@
+@doc """
+    tsvd(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple];
+         trunc::TruncationScheme = notrunc(), p::Real = 2, alg::Union{SVD, SDD} = SDD())
+        -> U, S, V, ϵ
+    tsvd!(t::AbstractTensorMap, trunc::TruncationScheme = notrunc(), p::Real = 2, alg::Union{SVD, SDD} = SDD())
+        -> U, S, V, ϵ
+
+Compute the (possibly truncated) singular value decomposition such that
+`norm(permute(t, (leftind, rightind)) - U * S * V) ≈ ϵ`, where `ϵ` thus represents the truncation error.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in
+`t` to be destroyed/overwritten, by using `tsvd!(t, trunc = notrunc(), p = 2)`.
+
+A truncation parameter `trunc` can be specified for the new internal dimension, in which
+case a truncated singular value decomposition will be computed. Choices are:
+*   `notrunc()`: no truncation (default);
+*   `truncerr(η::Real)`: truncates such that the p-norm of the truncated singular values is
+    smaller than `η`;
+*   `truncdim(χ::Int)`: truncates such that the equivalent total dimension of the internal
+    vector space is no larger than `χ`;
+*   `truncspace(V)`: truncates such that the dimension of the internal vector space is
+    smaller than that of `V` in any sector.
+*   `truncbelow(η::Real)`: truncates such that every singular value is larger then `η` ;
+
+Truncation options can also be combined using `&`, i.e. `truncbelow(η) & truncdim(χ)` will
+choose the truncation space such that every singular value is larger than `η`, and the
+equivalent total dimension of the internal vector space is no larger than `χ`.
+
+The method `tsvd` also returns the truncation error `ϵ`, computed as the `p` norm of the
+singular values that were truncated.
+
+THe keyword `alg` can be equal to `SVD()` or `SDD()`, corresponding to the underlying LAPACK
+algorithm that computes the decomposition (`_gesvd` or `_gesdd`).
+
+Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and `tsvd(!)`
+is currently only implemented for `InnerProductStyle(t) === EuclideanInnerProduct()`.
+""" tsvd, tsvd!
+
+@doc """
+    eig(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple]; kwargs...) -> D, V
+    eig!(t::AbstractTensorMap; kwargs...) -> D, V
+
+Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
+The function `eig` assumes that the linear map is not hermitian and returns type stable
+complex valued `D` and `V` tensors for both real and complex valued `t`. See `eigh` for
+hermitian linear maps
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in
+`t` to be destroyed/overwritten, by using `eig!(t)`. Note that the permuted tensor on
+which `eig!` is called should have equal domain and codomain, as otherwise the eigenvalue
+decomposition is meaningless and cannot satisfy
+```
+permute(t, (leftind, rightind)) * V = V * D
+```
+
+Accepts the same keyword arguments `scale` and `permute` as `eigen` of dense
+matrices. See the corresponding documentation for more information.
+
+See also `eigen` and `eigh`.
+""" eig
+
+@doc """
+    eigh(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple]; kwargs...) -> D, V
+    eigh!(t::AbstractTensorMap; kwargs...) -> D, V
+
+Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
+The function `eigh` assumes that the linear map is hermitian and `D` and `V` tensors with
+the same `scalartype` as `t`. See `eig` and `eigen` for non-hermitian tensors. Hermiticity
+requires that the tensor acts on inner product spaces, and the current implementation
+requires `InnerProductStyle(t) === EuclideanInnerProduct()`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in
+`t` to be destroyed/overwritten, by using `eigh!(t)`. Note that the permuted tensor on
+which `eigh!` is called should have equal domain and codomain, as otherwise the eigenvalue
+decomposition is meaningless and cannot satisfy
+```
+permute(t, (leftind, rightind)) * V = V * D
+```
+
+See also `eigen` and `eig`.
+""" eigh, eigh!
+
+@doc """
+    leftorth(t::AbstractTensorMap, (leftind, rightind)::Index2Tuple;
+                alg::OrthogonalFactorizationAlgorithm = QRpos()) -> Q, R
+
+Create orthonormal basis `Q` for indices in `leftind`, and remainder `R` such that
+`permute(t, (leftind, rightind)) = Q*R`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
+to be destroyed/overwritten, by using `leftorth!(t, alg = QRpos())`.
+
+Different algorithms are available, namely `QR()`, `QRpos()`, `SVD()` and `Polar()`. `QR()`
+and `QRpos()` use a standard QR decomposition, producing an upper triangular matrix `R`.
+`Polar()` produces a Hermitian and positive semidefinite `R`. `QRpos()` corrects the
+standard QR decomposition such that the diagonal elements of `R` are positive. Only
+`QRpos()` and `Polar()` are unique (no residual freedom) so that they always return the same
+result for the same input tensor `t`.
+
+Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
+`leftorth(!)` is currently only implemented for 
+    `InnerProductStyle(t) === EuclideanInnerProduct()`.
+""" leftorth, leftorth!
+
+@doc """
+    rightorth(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple];
+                alg::OrthogonalFactorizationAlgorithm = LQpos()) -> L, Q
+    rightorth!(t::AbstractTensorMap; alg) -> L, Q
+
+Create orthonormal basis `Q` for indices in `rightind`, and remainder `L` such that
+`permute(t, (leftind, rightind)) = L*Q`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
+to be destroyed/overwritten, by using `rightorth!(t, alg = LQpos())`.
+
+Different algorithms are available, namely `LQ()`, `LQpos()`, `RQ()`, `RQpos()`, `SVD()` and
+`Polar()`. `LQ()` and `LQpos()` produce a lower triangular matrix `L` and are computed using
+a QR decomposition of the transpose. `RQ()` and `RQpos()` produce an upper triangular
+remainder `L` and only works if the total left dimension is smaller than or equal to the
+total right dimension. `LQpos()` and `RQpos()` add an additional correction such that the
+diagonal elements of `L` are positive. `Polar()` produces a Hermitian and positive
+semidefinite `L`. Only `LQpos()`, `RQpos()` and `Polar()` are unique (no residual freedom)
+so that they always return the same result for the same input tensor `t`.
+
+Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
+`rightorth(!)` is currently only implemented for 
+`InnerProductStyle(t) === EuclideanInnerProduct()`.
+""" rightorth, rightorth!
+
+@doc """
+    leftnull(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple];
+                alg::OrthogonalFactorizationAlgorithm = QRpos()) -> N
+    leftnull!(t::AbstractTensorMap; alg) -> N
+
+Create orthonormal basis for the orthogonal complement of the support of the indices in
+`leftind`, such that `N' * permute(t, (leftind, rightind)) = 0`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
+to be destroyed/overwritten, by using `leftnull!(t, alg = QRpos())`.
+
+Different algorithms are available, namely `QR()` (or equivalently, `QRpos()`), `SVD()` and
+`SDD()`. The first assumes that the matrix is full rank and requires `iszero(atol)` and
+`iszero(rtol)`. With `SVD()` and `SDD()`, `rightnull` will use the corresponding singular
+value decomposition, and one can specify an absolute or relative tolerance for which
+singular values are to be considered zero, where `max(atol, norm(t)*rtol)` is used as upper
+bound.
+
+Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
+`leftnull(!)` is currently only implemented for 
+`InnerProductStyle(t) === EuclideanInnerProduct()`.
+""" leftnull, leftnull!
+
+@doc """
+    rightnull(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple];
+                alg::OrthogonalFactorizationAlgorithm = LQ(),
+                atol::Real = 0.0,
+                rtol::Real = eps(real(float(one(scalartype(t)))))*iszero(atol)) -> N
+    rightnull!(t::AbstractTensorMap; alg, atol, rtol)
+
+Create orthonormal basis for the orthogonal complement of the support of the indices in
+`rightind`, such that `permute(t, (leftind, rightind))*N' = 0`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
+to be destroyed/overwritten, by using `rightnull!(t, alg = LQpos())`.
+
+Different algorithms are available, namely `LQ()` (or equivalently, `LQpos`), `SVD()` and
+`SDD()`. The first assumes that the matrix is full rank and requires `iszero(atol)` and
+`iszero(rtol)`. With `SVD()` and `SDD()`, `rightnull` will use the corresponding singular
+value decomposition, and one can specify an absolute or relative tolerance for which
+singular values are to be considered zero, where `max(atol, norm(t)*rtol)` is used as upper
+bound.
+
+Orthogonality requires `InnerProductStyle(t) <: HasInnerProduct`, and
+`rightnull(!)` is currently only implemented for 
+`InnerProductStyle(t) === EuclideanInnerProduct()`.
+""" rightnull, rightnull!
+
+@doc """
+    eigen(t::AbstractTensorMap, [(leftind, rightind)::Index2Tuple]; kwargs...) -> D, V
+    eigen!(t::AbstractTensorMap; kwargs...) -> D, V
+
+Compute eigenvalue factorization of tensor `t` as linear map from `rightind` to `leftind`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in `t`
+to be destroyed/overwritten, by using `eigen!(t)`. Note that the permuted tensor on which
+`eigen!` is called should have equal domain and codomain, as otherwise the eigenvalue
+decomposition is meaningless and cannot satisfy
+```
+permute(t, (leftind, rightind)) * V = V * D
+```
+
+Accepts the same keyword arguments `scale` and `permute` as `eigen` of dense
+matrices. See the corresponding documentation for more information.
+
+See also [`eig(!)`](@ref eig) and [`eigh(!)`](@ref)
+""" eigen(::AbstractTensorMap), eigen!(::AbstractTensorMap)
+
+@doc """
+    isposdef(t::AbstractTensor, [(leftind, rightind)::Index2Tuple]) -> ::Bool
+
+Test whether a tensor `t` is positive definite as linear map from `rightind` to `leftind`.
+
+If `leftind` and `rightind` are not specified, the current partition of left and right
+indices of `t` is used. In that case, less memory is allocated if one allows the data in
+`t` to be destroyed/overwritten, by using `isposdef!(t)`. Note that the permuted tensor on
+which `isposdef!` is called should have equal domain and codomain, as otherwise it is
+meaningless.
+""" isposdef(::AbstractTensorMap), isposdef!(::AbstractTensorMap)
+
+for f in
+    (:tsvd, :eig, :eigh, :eigen, :leftorth, :rightorth, :left_polar, :right_polar,
+     :leftnull,
+     :rightnull, :isposdef)
+    f! = Symbol(f, :!)
+    @eval function $f(t::AbstractTensorMap, p::Index2Tuple; kwargs...)
+        tcopy = permutedcopy_oftype(t, factorisation_scalartype($f, t), p)
+        return $f!(tcopy; kwargs...)
+    end
+    @eval function $f(t::AbstractTensorMap; kwargs...)
+        tcopy = copy_oftype(t, factorisation_scalartype($f, t))
+        return $f!(tcopy; kwargs...)
+    end
+end
+
+function LinearAlgebra.eigvals(t::AbstractTensorMap; kwargs...)
+    tcopy = copy_oftype(t, factorisation_scalartype(eigen, t))
+    return LinearAlgebra.eigvals!(tcopy; kwargs...)
+end
+
+function LinearAlgebra.svdvals(t::AbstractTensorMap)
+    tcopy = copy_oftype(t, factorisation_scalartype(tsvd, t))
+    return LinearAlgebra.svdvals!(tcopy)
+end
diff --git a/src/tensors/factorizations/matrixalgebrakit.jl b/src/tensors/factorizations/matrixalgebrakit.jl
new file mode 100644
index 000000000..2ddf3eb5e
--- /dev/null
+++ b/src/tensors/factorizations/matrixalgebrakit.jl
@@ -0,0 +1,562 @@
+# Algorithm selection
+# -------------------
+for f! in
+    [:svd_compact!, :svd_full!, :svd_trunc!, :svd_vals!, :qr_compact!, :qr_full!, :qr_null!,
+     :lq_compact!, :lq_full!, :lq_null!, :eig_full!, :eig_trunc!, :eig_vals!, :eigh_full!,
+     :eigh_trunc!, :eigh_vals!, :left_polar!, :right_polar!]
+    @eval function default_algorithm(::typeof($f!), ::Type{T};
+                                     kwargs...) where {T<:AbstractTensorMap}
+        return default_algorithm($f!, blocktype(T); kwargs...)
+    end
+end
+
+function _select_truncation(f, ::AbstractTensorMap,
+                            trunc::MatrixAlgebraKit.TruncationStrategy)
+    return trunc
+end
+function _select_truncation(::typeof(left_null!), ::AbstractTensorMap, trunc::NamedTuple)
+    return MatrixAlgebraKit.null_truncation_strategy(; trunc...)
+end
+
+# Generic Implementations
+# ----------------------_
+for f! in (:qr_compact!, :qr_full!,
+           :lq_compact!, :lq_full!,
+           :eig_full!, :eigh_full!,
+           :svd_compact!, :svd_full!,
+           :left_polar!, :left_orth_polar!,
+           :right_polar!, :right_orth_polar!,
+           :left_orth!, :right_orth!)
+    @eval function $f!(t::AbstractTensorMap, F, alg::AbstractAlgorithm)
+        check_input($f!, t, F, alg)
+        foreachblock(t, F...) do _, bs
+            factors = Base.tail(bs)
+            factors′ = $f!(first(bs), factors, alg)
+            # deal with the case where the output is not in-place
+            for (f′, f) in zip(factors′, factors)
+                f′ === f || copyto!(f, f′)
+            end
+            return nothing
+        end
+
+        return F
+    end
+end
+
+# Handle these separately because single output instead of tuple
+for f! in (:qr_null!, :lq_null!, :svd_vals!, :eig_vals!, :eigh_vals!)
+    @eval function $f!(t::AbstractTensorMap, N, alg::AbstractAlgorithm)
+        check_input($f!, t, N, alg)
+
+        foreachblock(t, N) do _, (b, n)
+            n′ = $f!(b, n, alg)
+            # deal with the case where the output is not the same as the input
+            n === n′ || copyto!(n, n′)
+            return nothing
+        end
+
+        return N
+    end
+end
+
+# Singular value decomposition
+# ----------------------------
+const _T_USVᴴ = Tuple{<:AbstractTensorMap,<:AbstractTensorMap,<:AbstractTensorMap}
+const _T_USVᴴ_diag = Tuple{<:AbstractTensorMap,<:DiagonalTensorMap,<:AbstractTensorMap}
+
+function check_input(::typeof(svd_full!), t::AbstractTensorMap, (U, S, Vᴴ)::_T_USVᴴ,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar U t
+    @check_scalar S t real
+    @check_scalar Vᴴ t
+
+    # space checks
+    V_cod = fuse(codomain(t))
+    V_dom = fuse(domain(t))
+    @check_space(U, codomain(t) ← V_cod)
+    @check_space(S, V_cod ← V_dom)
+    @check_space(Vᴴ, V_dom ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(svd_compact!), t::AbstractTensorMap, (U, S, Vᴴ)::_T_USVᴴ_diag,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar U t
+    @check_scalar S t real
+    @check_scalar Vᴴ t
+
+    # space checks
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    @check_space(U, codomain(t) ← V_cod)
+    @check_space(S, V_cod ← V_dom)
+    @check_space(Vᴴ, V_dom ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(svd_vals!), t::AbstractTensorMap, S::SectorDict,
+                     ::AbstractAlgorithm)
+    @check_scalar S t real
+    V_cod = infimum(fuse(codomain(t)), fuse(domain(t)))
+    @check_space(S, V_cod ← V_dom)
+    return nothing
+end
+
+function initialize_output(::typeof(svd_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_cod = fuse(codomain(t))
+    V_dom = fuse(domain(t))
+    U = similar(t, codomain(t) ← V_cod)
+    S = similar(t, real(scalartype(t)), V_cod ← V_dom)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function initialize_output(::typeof(svd_compact!), t::AbstractTensorMap,
+                           ::AbstractAlgorithm)
+    V_cod = V_dom = infimum(fuse(codomain(t)), fuse(domain(t)))
+    U = similar(t, codomain(t) ← V_cod)
+    S = DiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+    Vᴴ = similar(t, V_dom ← domain(t))
+    return U, S, Vᴴ
+end
+
+function initialize_output(::typeof(svd_trunc!), t::AbstractTensorMap,
+                           alg::TruncatedAlgorithm)
+    return initialize_output(svd_compact!, t, alg.alg)
+end
+
+function initialize_output(::typeof(svd_vals!), t::AbstractTensorMap,
+                           alg::AbstractAlgorithm)
+    V_cod = infimum(fuse(codomain(t)), fuse(domain(t)))
+    return DiagonalTensorMap{real(scalartype(t))}(undef, V_cod)
+end
+
+function svd_trunc!(t::AbstractTensorMap, USVᴴ, alg::TruncatedAlgorithm)
+    USVᴴ′ = svd_compact!(t, USVᴴ, alg.alg)
+    return truncate!(svd_trunc!, USVᴴ′, alg.trunc)
+end
+
+# Eigenvalue decomposition
+# ------------------------
+const _T_DV = Tuple{<:DiagonalTensorMap,<:AbstractTensorMap}
+
+function check_input(::typeof(eigh_full!), t::AbstractTensorMap, (D, V)::_T_DV,
+                     ::AbstractAlgorithm)
+    domain(t) == codomain(t) ||
+        throw(ArgumentError("Eigenvalue decomposition requires square input tensor"))
+
+    # scalartype checks
+    @check_scalar D t real
+    @check_scalar V t
+
+    # space checks
+    V_D = fuse(domain(t))
+    @check_space(D, V_D ← V_D)
+    @check_space(V, codomain(t) ← V_D)
+
+    return nothing
+end
+
+function check_input(::typeof(eig_full!), t::AbstractTensorMap, (D, V)::_T_DV,
+                     ::AbstractAlgorithm)
+    domain(t) == codomain(t) ||
+        throw(ArgumentError("Eigenvalue decomposition requires square input tensor"))
+
+    # scalartype checks
+    @check_scalar D t complex
+    @check_scalar V t complex
+
+    # space checks
+    V_D = fuse(domain(t))
+    @check_space(D, V_D ← V_D)
+    @check_space(V, codomain(t) ← V_D)
+
+    return nothing
+end
+
+function check_input(::typeof(eigh_vals!), t::AbstractTensorMap, D::DiagonalTensorMap,
+                     ::AbstractAlgorithm)
+    @check_scalar D t real
+    V_D = fuse(domain(t))
+    @check_space(D, V_D ← V_D)
+    return nothing
+end
+
+function check_input(::typeof(eig_vals!), t::AbstractTensorMap, D::DiagonalTensorMap,
+                     ::AbstractAlgorithm)
+    @check_scalar D t complex
+    V_D = fuse(domain(t))
+    @check_space(D, V_D ← V_D)
+    return nothing
+end
+
+function initialize_output(::typeof(eigh_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    D = DiagonalTensorMap{T}(undef, V_D)
+    V = similar(t, codomain(t) ← V_D)
+    return D, V
+end
+
+function initialize_output(::typeof(eig_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    D = DiagonalTensorMap{Tc}(undef, V_D)
+    V = similar(t, Tc, codomain(t) ← V_D)
+    return D, V
+end
+
+function initialize_output(::typeof(eigh_vals!), t::AbstractTensorMap,
+                           alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    T = real(scalartype(t))
+    return D = DiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function initialize_output(::typeof(eig_vals!), t::AbstractTensorMap,
+                           alg::AbstractAlgorithm)
+    V_D = fuse(domain(t))
+    Tc = complex(scalartype(t))
+    return D = DiagonalTensorMap{Tc}(undef, V_D)
+end
+
+function initialize_output(::typeof(eigh_trunc!), t::AbstractTensorMap,
+                           alg::TruncatedAlgorithm)
+    return initialize_output(eigh_full!, t, alg.alg)
+end
+
+function initialize_output(::typeof(eig_trunc!), t::AbstractTensorMap,
+                           alg::TruncatedAlgorithm)
+    return initialize_output(eig_full!, t, alg.alg)
+end
+
+function eigh_trunc!(t::AbstractTensorMap, DV, alg::TruncatedAlgorithm)
+    DV′ = eigh_full!(t, DV, alg.alg)
+    return truncate!(eigh_trunc!, DV′, alg.trunc)
+end
+
+function eig_trunc!(t::AbstractTensorMap, DV, alg::TruncatedAlgorithm)
+    DV′ = eig_full!(t, DV, alg.alg)
+    return truncate!(eig_trunc!, DV′, alg.trunc)
+end
+
+# QR decomposition
+# ----------------
+const _T_QR = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+
+function check_input(::typeof(qr_full!), t::AbstractTensorMap, (Q, R)::_T_QR,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar Q t
+    @check_scalar R t
+
+    # space checks
+    V_Q = fuse(codomain(t))
+    @check_space(Q, codomain(t) ← V_Q)
+    @check_space(R, V_Q ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(qr_compact!), t::AbstractTensorMap, (Q, R)::_T_QR,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar Q t
+    @check_scalar R t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    @check_space(Q, codomain(t) ← V_Q)
+    @check_space(R, V_Q ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(qr_null!), t::AbstractTensorMap, N::AbstractTensorMap,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar N t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(codomain(t)), V_Q)
+    @check_space(N, codomain(t) ← V_N)
+
+    return nothing
+end
+
+function initialize_output(::typeof(qr_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = fuse(codomain(t))
+    Q = similar(t, codomain(t) ← V_Q)
+    R = similar(t, V_Q ← domain(t))
+    return Q, R
+end
+
+function initialize_output(::typeof(qr_compact!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    Q = similar(t, codomain(t) ← V_Q)
+    R = similar(t, V_Q ← domain(t))
+    return Q, R
+end
+
+function initialize_output(::typeof(qr_null!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(codomain(t)), V_Q)
+    N = similar(t, codomain(t) ← V_N)
+    return N
+end
+
+# LQ decomposition
+# ----------------
+const _T_LQ = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+
+function check_input(::typeof(lq_full!), t::AbstractTensorMap, (L, Q)::_T_LQ,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar L t
+    @check_scalar Q t
+
+    # space checks
+    V_Q = fuse(domain(t))
+    @check_space(L, codomain(t) ← V_Q)
+    @check_space(Q, V_Q ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(lq_compact!), t::AbstractTensorMap, (L, Q)::_T_LQ,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar L t
+    @check_scalar Q t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    @check_space(L, codomain(t) ← V_Q)
+    @check_space(Q, V_Q ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(lq_null!), t::AbstractTensorMap, N, ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar N t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(domain(t)), V_Q)
+    @check_space(N, V_N ← domain(t))
+
+    return nothing
+end
+
+function initialize_output(::typeof(lq_full!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = fuse(domain(t))
+    L = similar(t, codomain(t) ← V_Q)
+    Q = similar(t, V_Q ← domain(t))
+    return L, Q
+end
+
+function initialize_output(::typeof(lq_compact!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    L = similar(t, codomain(t) ← V_Q)
+    Q = similar(t, V_Q ← domain(t))
+    return L, Q
+end
+
+function initialize_output(::typeof(lq_null!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(domain(t)), V_Q)
+    N = similar(t, V_N ← domain(t))
+    return N
+end
+
+# Polar decomposition
+# -------------------
+const _T_WP = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+const _T_PWᴴ = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+using MatrixAlgebraKit: PolarViaSVD
+
+function check_input(::typeof(left_polar!), t::AbstractTensorMap, (W, P)::_T_WP,
+                     ::AbstractAlgorithm)
+    codomain(t) ≿ domain(t) ||
+        throw(ArgumentError("Polar decomposition requires `codomain(t) ≿ domain(t)`"))
+
+    # scalartype checks
+    @check_scalar W t
+    @check_scalar P t
+
+    # space checks
+    @check_space(W, space(t))
+    @check_space(P, domain(t) ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(left_orth_polar!), t::AbstractTensorMap, (W, P)::_T_WP,
+                     ::AbstractAlgorithm)
+    codomain(t) ≿ domain(t) ||
+        throw(ArgumentError("Polar decomposition requires `codomain(t) ≿ domain(t)`"))
+
+    # scalartype checks
+    @check_scalar W t
+    @check_scalar P t
+
+    # space checks
+    VW = fuse(domain(t))
+    @check_space(W, codomain(t) ← VW)
+    @check_space(P, VW ← domain(t))
+
+    return nothing
+end
+
+function initialize_output(::typeof(left_polar!), t::AbstractTensorMap, ::AbstractAlgorithm)
+    W = similar(t, space(t))
+    P = similar(t, domain(t) ← domain(t))
+    return W, P
+end
+
+function check_input(::typeof(right_polar!), t::AbstractTensorMap, (P, Wᴴ)::_T_PWᴴ,
+                     ::AbstractAlgorithm)
+    codomain(t) ≾ domain(t) ||
+        throw(ArgumentError("Polar decomposition requires `domain(t) ≿ codomain(t)`"))
+
+    # scalartype checks
+    @check_scalar P t
+    @check_scalar Wᴴ t
+
+    # space checks
+    @check_space(P, codomain(t) ← codomain(t))
+    @check_space(Wᴴ, space(t))
+
+    return nothing
+end
+
+function check_input(::typeof(right_orth_polar!), t::AbstractTensorMap, (P, Wᴴ)::_T_PWᴴ,
+                     ::AbstractAlgorithm)
+    codomain(t) ≾ domain(t) ||
+        throw(ArgumentError("Polar decomposition requires `domain(t) ≿ codomain(t)`"))
+
+    # scalartype checks
+    @check_scalar P t
+    @check_scalar Wᴴ t
+
+    # space checks
+    VW = fuse(codomain(t))
+    @check_space(P, codomain(t) ← VW)
+    @check_space(Wᴴ, VW ← domain(t))
+
+    return nothing
+end
+
+function initialize_output(::typeof(right_polar!), t::AbstractTensorMap,
+                           ::AbstractAlgorithm)
+    P = similar(t, codomain(t) ← codomain(t))
+    Wᴴ = similar(t, space(t))
+    return Wᴴ, P
+end
+
+# Needed to get algorithm selection to behave
+function left_orth_polar!(t::AbstractTensorMap, VC, alg)
+    alg′ = MatrixAlgebraKit.select_algorithm(left_polar!, t, alg)
+    return left_orth_polar!(t, VC, alg′)
+end
+function right_orth_polar!(t::AbstractTensorMap, CVᴴ, alg)
+    alg′ = MatrixAlgebraKit.select_algorithm(right_polar!, t, alg)
+    return right_orth_polar!(t, CVᴴ, alg′)
+end
+
+# Orthogonalization
+# -----------------
+const _T_VC = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+const _T_CVᴴ = Tuple{<:AbstractTensorMap,<:AbstractTensorMap}
+
+function check_input(::typeof(left_orth!), t::AbstractTensorMap, (V, C)::_T_VC,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar V t
+    isnothing(C) || @check_scalar C t
+
+    # space checks
+    V_C = infimum(fuse(codomain(t)), fuse(domain(t)))
+    @check_space(V, codomain(t) ← V_C)
+    isnothing(C) || @check_space(C, V_C ← domain(t))
+
+    return nothing
+end
+
+function check_input(::typeof(right_orth!), t::AbstractTensorMap, (C, Vᴴ)::_T_CVᴴ,
+                     ::AbstractAlgorithm)
+    # scalartype checks
+    isnothing(C) || @check_scalar C t
+    @check_scalar Vᴴ t
+
+    # space checks
+    V_C = infimum(fuse(codomain(t)), fuse(domain(t)))
+    isnothing(C) || @check_space(C, codomain(t) ← V_C)
+    @check_space(Vᴴ, V_C ← domain(t))
+
+    return nothing
+end
+
+function initialize_output(::typeof(left_orth!), t::AbstractTensorMap)
+    V_C = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V = similar(t, codomain(t) ← V_C)
+    C = similar(t, V_C ← domain(t))
+    return V, C
+end
+
+function initialize_output(::typeof(right_orth!), t::AbstractTensorMap)
+    V_C = infimum(fuse(codomain(t)), fuse(domain(t)))
+    C = similar(t, codomain(t) ← V_C)
+    Vᴴ = similar(t, V_C ← domain(t))
+    return C, Vᴴ
+end
+
+# Nullspace
+# ---------
+function check_input(::typeof(left_null!), t::AbstractTensorMap, N, ::AbstractAlgorithm)
+    # scalartype checks
+    @check_scalar N t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(codomain(t)), V_Q)
+    @check_space(N, codomain(t) ← V_N)
+
+    return nothing
+end
+
+function check_input(::typeof(right_null!), t::AbstractTensorMap, N, ::AbstractAlgorithm)
+    @check_scalar N t
+
+    # space checks
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(domain(t)), V_Q)
+    @check_space(N, V_N ← domain(t))
+
+    return nothing
+end
+
+function initialize_output(::typeof(left_null!), t::AbstractTensorMap)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(codomain(t)), V_Q)
+    N = similar(t, codomain(t) ← V_N)
+    return N
+end
+
+function initialize_output(::typeof(right_null!), t::AbstractTensorMap)
+    V_Q = infimum(fuse(codomain(t)), fuse(domain(t)))
+    V_N = ⊖(fuse(domain(t)), V_Q)
+    N = similar(t, V_N ← domain(t))
+    return N
+end
+
+for (f!, f_svd!) in zip((:left_null!, :right_null!), (:left_null_svd!, :right_null_svd!))
+    @eval function $f_svd!(t::AbstractTensorMap, N, alg, ::Nothing=nothing)
+        return $f!(t, N, alg)
+    end
+end
diff --git a/src/tensors/factorizations/truncation.jl b/src/tensors/factorizations/truncation.jl
new file mode 100644
index 000000000..b37f6dd8a
--- /dev/null
+++ b/src/tensors/factorizations/truncation.jl
@@ -0,0 +1,234 @@
+# Strategies
+# ----------
+notrunc() = NoTruncation()
+
+# deprecate
+const TruncationScheme = TruncationStrategy
+@deprecate truncdim(d::Int) truncrank(d)
+@deprecate truncbelow(ϵ::Real, add_back::Int=0) trunctol(ϵ)
+
+# TODO: add this to MatrixAlgebraKit
+struct TruncationError{T<:Real} <: TruncationStrategy
+    ϵ::T
+    p::Real
+end
+truncerr(epsilon::Real, p::Real=2) = TruncationError(epsilon, p)
+
+struct TruncationSpace{S<:ElementarySpace} <: TruncationStrategy
+    space::S
+end
+truncspace(space::ElementarySpace) = TruncationSpace(space)
+
+# Truncation
+# ----------
+function truncate!(::typeof(svd_trunc!), (U, S, Vᴴ)::_T_USVᴴ, strategy::TruncationStrategy)
+    ind = findtruncated_sorted(diagview(S), strategy)
+    V_truncated = spacetype(S)(c => length(I) for (c, I) in ind)
+    Ũ = similar(U, codomain(U) ← V_truncated)
+    for (c, b) in blocks(Ũ)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copyto!(b, @view(block(U, c)[:, I]))
+    end
+
+    S̃ = DiagonalTensorMap{scalartype(S), spacetype(S), storagetype(S)}(undef, V_truncated)
+    for (c, b) in blocks(S̃)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copyto!(b.diag, @view(block(S, c).diag[I]))
+    end
+
+    Ṽᴴ = similar(Vᴴ, V_truncated ← domain(Vᴴ))
+    for (c, b) in blocks(Ṽᴴ)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copyto!(b, @view(block(Vᴴ, c)[I, :]))
+    end
+
+    return Ũ, S̃, Ṽᴴ
+end
+
+function truncate!(::typeof(left_null!),
+                   (U, S)::Tuple{<:AbstractTensorMap,
+                                 <:AbstractTensorMap},
+                   strategy::MatrixAlgebraKit.TruncationStrategy)
+    extended_S = SectorDict(c => vcat(diagview(b),
+                                      zeros(eltype(b), max(0, size(b, 2) - size(b, 1))))
+                            for (c, b) in blocks(S))
+    ind = findtruncated(extended_S, strategy)
+    V_truncated = spacetype(S)(c => length(axes(b, 1)[ind[c]]) for (c, b) in blocks(S))
+    Ũ = similar(U, codomain(U) ← V_truncated)
+    for (c, b) in blocks(Ũ)
+        copy!(b, @view(block(U, c)[:, ind[c]]))
+    end
+    return Ũ
+end
+
+function truncate!(::typeof(eigh_trunc!), (D, V)::_T_DV, strategy::TruncationStrategy)
+    ind = findtruncated(diagview(D), strategy)
+    V_truncated = spacetype(D)(c => length(I) for (c, I) in ind)
+    
+    D̃ = DiagonalTensorMap{scalartype(D), spacetype(D), storagetype(D)}(undef, V_truncated)
+    for (c, b) in blocks(D̃)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copy!(b.diag, @view(block(D, c).diag[I]))
+    end
+
+    Ṽ = similar(V, V_truncated ← domain(V))
+    for (c, b) in blocks(Ṽ)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copy!(b, @view(block(V, c)[I, :]))
+    end
+
+    return D̃, Ṽ
+end
+function truncate!(::typeof(eig_trunc!), (D, V)::_T_DV, strategy::TruncationStrategy)
+    ind = findtruncated(diagview(D), strategy)
+    V_truncated = spacetype(D)(c => length(I) for (c, I) in ind)
+
+    D̃ = DiagonalTensorMap{scalartype(D), spacetype(D), storagetype(D)}(undef, V_truncated)
+    for (c, b) in blocks(D̃)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copy!(b.diag, @view(block(D, c).diag[I]))
+    end
+
+    Ṽ = similar(V, V_truncated ← domain(V))
+    for (c, b) in blocks(Ṽ)
+        I = get(ind, c, nothing)
+        @assert !isnothing(I)
+        copy!(b, @view(block(V, c)[I, :]))
+    end
+
+    return D̃, Ṽ
+end
+
+# Find truncation
+# ---------------
+# auxiliary functions
+rtol_to_atol(S, p, atol, rtol) = rtol > 0 ? max(atol, _norm(S, p) * rtol) : atol
+
+function _compute_truncerr(Σdata, truncdim, p=2)
+    I = keytype(Σdata)
+    S = scalartype(valtype(Σdata))
+    return TensorKit._norm((c => @view(v[(get(truncdim, c, 0) + 1):end])
+                            for (c, v) in Σdata),
+                           p, zero(S))
+end
+
+function _findnexttruncvalue(S, truncdim::SectorDict{I,Int}; by=identity,
+                             rev::Bool=true) where {I<:Sector}
+    # early return
+    (isempty(S) || all(iszero, values(truncdim))) && return nothing
+    if rev
+        σmin, imin = findmin(keys(truncdim)) do c
+            d = truncdim[c]
+            return by(S[c][d])
+        end
+        return σmin, keys(truncdim)[imin]
+    else
+        σmax, imax = findmax(keys(truncdim)) do c
+            d = truncdim[c]
+            return by(S[c][d])
+        end
+        return σmax, keys(truncdim)[imax]
+    end
+end
+
+# implementations
+function findtruncated_sorted(S::SectorDict, strategy::TruncationKeepAbove)
+    atol = rtol_to_atol(S, strategy.p, strategy.atol, strategy.rtol)
+    findtrunc = Base.Fix2(findtruncated_sorted, truncbelow(atol))
+    return SectorDict(c => findtrunc(d) for (c, d) in Sd)
+end
+function findtruncated(S::SectorDict, strategy::TruncationKeepAbove)
+    atol = rtol_to_atol(S, strategy.p, strategy.atol, strategy.rtol)
+    findtrunc = Base.Fix2(findtruncated, truncbelow(atol))
+    return SectorDict(c => findtrunc(d) for (c, d) in Sd)
+end
+
+function findtruncated_sorted(S::SectorDict, strategy::TruncationKeepBelow)
+    atol = rtol_to_atol(S, strategy.p, strategy.atol, strategy.rtol)
+    findtrunc = Base.Fix2(findtruncated_sorted, truncabove(atol))
+    return SectorDict(c => findtrunc(d) for (c, d) in Sd)
+end
+function findtruncated(S::SectorDict, strategy::TruncationKeepBelow)
+    atol = rtol_to_atol(S, strategy.p, strategy.atol, strategy.rtol)
+    findtrunc = Base.Fix2(findtruncated, truncabove(atol))
+    return SectorDict(c => findtrunc(d) for (c, d) in Sd)
+end
+
+function findtruncated_sorted(Sd::SectorDict, strategy::TruncationError)
+    I = keytype(Sd)
+    truncdim = SectorDict{I,Int}(c => length(d) for (c, d) in Sd)
+    while true
+        next = _findnexttruncvalue(Sd, truncdim)
+        isnothing(next) && break
+        σmin, cmin = next
+        truncdim[cmin] -= 1
+        err = _compute_truncerr(Sd, truncdim, strategy.p)
+        if err > strategy.ϵ
+            truncdim[cmin] += 1
+            break
+        end
+        if truncdim[cmin] == 0
+            delete!(truncdim, cmin)
+        end
+    end
+    return SectorDict{I,Base.OneTo{Int}}(c => Base.OneTo(d) for (c, d) in truncdim)
+end
+
+function findtruncated_sorted(Sd::SectorDict, strategy::TruncationKeepSorted)
+    return findtruncated(Sd, strategy)
+end
+function findtruncated(Sd::SectorDict, strategy::TruncationKeepSorted)
+    permutations = SectorDict(c => (sortperm(d; strategy.by, strategy.rev))
+                              for (c, d) in Sd)
+    Sd = SectorDict(c => sort(d; strategy.by, strategy.rev) for (c, d) in Sd)
+
+    I = keytype(Sd)
+    truncdim = SectorDict{I,Int}(c => length(d) for (c, d) in Sd)
+    totaldim = sum(dim(c) * d for (c, d) in truncdim; init=0)
+    while true
+        next = _findnexttruncvalue(Sd, truncdim; strategy.by, strategy.rev)
+        isnothing(next) && break
+        _, cmin = next
+        truncdim[cmin] -= 1
+        totaldim -= dim(cmin)
+        if totaldim < strategy.howmany
+            truncdim[cmin] += 1
+            break
+        end
+        if truncdim[cmin] == 0
+            delete!(truncdim, cmin)
+        end
+    end
+    return SectorDict(c => permutations[c][Base.OneTo(d)] for (c, d) in truncdim)
+end
+
+function findtruncated_sorted(Sd::SectorDict, strategy::TruncationSpace)
+    I = keytype(Sd)
+    return SectorDict{I,Base.OneTo{Int}}(c => Base.OneTo(min(length(d),
+                                                             dim(strategy.space, c)))
+                                         for (c, d) in Sd)
+end
+
+function findtruncated_sorted(Sd::SectorDict, strategy::TruncationKeepFiltered)
+    return SectorDict(c => findtruncated_sorted(d, strategy) for (c, d) in Sd)
+end
+function findtruncated(Sd::SectorDict, strategy::TruncationKeepFiltered)
+    return SectorDict(c => findtruncated(d, strategy) for (c, d) in Sd)
+end
+
+function findtruncated_sorted(Sd::SectorDict, strategy::TruncationIntersection)
+    inds = map(Base.Fix1(findtruncated_sorted, Sd), strategy)
+    return SectorDict(c => intersect(map(Base.Fix2(getindex, c), inds)...)
+                      for c in intersect(map(keys, inds)...))
+end
+function findtruncated(Sd::SectorDict, strategy::TruncationIntersection)
+    inds = map(Base.Fix1(findtruncated, Sd), strategy)
+    return SectorDict(c => intersect(map(Base.Fix2(getindex, c), inds)...)
+                      for c in intersect(map(keys, inds)...))
+end
diff --git a/src/tensors/factorizations/utility.jl b/src/tensors/factorizations/utility.jl
new file mode 100644
index 000000000..e23fb8b73
--- /dev/null
+++ b/src/tensors/factorizations/utility.jl
@@ -0,0 +1,29 @@
+# convenience to set default
+macro check_space(x, V)
+    return esc(:($MatrixAlgebraKit.@check_size($x, $V, $space)))
+end
+macro check_scalar(x, y, op=:identity, eltype=:scalartype)
+    return esc(:($MatrixAlgebraKit.@check_scalar($x, $y, $op, $eltype)))
+end
+
+function factorisation_scalartype(t::AbstractTensorMap)
+    T = scalartype(t)
+    return promote_type(Float32, typeof(zero(T) / sqrt(abs2(one(T)))))
+end
+factorisation_scalartype(f, t) = factorisation_scalartype(t)
+
+function permutedcopy_oftype(t::AbstractTensorMap, T::Type{<:Number}, p::Index2Tuple)
+    return permute!(similar(t, T, permute(space(t), p)), t, p)
+end
+function copy_oftype(t::AbstractTensorMap, T::Type{<:Number})
+    return copy!(similar(t, T), t)
+end
+
+function _reverse!(t::AbstractTensorMap; dims=:)
+    for (c, b) in blocks(t)
+        reverse!(b; dims)
+    end
+    return t
+end
+
+diagview(t::AbstractTensorMap) = SectorDict(c => diagview(b) for (c, b) in blocks(t))
diff --git a/src/tensors/linalg.jl b/src/tensors/linalg.jl
index f29bdf809..4f00f1444 100644
--- a/src/tensors/linalg.jl
+++ b/src/tensors/linalg.jl
@@ -59,7 +59,7 @@ function one!(t::AbstractTensorMap)
     domain(t) == codomain(t) ||
         throw(SectorMismatch("no identity if domain and codomain are different"))
     for (c, b) in blocks(t)
-        MatrixAlgebra.one!(b)
+        one!(b)
     end
     return t
 end
@@ -106,7 +106,7 @@ function isomorphism!(t::AbstractTensorMap)
     domain(t) ≅ codomain(t) ||
         throw(SpaceMismatch(lazy"domain and codomain are not isomorphic: $(space(t))"))
     for (_, b) in blocks(t)
-        MatrixAlgebra.one!(b)
+        one!(b)
     end
     return t
 end
@@ -155,7 +155,7 @@ function isometry!(t::AbstractTensorMap)
     domain(t) ≾ codomain(t) ||
         throw(SpaceMismatch(lazy"domain and codomain are not isometrically embeddable: $(space(t))"))
     for (_, b) in blocks(t)
-        MatrixAlgebra.one!(b)
+        one!(b)
     end
     return t
 end
@@ -268,7 +268,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p == 2
         n² = mapreduce(+, blockiter; init=init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.norm2(b)^2)
+            return isempty(b) ? init : oftype(init, dim(c) * norm(b, 2)^2)
         end
         return sqrt(n²)
     elseif p == 1
@@ -277,7 +277,7 @@ function _norm(blockiter, p::Real, init::Real)
         end
     elseif p > 0
         nᵖ = mapreduce(+, blockiter; init=init) do (c, b)
-            return isempty(b) ? init : oftype(init, dim(c) * LinearAlgebra.normp(b, p)^p)
+            return isempty(b) ? init : oftype(init, dim(c) * norm(b, p)^p)
         end
         return (nᵖ)^inv(oftype(nᵖ, p))
     else
@@ -377,7 +377,7 @@ function Base.inv(t::AbstractTensorMap)
     T = float(scalartype(t))
     tinv = similar(t, T, dom ← cod)
     for (c, b) in blocks(t)
-        binv = MatrixAlgebra.one!(block(tinv, c))
+        binv = one!(block(tinv, c))
         ldiv!(lu(b), binv)
     end
     return tinv
@@ -449,11 +449,11 @@ for f in (:cos, :sin, :tan, :cot, :cosh, :sinh, :tanh, :coth, :atan, :acot, :asi
         tf = similar(t, T)
         if T <: Real
             for (c, b) in blocks(t)
-                copy!(block(tf, c), real(MatrixAlgebra.$f(b)))
+                copy!(block(tf, c), real(MatrixAlgebraKit.$f(b)))
             end
         else
             for (c, b) in blocks(t)
-                copy!(block(tf, c), MatrixAlgebra.$f(b))
+                copy!(block(tf, c), MatrixAlgebraKit.$f(b))
             end
         end
         return tf
@@ -544,8 +544,8 @@ function ⊗(t1::AbstractTensorMap, t2::AbstractTensorMap)
                             d4 = dim(dom2, f2r.uncoupled)
                             m1 = sreshape(t1[f1l, f1r], (d1, 1, d3, 1))
                             m2 = sreshape(t2[f2l, f2r], (1, d2, 1, d4))
-                            m = sreshape(t[fl, fr], (d1, d2, d3, d4))
-                            m .+= coeff1 .* conj(coeff2) .* m1 .* m2
+                            m  = sreshape(t[fl, fr], (d1, d2, d3, d4))
+                            m .+= coeff1 .* conj.(coeff2) .* m1 .* m2
                         end
                     end
                 end
diff --git a/src/tensors/tensor.jl b/src/tensors/tensor.jl
index a918478c3..914da3c3b 100644
--- a/src/tensors/tensor.jl
+++ b/src/tensors/tensor.jl
@@ -138,7 +138,7 @@ function TensorMap(data::AbstractDict{<:Sector,<:AbstractMatrix},
         datac = data[c]
         size(datac) == size(b) ||
             throw(DimensionMismatch("wrong size of block for sector $c"))
-        copy!(b, datac)
+        copyto!(b, datac)
     end
     for (c, b) in data
         c ∈ blocksectors(t) || isempty(b) ||
diff --git a/src/tensors/truncation.jl b/src/tensors/truncation.jl
deleted file mode 100644
index e49cdc94d..000000000
--- a/src/tensors/truncation.jl
+++ /dev/null
@@ -1,163 +0,0 @@
-# truncation.jl
-#
-# Implements truncation schemes for truncating a tensor with svd, leftorth or rightorth
-abstract type TruncationScheme end
-
-struct NoTruncation <: TruncationScheme
-end
-notrunc() = NoTruncation()
-
-struct TruncationError{T<:Real} <: TruncationScheme
-    ϵ::T
-end
-truncerr(epsilon::Real) = TruncationError(epsilon)
-
-struct TruncationDimension <: TruncationScheme
-    dim::Int
-end
-truncdim(d::Int) = TruncationDimension(d)
-
-struct TruncationSpace{S<:ElementarySpace} <: TruncationScheme
-    space::S
-end
-truncspace(space::ElementarySpace) = TruncationSpace(space)
-
-struct TruncationCutoff{T<:Real} <: TruncationScheme
-    ϵ::T
-    add_back::Int
-end
-truncbelow(epsilon::Real, add_back::Int=0) = TruncationCutoff(epsilon, add_back)
-
-# Compute the total truncation error given truncation dimensions
-function _compute_truncerr(Σdata, truncdim, p=2)
-    I = keytype(Σdata)
-    S = scalartype(valtype(Σdata))
-    return _norm((c => view(v, (truncdim[c] + 1):length(v)) for (c, v) in Σdata), p,
-                 zero(S))
-end
-
-# Compute truncation dimensions
-function _compute_truncdim(Σdata, ::NoTruncation, p=2)
-    I = keytype(Σdata)
-    truncdim = SectorDict{I,Int}(c => length(v) for (c, v) in Σdata)
-    return truncdim
-end
-function _compute_truncdim(Σdata, trunc::TruncationError, p=2)
-    I = keytype(Σdata)
-    S = real(eltype(valtype(Σdata)))
-    truncdim = SectorDict{I,Int}(c => length(Σc) for (c, Σc) in Σdata)
-    truncerr = zero(S)
-    while true
-        cmin = _findnexttruncvalue(Σdata, truncdim, p)
-        isnothing(cmin) && break
-        truncdim[cmin] -= 1
-        truncerr = _compute_truncerr(Σdata, truncdim, p)
-        if truncerr > trunc.ϵ
-            truncdim[cmin] += 1
-            break
-        end
-    end
-    return truncdim
-end
-function _compute_truncdim(Σdata, trunc::TruncationDimension, p=2)
-    I = keytype(Σdata)
-    truncdim = SectorDict{I,Int}(c => length(v) for (c, v) in Σdata)
-    while sum(dim(c) * d for (c, d) in truncdim) > trunc.dim
-        cmin = _findnexttruncvalue(Σdata, truncdim, p)
-        isnothing(cmin) && break
-        truncdim[cmin] -= 1
-    end
-    return truncdim
-end
-function _compute_truncdim(Σdata, trunc::TruncationSpace, p=2)
-    I = keytype(Σdata)
-    truncdim = SectorDict{I,Int}(c => min(length(v), dim(trunc.space, c))
-                                 for (c, v) in Σdata)
-    return truncdim
-end
-
-function _compute_truncdim(Σdata, trunc::TruncationCutoff, p=2)
-    I = keytype(Σdata)
-    truncdim = SectorDict{I,Int}(c => length(v) for (c, v) in Σdata)
-    for (c, v) in Σdata
-        newdim = findlast(Base.Fix2(>, trunc.ϵ), v)
-        if newdim === nothing
-            truncdim[c] = 0
-        else
-            truncdim[c] = newdim
-        end
-    end
-    for i in 1:(trunc.add_back)
-        cmax = _findnextgrowvalue(Σdata, truncdim, p)
-        isnothing(cmax) && break
-        truncdim[cmax] += 1
-    end
-    return truncdim
-end
-
-# Combine truncations
-struct MultipleTruncation{T<:Tuple{Vararg{TruncationScheme}}} <: TruncationScheme
-    truncations::T
-end
-function Base.:&(a::MultipleTruncation, b::MultipleTruncation)
-    return MultipleTruncation((a.truncations..., b.truncations...))
-end
-function Base.:&(a::MultipleTruncation, b::TruncationScheme)
-    return MultipleTruncation((a.truncations..., b))
-end
-function Base.:&(a::TruncationScheme, b::MultipleTruncation)
-    return MultipleTruncation((a, b.truncations...))
-end
-Base.:&(a::TruncationScheme, b::TruncationScheme) = MultipleTruncation((a, b))
-
-function _compute_truncdim(Σdata, trunc::MultipleTruncation, p::Real=2)
-    truncdim = _compute_truncdim(Σdata, trunc.truncations[1], p)
-    for k in 2:length(trunc.truncations)
-        truncdimₖ = _compute_truncdim(Σdata, trunc.truncations[k], p)
-        for (c, d) in truncdim
-            truncdim[c] = min(d, truncdimₖ[c])
-        end
-    end
-    return truncdim
-end
-
-# auxiliary function
-function _findnexttruncvalue(Σdata, truncdim::SectorDict{I,Int}, p::Real) where {I<:Sector}
-    # early return
-    (isempty(Σdata) || all(iszero, values(truncdim))) && return nothing
-
-    # find some suitable starting candidate
-    cmin = findfirst(>(0), truncdim)
-    σmin = Σdata[cmin][truncdim[cmin]]
-
-    # find the actual minimum singular value
-    for (c, σs) in Σdata
-        if truncdim[c] > 0
-            σ = σs[truncdim[c]]
-            if σ < σmin
-                cmin, σmin = c, σ
-            end
-        end
-    end
-    return cmin
-end
-function _findnextgrowvalue(Σdata, truncdim::SectorDict{I,Int}, p::Real) where {I<:Sector}
-    istruncated = SectorDict{I,Bool}(c => (d < length(Σdata[c])) for (c, d) in truncdim)
-    # early return
-    (isempty(Σdata) || !any(values(istruncated))) && return nothing
-
-    # find some suitable starting candidate
-    cmax = findfirst(istruncated)
-    σmax = Σdata[cmax][truncdim[cmax] + 1]
-
-    # find the actual maximal singular value
-    for (c, σs) in Σdata
-        if istruncated[c]
-            σ = σs[truncdim[c] + 1]
-            if σ > σmax
-                cmax, σmax = c, σ
-            end
-        end
-    end
-    return cmax
-end
diff --git a/test/ad.jl b/test/ad.jl
index e5e2d884d..4013846d0 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -398,7 +398,7 @@ Vlist = ((ℂ^2, (ℂ^3)', ℂ^3, ℂ^2, (ℂ^2)'),
             test_rrule(eigh′, H; atol, output_tangent=(ΔD, ΔU))
         end
 
-        let (U, S, V, ϵ) = tsvd(A)
+        let (U, S, V) = tsvd(A)
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
@@ -408,54 +408,53 @@ Vlist = ((ℂ^2, (ℂ^3)', ℂ^3, ℂ^2, (ℂ^2)'),
                     mul!(block(ΔU, c), block(U, c), Diagonal(imag(diag(b))), -im, 1)
                 end
             end
-            test_rrule(tsvd, A; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0))
+            test_rrule(tsvd, A; atol, output_tangent=(ΔU, ΔS, ΔV))
 
             allS = mapreduce(x -> diag(x[2]), vcat, blocks(S))
             truncval = (maximum(allS) + minimum(allS)) / 2
-            U, S, V, ϵ = tsvd(A; trunc=truncerr(truncval))
+            U, S, V = tsvd(A; trunc=truncerr(truncval))
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
             T <: Complex && remove_svdgauge_depence!(ΔU, ΔV, U, S, V)
-            test_rrule(tsvd, A; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0),
+            test_rrule(tsvd, A; atol, output_tangent=(ΔU, ΔS, ΔV),
                        fkwargs=(; trunc=truncerr(truncval)))
         end
 
-        let (U, S, V, ϵ) = tsvd(B)
+        let (U, S, V) = tsvd(B)
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
             T <: Complex && remove_svdgauge_depence!(ΔU, ΔV, U, S, V)
-            test_rrule(tsvd, B; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0))
+            test_rrule(tsvd, B; atol, output_tangent=(ΔU, ΔS, ΔV))
 
             Vtrunc = spacetype(S)(TensorKit.SectorDict(c => ceil(Int, size(b, 1) / 2)
                                                        for (c, b) in blocks(S)))
 
-            U, S, V, ϵ = tsvd(B; trunc=truncspace(Vtrunc))
+            U, S, V = tsvd(B; trunc=truncspace(Vtrunc))
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
             T <: Complex && remove_svdgauge_depence!(ΔU, ΔV, U, S, V)
-            test_rrule(tsvd, B; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0),
+            test_rrule(tsvd, B; atol, output_tangent=(ΔU, ΔS, ΔV),
                        fkwargs=(; trunc=truncspace(Vtrunc)))
         end
 
-        let (U, S, V, ϵ) = tsvd(C)
+        let (U, S, V) = tsvd(C)
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
             T <: Complex && remove_svdgauge_depence!(ΔU, ΔV, U, S, V)
-            test_rrule(tsvd, C; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0))
+            test_rrule(tsvd, C; atol, output_tangent=(ΔU, ΔS, ΔV))
 
-            c, = TensorKit.MatrixAlgebra._argmax(x -> sqrt(dim(x[1])) * maximum(diag(x[2])),
-                                                 blocks(S))
+            c, = argmax(x -> sqrt(dim(x[1])) * maximum(diag(x[2])), blocks(S))
             trunc = truncdim(round(Int, 2 * dim(c)))
-            U, S, V, ϵ = tsvd(C; trunc)
+            U, S, V = tsvd(C; trunc)
             ΔU = randn(scalartype(U), space(U))
             ΔS = randn(scalartype(S), space(S))
             ΔV = randn(scalartype(V), space(V))
             T <: Complex && remove_svdgauge_depence!(ΔU, ΔV, U, S, V)
-            test_rrule(tsvd, C; atol, output_tangent=(ΔU, ΔS, ΔV, 0.0), fkwargs=(; trunc))
+            test_rrule(tsvd, C; atol, output_tangent=(ΔU, ΔS, ΔV), fkwargs=(; trunc))
         end
 
         let D = LinearAlgebra.eigvals(C)
diff --git a/test/amdgpu.jl b/test/amdgpu.jl
new file mode 100644
index 000000000..b7d1bfb41
--- /dev/null
+++ b/test/amdgpu.jl
@@ -0,0 +1,597 @@
+using Adapt
+ad = adapt(Array)
+
+const AMDGPUExt = Base.get_extension(TensorKit, :TensorKitAMDGPUExt)
+@assert !isnothing(AMDGPUExt)
+# const ROCTensorMap{T,S,N1,N2,I,A} = AMDGPUExt.ROCTensorMap{T,S,N1,N2,I,A}
+const ROCTensorMap = getglobal(AMDGPUExt, :ROCTensorMap)
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂)#, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("AMDGPU Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred AMDGPU.zeros(T, W)
+                # @test @constinferred(hash(t)) == hash(deepcopy(t)) # hash is not defined for CuArray?
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) ==
+                      @constinferred tensormaptype(spacetype(t), 5, 0, storagetype(t))
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(ROCTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Tensor Array conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+
+                # cuTENSOR does not support Int
+                Ts = sectortype(W) === Trivial ? (Int, Float32, ComplexF64) :
+                     (Float32, ComplexF64)
+                for T in Ts
+                    t = @constinferred AMDGPU.rand(T, W)
+                    a = @constinferred convert(CuArray, t)
+                    @test t ≈ @constinferred TensorMap(a, W)
+                    # also test if input is matrix
+                    a2 = reshape(a, prod(dim, codomain(t)), prod(dim, domain(t)))
+                    @test t ≈ @constinferred TensorMap(a2, codomain(t), domain(t))
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred AMDGPU.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred rand!(similar(t))
+                β = rand(T)
+                @test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                A = storagetype(t)
+
+                i1 = @constinferred(isomorphism(CuMatrix{T}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuMatrix{T}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuMatrix{T}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuMatrix{T}, V2 ⊗ V1))
+
+                w = @constinferred(isometry(CuMatrix{T}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)),
+                                            V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuMatrix{T}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = AMDGPU.rand(T, W)
+                    t2 = @constinferred rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(ad(t), 2)
+                    @test dot(t2, t) ≈ dot(ad(t2), ad(t))
+                    α = rand(T)
+                    @test ad(α * t) ≈ α * ad(t)
+                    @test ad(t + t) ≈ 2 * ad(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred AMDGPU.randn(T, W, W)
+                    @test real(ad(t)) == ad(@constinferred real(t))
+                    @test imag(ad(t)) == ad(@constinferred imag(t))
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred AMDGPU.randn(W ← W)
+            @test typeof(convert(TensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        @timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = AMDGPU.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = AMDGPU.rand(ComplexF64, W)
+            t′ = randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    @test norm(t2) ≈ norm(t)
+                    t2′ = permute(t′, (p1, p2))
+                    @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                end
+
+                t3 = VERSION < v"1.7" ? repartition(t, k) :
+                     @constinferred repartition(t, $k)
+                @test norm(t3) ≈ norm(t)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = AMDGPU.rand(ComplexF64, W)
+                a = ad(t)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = permute(t, (p1, p2))
+                        a2 = ad(t2)
+                        @test a2 ≈ permute(a, (p1, p2))
+                        @test ad(transpose(t2)) ≈ transpose(a2)
+                    end
+
+                    t3 = repartition(t, k)
+                    a3 = ad(t3)
+                    @test a3 ≈ repartition(a, k)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = AMDGPU.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = AMDGPU.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = AMDGPU.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = AMDGPU.randn(ComplexF64, V1' * V2', V3')
+                A2 = AMDGPU.randn(ComplexF64, V3 * V4, V5)
+                rhoL = AMDGPU.randn(ComplexF64, V1, V1)
+                rhoR = AMDGPU.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = AMDGPU.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                                               A2[b, t2, c'] * rhoR[c', c] *
+                                               H[s1, s2, t1, t2]
+
+                @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                                                    conj(ad(A1)[a', t1, b]) *
+                                                    ad(A2)[b, t2, c'] *
+                                                    ad(rhoR)[c', c] *
+                                                    ad(H)[s1, s2, t1, t2]
+
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = AMDGPU.rand(T, W1, W1)
+                t2 = AMDGPU.rand(T, W2, W2)
+                t = AMDGPU.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                @test one(t1) / t1 ≈ pinv(t1)
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                tp = pinv(t) * t
+                @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = AMDGPU.rand(T, W1, W1)
+                    t2 = AMDGPU.rand(T, W2, W2)
+                    t = AMDGPU.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    At1 = reshape(convert(Array, t1), d1, d1)
+                    At2 = reshape(convert(Array, t2), d2, d2)
+                    At = reshape(convert(Array, t), d1, d2)
+                    @test ad(t1 * t) ≈ ad(t1) * ad(t)
+                    @test ad(t1' * t) ≈ ad(t1)' * ad(t)
+                    @test ad(t2 * t') ≈ ad(t2) * ad(t)'
+                    @test ad(t2' * t') ≈ ad(t2)' * ad(t)'
+                    @test ad(inv(t1)) ≈ inv(ad(t1))
+                    @test ad(pinv(t)) ≈ pinv(ad(t))
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test ad(t1 \ t) ≈ ad(t1) \ ad(t)
+                    @test ad(t1' \ t) ≈ ad(t1)' \ ad(t)
+                    @test ad(t2 \ t') ≈ ad(t2) \ ad(t)'
+                    @test ad(t2' \ t') ≈ ad(t2)' \ ad(t)'
+
+                    @test ad(t2 / t) ≈ ad(t2) / ad(t)
+                    @test ad(t2' / t) ≈ ad(t2)' / ad(t)
+                    @test ad(t1 / t') ≈ ad(t1) / ad(t)'
+                    @test ad(t1' / t') ≈ ad(t1)' / ad(t)'
+                end
+            end
+        end
+        @timedtestset "Factorization" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                # Test both a normal tensor and an adjoint one.
+                ts = (AMDGPU.rand(T, W), AMDGPU.rand(T, W)')
+                for t in ts
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), # TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        Q, R = @constinferred leftorth(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        QdQ = Q' * Q
+                        @test QdQ ≈ one(QdQ)
+                        @test Q * R ≈ permute(t, ((3, 4, 2), (1, 5)))
+                        if alg isa Polar
+                            # @test isposdef(R) # not defined for AMDGPU
+                            @test domain(R) == codomain(R) == space(t, 1)' ⊗ space(t, 5)'
+                        end
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), # TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        N = @constinferred leftnull(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        NdN = N' * N
+                        @test NdN ≈ one(NdN)
+                        @test norm(N' * permute(t, ((3, 4, 2), (1, 5)))) <
+                              100 * eps(norm(t))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), #TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        # cusolver SVD requires m >= n for some reason
+                        L, Q = @constinferred rightorth(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        QQd = Q * Q'
+                        @test QQd ≈ one(QQd)
+                        @test L * Q ≈ permute(t, ((3, 4), (2, 1, 5)))
+                        if alg isa Polar
+                            # @test isposdef(L) # not defined for AMDGPU
+                            @test domain(L) == codomain(L) == space(t, 3) ⊗ space(t, 4)
+                        end
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), # TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        M = @constinferred rightnull(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        MMd = M * M'
+                        @test MMd ≈ one(MMd)
+                        @test norm(permute(t, ((3, 4), (2, 1, 5))) * M') <
+                              100 * eps(norm(t))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SDD(),)
+                        U, S, V = @constinferred tsvd(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        UdU = U' * U
+                        @test UdU ≈ one(UdU)
+                        VVd = V * V'
+                        @test VVd ≈ one(VVd)
+                        t2 = permute(t, ((3, 4, 2), (1, 5)))
+                        @test U * S * V ≈ t2
+
+                        s = LinearAlgebra.svdvals(t2)
+                        s′ = LinearAlgebra.diag(S)
+                        for (c, b) in s
+                            @test b ≈ s′[c]
+                        end
+                    end
+                end
+                @testset "empty tensor" begin
+                    t = AMDGPU.randn(T, V1 ⊗ V2, typeof(V1)())
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        Q, R = @constinferred leftorth(t; alg=alg)
+                        @test Q == t
+                        @test dim(Q) == dim(R) == 0
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       TensorKit.SDD())
+                        N = @constinferred leftnull(t; alg=alg)
+                        @test N' * N ≈ id(storagetype(t), domain(N))
+                        @test N * N' ≈ id(storagetype(t), codomain(N))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        L, Q = @constinferred rightorth(copy(t'); alg=alg)
+                        @test Q == t'
+                        @test dim(Q) == dim(L) == 0
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        TensorKit.SDD())
+                        M = @constinferred rightnull(copy(t'); alg=alg)
+                        @test M * M' ≈ id(storagetype(t), codomain(M))
+                        @test M' * M ≈ id(storagetype(t), domain(M))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(), TensorKit.SDD())
+                        U, S, V = @constinferred tsvd(t; alg=alg)
+                        @test U == t
+                        @test dim(U) == dim(S) == dim(V)
+                    end
+                end
+
+                # AMDGPU only supports symmetric/hermitian eigen
+                @testset "eig and isposdef" begin
+                    t = AMDGPU.rand(T, V1 ⊗ V2 ← V1 ⊗ V2)
+                    t += t'
+                    D, V = eigen(t)
+                    @test t * V ≈ V * D
+
+                    # d = LinearAlgebra.eigvals(t; sortby=nothing)
+                    # d′ = LinearAlgebra.diag(D)
+                    # for (c, b) in d
+                    #     @test b ≈ d′[c]
+                    # end
+
+                    # Somehow moving these test before the previous one gives rise to errors
+                    # with T=Float32 on x86 platforms. Is this an OpenBLAS issue? 
+                    # VdV = V' * V
+                    # VdV = (VdV + VdV') / 2
+                    # @test isposdef(VdV)
+                    #
+                    # @test !isposdef(t2) # unlikely for non-hermitian map
+                    # t2 = (t2 + t2')
+                    # D, V = eigen(t2)
+                    # VdV = V' * V
+                    # @test VdV ≈ one(VdV)
+                    # D̃, Ṽ = @constinferred eigh(t2)
+                    # @test D ≈ D̃
+                    # @test V ≈ Ṽ
+                    # λ = minimum(minimum(real(LinearAlgebra.diag(b)))
+                    #             for (c, b) in blocks(D))
+                    # @test isposdef(t2) == isposdef(λ)
+                    # @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                    # @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+                end
+            end
+        end
+        @timedtestset "Tensor truncation" begin
+            for T in (Float32, ComplexF64)
+                for p in (1, 2, 3, Inf)
+                    # Test both a normal tensor and an adjoint one.
+                    ts = (AMDGPU.randn(T, V1 ⊗ V2 ⊗ V3, V4 ⊗ V5),
+                          AMDGPU.randn(T, V4 ⊗ V5, V1 ⊗ V2 ⊗ V3)')
+                    for t in ts
+                        U₀, S₀, V₀, = tsvd(t)
+                        t = rmul!(t, 1 / norm(S₀, p))
+                        # Probably shouldn't allow truncerr and truncdim, as these require scalar indexing?
+                        U, S, V, ϵ = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))), p=p)
+                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
+                    end
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = AMDGPU.randn(T, W, W)
+                    s = dim(W)
+                    expt = @constinferred exp(t)
+                    @test ad(expt) ≈ exp(ad(t))
+
+                    @test (@constinferred sqrt(t))^2 ≈ t
+                    @test ad(sqrt(t)) ≈ sqrt(ad(t))
+
+                    @test exp(@constinferred log(expt)) ≈ expt
+                    @test ad(log(expt)) ≈ log(ad(expt))
+
+                    @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                    t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8
+                end
+            end
+        end
+        # Sylvester not defined for AMDGPU
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = AMDGPU.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = AMDGPU.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = AMDGPU.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                t = @constinferred (t1 ⊗ t2)
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    t = @constinferred (t1 ⊗ t2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    At = ad(t)
+                    @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = AMDGPU.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = AMDGPU.rand(T, V2 ⊗ V1 ⊗ V3)
+                t = @constinferred (t1 ⊗ t2)
+                @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                @test t ≈ t′
+            end
+        end
+    end
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = AMDGPU.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = AMDGPU.rand(T, W2, W1 ⊗ W1')
+            t = @constinferred (t1 ⊠ t2)
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+        end
+    end
+end
+
diff --git a/test/cuda.jl b/test/cuda.jl
new file mode 100644
index 000000000..f40402a1c
--- /dev/null
+++ b/test/cuda.jl
@@ -0,0 +1,591 @@
+using Adapt
+ad = adapt(Array)
+
+const CUDAExt = Base.get_extension(TensorKit, :TensorKitCUDAExt)
+@assert !isnothing(CUDAExt)
+# const CuTensorMap{T,S,N1,N2,I,A} = CUDAExt.CuTensorMap{T,S,N1,N2,I,A}
+const CuTensorMap = getglobal(CUDAExt, :CuTensorMap)
+
+for V in (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+    V1, V2, V3, V4, V5 = V
+    @assert V3 * V4 * V2 ≿ V1' * V5' # necessary for leftorth tests
+    @assert V3 * V4 ≾ V1' * V2' * V5' # necessary for rightorth tests
+end
+
+spacelist = try
+    if ENV["CI"] == "true"
+        println("Detected running on CI")
+        if Sys.iswindows()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂)
+        elseif Sys.isapple()
+            (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VfU₁, VfSU₂)#, VSU₃)
+        else
+            (Vtr, Vℤ₂, Vfℤ₂, VU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+        end
+    else
+        (Vtr, VU₁, VSU₂, Vfℤ₂)
+    end
+catch
+    (Vtr, Vℤ₂, Vfℤ₂, Vℤ₃, VU₁, VfU₁, VCU₁, VSU₂, VfSU₂)#, VSU₃)
+end
+
+for V in spacelist
+    I = sectortype(first(V))
+    Istr = TensorKit.type_repr(I)
+    println("---------------------------------------")
+    println("CUDA Tensors with symmetry: $Istr")
+    println("---------------------------------------")
+    @timedtestset "Tensors with symmetry: $Istr" verbose = true begin
+        V1, V2, V3, V4, V5 = V
+        @timedtestset "Basic tensor properties" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Int, Float32, Float64, ComplexF32, ComplexF64)
+                t = @constinferred CUDA.zeros(T, W)
+                # @test @constinferred(hash(t)) == hash(deepcopy(t)) # hash is not defined for CuArray?
+                @test scalartype(t) == T
+                @test norm(t) == 0
+                @test codomain(t) == W
+                @test space(t) == (W ← one(W))
+                @test domain(t) == one(W)
+                @test typeof(t) ==
+                      @constinferred tensormaptype(spacetype(t), 5, 0, storagetype(t))
+            end
+        end
+        @timedtestset "Tensor Dict conversion" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Int, Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                d = convert(Dict, t)
+                @test t == convert(CuTensorMap, d)
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Tensor Array conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+
+                # cuTENSOR does not support Int
+                Ts = sectortype(W) === Trivial ? (Int, Float32, ComplexF64) :
+                     (Float32, ComplexF64)
+                for T in Ts
+                    t = @constinferred CUDA.rand(T, W)
+                    a = @constinferred convert(CuArray, t)
+                    @test t ≈ @constinferred TensorMap(a, W)
+                    # also test if input is matrix
+                    a2 = reshape(a, prod(dim, codomain(t)), prod(dim, domain(t)))
+                    @test t ≈ @constinferred TensorMap(a2, codomain(t), domain(t))
+                end
+            end
+        end
+        @timedtestset "Basic linear algebra" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                t = @constinferred CUDA.rand(T, W)
+                @test scalartype(t) == T
+                @test space(t) == W
+                @test space(t') == W'
+                @test dim(t) == dim(space(t))
+                @test codomain(t) == codomain(W)
+                @test domain(t) == domain(W)
+                @test isa(@constinferred(norm(t)), real(T))
+                @test norm(t)^2 ≈ dot(t, t)
+                α = rand(T)
+                @test norm(α * t) ≈ abs(α) * norm(t)
+                @test norm(t + t, 2) ≈ 2 * norm(t, 2)
+                @test norm(t + t, 1) ≈ 2 * norm(t, 1)
+                @test norm(t + t, Inf) ≈ 2 * norm(t, Inf)
+                p = 3 * rand(Float64)
+                @test norm(t + t, p) ≈ 2 * norm(t, p)
+                @test norm(t) ≈ norm(t')
+
+                t2 = @constinferred CUDA.rand!(similar(t))
+                β = rand(T)
+                @test @constinferred(dot(β * t2, α * t)) ≈ conj(β) * α * conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t, t2))
+                @test dot(t2, t) ≈ conj(dot(t2', t'))
+                @test dot(t2, t) ≈ dot(t', t2')
+
+                A = storagetype(t)
+
+                i1 = @constinferred(isomorphism(CuMatrix{T}, V1 ⊗ V2, V2 ⊗ V1))
+                i2 = @constinferred(isomorphism(CuMatrix{T}, V2 ⊗ V1, V1 ⊗ V2))
+                @test i1 * i2 == @constinferred(id(CuMatrix{T}, V1 ⊗ V2))
+                @test i2 * i1 == @constinferred(id(CuMatrix{T}, V2 ⊗ V1))
+
+                w = @constinferred(isometry(CuMatrix{T}, V1 ⊗ (oneunit(V1) ⊕ oneunit(V1)),
+                                            V1))
+                @test dim(w) == 2 * dim(V1 ← V1)
+                @test w' * w == id(CuMatrix{T}, V1)
+                @test w * w' == (w * w')^2
+            end
+        end
+        if hasfusiontensor(I)
+            @timedtestset "Basic linear algebra: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+                for T in (Float32, ComplexF64)
+                    t = CUDA.rand(T, W)
+                    t2 = @constinferred CUDA.rand!(similar(t))
+                    @test norm(t, 2) ≈ norm(ad(t), 2)
+                    @test dot(t2, t) ≈ dot(ad(t2), ad(t))
+                    α = rand(T)
+                    @test ad(α * t) ≈ α * ad(t)
+                    @test ad(t + t) ≈ 2 * ad(t)
+                end
+            end
+            @timedtestset "Real and imaginary parts" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64, ComplexF32)
+                    t = @constinferred CUDA.randn(T, W, W)
+                    @test real(ad(t)) == ad(@constinferred real(t))
+                    @test imag(ad(t)) == ad(@constinferred imag(t))
+                end
+            end
+        end
+        @timedtestset "Tensor conversion" begin
+            W = V1 ⊗ V2
+            t = @constinferred CUDA.randn(W ← W)
+            @test typeof(convert(CuTensorMap, t')) == typeof(t)
+            tc = complex(t)
+            @test convert(typeof(tc), t) == tc
+            @test typeof(convert(typeof(tc), t)) == typeof(tc)
+            @test typeof(convert(typeof(tc), t')) == typeof(tc)
+            @test Base.promote_typeof(t, tc) == typeof(tc)
+            @test Base.promote_typeof(tc, t) == typeof(tc + t)
+        end
+        #=@timedtestset "diag/diagm" begin
+            W = V1 ⊗ V2 ⊗ V3 ← V4 ⊗ V5
+            t = CUDA.randn(ComplexF64, W)
+            d = LinearAlgebra.diag(t)
+            # TODO find a way to use CUDA here
+            D = LinearAlgebra.diagm(codomain(t), domain(t), d)
+            @test LinearAlgebra.isdiag(D)
+            @test LinearAlgebra.diag(D) == d
+        end=#
+        @timedtestset "Permutations: test via inner product invariance" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            t = CUDA.rand(ComplexF64, W)
+            t′ = CUDA.randn!(similar(t))
+            for k in 0:5
+                for p in permutations(1:5)
+                    p1 = ntuple(n -> p[n], k)
+                    p2 = ntuple(n -> p[k + n], 5 - k)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    t2 = permute(t, (p1, p2))
+                    @test norm(t2) ≈ norm(t)
+                    t2′ = permute(t′, (p1, p2))
+                    @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
+                end
+
+                t3 = @constinferred repartition(t, $k)
+                t3 = repartition(t, k)
+                @test norm(t3) ≈ norm(t)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Permutations: test via conversion" begin
+                W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+                t = CUDA.rand(ComplexF64, W)
+                a = ad(t)
+                for k in 0:5
+                    for p in permutations(1:5)
+                        p1 = ntuple(n -> p[n], k)
+                        p2 = ntuple(n -> p[k + n], 5 - k)
+                        t2 = permute(t, (p1, p2))
+                        a2 = ad(t2)
+                        @test a2 ≈ permute(a, (p1, p2))
+                        @test ad(transpose(t2)) ≈ transpose(a2)
+                    end
+
+                    t3 = repartition(t, k)
+                    a3 = ad(t3)
+                    @test a3 ≈ repartition(a, k)
+                end
+            end
+        end
+        @timedtestset "Full trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
+            end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
+            @test ss ≈ s2
+            @test ss ≈ s3
+        end
+        @timedtestset "Partial trace: test self-consistency" begin
+            t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+            @tensor t2[a, b] := t[c, d, b, d, c, a]
+            @tensor t4[a, b, c, d] := t[d, e, b, e, c, a]
+            @tensor t5[a, b] := t4[a, b, c, c]
+            @test t2 ≈ t5
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Trace: test via conversion" begin
+                t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
+                @test t3 ≈ ad(t2)
+            end
+        end
+        @timedtestset "Trace and contraction" begin
+            t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
+            t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
+            @test ta ≈ tb
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor contraction: test via conversion" begin
+                A1 = CUDA.randn(ComplexF64, V1' * V2', V3')
+                A2 = CUDA.randn(ComplexF64, V3 * V4, V5)
+                rhoL = CUDA.randn(ComplexF64, V1, V1)
+                rhoR = CUDA.randn(ComplexF64, V5, V5)' # test adjoint tensor
+                H = CUDA.randn(ComplexF64, V2 * V4, V2 * V4)
+                @tensor HrA12[a, s1, s2, c] := rhoL[a, a'] * conj(A1[a', t1, b]) *
+                                               A2[b, t2, c'] * rhoR[c', c] *
+                                               H[s1, s2, t1, t2]
+
+                @tensor HrA12array[a, s1, s2, c] := ad(rhoL)[a, a'] *
+                                                    conj(ad(A1)[a', t1, b]) *
+                                                    ad(A2)[b, t2, c'] *
+                                                    ad(rhoR)[c', c] *
+                                                    ad(H)[s1, s2, t1, t2]
+
+                @test HrA12array ≈ ad(HrA12)
+            end
+        end
+        @timedtestset "Multiplication and inverse: test compatibility" begin
+            W1 = V1 ⊗ V2 ⊗ V3
+            W2 = V4 ⊗ V5
+            for T in (Float64, ComplexF64)
+                t1 = CUDA.rand(T, W1, W1)
+                t2 = CUDA.rand(T, W2, W2)
+                t  = CUDA.rand(T, W1, W2)
+                @test t1 * (t1 \ t) ≈ t
+                @test (t / t2) * t2 ≈ t
+                @test t1 \ one(t1) ≈ inv(t1)
+                # @test one(t1) / t1 ≈ pinv(t1) # pinv not available in CUDA
+                @test_throws SpaceMismatch inv(t)
+                @test_throws SpaceMismatch t2 \ t
+                @test_throws SpaceMismatch t / t1
+                # pinv not available in CUDA
+                # tp = pinv(t) * t 
+                # @test tp ≈ tp * tp
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Multiplication and inverse: test via conversion" begin
+                W1 = V1 ⊗ V2 ⊗ V3
+                W2 = V4 ⊗ V5
+                for T in (Float32, Float64, ComplexF32, ComplexF64)
+                    t1 = CUDA.rand(T, W1, W1)
+                    t2 = CUDA.rand(T, W2, W2)
+                    t  = CUDA.rand(T, W1, W2)
+                    d1 = dim(W1)
+                    d2 = dim(W2)
+                    At1 = reshape(convert(Array, t1), d1, d1)
+                    At2 = reshape(convert(Array, t2), d2, d2)
+                    At  = reshape(convert(Array, t), d1, d2)
+                    @test ad(t1 * t) ≈ ad(t1) * ad(t)
+                    @test ad(t1' * t) ≈ ad(t1)' * ad(t)
+                    @test ad(t2 * t') ≈ ad(t2) * ad(t)'
+                    @test ad(t2' * t') ≈ ad(t2)' * ad(t)'
+                    @test ad(inv(t1)) ≈ inv(ad(t1))
+                    # pinv not available in CUDA
+                    #@test ad(pinv(t)) ≈ pinv(ad(t))
+
+                    if T == Float32 || T == ComplexF32
+                        continue
+                    end
+
+                    @test ad(t1 \ t) ≈ ad(t1) \ ad(t)
+                    @test ad(t1' \ t) ≈ ad(t1)' \ ad(t)
+                    @test ad(t2 \ t') ≈ ad(t2) \ ad(t)'
+                    @test ad(t2' \ t') ≈ ad(t2)' \ ad(t)'
+
+                    @test ad(t2 / t) ≈ ad(t2) / ad(t)
+                    @test ad(t2' / t) ≈ ad(t2)' / ad(t)
+                    @test ad(t1 / t') ≈ ad(t1) / ad(t)'
+                    @test ad(t1' / t') ≈ ad(t1)' / ad(t)'
+                end
+            end
+        end
+        @timedtestset "Factorization" begin
+            W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
+            for T in (Float32, ComplexF64)
+                # Test both a normal tensor and an adjoint one.
+                ts = (CUDA.rand(T, W), CUDA.rand(T, W)')
+                for t in ts
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       )
+                        Q, R = @constinferred leftorth(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(Q)
+                        @test Q * R ≈ permute(t, ((3, 4, 2), (1, 5)))
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       )
+                        N = @constinferred leftnull(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(N)
+                        @test norm(N' * permute(t, ((3, 4, 2), (1, 5)))) <
+                              100 * eps(norm(t))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        )
+                        # cusolver SVD requires m >= n for some reason
+                        L, Q = @constinferred rightorth(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        @test isisometry(Q; side=:right)
+                        @test L * Q ≈ permute(t, ((3, 4), (2, 1, 5)))
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        )
+                        M = @constinferred rightnull(t, ((3, 4), (2, 1, 5)); alg=alg)
+                        @test isisometry(M; side=:right)
+                        @test norm(permute(t, ((3, 4), (2, 1, 5))) * M') <
+                              100 * eps(norm(t))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(),)
+                        U, S, V = @constinferred tsvd(t, ((3, 4, 2), (1, 5)); alg=alg)
+                        @test isisometry(U)
+                        @test isisometry(V; side=:right)
+                        t2 = permute(t, ((3, 4, 2), (1, 5)))
+                        US = U * S
+                        USV = US * V
+                        @test U * S * V ≈ t2
+
+                        s = LinearAlgebra.svdvals(t2)
+                        s′ = LinearAlgebra.diag(S)
+                        for (c, b) in s
+                            @test b ≈ s′[c]
+                        end
+                    end
+                end
+                @testset "empty tensor" begin
+                    t = CUDA.randn(T, V1 ⊗ V2, typeof(V1)())
+                    @testset "leftorth with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.QRpos(),
+                                                       TensorKit.QL(), TensorKit.QLpos(),
+                                                       TensorKit.Polar(), TensorKit.SVD(),
+                                                       )
+                        Q, R = @constinferred leftorth(t; alg=alg)
+                        @test Q == t
+                        @test dim(Q) == dim(R) == 0
+                    end
+                    @testset "leftnull with $alg" for alg in
+                                                      (TensorKit.QR(), TensorKit.SVD(),
+                                                       )
+                        N = @constinferred leftnull(t; alg=alg)
+                        @test N' * N ≈ id(storagetype(t), domain(N))
+                        @test N * N' ≈ id(storagetype(t), codomain(N))
+                    end
+                    @testset "rightorth with $alg" for alg in
+                                                       (TensorKit.RQ(), TensorKit.RQpos(),
+                                                        TensorKit.LQ(), TensorKit.LQpos(),
+                                                        TensorKit.Polar(), TensorKit.SVD(),
+                                                        )
+                        L, Q = @constinferred rightorth(copy(t'); alg=alg)
+                        @test Q == t'
+                        @test dim(Q) == dim(L) == 0
+                    end
+                    @testset "rightnull with $alg" for alg in
+                                                       (TensorKit.LQ(), TensorKit.SVD(),
+                                                        )
+                        M = @constinferred rightnull(copy(t'); alg=alg)
+                        @test M * M' ≈ id(storagetype(t), codomain(M))
+                        @test M' * M ≈ id(storagetype(t), domain(M))
+                    end
+                    @testset "tsvd with $alg" for alg in (TensorKit.SVD(), )
+                        U, S, V = @constinferred tsvd(t; alg=alg)
+                        @test U == t
+                        @test dim(U) == dim(S) == dim(V)
+                    end
+                end
+
+                # CUDA only supports symmetric/hermitian eigen
+                @testset "eig and isposdef" begin
+                    t = CUDA.rand(T, V1 ⊗ V2 ← V1 ⊗ V2)
+                    t += t'
+                    D, V = eigen(t)
+                    @test t * V ≈ V * D
+
+                    d = LinearAlgebra.eigvals(t; sortby=nothing)
+                    d′ = LinearAlgebra.diag(D)
+                    for (c, b) in d
+                        @test sort(real.(b)) ≈ sort(real.(d′[c]))
+                    end
+
+                    # Somehow moving these test before the previous one gives rise to errors
+                    # with T=Float32 on x86 platforms. Is this an OpenBLAS issue? 
+                    # VdV = V' * V
+                    # VdV = (VdV + VdV') / 2
+                    # @test isposdef(VdV)
+                    #
+                    # @test !isposdef(t2) # unlikely for non-hermitian map
+                    # t2 = (t2 + t2')
+                    # D, V = eigen(t2)
+                    # VdV = V' * V
+                    # @test VdV ≈ one(VdV)
+                    # D̃, Ṽ = @constinferred eigh(t2)
+                    # @test D ≈ D̃
+                    # @test V ≈ Ṽ
+                    # λ = minimum(minimum(real(LinearAlgebra.diag(b)))
+                    #             for (c, b) in blocks(D))
+                    # @test isposdef(t2) == isposdef(λ)
+                    # @test isposdef(t2 - λ * one(t2) + 0.1 * one(t2))
+                    # @test !isposdef(t2 - λ * one(t2) - 0.1 * one(t2))
+                end
+            end
+        end
+        @timedtestset "Tensor truncation" begin
+            for T in (Float32, ComplexF64)
+                for p in (1, 2, 3, Inf)
+                    # Test both a normal tensor and an adjoint one.
+                    ts = (CUDA.randn(T, V1 ⊗ V2 ⊗ V3, V4 ⊗ V5),
+                          CUDA.randn(T, V4 ⊗ V5, V1 ⊗ V2 ⊗ V3)')
+                    for t in ts
+                        U₀, S₀, V₀, = tsvd(t)
+                        t = rmul!(t, 1 / norm(S₀, p))
+                        # Probably shouldn't allow truncerr and truncdim, as these require scalar indexing?
+                        U, S, V = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))), p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        @test (U, S, V) == (U′, S′, V′)
+                    end
+                end
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor functions" begin
+                W = V1 ⊗ V2
+                for T in (Float64, ComplexF64)
+                    t = CUDA.randn(T, W, W)
+                    s = dim(W)
+                    expt = @constinferred exp(t)
+                    @test ad(expt) ≈ exp(ad(t))
+
+                    @test (@constinferred sqrt(t))^2 ≈ t
+                    @test ad(sqrt(t)) ≈ sqrt(ad(t))
+
+                    @test exp(@constinferred log(expt)) ≈ expt
+                    @test ad(log(expt)) ≈ log(ad(expt))
+
+                    @test (@constinferred cos(t))^2 + (@constinferred sin(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tan(t)) ≈ sin(t) / cos(t)
+                    @test (@constinferred cot(t)) ≈ cos(t) / sin(t)
+                    @test (@constinferred cosh(t))^2 - (@constinferred sinh(t))^2 ≈
+                          id(storagetype(t), W)
+                    @test (@constinferred tanh(t)) ≈ sinh(t) / cosh(t)
+                    @test (@constinferred coth(t)) ≈ cosh(t) / sinh(t)
+
+                    t1 = sin(t)
+                    @test sin(@constinferred asin(t1)) ≈ t1
+                    t2 = cos(t)
+                    @test cos(@constinferred acos(t2)) ≈ t2
+                    t3 = sinh(t)
+                    @test sinh(@constinferred asinh(t3)) ≈ t3
+                    t4 = cosh(t)
+                    @test cosh(@constinferred acosh(t4)) ≈ t4
+                    t5 = tan(t)
+                    @test tan(@constinferred atan(t5)) ≈ t5
+                    t6 = cot(t)
+                    @test cot(@constinferred acot(t6)) ≈ t6
+                    t7 = tanh(t)
+                    @test tanh(@constinferred atanh(t7)) ≈ t7
+                    t8 = coth(t)
+                    @test coth(@constinferred acoth(t8)) ≈ t8
+                end
+            end
+        end
+        # Sylvester not defined for CUDA
+        # @timedtestset "Sylvester equation" begin
+        #     for T in (Float32, ComplexF64)
+        #         tA = CUDA.rand(T, V1 ⊗ V3, V1 ⊗ V3)
+        #         tB = CUDA.rand(T, V2 ⊗ V4, V2 ⊗ V4)
+        #         tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
+        #         tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+        #         tC = CUDA.rand(T, V1 ⊗ V3, V2 ⊗ V4)
+        #         t = @constinferred sylvester(tA, tB, tC)
+        #         @test codomain(t) == V1 ⊗ V3
+        #         @test domain(t) == V2 ⊗ V4
+        #         @test norm(tA * t + t * tB + tC) <
+        #               (norm(tA) + norm(tB) + norm(tC)) * eps(real(T))^(2 / 3)
+        #         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        #             matrix(x) = reshape(convert(Array, x), dim(codomain(x)), dim(domain(x)))
+        #             @test matrix(t) ≈ sylvester(matrix(tA), matrix(tB), matrix(tC))
+        #         end
+        #     end
+        # end
+        #
+        # TODO
+        @timedtestset "Tensor product: test via norm preservation" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
+                t = @constinferred (t1 ⊗ t2)
+                @test norm(t) ≈ norm(t1) * norm(t2)
+            end
+        end
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+            @timedtestset "Tensor product: test via conversion" begin
+                for T in (Float32, ComplexF64)
+                    t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1)
+                    t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V2)
+                    t = @constinferred (t1 ⊗ t2)
+                    d1 = dim(codomain(t1))
+                    d2 = dim(codomain(t2))
+                    d3 = dim(domain(t1))
+                    d4 = dim(domain(t2))
+                    At = ad(t)
+                    @test ad(t) ≈ ad(t1) ⊗ ad(t2)
+                end
+            end
+        end
+        @timedtestset "Tensor product: test via tensor contraction" begin
+            for T in (Float32, ComplexF64)
+                t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
+                t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
+                t = @constinferred (t1 ⊗ t2)
+                @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                @test t ≈ t′
+            end
+        end
+    end
+end
+
+@timedtestset "Deligne tensor product: test via conversion" begin
+    Vlists1 = (Vtr,) # VSU₂)
+    Vlists2 = (Vtr,) # Vℤ₂)
+    @testset for Vlist1 in Vlists1, Vlist2 in Vlists2
+        V1, V2, V3, V4, V5 = Vlist1
+        W1, W2, W3, W4, W5 = Vlist2
+        for T in (Float32, ComplexF64)
+            t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
+            t2 = CUDA.rand(T, W2, W1 ⊗ W1')
+            t = @constinferred (t1 ⊠ t2)
+            d1 = dim(codomain(t1))
+            d2 = dim(codomain(t2))
+            d3 = dim(domain(t1))
+            d4 = dim(domain(t2))
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
+        end
+    end
+end
+
diff --git a/test/diagonal.jl b/test/diagonal.jl
index e5fcaa430..0f67836dd 100644
--- a/test/diagonal.jl
+++ b/test/diagonal.jl
@@ -200,6 +200,16 @@ diagspacelist = ((ℂ^4)', ℂ[Z2Irrep](0 => 2, 1 => 3),
                 @test all(((s, t),) -> isapprox(s, t),
                           zip(values(LinearAlgebra.eigvals(D)),
                               values(LinearAlgebra.eigvals(t))))
+                D, W = @constinferred eig!(t)
+                @test D === t
+                @test W == one(t)
+                @test t * W ≈ W * D
+                D2, V2 = @constinferred eigh!(t2)
+                if T <: Real
+                    @test D2 === t2
+                end
+                @test V2 == one(t)
+                @test t2 * V2 ≈ V2 * D2
             end
             @testset "leftorth with $alg" for alg in (TensorKit.QR(), TensorKit.QL())
                 Q, R = @constinferred leftorth(t; alg=alg)
diff --git a/test/runtests.jl b/test/runtests.jl
index 9e44b8a3a..70124f857 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -112,24 +112,39 @@ VSU₂U₁ = (Vect[SU2Irrep ⊠ U1Irrep]((0, 0) => 1, (1 // 2, -1) => 1),
 #     ℂ[SU3Irrep]((1, 0, 0) => 1, (2, 0, 0) => 1),
 #     ℂ[SU3Irrep]((0, 0, 0) => 1, (1, 0, 0) => 1, (1, 1, 0) => 1)')
 
+is_buildkite = get(ENV, "BUILDKITE", "false") == "true"
+
 Ti = time()
-include("fusiontrees.jl")
-include("spaces.jl")
-include("tensors.jl")
-include("diagonal.jl")
-include("planar.jl")
-# TODO: remove once we know AD is slow on macOS CI
-if !(Sys.isapple() && get(ENV, "CI", "false") == "true") && isempty(VERSION.prerelease)
-    include("ad.jl")
+if !is_buildkite
+    include("fusiontrees.jl")
+    include("spaces.jl")
+    include("tensors.jl")
+    include("diagonal.jl")
+    include("planar.jl")
+    # TODO: remove once we know AD is slow on macOS CI
+    if !(Sys.isapple() && get(ENV, "CI", "false") == "true") && isempty(VERSION.prerelease)
+        include("ad.jl")
+    end
+    include("bugfixes.jl")
+    Tf = time()
+    @testset "Aqua" verbose = true begin
+        using Aqua
+        Aqua.test_all(TensorKit)
+    end
+else
+    using CUDA, cuTENSOR
+    if CUDA.functional()
+        include("cuda.jl")
+    end
+
+    using AMDGPU 
+    if AMDGPU.functional()
+        include("amdgpu.jl")
+    end
+    Tf = time()
 end
-include("bugfixes.jl")
-Tf = time()
+
 printstyled("Finished all tests in ",
             string(round((Tf - Ti) / 60; sigdigits=3)),
             " minutes."; bold=true, color=Base.info_color())
 println()
-
-@testset "Aqua" verbose = true begin
-    using Aqua
-    Aqua.test_all(TensorKit)
-end
diff --git a/test/tensors.jl b/test/tensors.jl
index 25bc157b9..2b73d49bd 100644
--- a/test/tensors.jl
+++ b/test/tensors.jl
@@ -369,9 +369,8 @@ for V in spacelist
             for T in (Float64, ComplexF64)
                 t1 = randisometry(T, W1, W2)
                 t2 = randisometry(T, W2 ← W2)
-                @test t1' * t1 ≈ one(t2)
-                @test t2' * t2 ≈ one(t2)
-                @test t2 * t2' ≈ one(t2)
+                @test isisometry(t1)
+                @test isunitary(t2)
                 P = t1 * t1'
                 @test P * P ≈ P
             end
@@ -451,20 +450,14 @@ for V in spacelist
                                                        TensorKit.Polar(), TensorKit.SVD(),
                                                        TensorKit.SDD())
                         Q, R = @constinferred leftorth(t, ((3, 4, 2), (1, 5)); alg=alg)
-                        QdQ = Q' * Q
-                        @test QdQ ≈ one(QdQ)
+                        @test isisometry(Q)
                         @test Q * R ≈ permute(t, ((3, 4, 2), (1, 5)))
-                        if alg isa Polar
-                            @test isposdef(R)
-                            @test domain(R) == codomain(R) == space(t, 1)' ⊗ space(t, 5)'
-                        end
                     end
                     @testset "leftnull with $alg" for alg in
                                                       (TensorKit.QR(), TensorKit.SVD(),
                                                        TensorKit.SDD())
                         N = @constinferred leftnull(t, ((3, 4, 2), (1, 5)); alg=alg)
-                        NdN = N' * N
-                        @test NdN ≈ one(NdN)
+                        @test isisometry(N)
                         @test norm(N' * permute(t, ((3, 4, 2), (1, 5)))) <
                               100 * eps(norm(t))
                     end
@@ -474,29 +467,21 @@ for V in spacelist
                                                         TensorKit.Polar(), TensorKit.SVD(),
                                                         TensorKit.SDD())
                         L, Q = @constinferred rightorth(t, ((3, 4), (2, 1, 5)); alg=alg)
-                        QQd = Q * Q'
-                        @test QQd ≈ one(QQd)
+                        @test isisometry(Q; side=:right)
                         @test L * Q ≈ permute(t, ((3, 4), (2, 1, 5)))
-                        if alg isa Polar
-                            @test isposdef(L)
-                            @test domain(L) == codomain(L) == space(t, 3) ⊗ space(t, 4)
-                        end
                     end
                     @testset "rightnull with $alg" for alg in
                                                        (TensorKit.LQ(), TensorKit.SVD(),
                                                         TensorKit.SDD())
                         M = @constinferred rightnull(t, ((3, 4), (2, 1, 5)); alg=alg)
-                        MMd = M * M'
-                        @test MMd ≈ one(MMd)
+                        @test isisometry(M; side=:right)
                         @test norm(permute(t, ((3, 4), (2, 1, 5))) * M') <
                               100 * eps(norm(t))
                     end
                     @testset "tsvd with $alg" for alg in (TensorKit.SVD(), TensorKit.SDD())
                         U, S, V = @constinferred tsvd(t, ((3, 4, 2), (1, 5)); alg=alg)
-                        UdU = U' * U
-                        @test UdU ≈ one(UdU)
-                        VVd = V * V'
-                        @test VVd ≈ one(VVd)
+                        @test isisometry(U)
+                        @test isisometry(V; side=:right)
                         t2 = permute(t, ((3, 4, 2), (1, 5)))
                         @test U * S * V ≈ t2
 
@@ -505,6 +490,11 @@ for V in spacelist
                         for (c, b) in s
                             @test b ≈ s′[c]
                         end
+                        s = LinearAlgebra.svdvals(t2')
+                        s′ = LinearAlgebra.diag(S')
+                        for (c, b) in s
+                            @test b ≈ s′[c]
+                        end
                     end
                     @testset "cond and rank" begin
                         t2 = permute(t, ((3, 4, 2), (1, 5)))
@@ -522,6 +512,10 @@ for V in spacelist
                         λmax = maximum(s -> maximum(abs, s), values(vals))
                         λmin = minimum(s -> minimum(abs, s), values(vals))
                         @test cond(t4) ≈ λmax / λmin
+                        vals = LinearAlgebra.eigvals(t4')
+                        λmax = maximum(s -> maximum(abs, s), values(vals))
+                        λmin = minimum(s -> minimum(abs, s), values(vals))
+                        @test cond(t4') ≈ λmax / λmin
                     end
                 end
                 @testset "empty tensor" begin
@@ -539,8 +533,7 @@ for V in spacelist
                                                       (TensorKit.QR(), TensorKit.SVD(),
                                                        TensorKit.SDD())
                         N = @constinferred leftnull(t; alg=alg)
-                        @test N' * N ≈ id(domain(N))
-                        @test N * N' ≈ id(codomain(N))
+                        @test isunitary(N)
                     end
                     @testset "rightorth with $alg" for alg in
                                                        (TensorKit.RQ(), TensorKit.RQpos(),
@@ -555,8 +548,7 @@ for V in spacelist
                                                        (TensorKit.LQ(), TensorKit.SVD(),
                                                         TensorKit.SDD())
                         M = @constinferred rightnull(copy(t'); alg=alg)
-                        @test M * M' ≈ id(codomain(M))
-                        @test M' * M ≈ id(domain(M))
+                        @test isunitary(M)
                     end
                     @testset "tsvd with $alg" for alg in (TensorKit.SVD(), TensorKit.SDD())
                         U, S, V = @constinferred tsvd(t; alg=alg)
@@ -592,8 +584,7 @@ for V in spacelist
                     @test !isposdef(t2) # unlikely for non-hermitian map
                     t2 = (t2 + t2')
                     D, V = eigen(t2)
-                    VdV = V' * V
-                    @test VdV ≈ one(VdV)
+                    @test isisometry(V)
                     D̃, Ṽ = @constinferred eigh(t2)
                     @test D ≈ D̃
                     @test V ≈ Ṽ
@@ -615,23 +606,36 @@ for V in spacelist
                     for t in ts
                         U₀, S₀, V₀, = tsvd(t)
                         t = rmul!(t, 1 / norm(S₀, p))
-                        U, S, V, ϵ = @constinferred tsvd(t; trunc=truncerr(5e-1), p=p)
+                        U, S, V = @constinferred tsvd(t; trunc=truncerr(5e-1, p))
+                        ϵ = TensorKit._norm(LinearAlgebra.svdvals(U * S * V - t), p,
+                                            zero(scalartype(S)))
+                        p == 2 && @test ϵ < 5e-1
                         # @show p, ϵ
                         # @show domain(S)
                         # @test min(space(S,1), space(S₀,1)) != space(S₀,1)
-                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncerr(nextfloat(ϵ)), p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncerr(ϵ + 10eps(ϵ), p))
+                        ϵ′ = TensorKit._norm(LinearAlgebra.svdvals(U′ * S′ * V′ - t), p,
+                                             zero(scalartype(S)))
+
                         @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
-                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncdim(ceil(Int, dim(domain(S)))),
-                                              p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncdim(ceil(Int, dim(domain(S)))))
+                        ϵ′ = TensorKit._norm(LinearAlgebra.svdvals(U′ * S′ * V′ - t), p,
+                                             zero(scalartype(S)))
                         @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
-                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncspace(space(S, 1)))
+                        ϵ′ = TensorKit._norm(LinearAlgebra.svdvals(U′ * S′ * V′ - t), p,
+                                             zero(scalartype(S)))
                         @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
                         # results with truncationcutoff cannot be compared because they don't take degeneracy into account, and thus truncate differently
-                        U, S, V, ϵ = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))), p=p)
+                        U, S, V = tsvd(t; trunc=truncbelow(1 / dim(domain(S₀))))
+                        ϵ = TensorKit._norm(LinearAlgebra.svdvals(U * S * V - t), p,
+                                            zero(scalartype(S)))
                         # @show p, ϵ
                         # @show domain(S)
                         # @test min(space(S,1), space(S₀,1)) != space(S₀,1)
-                        U′, S′, V′, ϵ′ = tsvd(t; trunc=truncspace(space(S, 1)), p=p)
+                        U′, S′, V′ = tsvd(t; trunc=truncspace(space(S, 1)))
+                        ϵ′ = TensorKit._norm(LinearAlgebra.svdvals(U′ * S′ * V′ - t), p,
+                                             zero(scalartype(S)))
                         @test (U, S, V, ϵ) == (U′, S′, V′, ϵ′)
                     end
                 end
@@ -691,8 +695,8 @@ for V in spacelist
             for T in (Float32, ComplexF64)
                 tA = rand(T, V1 ⊗ V3, V1 ⊗ V3)
                 tB = rand(T, V2 ⊗ V4, V2 ⊗ V4)
-                tA = 3 // 2 * leftorth(tA; alg=Polar())[1]
-                tB = 1 // 5 * leftorth(tB; alg=Polar())[1]
+                tA = 3 // 2 * left_polar(tA)[1]
+                tB = 1 // 5 * left_polar(tB)[1]
                 tC = rand(T, V1 ⊗ V3, V2 ⊗ V4)
                 t = @constinferred sylvester(tA, tB, tC)
                 @test codomain(t) == V1 ⊗ V3