sevagh
diff --git a/‎.github/PERFORMANCE.md
Lines changed: 7 additions & 0 deletions b/‎.github/PERFORMANCE.md
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/SDR_scores.md
Lines changed: 38 additions & 12 deletions b/‎.github/SDR_scores.md
Lines changed: 38 additions & 12 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 10 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 19 additions & 18 deletions b/‎README.md
Lines changed: 19 additions & 18 deletions
@@ -54,3 +54,10 @@ sys     3m28.465s
 ```
 
 More than 2x faster for 4 threads. This is inspired by the parallelism strategy used in <https://freemusicdemixer.com>.
+
+V3 is a faster algorithm and the mt variant (with 4 threads) runs in 2.5 min:
+```
+real    2m35.737s
+user    10m28.019s
+sys     2m42.292s
+```
@@ -13,10 +13,10 @@ other           ==> SDR:   7.421  SIR:  11.289  ISR:  14.241  SAR:   8.179
 ```
 CPP inference (this codebase):
 ```
-vocals          ==> SDR:   8.339  SIR:  18.276  ISR:  15.836  SAR:   8.346
-drums           ==> SDR:  10.058  SIR:  18.596  ISR:  17.019  SAR:  10.810
-bass            ==> SDR:   3.919  SIR:  12.436  ISR:   6.931  SAR:   3.182
-other           ==> SDR:   7.421  SIR:  11.286  ISR:  14.252  SAR:   8.183
+vocals          ==> SDR:   8.370  SIR:  18.188  ISR:  15.924  SAR:   8.475
+drums           ==> SDR:  10.002  SIR:  18.571  ISR:  17.027  SAR:  10.645
+bass            ==> SDR:   4.021  SIR:  12.407  ISR:   7.031  SAR:   3.223
+other           ==> SDR:   7.469  SIR:  11.367  ISR:  14.186  SAR:   8.182
 ```
 *n.b.* for the above results, the random shift in the beginning of the song was fixed to 1337 in both PyTorch and C++.
 
@@ -33,10 +33,10 @@ other           ==> SDR:   0.168  SIR:  11.449  ISR:   0.411  SAR:  -2.720
 ```
 CPP inference (this codebase):
 ```
-vocals          ==> SDR:   8.395  SIR:  18.699  ISR:  16.076  SAR:   8.576
-drums           ==> SDR:   9.927  SIR:  17.921  ISR:  17.518  SAR:  10.635
-bass            ==> SDR:   4.519  SIR:  10.458  ISR:   8.606  SAR:   4.370
-other           ==> SDR:   0.164  SIR:  11.443  ISR:   0.409  SAR:  -2.713
+vocals          ==> SDR:   8.395  SIR:  18.581  ISR:  16.101  SAR:   8.579
+drums           ==> SDR:   9.922  SIR:  18.013  ISR:  17.477  SAR:  10.669
+bass            ==> SDR:   4.523  SIR:  10.482  ISR:   8.567  SAR:   4.336
+other           ==> SDR:   0.167  SIR:  11.145  ISR:   0.448  SAR:  -1.238
 ```
 
 *n.b.* the "other" score will be artificially low because of the extra guitar + piano separation where there are no stems to compare to
@@ -54,10 +54,36 @@ other           ==> SDR:   7.384  SIR:  12.812  ISR:  12.977  SAR:   7.798
 ```
 CPP inference (this codebase, `demucs_ft.cpp`)
 ```
-vocals          ==> SDR:   8.594  SIR:  19.045  ISR:  16.313  SAR:   8.617
-drums           ==> SDR:  10.463  SIR:  19.782  ISR:  17.144  SAR:  11.132
-bass            ==> SDR:   4.584  SIR:   9.359  ISR:   9.068  SAR:   4.885
-other           ==> SDR:   7.426  SIR:  12.793  ISR:  12.975  SAR:   7.830
+vocals          ==> SDR:   8.679  SIR:  18.861  ISR:  16.611  SAR:   8.664
+drums           ==> SDR:  10.480  SIR:  19.898  ISR:  17.125  SAR:  11.053
+bass            ==> SDR:   4.590  SIR:   9.516  ISR:   9.102  SAR:   4.935
+other           ==> SDR:   7.370  SIR:  12.853  ISR:  12.926  SAR:   7.805
+```
+
+### Performance of v3 (hdemucs_mmi) model
+
+Track 'Zeno - Signs' from MUSDB18-HQ test set
+
+PyTorch inference (using v3-mmi default segment length + LSTM max length of 200):
+```
+vocals          ==> SDR:   8.328  SIR:  18.943  ISR:  16.097  SAR:   8.563
+drums           ==> SDR:   9.284  SIR:  18.123  ISR:  16.230  SAR:  10.125
+bass            ==> SDR:   3.612  SIR:  10.313  ISR:   6.958  SAR:   3.077
+other           ==> SDR:   7.122  SIR:  11.391  ISR:  14.363  SAR:   7.910
+```
+PyTorch inference (using v4 7.8s segment length + LSTM max length of 336):
+```
+vocals          ==> SDR:   8.304  SIR:  18.916  ISR:  16.087  SAR:   8.557
+drums           ==> SDR:   9.279  SIR:  18.149  ISR:  16.203  SAR:  10.109
+bass            ==> SDR:   3.601  SIR:  10.350  ISR:   6.971  SAR:   3.076
+other           ==> SDR:   7.123  SIR:  11.373  ISR:  14.373  SAR:   7.907
+```
+CPP inference (this codebase, `demucs_v3.cpp`):
+```
+vocals          ==> SDR:   8.332  SIR:  18.889  ISR:  16.083  SAR:   8.557
+drums           ==> SDR:   9.285  SIR:  18.242  ISR:  16.194  SAR:  10.140
+bass            ==> SDR:   3.668  SIR:  10.040  ISR:   7.056  SAR:   3.210
+other           ==> SDR:   7.130  SIR:  11.440  ISR:  14.257  SAR:   7.860
 ```
 
 ### Performance of multi-threaded inference
 
@@ -104,6 +104,16 @@ target_include_directories(demucs_ft_mt.cpp.main PRIVATE vendor/libnyquist/inclu
 target_include_directories(demucs_ft_mt.cpp.main PRIVATE cli-apps)
 target_link_libraries(demucs_ft_mt.cpp.main demucs.cpp.lib libnyquist)
 
+add_executable(demucs_v3.cpp.main "cli-apps/demucs_v3.cpp")
+target_include_directories(demucs_v3.cpp.main PRIVATE vendor/libnyquist/include)
+target_include_directories(demucs_v3.cpp.main PRIVATE cli-apps)
+target_link_libraries(demucs_v3.cpp.main demucs.cpp.lib libnyquist)
+
+add_executable(demucs_v3_mt.cpp.main "cli-apps/demucs_v3_mt.cpp")
+target_include_directories(demucs_v3_mt.cpp.main PRIVATE vendor/libnyquist/include)
+target_include_directories(demucs_v3_mt.cpp.main PRIVATE cli-apps)
+target_link_libraries(demucs_v3_mt.cpp.main demucs.cpp.lib libnyquist)
+
 file(GLOB SOURCES_TO_LINT "src/*.cpp" "src/*.hpp" "cli-apps/*.cpp" "cli-apps/*.hpp")
 
 # add target to run standard lints and formatters
 
@@ -1,8 +1,8 @@
 # demucs.cpp
 
-C++17 library that implements the inference of the [Demucs v4 hybrid transformer model](https://github.com/facebookresearch/demucs), a PyTorch neural network for music demixing.
+C++17 library that implements inference for the [Demucs v4 hybrid transformer](https://github.com/facebookresearch/demucs) and [Demucs v3 hybrid](https://github.com/facebookresearch/demucs/tree/v3) models, which are high-performance PyTorch neural networks for music source separation.
 
-It uses only the standard library and the header-only library [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) as dependencies, making it suitable to compile and run on many platforms. It was designed for low-memory environments by sacrificing the speed of the Torch implementation.
+It uses only the standard library (C++17) and the header-only library [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) as dependencies, making it suitable to compile and run on many platforms. It was designed for low-memory environments by sacrificing the speed of the Torch implementation.
 
 Demucs.cpp powers my websites (<https://freemusicdemixer.com>, <https://pro.freemusicdemixer.com>) and now my new Android app [Music Demixer](https://play.google.com/store/apps/details?id=com.freemusicdemixer.pro) to bring Demucs to your pocket!
 
@@ -12,9 +12,11 @@ See my other project [umx.cpp](https://github.com/sevagh/umx.cpp) for a similar
 
 ### Library design
 
-It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio files, the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (4-source, 6-source, fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) (+ OpenMP) to implement the inference. There are also programs for multi-threaded Demucs inference using C++11's `std::thread`.
+The inference library (in `src/`) uses the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `hdemucs_mmi`, `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (v3, v4 4-source, v4 6-source, v4 fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) to implement the inference (with OpenMP as a requirement).
 
-**All Hybrid-Transformer weights** (4-source, 6-source, fine-tuned) are supported. See the [Convert weights](#convert-weights) section below. Demixing quality is nearly identical to PyTorch as shown in the [SDR scores doc](./.github/SDR_scores.md).
+The cli programs (in `cli-apps/`) additionally use [libnyquist](https://github.com/ddiakopoulos/libnyquist) to read and write audio files, and the multithreaded cli programs use C++11's `std::thread`.
+
+**All Hybrid-Transformer weights** (4-source, 6-source, fine-tuned) are supported. See the [Convert weights](#convert-weights) section below. Inference for the **Demucs v3 Hybrid model weights** `hdemucs_mmi` is also supported. Demixing quality is practically identical to PyTorch as shown in the [SDR scores doc](./.github/SDR_scores.md).
 
 ### Directory structure
 
@@ -23,8 +25,10 @@ It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio f
 1. `demucs_ft.cpp.main`: run all four fine-tuned models for `htdemucs_ft` inference, same as the BagOfModels idea of PyTorch Demucs
 1. `demucs_mt.cpp.main`: run a single model, multi-threaded
 1. `demucs_ft_mt.cpp.main`: run all four fine-tuned models, multi-threaded
+1. `demucs_v3.cpp.main`: run a single model for v3 `hdemucs_mmi`
+1. `demucs_v3_mt.cpp.main`: run a single model for v3 `hdemucs_mmi`, multi-threaded
 
-See the [PERFORMANCE doc](./.github/PERFORMANCE.md) for details on multi-threading, external BLAS libraries, etc..
+See the [PERFORMANCE doc](./.github/PERFORMANCE.md) for time measurements, benchmarks, details on multi-threading, external BLAS libraries, etc.
 
 ## Instructions
 
@@ -45,10 +49,6 @@ $ sudo apt-get install gcc g++ cmake clang-tools libopenblas0-openmp libopenblas
 Compile with CMake:
 ```
 $ mkdir -p build && cd build && cmake .. && make -j16
-libdemucs.cpp.lib.a <--- library
-demucs.cpp.main     <--- single-model (4s, 6s, ft)
-demucs_ft.cpp.main  <--- bag of ft models
-demucs.cpp.test     <--- unit tests
 ```
 
 ### Convert weights
@@ -62,7 +62,7 @@ $ mamba activate demucscpp
 $ python -m pip install -r ./scripts/requirements.txt
 ```
 
-Dump Demucs weights to ggml file, with flag `--six-source` for the 6-source variant, and all of `--ft-drums, --ft-vocals, --ft-bass, --ft-other` for the fine-tuned models:
+Dump Demucs weights to ggml file, with flag `--six-source` for the 6-source variant, all of `--ft-drums, --ft-vocals, --ft-bass, --ft-other` for the fine-tuned models, and `--v3` for the v3 model:
 ```
 $ python ./scripts/convert-pth-to-ggml.py ./ggml-demucs
 ...
@@ -76,14 +76,15 @@ Done. Output file:  ggml-demucs/ggml-model-htdemucs-4s-f16.bin
 
 All supported models would look like this:
 ```
-$ ls ../ggml-demucs/
-total 133M
- 81M Jan 10 22:40 ggml-model-htdemucs-4s-f16.bin
- 53M Jan 10 22:41 ggml-model-htdemucs-6s-f16.bin
- 81M Jan 10 22:41 ggml-model-htdemucs_ft_drums-4s-f16.bin
- 81M Jan 10 22:43 ggml-model-htdemucs_ft_bass-4s-f16.bin
- 81M Jan 10 22:43 ggml-model-htdemucs_ft_other-4s-f16.bin
- 81M Jan 10 22:43 ggml-model-htdemucs_ft_vocals-4s-f16.bin
+$ ls ./ggml-demucs/
+total 613M
+160M May  5 14:38 ggml-model-hdemucs_mmi-v3-f16.bin
+ 53M May  5 16:50 ggml-model-htdemucs-6s-f16.bin
+ 81M May  5 16:50 ggml-model-htdemucs_ft_vocals-4s-f16.bin
+ 81M May  5 16:50 ggml-model-htdemucs_ft_bass-4s-f16.bin
+ 81M May  5 16:50 ggml-model-htdemucs_ft_drums-4s-f16.bin
+ 81M May  5 16:50 ggml-model-htdemucs_ft_other-4s-f16.bin
+ 81M May  5 16:51 ggml-model-htdemucs-4s-f16.bin
 ```
 
 ### Run demucs.cpp