elalish · elalish · Jul 8, 2024 · Jul 7, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/.github/workflows/manifold.yml b/.github/workflows/manifold.yml
@@ -221,7 +221,7 @@ jobs:
         pip install trimesh pytest
     - name: Install TBB
       if: matrix.parallel_backend == 'TBB'
-      run: brew install tbb
+      run: brew install tbb onedpl
     - uses: actions/checkout@v4
       with:
         submodules: recursive

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -113,7 +113,8 @@
     "format": "cpp",
     "numbers": "cpp",
     "ranges": "cpp",
-    "stop_token": "cpp"
+    "stop_token": "cpp",
+    "execution": "cpp"
   },
   "C_Cpp.clang_format_fallbackStyle": "google",
   "editor.formatOnSave": true,

diff --git a/README.md b/README.md
@@ -34,7 +34,6 @@ This is a modern C++ library that Github's CI verifies builds and runs on a vari
 
 System Dependencies (note that we will automatically download the dependency if there is no such package on the system):
 - [`GLM`](https://github.com/g-truc/glm/): A compact header-only vector library.
-- [`Thrust`](https://github.com/NVIDIA/thrust): NVIDIA's parallel algorithms library (basically a superset of C++17 std::parallel_algorithms)
 - [`tbb`](https://github.com/oneapi-src/oneTBB/): Intel's thread building blocks library. (only when `MANIFOLD_PAR=TBB` is enabled)
 - [`gtest`](https://github.com/google/googletest/): Google test library (only when test is enabled, i.e. `MANIFOLD_TEST=ON`)
 
@@ -48,7 +47,7 @@ This library is fast with guaranteed manifold output. As such you need manifold
 
 The most significant contribution here is a guaranteed-manifold [mesh Boolean](https://github.com/elalish/manifold/wiki/Manifold-Library#mesh-boolean) algorithm, which I believe is the first of its kind. If you know of another, please open a discussion - a mesh Boolean algorithm robust to edge cases has been an open problem for many years. Likewise, if the Boolean here ever fails you, please submit an issue! This Boolean forms the basis of a CAD kernel, as it allows simple shapes to be combined into more complex ones.
 
-To aid in speed, this library makes extensive use of parallelization, generally through Nvidia's Thrust library. You can switch between the TBB, and serial C++ backends by setting a CMake flag. Not everything is so parallelizable, for instance a [polygon triangulation](https://github.com/elalish/manifold/wiki/Manifold-Library#polygon-triangulation) algorithm is included which is serial. Even if compiled with parallel backend, the code will still fall back to the serial version of the algorithms if the problem size is small. The WASM build is serial-only for now, but still fast.
+To aid in speed, this library makes extensive use of parallelization, generally through PSTL. You can switch between the TBB, and serial C++ backends by setting a CMake flag. Not everything is so parallelizable, for instance a [polygon triangulation](https://github.com/elalish/manifold/wiki/Manifold-Library#polygon-triangulation) algorithm is included which is serial. Even if compiled with parallel backend, the code will still fall back to the serial version of the algorithms if the problem size is small. The WASM build is serial-only for now, but still fast.
 
 > Note: OMP and CUDA backends are now removed
 
@@ -83,7 +82,6 @@ CMake flags (usage e.g. `-DMANIFOLD_DEBUG=ON`):
 Offline building:
 - `FETCHCONTENT_SOURCE_DIR_GLM`: path to glm source.
 - `FETCHCONTENT_SOURCE_DIR_GOOGLETEST`: path to googletest source.
-- `FETCHCONTENT_SOURCE_DIR_THRUST`: path to NVIDIA thrust source.
 
 The build instructions used by our CI are in [manifold.yml](https://github.com/elalish/manifold/blob/master/.github/workflows/manifold.yml), which is a good source to check if something goes wrong and for instructions specific to other platforms, like Windows.
 

diff --git a/flake.lock b/flake.lock
diff --git a/flake.nix b/flake.nix
@@ -5,15 +5,11 @@
     url = "github:google/googletest/v1.14.0";
     flake = false;
   };
-  inputs.thrust-src = {
-    url = "git+https://github.com/NVIDIA/thrust.git?submodules=1";
-    flake = false;
-  };
   inputs.clipper2-src = {
     url = "github:AngusJohnson/Clipper2";
     flake = false;
   };
-  outputs = { self, nixpkgs, flake-utils, gtest-src, thrust-src, clipper2-src }:
+  outputs = { self, nixpkgs, flake-utils, gtest-src, clipper2-src }:
     flake-utils.lib.eachDefaultSystem
       (system:
         let
@@ -52,7 +48,6 @@
                 "-DMANIFOLD_PYBIND=ON"
                 "-DMANIFOLD_CBIND=ON"
                 "-DBUILD_SHARED_LIBS=ON"
-                "-DFETCHCONTENT_SOURCE_DIR_THRUST=${thrust-src}"
                 "-DMANIFOLD_PAR=${pkgs.lib.strings.toUpper parallel-backend}"
               ];
               prePatch = ''
@@ -100,7 +95,6 @@
                 emcmake cmake -DCMAKE_BUILD_TYPE=Release \
                 -DFETCHCONTENT_SOURCE_DIR_GLM=${pkgs.glm.src} \
                 -DFETCHCONTENT_SOURCE_DIR_GOOGLETEST=${gtest-src} \
-                -DFETCHCONTENT_SOURCE_DIR_THRUST=${thrust-src} \
                 -DFETCHCONTENT_SOURCE_DIR_CLIPPER2=../clipper2 ..
               '';
               buildPhase = ''
@@ -139,7 +133,6 @@
                 pathspec
                 pkg-config
               ];
-              SKBUILD_CMAKE_DEFINE = "FETCHCONTENT_SOURCE_DIR_THRUST=${thrust-src}";
               checkInputs = [
                 trimesh
                 pytest

diff --git a/manifoldDeps.cmake b/manifoldDeps.cmake
@@ -1,9 +1,12 @@
 include(FetchContent)
 include(GNUInstallDirs)
 find_package(PkgConfig QUIET)
-find_package(Clipper2)
+find_package(Clipper2 QUIET)
 if(MANIFOLD_PAR STREQUAL "TBB")
     find_package(TBB QUIET)
+    if(APPLE)
+        find_package(oneDPL QUIET)
+    endif()
 endif()
 if (PKG_CONFIG_FOUND)
     if (NOT Clipper2_FOUND)

diff --git a/src/collider/include/collider.h b/src/collider/include/collider.h
@@ -36,7 +36,7 @@ class Collider {
   Vec<Box> nodeBBox_;
   Vec<int> nodeParent_;
   // even nodes are leaves, odd nodes are internal, root is 1
-  Vec<thrust::pair<int, int>> internalChildren_;
+  Vec<std::pair<int, int>> internalChildren_;
 
   size_t NumInternal() const { return internalChildren_.size(); };
   size_t NumLeaves() const {

diff --git a/src/collider/src/collider.cpp b/src/collider/src/collider.cpp
@@ -69,7 +69,7 @@ int Leaf2Node(int leaf) { return leaf * 2; }
 
 struct CreateRadixTree {
   VecView<int> nodeParent_;
-  VecView<thrust::pair<int, int>> internalChildren_;
+  VecView<std::pair<int, int>> internalChildren_;
   const VecView<const uint32_t> leafMorton_;
 
   int PrefixLength(uint32_t a, uint32_t b) const {
@@ -137,7 +137,7 @@ struct CreateRadixTree {
     int first = internal;
     // Find the range of objects with a common prefix
     int last = RangeEnd(first);
-    if (first > last) thrust::swap(first, last);
+    if (first > last) std::swap(first, last);
     // Determine where the next-highest difference occurs
     int split = FindSplit(first, last);
     int child1 = split == first ? Leaf2Node(split) : Internal2Node(split);
@@ -156,7 +156,7 @@ template <typename T, const bool selfCollision, typename Recorder>
 struct FindCollisions {
   VecView<const T> queries;
   VecView<const Box> nodeBBox_;
-  VecView<const thrust::pair<int, int>> internalChildren_;
+  VecView<const std::pair<int, int>> internalChildren_;
   Recorder recorder;
 
   int RecordCollision(int node, const int queryIdx) {
@@ -244,7 +244,7 @@ struct BuildInternalBoxes {
   VecView<Box> nodeBBox_;
   VecView<int> counter_;
   const VecView<int> nodeParent_;
-  const VecView<thrust::pair<int, int>> internalChildren_;
+  const VecView<std::pair<int, int>> internalChildren_;
 
   void operator()(int leaf) {
     int node = Leaf2Node(leaf);
@@ -288,7 +288,7 @@ Collider::Collider(const VecView<const Box>& leafBB,
   // assign and allocate members
   nodeBBox_.resize(num_nodes);
   nodeParent_.resize(num_nodes, -1);
-  internalChildren_.resize(leafBB.size() - 1, thrust::make_pair(-1, -1));
+  internalChildren_.resize(leafBB.size() - 1, std::make_pair(-1, -1));
   // organize tree
   for_each_n(autoPolicy(NumInternal()), countAt(0), NumInternal(),
              CreateRadixTree({nodeParent_, internalChildren_, leafMorton}));

diff --git a/src/manifold/src/boolean3.cpp b/src/manifold/src/boolean3.cpp
@@ -105,7 +105,7 @@ inline bool Shadows(float p, float q, float dir) {
   return p == q ? dir < 0 : p < q;
 }
 
-inline thrust::pair<int, glm::vec2> Shadow01(
+inline std::pair<int, glm::vec2> Shadow01(
     const int p0, const int q1, VecView<const glm::vec3> vertPosP,
     VecView<const glm::vec3> vertPosQ, VecView<const Halfedge> halfedgeQ,
     const float expandP, VecView<const glm::vec3> normalP, const bool reverse) {
@@ -133,7 +133,7 @@ inline thrust::pair<int, glm::vec2> Shadow01(
       if (!Shadows(vertPosP[p0].y, yz01[0], expandP * normalP[p0].y)) s01 = 0;
     }
   }
-  return thrust::make_pair(s01, yz01);
+  return std::make_pair(s01, yz01);
 }
 
 // https://github.com/scandum/binary_search/blob/master/README.md
@@ -406,7 +406,7 @@ struct Kernel12 {
         if (k < 2 && (k == 0 || (s != 0) != shadows)) {
           shadows = s != 0;
           xzyLR0[k] = vertPosP[vert];
-          thrust::swap(xzyLR0[k].y, xzyLR0[k].z);
+          std::swap(xzyLR0[k].y, xzyLR0[k].z);
           xzyLR1[k] = xzyLR0[k];
           xzyLR1[k][1] = z02[idx];
           k++;
@@ -434,7 +434,7 @@ struct Kernel12 {
           xzyLR0[k][2] = xyzz.y;
           xzyLR1[k] = xzyLR0[k];
           xzyLR1[k][1] = xyzz.w;
-          if (!forward) thrust::swap(xzyLR0[k][1], xzyLR1[k][1]);
+          if (!forward) std::swap(xzyLR0[k][1], xzyLR1[k][1]);
           k++;
         }
       }

diff --git a/src/manifold/src/boolean_result.cpp b/src/manifold/src/boolean_result.cpp
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <array>
+#include <iostream>
 #include <map>
 
 #if MANIFOLD_PAR == 'T' && __has_include(<tbb/concurrent_map.h>)
@@ -30,10 +31,8 @@
 #endif
 #include "boolean3.h"
 #include "par.h"
-#include "polygon.h"
 
 using namespace manifold;
-using namespace thrust::placeholders;
 
 template <>
 struct std::hash<std::pair<int, int>> {
@@ -46,7 +45,7 @@
 
 constexpr int kParallelThreshold = 128;
 
-struct AbsSum : public thrust::binary_function<int, int, int> {
+struct AbsSum {
   int operator()(int a, int b) { return abs(a) + abs(b); }
 };
 
@@ -93,10 +92,6 @@
   }
 };
 
-struct NotZero : public thrust::unary_function<int, int> {
-  int operator()(int x) const { return x > 0 ? 1 : 0; }
-};
-
 std::tuple<Vec<int>, Vec<int>> SizeOutput(
     Manifold::Impl &outR, const Manifold::Impl &inP, const Manifold::Impl &inQ,
     const Vec<int> &i03, const Vec<int> &i30, const Vec<int> &i12,
@@ -122,31 +117,48 @@
                  {sidesPerFaceQ, sidesPerFaceP, i21, p2q1, inQ.halfedge_}));
 
   Vec<int> facePQ2R(inP.NumTri() + inQ.NumTri() + 1, 0);
-  auto keepFace = TransformIterator(sidesPerFacePQ.begin(), NotZero());
+  auto keepFace = TransformIterator(sidesPerFacePQ.begin(),
+                                    [](int x) { return x > 0 ? 1 : 0; });
 
   inclusive_scan(autoPolicy(sidesPerFacePQ.size()), keepFace,
                  keepFace + sidesPerFacePQ.size(), facePQ2R.begin() + 1);
   int numFaceR = facePQ2R.back();
   facePQ2R.resize(inP.NumTri() + inQ.NumTri());
 
   outR.faceNormal_.resize(numFaceR);
-  auto next = copy_if<decltype(outR.faceNormal_.begin())>(
-      autoPolicy(inP.faceNormal_.size()), inP.faceNormal_.begin(),
-      inP.faceNormal_.end(), keepFace, outR.faceNormal_.begin(),
-      thrust::identity<bool>());
+
+  Vec<size_t> tmpBuffer(outR.faceNormal_.size());
+  auto faceIds = TransformIterator(countAt(0_z), [&sidesPerFacePQ](size_t i) {
+    if (sidesPerFacePQ[i] > 0) return i;
+    return std::numeric_limits<size_t>::max();
+  });
+
+  auto next = copy_if<decltype(tmpBuffer.begin())>(
+      autoPolicy(inP.faceNormal_.size()), faceIds,
+      faceIds + inP.faceNormal_.size(), tmpBuffer.begin(),
+      [](size_t v) { return v != std::numeric_limits<size_t>::max(); });
+
+  gather(autoPolicy(inP.faceNormal_.size()), tmpBuffer.begin(), next,
+         inP.faceNormal_.begin(), outR.faceNormal_.begin());
+
+  auto faceIdsQ =
+      TransformIterator(countAt(0_z), [&sidesPerFacePQ, &inP](size_t i) {
+        if (sidesPerFacePQ[i + inP.faceNormal_.size()] > 0) return i;
+        return std::numeric_limits<size_t>::max();
+      });
+  auto end = copy_if<decltype(tmpBuffer.begin())>(
+      autoPolicy(inQ.faceNormal_.size()), faceIdsQ,
+      faceIdsQ + inQ.faceNormal_.size(), next,
+      [](size_t v) { return v != std::numeric_limits<size_t>::max(); });
+
   if (invertQ) {
-    auto start =
-        TransformIterator(inQ.faceNormal_.begin(), thrust::negate<glm::vec3>());
-    auto end =
-        TransformIterator(inQ.faceNormal_.end(), thrust::negate<glm::vec3>());
-    copy_if<decltype(inQ.faceNormal_.begin())>(
-        autoPolicy(inQ.faceNormal_.size()), start, end, keepFace + inP.NumTri(),
-        next, thrust::identity<bool>());
+    gather(autoPolicy(inQ.faceNormal_.size()), next, end,
+           TransformIterator(inQ.faceNormal_.begin(), Negate<glm::vec3>()),
+           outR.faceNormal_.begin() + std::distance(tmpBuffer.begin(), next));
   } else {
-    copy_if<decltype(inQ.faceNormal_.begin())>(
-        autoPolicy(inQ.faceNormal_.size()), inQ.faceNormal_.begin(),
-        inQ.faceNormal_.end(), keepFace + inP.NumTri(), next,
-        thrust::identity<bool>());
+    gather(autoPolicy(inQ.faceNormal_.size()), next, end,
+           inQ.faceNormal_.begin(),
+           outR.faceNormal_.begin() + std::distance(tmpBuffer.begin(), next));
   }
 
   auto newEnd = remove<decltype(sidesPerFacePQ.begin())>(
@@ -662,13 +674,13 @@
   Vec<int> i30(w30_.size());
 
   transform(autoPolicy(x12_.size()), x12_.begin(), x12_.end(), i12.begin(),
-            c3 * _1);
+            [c3](int v) { return c3 * v; });
   transform(autoPolicy(x21_.size()), x21_.begin(), x21_.end(), i21.begin(),
-            c3 * _1);
+            [c3](int v) { return c3 * v; });
   transform(autoPolicy(w03_.size()), w03_.begin(), w03_.end(), i03.begin(),
-            c1 + c3 * _1);
+            [c1, c3](int v) { return c1 + c3 * v; });
   transform(autoPolicy(w30_.size()), w30_.begin(), w30_.end(), i30.begin(),
-            c2 + c3 * _1);
+            [c2, c3](int v) { return c2 + c3 * v; });
 
   Vec<int> vP2R(inP_.NumVert());
   exclusive_scan(autoPolicy(i03.size()), i03.begin(), i03.end(), vP2R.begin(),