perf: read_off replace pandas.read_csv engine=python with c

YodaEmbedding · YodaEmbedding · commit e9689562fce2 · 2023-12-24T03:51:35.000-08:00
Big performance improvement by removing the need to use the slow `engine="python"` by reading the sliced file from an in-memory StringIO buffer. Also fixes bug where OFF files containing more lines than `num_points + num_faces` tries to read potential edges as faces! As [Wikipedia] says, the OFF file may contain: - points - faces (optional) - edges (optional) Of course, this still does not encompass all possible OFF file variants described by Wikipedia, but it's an improvement. [Wikipedia]: https://en.wikipedia.org/wiki/OFF_(file_format)
diff --git a/pyntcloud/io/off.py b/pyntcloud/io/off.py
@@ -1,62 +1,80 @@
-import pandas as pd
+from contextlib import contextmanager
+from io import StringIO
+from itertools import islice
+
 import numpy as np
+import pandas as pd
 
 
 def read_off(filename):
-
-    with open(filename) as off:
-
-        first_line = off.readline()
+    with open(filename) as f:
+        first_line = f.readline()
         if "OFF" not in first_line:
-            raise ValueError('The file does not start with the word OFF')
-        color = True if "C" in first_line else False
+            raise ValueError("The file does not start with the word OFF")
+        has_color = "C" in first_line
 
-        n_points = 0
-        n_faces = 0
+        num_rows = None
+        n_points = None
+        n_faces = None
 
-        count = 1
-        for line in off:
-            count += 1
+        # Read header.
+        for line in f:
             if line.startswith("#"):
                 continue
             line = line.strip().split()
-            if len(line) > 1:
-                n_points = int(line[0])
-                n_faces = int(line[1])
-                break
+            if len(line) <= 1:
+                continue
+            n_points = int(line[0])
+            n_faces = int(line[1])
+            num_rows = n_points + n_faces
+            break
 
-        if (n_points == 0):
-            raise ValueError('The file has no points')
+        if num_rows is None:
+            raise ValueError("The file does not contain a valid header")
 
-        data = {}
-        point_names = ["x", "y", "z"]
-        point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
+        # Read remaining lines.
+        lines = [next(f) for _ in range(num_rows)]
 
-        if color:
-            point_names.extend(["red", "green", "blue"])
-            point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
+    if n_points == 0:
+        raise ValueError("The file has no points")
 
+    data = {}
+    point_names = ["x", "y", "z"]
+    point_types = {"x": np.float32, "y": np.float32, "z": np.float32}
+
+    if has_color:
+        point_names.extend(["red", "green", "blue"])
+        color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8}
+        point_types = {**point_types, **color_point_types}
+
+    with _file_from_lines(lines, 0, n_points) as f:
         data["points"] = pd.read_csv(
-            off,
+            f,
             sep=" ",
             header=None,
             engine="c",
-            nrows=n_points,
             names=point_names,
-            dtype=point_types,
             index_col=False,
-            comment="#"
+            comment="#",
         )
 
+    with _file_from_lines(lines, n_points, n_points + n_faces) as f:
         data["mesh"] = pd.read_csv(
-            filename,
+            f,
             sep=" ",
             header=None,
             engine="c",
-            skiprows=(count + n_points),
-            nrows=n_faces,
             usecols=[1, 2, 3],
             names=["v1", "v2", "v3"],
-            comment="#"
+            comment="#",
         )
-        return data
+
+    return data
+
+
+@contextmanager
+def _file_from_lines(lines, start=None, stop=None):
+    with StringIO() as f:
+        f.writelines("".join(islice(lines, start, stop)))
+        f.seek(0)
+        yield f