perf: read_off remove pandas.read_csv skiprows

YodaEmbedding · YodaEmbedding · commit 4c5211b15ec3 · 2023-12-24T04:30:57.000-08:00
Using the already seeked file avoids the need for pandas to reparse the
first rows of the file to find the starting position.

---

Also, improve robustness of header parsing a bit.

In particular, ModelNet40 has faulty headers:
```bash
$ head -n 1 ModelNet40/chair/train/chair_0856.off
OFF6586 5534 0
```

For reference, the correct format is:
```
OFF
6586 5534 0
```

Nonetheless, it is still valuable to parse the faulty header.
diff --git a/pyntcloud/io/off.py b/pyntcloud/io/off.py
@@ -1,62 +1,73 @@
-import pandas as pd
+import re
+
 import numpy as np
+import pandas as pd
 
 
 def read_off(filename):
-
-    with open(filename) as off:
-
-        first_line = off.readline()
+    with open(filename) as f:
+        first_line = f.readline()
         if "OFF" not in first_line:
-            raise ValueError('The file does not start with the word OFF')
-        color = True if "C" in first_line else False
+            raise ValueError("The file does not start with the word OFF")
+        has_color = "C" in first_line
 
-        n_points = 0
-        n_faces = 0
+        num_rows = None
+        n_points = None
+        n_faces = None
 
-        count = 1
-        for line in off:
-            count += 1
+        # Backtrack to account for faulty headers, e.g. "OFF4 4 0".
+        m = re.match(r"^(?P<prefix>\D+)([\d\s]+)$", first_line)
+        if m:
+            f.seek(len(m.group("prefix")))
+
+        # Read header.
+        for line in f:
             if line.startswith("#"):
                 continue
             line = line.strip().split()
-            if len(line) > 1:
-                n_points = int(line[0])
-                n_faces = int(line[1])
-                break
+            if len(line) <= 1:
+                continue
+            n_points = int(line[0])
+            n_faces = int(line[1])
+            num_rows = n_points + n_faces
+            break
 
-        if (n_points == 0):
-            raise ValueError('The file has no points')
+        if num_rows is None:
+            raise ValueError("The file does not contain a valid header")
+
+        if n_points == 0:
+            raise ValueError("The file contains no points")
 
         data = {}
         point_names = ["x", "y", "z"]
-        point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
+        point_types = {"x": np.float32, "y": np.float32, "z": np.float32}
 
-        if color:
+        if has_color:
             point_names.extend(["red", "green", "blue"])
-            point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
+            color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8}
+            point_types = {**point_types, **color_point_types}
 
         data["points"] = pd.read_csv(
-            off,
+            f,
             sep=" ",
             header=None,
             engine="c",
             nrows=n_points,
             names=point_names,
             dtype=point_types,
             index_col=False,
-            comment="#"
+            comment="#",
         )
 
         data["mesh"] = pd.read_csv(
-            filename,
+            f,
             sep=" ",
             header=None,
             engine="c",
-            skiprows=(count + n_points),
             nrows=n_faces,
             usecols=[1, 2, 3],
             names=["v1", "v2", "v3"],
-            comment="#"
+            comment="#",
         )
-        return data
+
+    return data