Skip to content

Commit 4c5211b

Browse files
committed
perf: read_off remove pandas.read_csv skiprows
Using the already seeked file avoids the need for pandas to reparse the first rows of the file to find the starting position. --- Also, improve robustness of header parsing a bit. In particular, ModelNet40 has faulty headers: ```bash $ head -n 1 ModelNet40/chair/train/chair_0856.off OFF6586 5534 0 ``` For reference, the correct format is: ``` OFF 6586 5534 0 ``` Nonetheless, it is still valuable to parse the faulty header.
1 parent 248fd0f commit 4c5211b

File tree

1 file changed

+38
-27
lines changed

1 file changed

+38
-27
lines changed

pyntcloud/io/off.py

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,73 @@
1-
import pandas as pd
1+
import re
2+
23
import numpy as np
4+
import pandas as pd
35

46

57
def read_off(filename):
6-
7-
with open(filename) as off:
8-
9-
first_line = off.readline()
8+
with open(filename) as f:
9+
first_line = f.readline()
1010
if "OFF" not in first_line:
11-
raise ValueError('The file does not start with the word OFF')
12-
color = True if "C" in first_line else False
11+
raise ValueError("The file does not start with the word OFF")
12+
has_color = "C" in first_line
1313

14-
n_points = 0
15-
n_faces = 0
14+
num_rows = None
15+
n_points = None
16+
n_faces = None
1617

17-
count = 1
18-
for line in off:
19-
count += 1
18+
# Backtrack to account for faulty headers, e.g. "OFF4 4 0".
19+
m = re.match(r"^(?P<prefix>\D+)([\d\s]+)$", first_line)
20+
if m:
21+
f.seek(len(m.group("prefix")))
22+
23+
# Read header.
24+
for line in f:
2025
if line.startswith("#"):
2126
continue
2227
line = line.strip().split()
23-
if len(line) > 1:
24-
n_points = int(line[0])
25-
n_faces = int(line[1])
26-
break
28+
if len(line) <= 1:
29+
continue
30+
n_points = int(line[0])
31+
n_faces = int(line[1])
32+
num_rows = n_points + n_faces
33+
break
2734

28-
if (n_points == 0):
29-
raise ValueError('The file has no points')
35+
if num_rows is None:
36+
raise ValueError("The file does not contain a valid header")
37+
38+
if n_points == 0:
39+
raise ValueError("The file contains no points")
3040

3141
data = {}
3242
point_names = ["x", "y", "z"]
33-
point_types = {'x': np.float32, 'y': np.float32, 'z': np.float32}
43+
point_types = {"x": np.float32, "y": np.float32, "z": np.float32}
3444

35-
if color:
45+
if has_color:
3646
point_names.extend(["red", "green", "blue"])
37-
point_types = dict(point_types, **{'red': np.uint8, 'green': np.uint8, 'blue': np.uint8})
47+
color_point_types = {"red": np.uint8, "green": np.uint8, "blue": np.uint8}
48+
point_types = {**point_types, **color_point_types}
3849

3950
data["points"] = pd.read_csv(
40-
off,
51+
f,
4152
sep=" ",
4253
header=None,
4354
engine="c",
4455
nrows=n_points,
4556
names=point_names,
4657
dtype=point_types,
4758
index_col=False,
48-
comment="#"
59+
comment="#",
4960
)
5061

5162
data["mesh"] = pd.read_csv(
52-
filename,
63+
f,
5364
sep=" ",
5465
header=None,
5566
engine="c",
56-
skiprows=(count + n_points),
5767
nrows=n_faces,
5868
usecols=[1, 2, 3],
5969
names=["v1", "v2", "v3"],
60-
comment="#"
70+
comment="#",
6171
)
62-
return data
72+
73+
return data

0 commit comments

Comments
 (0)