Skip to content

Commit e919f7b

Browse files
Fix duplication of file reading for Converter subclass & Add docs and example.
1 parent 666f291 commit e919f7b

File tree

4 files changed

+74
-26
lines changed

4 files changed

+74
-26
lines changed

README.md

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,23 @@
11
# edf2parquet
22
Simple utility package to convert EDF/EDF+ files into Apache Parquet format
3-
while preserving the EDF file header information and signal headers metadata information.
4-
Currently, each signal is stored as a separate parquet file, with the option to automatically
5-
add a pandas readable DatetimeIndex.
3+
while preserving the EDF file header information and signal headers metadata information and some nice enhanced features:
4+
- handling of non-strictly EDF compliant .EDF headers (e.g. UTF-8 characters in the header, etc.).
5+
- automatic conversion of the EDF file header start date and signal sampling frequency to a pd.DatetimeIndex with the correct timezone and frequency for easy Pandas interoperability (at the cost of slightly bigger file sizes of course).
6+
- skipping of specific signals during conversion
7+
- bundling signals with the same sampling frequency into a single parquet file
8+
- splitting of EDF files by non-use periods (e.g. if a file consists of continuous multiple nights, and you want to split it into a single file per night)
9+
- compression of the resulting parquet files
10+
611

712
## Installation
813

14+
### Requirements
15+
The package was tested with the pinned versions in the `requirements.txt` file.
16+
If something does not work, try to install this exact versions. I would particularly advise
17+
to use matching or more recent versions of PyArrow and Pandas (version 2.0 is important
18+
as its using underlying Arrow datastructures itself, thus it will break with anything
19+
below 2.0, as far as I'm aware).
20+
921
```bash
1022
pip install git+https://github.com/NarayanSchuetz/edf2parquet.git
1123
```
@@ -16,7 +28,7 @@ Convert an EDF file into Apache parquet format using the EdfToParquetConverter c
1628
```python
1729
import pytz
1830

19-
from edf2parquet.converters import EdfToParquetConverter
31+
from edf2parquet.converters import EdfToParquetConverter, AdvancedEdfToParquetConverter
2032

2133
my_edf_file_path = "path_to_my_edfile.edf"
2234
my_parquet_output_dir = "path_to_my_parquet_output_dir"
@@ -27,6 +39,18 @@ converter = EdfToParquetConverter(edf_file_path=my_edf_file_path,
2739
parquet_output_dir=my_parquet_output_dir,
2840
compression_codec="GZIP")
2941

42+
converter.convert()
43+
44+
# or alternatively using the advanced converter
45+
converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file_path, # path to the EDF file
46+
exclude_signals=["Audio"], # list of signals to exclude from the conversion
47+
parquet_output_dir=my_parquet_output_dir, # path to the output directory (will be created if not exists)
48+
group_by_sampling_freq=True, # whether to group signals with same sampling frequency into single parquet files
49+
datetime_index=True, # whether to automatically add a pd.DatetimeIndex to the resulting parquet files
50+
local_timezone=(pytz.timezone("Europe/Zurich"), pytz.timezone("Europe/Zurich")), # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases)
51+
compression_codec="GZIP", # compression codec to use for the resulting parquet files
52+
split_non_use_by_col="MY_COLUMN") # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file.
53+
3054
converter.convert()
3155
```
3256
### Reading:
@@ -62,8 +86,7 @@ reader.get_signal_headers()
6286
Check the `examples.ipynb` notebook for detailed outputs.
6387

6488
## Todo
65-
- [ ] Allow to bundle signals with the same sampling rate into a single parquet file.
89+
- [x] Allow to bundle signals with the same sampling rate into a single parquet file.
6690
- [ ] Provide a high level user API
6791
- [ ] Enable (possibly distributed) parallel processing to efficiently convert a whole directory of EDF files.
68-
- [ ] Provide a high level API to convert EDF files with the same sampling frequency (fs) into a single parquet file with a single row per signal.
6992

edf2parquet/converters.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def __init__(
5050
"""
5151
self._datetime_index = datetime_index
5252
self._default_signal_dtype = default_signal_dtype
53-
self._edf_file = pyedflib.EdfReader(edf_file_path)
53+
self._edf_reader = EdfReader(edf_file_path)
5454
self._parquet_output_dir = parquet_output_dir
5555
self._compression_codec = compression_codec
5656
self._local_timezone = local_timezone
@@ -61,6 +61,10 @@ def __del__(self) -> None:
6161
def __repr__(self) -> str:
6262
return f"EdfToParquetConverter({self._edf_file.getHeader()})"
6363

64+
@property
65+
def _edf_file(self):
66+
return self._edf_reader.edf_file
67+
6468
def convert(self) -> Optional[Dict[str, pa.Table]]:
6569
"""
6670
Converts an EDF/EDF+ file to Apache Parquet file format.
@@ -241,7 +245,6 @@ def __init__(
241245
"""
242246
super().__init__(edf_file_path, datetime_index, default_signal_dtype, parquet_output_dir,
243247
compression_codec, local_timezone)
244-
self._edf_reader = EdfReader(edf_file_path)
245248
self._group_by_sampling_freq = group_by_sampling_freq
246249
self._exclude_signals = exclude_signals
247250
self._split_non_use_by_col = split_non_use_by_col

examples.ipynb

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,41 @@
3434
"converter.convert()"
3535
]
3636
},
37+
{
38+
"cell_type": "markdown",
39+
"source": [
40+
"### Read an EDF file and convert it to Parquet files using the AdvancedEdfToParquetConverter class directly."
41+
],
42+
"metadata": {
43+
"collapsed": false
44+
}
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"outputs": [],
50+
"source": [
51+
"from edf2parquet.converters import AdvancedEdfToParquetConverter\n",
52+
"import pytz\n",
53+
"\n",
54+
"my_edf_file = \"path_to_my_edfile.edf\" # REPLACE WITH YOUR EDF FILE PATH\n",
55+
"my_parquet_output_dir = \"path_to_my_parquet_output_dir\" # REPLACE WITH YOUR PARQUET OUTPUT DIRECTORY\n",
56+
"\n",
57+
"converter = AdvancedEdfToParquetConverter(edf_file_path=my_edf_file, # path to the EDF file\n",
58+
" exclude_signals=[\"Audio\"], # list of signals to exclude from the conversion\n",
59+
" parquet_output_dir=my_parquet_output_dir, # path to the output directory (will be created if not exists)\n",
60+
" group_by_sampling_freq=True, # whether to group signals with same sampling frequency into single parquet files\n",
61+
" datetime_index=True, # whether to automatically add a pd.DatetimeIndex to the resulting parquet files\n",
62+
" local_timezone=(pytz.timezone(\"Europe/Zurich\"), pytz.timezone(\"Europe/Zurich\")), # specifies the timezone of the EDF file and the timezone of the start_date in the EDF file (should be the same for most cases)\n",
63+
" compression_codec=\"GZIP\", # compression codec to use for the resulting parquet files\n",
64+
" split_non_use_by_col=\"MY_COLUMN\") # only specify this if you want to split the EDF file by non-use periods (e.g. if a file consists of continuous multiple nights and you want to split it into a single file per night) -> read the docstring of the AdvancedEdfToParquetConverter class for more information. The column specifies the column to use for splitting the file.\n",
65+
"\n",
66+
"converter.convert()"
67+
],
68+
"metadata": {
69+
"collapsed": false
70+
}
71+
},
3772
{
3873
"cell_type": "markdown",
3974
"metadata": {},
@@ -220,19 +255,6 @@
220255
"end_time": "2023-04-23T18:37:56.725993Z"
221256
}
222257
}
223-
},
224-
{
225-
"cell_type": "code",
226-
"execution_count": 1,
227-
"outputs": [],
228-
"source": [],
229-
"metadata": {
230-
"collapsed": false,
231-
"ExecuteTime": {
232-
"start_time": "2023-04-25T17:04:41.598905Z",
233-
"end_time": "2023-04-25T17:04:41.602388Z"
234-
}
235-
}
236258
}
237259
],
238260
"metadata": {

setup.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
MAJOR = "0"
55
MINOR = "1"
6-
PATCH = "1"
6+
PATCH = "2"
77

88
_VERSION_TAG = "{MAJOR}.{MINOR}.{PATCH}".format(MAJOR=MAJOR, MINOR=MINOR, PATCH=PATCH)
99

@@ -12,8 +12,8 @@
1212

1313

1414
def get_version():
15-
import subprocess
16-
commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3]
15+
# import subprocess
16+
# commit_hash = str(subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE).stdout)[2:-3]
1717
return '{VERSION_TAG}'.format(VERSION_TAG=_VERSION_TAG)
1818

1919

@@ -30,9 +30,9 @@ def get_version():
3030
install_requires=[
3131
'pytest',
3232
'numpy',
33-
'pandas',
33+
'pandas>=2.0.0',
3434
'pyarrow',
35-
'pyedflib==0.1.19', # newer versions
35+
'pyedflib>=0.1.32',
3636
],
3737
setup_requires=[
3838
'pytest-runner',

0 commit comments

Comments
 (0)