Skip to content

Commit e126df5

Browse files
Add more TUDatasets
* MUTAG and Mutagenicity * NCI1 and NCI109 * PTCDatasets * PROTEINS datasets * Add fingerprint dataset
1 parent 4ce6763 commit e126df5

File tree

2 files changed

+249
-15
lines changed

2 files changed

+249
-15
lines changed

README.md

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,7 @@ julia> list_datasets()
1919
TUDatasets.AIDSDataset
2020
TUDatasets.AspirinDataset
2121
TUDatasets.BZRDataset
22-
TUDatasets.BZR_MDDataset
23-
TUDatasets.BenzeneDataset
24-
TUDatasets.COIL_DELDataset
25-
TUDatasets.COIL_RAGDataset
26-
TUDatasets.COLLABDataset
27-
TUDatasets.COLORS_3Dataset
28-
TUDatasets.DBLP_v1Dataset
29-
TUDatasets.DDDataset
30-
TUDatasets.ENZYMESDataset
31-
TUDatasets.QM9Dataset
32-
TUDatasets.REDDIT_BINARYDataset
33-
TUDatasets.SYNTHETICDataset
34-
TUDatasets.SYNTHETICnewDataset
35-
TUDatasets.SynthieDataset
36-
TUDatasets.TRIANGLESDataset
22+
[...]
3723

3824
# Load QM9 from TUDatasets. This dataset contains 129433 molecules represented as graphs.
3925
# The resulting ValGraphCollection is an immutable collection of graphs.

src/TUDatasets.jl

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,21 @@ function __init__()
235235
BenzeneDataset(),
236236
BZRDataset(),
237237
BZR_MDDataset(),
238+
MutagenicityDataset(),
239+
MUTAGDataset(),
240+
NCI1Dataset(),
241+
NCI109Dataset(),
242+
PTC_FMDataset(),
243+
PTC_FRDataset(),
244+
PTC_MMDataset(),
245+
PTC_MRDataset(),
238246
DDDataset(),
239247
ENZYMESDataset(),
248+
PROTEINSDataset(),
249+
PROTEINS_fullDataset(),
240250
COIL_DELDataset(),
241251
COIL_RAGDataset(),
252+
FingerprintDataset(),
242253
COLLABDataset(),
243254
DBLP_v1Dataset(),
244255
REDDIT_BINARYDataset(),
@@ -362,6 +373,177 @@ node_labels_map(::BenzeneDataset, i) = ("C", "O", "H")[i + 1]
362373

363374
graph_attributes_type(::BenzeneDataset) = NamedTuple{(:total_energy,), Tuple{Float64}}
364375

376+
## --------------------------------------
377+
## Mutagenicity
378+
## --------------------------------------
379+
380+
struct MutagenicityDataset <: TUDataset end
381+
382+
dataset_name(::MutagenicityDataset) = "Mutagenicity"
383+
384+
dataset_hash(::MutagenicityDataset) = "6230f94ba246b76834fb51ffa138370477b7bf8a784ade92c5e0586780d2ae0e"
385+
386+
dataset_references(::MutagenicityDataset) = [16, 20]
387+
388+
readme_name(::MutagenicityDataset) = "Mutagenicity_label_readme.txt"
389+
390+
node_labels_type(::MutagenicityDataset) = NamedTuple{(:chem,), Tuple{String}}
391+
node_labels_map(::MutagenicityDataset, i) = ("C", "O", "Cl", "H", "N", "F", "Br", "S", "P", "I", "Na", "K", "Li", "Ca")[i + 1]
392+
393+
edge_labels_type(::MutagenicityDataset) = NamedTuple{(:valence,), Tuple{Int8}}
394+
edge_labels_map(::MutagenicityDataset, i) = (1, 2, 3)[i + 1]
395+
396+
graph_labels_type(::MutagenicityDataset) = NamedTuple{(:class,), Tuple{String}}
397+
graph_labels_map(::MutagenicityDataset, i) = ("mutagen", "nonmutagen")[i + 1]
398+
399+
## --------------------------------------
400+
## MUTAG
401+
## --------------------------------------
402+
403+
struct MUTAGDataset <: TUDataset end
404+
405+
dataset_name(::MUTAGDataset) = "MUTAG"
406+
407+
dataset_hash(::MUTAGDataset) = "c419bdc853c367d2d83da4973c45100954ae15e10f5ae2cddde6ca431f8207f6"
408+
409+
dataset_references(::MUTAGDataset) = [1, 23]
410+
411+
readme_name(::MUTAGDataset) = "README.txt"
412+
413+
node_labels_type(::MUTAGDataset) = NamedTuple{(:chem,), Tuple{String}}
414+
node_labels_map(::MUTAGDataset, i) = ("C", "N", "O", "F", "I", "Cl", "Br")[i + 1]
415+
416+
edge_labels_type(::MUTAGDataset) = NamedTuple{(:bond_type,), Tuple{String}}
417+
edge_labels_map(::MUTAGDataset, i) = ("aromatic", "single", "double", "triple")[i + 1]
418+
419+
graph_labels_type(::MUTAGDataset) = Tuple{Int8}
420+
421+
## --------------------------------------
422+
## NCI1
423+
## --------------------------------------
424+
425+
struct NCI1Dataset <: TUDataset end
426+
427+
dataset_name(::NCI1Dataset) = "NCI1"
428+
429+
dataset_hash(::NCI1Dataset) = "10e1458f3bd9224f14e6d7627e74dcfd13e48d376d73935e7bd2900590ef1d82"
430+
431+
dataset_references(::NCI1Dataset) = [8, 9, 22]
432+
433+
readme_name(::NCI1Dataset) = "README.txt"
434+
435+
node_labels_type(::NCI1Dataset) = Tuple{Int8}
436+
437+
graph_labels_type(::NCI1Dataset) = Tuple{Bool}
438+
439+
## --------------------------------------
440+
## NCI109
441+
## --------------------------------------
442+
443+
struct NCI109Dataset <: TUDataset end
444+
445+
dataset_name(::NCI109Dataset) = "NCI109"
446+
447+
dataset_hash(::NCI109Dataset) = "96e521a294e3e9c088540e9e9caccf55e4ca6e97cf468d68445814467956abaf"
448+
449+
dataset_references(::NCI109Dataset) = [8, 9, 22]
450+
451+
readme_name(::NCI109Dataset) = "README.txt"
452+
453+
node_labels_type(::NCI109Dataset) = Tuple{Int8}
454+
455+
graph_labels_type(::NCI109Dataset) = Tuple{Bool}
456+
457+
## --------------------------------------
458+
## PTC_FM
459+
## --------------------------------------
460+
461+
struct PTC_FMDataset <: TUDataset end
462+
463+
dataset_name(::PTC_FMDataset) = "PTC_FM"
464+
465+
dataset_hash(::PTC_FMDataset) = "a06c80761db8ffd739a171f0d90cfa1f4dc965e1ea716ee5a25cc6cf5f4ae682"
466+
467+
dataset_references(::PTC_FMDataset) = [2, 23]
468+
469+
readme_name(::PTC_FMDataset) = "README.txt"
470+
471+
node_labels_type(::PTC_FMDataset) = Tuple{String}
472+
# Note that each PTC dataset has slightly different atoms in slightly different order.
473+
node_labels_map(::PTC_FMDataset, i) = ("In", "P", "C", "O", "N", "Cl", "S", "Br", "Na", "F", "As", "K", "Cu", "I", "Ba", "Sn", "Pb", "Ca")[i + 1]
474+
475+
edge_labels_type(::PTC_FMDataset) = Tuple{String}
476+
# This is weird, as single and double have different order here than for other PTC datasets
477+
edge_labels_map(::PTC_FMDataset, i) = ("triple", "single", "double", "aromatic")[i + 1]
478+
479+
graph_labels_type(::PTC_FMDataset) = Tuple{Int8}
480+
481+
## --------------------------------------
482+
## PTC_FR
483+
## --------------------------------------
484+
485+
struct PTC_FRDataset <: TUDataset end
486+
487+
dataset_name(::PTC_FRDataset) = "PTC_FR"
488+
489+
dataset_hash(::PTC_FRDataset) = "c4b0083af725aaff27b41228591294922968bd5509179d24c6ea4d3996ed6072"
490+
491+
dataset_references(::PTC_FRDataset) = [2, 23]
492+
493+
readme_name(::PTC_FRDataset) = "README.txt"
494+
495+
node_labels_type(::PTC_FRDataset) = Tuple{String}
496+
node_labels_map(::PTC_FRDataset, i) = ("In", "P", "O", "N", "Na", "C", "Cl", "S", "Br", "F", "As", "K", "Cu", "Zn", "I", "Sn", "Pb", "Te", "Ca")[i + 1]
497+
498+
edge_labels_type(::PTC_FRDataset) = Tuple{String}
499+
edge_labels_map(::PTC_FRDataset, i) = ("triple", "double", "single", "aromatic")[i + 1]
500+
501+
graph_labels_type(::PTC_FRDataset) = Tuple{Int8}
502+
503+
## --------------------------------------
504+
## PTC_MM
505+
## --------------------------------------
506+
507+
struct PTC_MMDataset <: TUDataset end
508+
509+
dataset_name(::PTC_MMDataset) = "PTC_MM"
510+
511+
dataset_hash(::PTC_MMDataset) = "3846d6697330a446d46a1274b8708fcc153acdbcf59f649871bc0844bb012e4f"
512+
513+
dataset_references(::PTC_MMDataset) = [2, 23]
514+
515+
readme_name(::PTC_MMDataset) = "README.txt"
516+
517+
node_labels_type(::PTC_MMDataset) = Tuple{String}
518+
node_labels_map(::PTC_MMDataset, i) = ("In", "P", "O", "N", "Na", "C", "Cl", "S", "Br", "F", "As", "K", "B", "Cu", "Zn", "I", "Ba", "Sn", "Pb", "Ca")[i + 1]
519+
520+
edge_labels_type(::PTC_MMDataset) = Tuple{String}
521+
edge_labels_map(::PTC_MMDataset, i) = ("triple", "double", "single", "aromatic")[i + 1]
522+
523+
graph_labels_type(::PTC_MMDataset) = Tuple{Int8}
524+
525+
## --------------------------------------
526+
## PTC_MR
527+
## --------------------------------------
528+
529+
struct PTC_MRDataset <: TUDataset end
530+
531+
dataset_name(::PTC_MRDataset) = "PTC_MR"
532+
533+
dataset_hash(::PTC_MRDataset) = "5699a6d9f1bc5b3d71495f09ef50de53fa3e6bb24ead1150da678500229f5237"
534+
535+
dataset_references(::PTC_MRDataset) = [2, 23]
536+
537+
readme_name(::PTC_MRDataset) = "README.txt"
538+
539+
node_labels_type(::PTC_MRDataset) = Tuple{String}
540+
node_labels_map(::PTC_MRDataset, i) = ("In", "P", "O", "N", "Na", "C", "Cl", "S", "Br", "F", "K", "Cu", "Zn", "I", "Ba", "Sn", "Pb", "Ca")[i + 1]
541+
542+
edge_labels_type(::PTC_MRDataset) = Tuple{String}
543+
edge_labels_map(::PTC_MRDataset, i) = ("triple", "double", "single", "aromatic")[i + 1]
544+
545+
graph_labels_type(::PTC_MRDataset) = Tuple{Int8}
546+
365547

366548
## --------------------------------------
367549
## QM9
@@ -429,6 +611,48 @@ graph_labels_type(::ENZYMESDataset) = Tuple{Int8}
429611
node_labels_type(::ENZYMESDataset) = Tuple{Int8}
430612
node_attributes_type(::ENZYMESDataset) = NTuple{18, Float64}
431613

614+
## --------------------------------------
615+
## PROTEINS
616+
## --------------------------------------
617+
618+
struct PROTEINSDataset <: TUDataset end
619+
620+
dataset_name(::PROTEINSDataset) = "PROTEINS"
621+
622+
dataset_hash(::PROTEINSDataset) = "2da8de15284b88edabca2888ce5444d62f364ed41159260977088c4e53d4d848"
623+
624+
readme_name(::PROTEINSDataset) = "README.txt"
625+
626+
dataset_references(::PROTEINSDataset) = [4, 6]
627+
628+
graph_eltype(::PROTEINSDataset) = Int16
629+
630+
graph_labels_type(::PROTEINSDataset) = Tuple{Int8}
631+
632+
node_labels_type(::PROTEINSDataset) = Tuple{Int8}
633+
node_attributes_type(::PROTEINSDataset) = Tuple{Float64}
634+
635+
## --------------------------------------
636+
## PROTEINS_full
637+
## --------------------------------------
638+
639+
struct PROTEINS_fullDataset <: TUDataset end
640+
641+
dataset_name(::PROTEINS_fullDataset) = "PROTEINS_full"
642+
643+
dataset_hash(::PROTEINS_fullDataset) = "3b7782403ce98754df3330a67e9b2aff32e69520aa1245bf515c48cc0119c562"
644+
645+
readme_name(::PROTEINS_fullDataset) = "README.txt"
646+
647+
dataset_references(::PROTEINS_fullDataset) = [4, 6]
648+
649+
graph_eltype(::PROTEINS_fullDataset) = Int16
650+
651+
graph_labels_type(::PROTEINS_fullDataset) = Tuple{Int8}
652+
653+
node_labels_type(::PROTEINS_fullDataset) = Tuple{Int8}
654+
node_attributes_type(::PROTEINS_fullDataset) = NTuple{29, Float64}
655+
432656
## --------------------------------------
433657
## COIL-DEL
434658
## --------------------------------------
@@ -473,6 +697,30 @@ edge_attributes_type(::COIL_RAGDataset) = NamedTuple{(:boundary,), Tuple{Float32
473697

474698
graph_labels_type(::COIL_RAGDataset) = Tuple{Int8} # TODO not sure what the labels mean
475699

700+
## --------------------------------------
701+
## Fingerprint
702+
## --------------------------------------
703+
704+
struct FingerprintDataset <: TUDataset end
705+
706+
dataset_name(::FingerprintDataset) = "Fingerprint"
707+
708+
dataset_hash(::FingerprintDataset) = "6c53fc4e71a26b192681375b7a860afd49a24367cb1a15bda6b57067c467154d"
709+
710+
dataset_references(::FingerprintDataset) = [16, 19]
711+
712+
readme_name(::FingerprintDataset) = "Fingerprint_label_readme.txt"
713+
714+
node_attributes_type(::FingerprintDataset) = @NamedTuple{x::Float64, y::Float64}
715+
716+
edge_attributes_type(::FingerprintDataset) = @NamedTuple{orient::Float64, angle::Float64}
717+
718+
graph_labels_type(::FingerprintDataset) = @NamedTuple{class::String}
719+
graph_labels_map(::FingerprintDataset, i) =
720+
("L", "TR", "A", "TA", "W", "R", "T", "WR", "TL", "LT", "AT", "RT", "WL", "RW", "AR")[i + 1]
721+
722+
723+
476724
## --------------------------------------
477725
## COLLAB
478726
## --------------------------------------

0 commit comments

Comments
 (0)