Westlake-AI
diff --git a/‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f20_bs1024_ep100.py
Lines changed: 4 additions & 6 deletions b/‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f20_bs1024_ep100.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f5_bs1024_ep100.py renamed to ‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f80_bs1024_ep50.py
Lines changed: 7 additions & 8 deletions b/‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f5_bs1024_ep100.py renamed to ‎configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f80_bs1024_ep50.py
Lines changed: 7 additions & 8 deletions
diff --git a/‎configs/regression/_base_/datasets/DNA/dna.py
Lines changed: 1 addition & 1 deletion b/‎configs/regression/_base_/datasets/DNA/dna.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/selfsup/DNA/transformer/bert/deit_t_dim64_l512_f5_bs1024.py
Lines changed: 103 additions & 0 deletions b/‎configs/selfsup/DNA/transformer/bert/deit_t_dim64_l512_f5_bs1024.py
Lines changed: 103 additions & 0 deletions
diff --git a/‎configs/selfsup/_base_/datasets/DNA/dna_pretrain.py
Lines changed: 38 additions & 0 deletions b/‎configs/selfsup/_base_/datasets/DNA/dna_pretrain.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎openbioseq/datasets/data_sources/dna_seq_source.py
Lines changed: 3 additions & 1 deletion b/‎openbioseq/datasets/data_sources/dna_seq_source.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎openbioseq/models/backbones/__init__.py
Lines changed: 1 addition & 2 deletions b/‎openbioseq/models/backbones/__init__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎openbioseq/models/backbones/mim_vit.py
Lines changed: 19 additions & 6 deletions b/‎openbioseq/models/backbones/mim_vit.py
Lines changed: 19 additions & 6 deletions
@@ -11,12 +11,13 @@
     type='Classification',
     pretrained=None,
     backbone=dict(
-        type='DNATransformer',
+        type='SequenceTransformer',
         arch={'embed_dims': embed_dim,
               'num_layers': 12,
               'num_heads': embed_dim // 16,
               'feedforward_channels': embed_dim * 4},
         in_channels=4,
+        padding_index=0,
         seq_len=seq_len,
         norm_cfg=dict(type='LN', eps=1e-6),
         drop_rate=0.1,
@@ -44,16 +45,13 @@
     data_type="regression", target_type='total',
     filter_condition=20, max_seq_length=512,
 )
-
 data = dict(
     samples_per_gpu=64,  # bs64 x 8gpu x 2 accu = bs1024
     workers_per_gpu=4,
     train=dict(
-        data_source=dict(
-            root=data_root+"train", **data_source_cfg)),
+        data_source=dict(root=data_root+"train", **data_source_cfg)),
     val=dict(
-        data_source=dict(
-            root=data_root+"test", **data_source_cfg)),
+        data_source=dict(root=data_root+"test", **data_source_cfg)),
 )
 update_interval = 2
 
 
@@ -11,19 +11,21 @@
     type='Classification',
     pretrained=None,
     backbone=dict(
-        type='DNATransformer',
+        type='SequenceTransformer',
         arch={'embed_dims': embed_dim,
               'num_layers': 12,
               'num_heads': embed_dim // 16,
               'feedforward_channels': embed_dim * 4},
         in_channels=4,
+        padding_index=0,
         seq_len=seq_len,
         norm_cfg=dict(type='LN', eps=1e-6),
         drop_rate=0.1,
         drop_path_rate=0.1,
         init_values=0.1,
         final_norm=True,
         out_indices=-1,  # last layer
+        with_embedding=True,  # use `nn.Embedding`
         with_cls_token=False,
         output_cls_token=False,
     ),
@@ -42,18 +44,15 @@
     file_list=None,  # use all splits
     word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
     data_type="regression", target_type='total',
-    filter_condition=5, max_seq_length=512,
+    filter_condition=80, max_seq_length=512,
 )
-
 data = dict(
     samples_per_gpu=64,  # bs64 x 8gpu x 2 accu = bs1024
-    workers_per_gpu=4,
+    workers_per_gpu=2,
     train=dict(
-        data_source=dict(
-            root=data_root+"train", **data_source_cfg)),
+        data_source=dict(root=data_root+"train", **data_source_cfg)),
     val=dict(
-        data_source=dict(
-            root=data_root+"test", **data_source_cfg)),
+        data_source=dict(root=data_root+"test", **data_source_cfg)),
 )
 update_interval = 2
 
 
@@ -46,7 +46,7 @@
     samples_per_gpu=100,
     workers_per_gpu=2,
     eval_param=dict(
-        metric=['mse', 'spearman'],
+        metric=['mse', 'spearman', 'pearson'],
         metric_options=dict(average_mode='mean')
     ),
 )
 
@@ -0,0 +1,103 @@
+_base_ = [
+    '../../../_base_/datasets/DNA/dna_pretrain.py',
+    '../../../_base_/default_runtime.py',
+]
+
+embed_dim = 64
+seq_len = 512
+patch_size = 1
+
+# model settings
+model = dict(
+    type='BERT',
+    pretrained=None,
+    mask_ratio=0.15,  # BERT 15%
+    backbone=dict(
+        type='SimMIMTransformer',
+        arch={'embed_dims': embed_dim,
+              'num_layers': 12,
+              'num_heads': embed_dim // 16,
+              'feedforward_channels': embed_dim * 4},
+        in_channels=4,
+        padding_index=0,
+        seq_len=seq_len,
+        mask_layer=10,
+        mask_ratio=0.15,  # BERT 15%
+        mask_token='learnable',
+        norm_cfg=dict(type='LN', eps=1e-6),
+        drop_rate=0.,  # no dropout for pre-training
+        drop_path_rate=0.1,
+        final_norm=True,
+        out_indices=-1,  # last layer
+        with_embedding=True,  # use `nn.Embedding`
+        with_cls_token=True,
+        output_cls_token=True,
+    ),
+    neck=dict(
+        type='SimMIMNeck', feature_Nd="1d",
+        in_channels=embed_dim, out_channels=5, encoder_stride=patch_size),
+    head=dict(
+        type='MIMHead',
+        loss=dict(type='CrossEntropyLoss',
+            use_soft=False, use_sigmoid=False, reduction='none', loss_weight=1.0),
+        feature_Nd="1d", unmask_weight=0., encoder_in_channels=5,
+    ),
+    init_cfg=[
+        dict(type='TruncNormal', layer=['Conv1d', 'Linear'], std=0.02, bias=0.),
+        dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.)
+    ],
+)
+
+# dataset settings
+data_root = 'data/dna/'
+data_source_cfg = dict(
+    type='DNASeqDataset',
+    file_list=None,  # use all splits
+    # file_list=["train_0.csv",],  # use all splits
+    word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
+    has_labels=True, return_label=False,  # pre-training
+    data_type="regression", target_type='total',
+    filter_condition=5, max_seq_length=seq_len,
+)
+data = dict(
+    samples_per_gpu=2,  # bs64 x 8gpu x 2 accu = bs1024
+    workers_per_gpu=2,
+    train=dict(
+        data_source=dict(root=data_root+"train", **data_source_cfg)),
+)
+update_interval = 2  # bs64 x 8gpu x 2 accu = bs1024
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=1e-2, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'cls_token': dict(weight_decay=0.),
+        'pos_embed': dict(weight_decay=0.),
+        'mask_token': dict(weight_decay=0.),
+    })
+
+# apex
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale=dict(mode='dynamic'))
+optimizer_config = dict(
+    grad_clip=dict(max_norm=10.0), update_interval=1)
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-5,
+    warmup='linear',
+    warmup_iters=5, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# checkpoint
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=100)
@@ -0,0 +1,38 @@
+# dataset settings
+data_root = 'data/dna/'
+data_source_cfg = dict(
+    type='DNASeqDataset',
+    file_list=None,  # use all splits
+    word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
+    has_labels=True, return_label=False,  # pre-training
+    data_type="regression", target_type='total',
+    filter_condition=5, max_seq_length=512
+)
+
+dataset_type = 'ExtractDataset'
+sample_norm_cfg = dict(mean=[0,], std=[1,])
+train_pipeline = [
+    dict(type='ToTensor'),
+]
+test_pipeline = [
+    dict(type='ToTensor'),
+]
+# prefetch
+prefetch = False
+
+data = dict(
+    samples_per_gpu=256,
+    workers_per_gpu=4,
+    drop_last=True,
+    train=dict(
+        type=dataset_type,
+        data_source=dict(
+            root=data_root+"train",
+            **data_source_cfg),
+        pipeline=train_pipeline,
+        prefetch=prefetch,
+    ),
+)
+
+# checkpoint
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
@@ -48,6 +48,7 @@ def __init__(self,
                  data_splitor=" ",
                  mapping_name="ACGT",
                  has_labels=True,
+                 return_label=True,
                  target_type='',
                  filter_condition=0,
                  data_type="classification",
@@ -71,6 +72,7 @@ def __init__(self,
 
         # instance vars
         self.has_labels = len(lines[0].split(data_splitor)) >= 2 and has_labels
+        self.return_label = return_label
         self.data_type = data_type
         self.max_seq_length = max_seq_length
         self.filter_condition = filter_condition
@@ -125,7 +127,7 @@ def get_length(self):
 
     def get_sample(self, idx):
         seq = self.data_list[idx]
-        if self.has_labels:
+        if self.has_labels and self.return_label:
             target = self.labels[idx]
             return seq, target
         else:
 
@@ -4,7 +4,6 @@
 from .resnet import ResNet, ResNet_CIFAR, ResNet_Mix, ResNet_Mix_CIFAR
 from .seq_lstm import SequenceLSTM
 from .seq_transformer import SequenceTransformer
-from .seq_embed_transformer import DNATransformer
 from .timm_backbone import TIMMBackbone
 from .uniformer import UniFormer
 from .van import VAN
@@ -16,5 +15,5 @@
     'MAETransformer', 'MAEViT', 'MIMVisionTransformer', 'SimMIMTransformer', 'SimMIMViT',
     'ResNet', 'ResNet_CIFAR', 'ResNet_Mix', 'ResNet_Mix_CIFAR',
     'SequenceLSTM', 'SequenceTransformer', 'TIMMBackbone', 'TransformerEncoderLayer',
-    'UniFormer', 'VAN', 'VisionTransformer', 'WideResNet', 'DNATransformer'
+    'UniFormer', 'VAN', 'VisionTransformer', 'WideResNet',
 ]
@@ -92,7 +92,13 @@ def _init_weights(self, m):
     def forward(self, x):
         """ MAE backbone only used for MAE model """
         B = x.shape[0]
-        x, _ = self.patch_embed(x)
+        if not self.with_embedding:
+            x, _ = self.patch_embed(x)
+        else:
+            if x.dtype != torch.long:  # must be indice
+                x = x.type(torch.long).clamp(0, x.size(1)-1)
+            x = self.embedding_layer(x)
+
         # add pos embed w/o cls token
         x = x + self.pos_embed[:, 1:, :]
         # masking: length -> length * mask_ratio
@@ -360,16 +366,23 @@ def forward(self, x, mask=None):
         Returns:
             tuple: A tuple containing features from multi-stages.
         """
-        x, seq_len = self.patch_embed(x)
+        if not self.with_embedding:
+            x, seq_len = self.patch_embed(x)
+        else:
+            if x.dtype != torch.long:  # must be indice
+                x = x.type(torch.long).clamp(0, x.size(1)-1)
+            x = self.embedding_layer(x)
+            seq_len = self.seq_len
 
         if self.mask_layer == 0:
             if mask is None:
                 mask = simmim_random_masking(x, self.mask_ratio)
             x = forward_simmim_masking(
                 x, self.mask_token, mask, self.mask_mode)
 
-        cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1)
+        if self.with_cls_token or self.output_cls_token:
+            cls_tokens = self.cls_token.expand(x.size(0), -1, -1)
+            x = torch.cat((cls_tokens, x), dim=1)
         x = x + self.resize_pos_embed(
             self.pos_embed,
             src_shape=self.patch_resolution,
@@ -378,7 +391,7 @@ def forward(self, x, mask=None):
             num_extra_tokens=self.num_extra_tokens)
         x = self.drop_after_pos(x)
 
-        if not self.with_cls_token:
+        if self.with_cls_token and not self.output_cls_token:
             # Remove class token for transformer encoder input
             x = x[:, 1:]
 
@@ -393,7 +406,7 @@ def forward(self, x, mask=None):
             if i == len(self.layers) - 1 and self.final_norm:
                 x = self.norm1(x)
 
-        if self.with_cls_token:
+        if self.output_cls_token:
             x = x[:, 1:]
 
         return (x, mask)
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`samples_per_gpu=100,`
`47`	`47`	`workers_per_gpu=2,`
`48`	`48`	`eval_param=dict(`
`49`		`- metric=['mse', 'spearman'],`
	`49`	`+ metric=['mse', 'spearman', 'pearson'],`
`50`	`50`	`metric_options=dict(average_mode='mean')`
`51`	`51`	`),`
`52`	`52`	`)`