fix dna

Lupin1998 · Lupin1998 · commit 58ac558d665f · 2023-02-21T19:57:08.000Z
diff --git a/configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f20_bs1024_ep100.py b/configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f20_bs1024_ep100.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../../../_base_/datasets/DNA/dna.py',
+    '../../../_base_/default_runtime.py',
+]
+
+embed_dim = 64
+seq_len = 512
+
+# model settings
+model = dict(
+    type='Classification',
+    pretrained=None,
+    backbone=dict(
+        type='DNATransformer',
+        arch={'embed_dims': embed_dim,
+              'num_layers': 12,
+              'num_heads': embed_dim // 16,
+              'feedforward_channels': embed_dim * 4},
+        in_channels=4,
+        seq_len=seq_len,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        drop_rate=0.1,
+        drop_path_rate=0.1,
+        init_values=0.1,
+        final_norm=True,
+        out_indices=-1,  # last layer
+        with_cls_token=False,
+        output_cls_token=False,
+    ),
+    head=dict(
+        type='RegHead',
+        loss=dict(type='RegressionLoss', mode='huber_loss',
+            loss_weight=1.0, reduction='mean',
+            activate='sigmoid', alpha=0.2, gamma=1.0, beta=1.0, residual=False),
+        with_avg_pool=True, in_channels=embed_dim, out_channels=1),
+)
+
+# dataset settings
+data_root = 'data/dna/'
+data_source_cfg = dict(
+    type='DNASeqDataset',
+    file_list=None,  # use all splits
+    word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
+    data_type="regression", target_type='total',
+    filter_condition=20, max_seq_length=512,
+)
+
+data = dict(
+    samples_per_gpu=64,  # bs64 x 8gpu x 2 accu = bs1024
+    workers_per_gpu=4,
+    train=dict(
+        data_source=dict(
+            root=data_root+"train", **data_source_cfg)),
+    val=dict(
+        data_source=dict(
+            root=data_root+"test", **data_source_cfg)),
+)
+update_interval = 2
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=5e-3,
+    weight_decay=1e-2, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'pos_embed': dict(weight_decay=0.),
+        'gamma': dict(weight_decay=0.),
+        'noise_sigma': dict(weight_decay=0., lr_mult=1e-1),
+    })
+
+# apex
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(
+    grad_clip=dict(max_norm=5.0), update_interval=update_interval)
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-5,
+    warmup='linear',
+    warmup_iters=1, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# checkpoint
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f5_bs1024_ep100.py b/configs/regression/DNA/transformer/deit/deit_t_dim64_l512_f5_bs1024_ep100.py
@@ -2,17 +2,20 @@
     '../../../_base_/datasets/DNA/dna.py',
     '../../../_base_/default_runtime.py',
 ]
+
 embed_dim = 64
-patch_size = 2
-seq_len = 1024
+seq_len = 512
 
 # model settings
 model = dict(
     type='Classification',
     pretrained=None,
     backbone=dict(
         type='DNATransformer',
-        arch='deit-s',
+        arch={'embed_dims': embed_dim,
+              'num_layers': 12,
+              'num_heads': embed_dim // 16,
+              'feedforward_channels': embed_dim * 4},
         in_channels=4,
         seq_len=seq_len,
         norm_cfg=dict(type='LN', eps=1e-6),
@@ -32,37 +35,59 @@
         with_avg_pool=True, in_channels=embed_dim, out_channels=1),
 )
 
+# dataset settings
+data_root = 'data/dna/'
+data_source_cfg = dict(
+    type='DNASeqDataset',
+    file_list=None,  # use all splits
+    word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
+    data_type="regression", target_type='total',
+    filter_condition=5, max_seq_length=512,
+)
+
+data = dict(
+    samples_per_gpu=64,  # bs64 x 8gpu x 2 accu = bs1024
+    workers_per_gpu=4,
+    train=dict(
+        data_source=dict(
+            root=data_root+"train", **data_source_cfg)),
+    val=dict(
+        data_source=dict(
+            root=data_root+"test", **data_source_cfg)),
+)
+update_interval = 2
+
 # optimizer
 optimizer = dict(
     type='AdamW',
-    lr=3e-3,
-    weight_decay=5e-2, eps=1e-8, betas=(0.9, 0.999),
+    lr=5e-3,
+    weight_decay=1e-2, eps=1e-8, betas=(0.9, 0.999),
     paramwise_options={
         '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
         'norm': dict(weight_decay=0.),
         'bias': dict(weight_decay=0.),
         'pos_embed': dict(weight_decay=0.),
         'gamma': dict(weight_decay=0.),
-        # 'noise_sigma': dict(weight_decay=0., lr_mult=1e-1),
+        'noise_sigma': dict(weight_decay=0., lr_mult=1e-1),
     })
 
 # apex
 use_fp16 = False
-fp16 = dict(type='apex', loss_scale=dict(mode='dynamic'))
+fp16 = dict(type='mmcv', loss_scale='dynamic')
 optimizer_config = dict(
-    grad_clip=dict(max_norm=5.0), update_interval=1)
+    grad_clip=dict(max_norm=5.0), update_interval=update_interval)
 
 # learning policy
 lr_config = dict(
     policy='CosineAnnealing',
     by_epoch=False, min_lr=1e-5,
     warmup='linear',
-    warmup_iters=5, warmup_by_epoch=True,
+    warmup_iters=1, warmup_by_epoch=True,
     warmup_ratio=1e-5,
 )
 
 # checkpoint
-checkpoint_config = dict(interval=100, max_keep_ckpts=1)
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
 
 # runtime settings
-runner = dict(type='EpochBasedRunner', max_epochs=100)
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/configs/regression/_base_/datasets/DNA/dna.py b/configs/regression/_base_/datasets/DNA/dna.py
@@ -5,7 +5,7 @@
     file_list=None,  # use all splits
     word_splitor="", data_splitor=",", mapping_name="ACGT",  # gRNA tokenize
     data_type="regression", target_type='total',
-    filter_condition=0
+    filter_condition=5, max_seq_length=512
 )
 
 dataset_type = 'RegressionDataset'
@@ -41,8 +41,8 @@
 
 # validation hook
 evaluation = dict(
-    initial=False,
-    interval=5,
+    initial=True,
+    interval=1,
     samples_per_gpu=100,
     workers_per_gpu=2,
     eval_param=dict(
@@ -52,4 +52,4 @@
 )
 
 # checkpoint
-checkpoint_config = dict(interval=200, max_keep_ckpts=1)
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
diff --git a/openbioseq/datasets/data_sources/dna_seq_source.py b/openbioseq/datasets/data_sources/dna_seq_source.py
@@ -7,7 +7,6 @@
 from ..utils import read_file
 
 
-
 @DATASOURCES.register_module
 class DNASeqDataset(object):
     """The implementation for loading any bio seqences.
@@ -95,7 +94,11 @@ def __init__(self,
                     # data = [mapping[tok] for tok in l[self.col_names.index('seq')]] + [0] * padding
                     data_list = list(map(mapping.get, l[self.col_names.index('seq')]))
                     padding = self.max_seq_length - len(data_list)
-                    data = data_list + [0] * padding
+                    if padding < 0:
+                        data = data_list[:self.max_seq_length]
+                    else:
+                        data = data_list + [0] * padding
+
                     label = l[self.col_names.index(self.target_type)]
                     
                     if self.data_type == "classification":
diff --git a/openbioseq/models/backbones/seq_embed_transformer.py b/openbioseq/models/backbones/seq_embed_transformer.py
@@ -1,7 +1,5 @@
 import math
 from typing import Sequence
-from functools import reduce
-from operator import mul
 
 import numpy as np
 import torch
@@ -13,8 +11,7 @@
 from mmcv.utils.parrots_wrapper import _BatchNorm
 
 from openbioseq.utils import get_root_logger, print_log
-from ..utils import resize_pos_embed, PatchEmbed1d, ConvPatchEmbed1d, \
-                    build_1d_sincos_position_embedding
+from ..utils import resize_pos_embed, build_1d_sincos_position_embedding
 from ..builder import BACKBONES
 from .base_backbone import BaseBackbone
 from .vision_transformer import TransformerEncoderLayer
@@ -203,26 +200,6 @@ def __init__(self,
             padding_idx=padding_index
         )
         self.embedding_layer = nn.Embedding(**_seq_cfg)
-        # _patch_cfg = dict(
-        #     in_channels=in_channels,
-        #     input_size=seq_len,
-        #     embed_dims=self.embed_dims,
-        #     conv_type='Conv1d',
-        #     kernel_size=patch_size,
-        #     stride=patch_size if patchfied else patch_size // 2,
-        # )
-        # if stem_layer <= 1:
-        #     _patch_cfg.update(patch_cfg)
-        #     self.patch_embed = PatchEmbed1d(**_patch_cfg)
-        # else:
-        #     _patch_cfg.update(dict(
-        #         num_layers=stem_layer,
-        #         act_cfg=act_cfg,
-        #     ))
-        #     _patch_cfg.update(patch_cfg)
-        #     self.patch_embed = ConvPatchEmbed1d(**_patch_cfg)
-        # self.patch_resolution = self.patch_embed.init_out_size
-        # self.num_patches = self.patch_embed.init_out_size
 
         # Set cls token
         if output_cls_token:
@@ -328,11 +305,7 @@ def init_weights(self, pretrained=None):
                     cls_token=True)
                 self.pos_embed.data.copy_(pos_emb)
                 self.pos_embed.requires_grad = False
-                # xavier_uniform initialization for PatchEmbed1d
-                # if isinstance(self.patch_embed, PatchEmbed1d):
-                #     val = math.sqrt(
-                #         6. / float(3 * reduce(mul, self.patch_size, 1) + self.embed_dims))
-                #     uniform_init(self.patch_embed.projection, -val, val, bias=0)
+
                 # initialization for linear layers
                 for name, m in self.named_modules():
                     if isinstance(m, nn.Linear):
@@ -373,8 +346,10 @@ def resize_pos_embed(*args, **kwargs):
 
     def forward(self, x):
         B = x.shape[0]
+        if x.dtype != torch.long:  # must be indice
+            x = x.type(torch.long).clamp(0, x.size(1)-1)
         x = self.embedding_layer(x)
-        
+
         if self.cls_token is not None:
             cls_tokens = self.cls_token.expand(B, -1, -1)
             x = torch.cat((cls_tokens, x), dim=1)
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
@@ -1,4 +1,9 @@
-# Copyright (c) OpenMMLab. All rights reserved.
+"""
+An example to count Params and FLOPs.
+
+Example command:
+python tools/get_flops.py [PATH_TO_config] --channel 4 --shape 512
+"""
 import argparse
 
 from mmcv import Config
@@ -13,13 +18,13 @@ def parse_args():
     parser.add_argument(
         '--channel',
         type=int,
-        default=3,
+        default=4,
         help='input data channel')
     parser.add_argument(
         '--shape',
         type=int,
         nargs='+',
-        default=[224, 224],
+        default=[512],
         help='input data size')
     args = parser.parse_args()
     return args
@@ -36,6 +41,8 @@ def main():
         input_shape = (in_channel, ) + tuple(args.shape)
     else:
         raise ValueError('invalid input shape')
+    if args.channel == 0:  # using nn.Embedding in the model
+        input_shape = input_shape[1:]
 
     cfg = Config.fromfile(args.config)
     model = build_model(cfg.model)