add demo

Lupin1998 · Lupin1998 · commit f678533fb9c8 · 2023-03-06T05:05:08.000Z
diff --git a/demo/grna_demo.csv b/demo/grna_demo.csv
@@ -0,0 +1,2 @@
+1,TCTGTAAAGTGGCAAGCAGGAGTCTGCTACAATGGAGGAAAGGATTTTGCTGTATCTCTTGCC
+2,CAGGAGGGAAACATGGTTACTGCTCGCCAGGAACCTCGCCTGGTCCTGATTTCCCTGACCTGC
diff --git a/demo/grna_demo.py b/demo/grna_demo.py
@@ -0,0 +1,108 @@
+"""
+An example of prediction of gRNA editing efficiency.
+
+Example command (a sequence in 63 bits):
+python grna_demo.py TTGCTGTATCTCTTGCCAGGCCCAAGGCTGCAGAGGGAATTGGTAATATACTTCATTTAATAA
+
+Output results:
+0.20432067
+"""
+
+import argparse
+import torch
+from mmcv.runner import load_checkpoint
+
+from openbioseq.datasets.data_sources.bio_seq_source import binarize
+from openbioseq.models import build_model
+from openbioseq.datasets.utils import read_file
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process an input gRNA sequence to predict')
+    parser.add_argument('--input_seq', type=str, default=None, help='input sequence')
+    parser.add_argument('--input_file', type=str, default=None,
+                        help='path to a input file containing several sequences')
+    parser.add_argument('--debug', action='store_true', default=False,
+                        help='debug mode of demo')
+    args = parser.parse_args()
+    return args
+
+
+def get_model_config(seq_len=63, embed_dim=64, patch_size=2):
+    """ Transformer """
+
+    checkpoint = "https://github.com/Westlake-AI/OpenBioSeq/releases/download/v0.1.1/k562_layer4_p2_h4_d64_init_bs256_ep100.pth"
+    model = dict(
+        type='Classification',
+        pretrained=None,
+        backbone=dict(
+            type='SequenceTransformer',
+            arch=dict(
+                embed_dims=embed_dim,
+                num_layers=4,
+                num_heads=4,
+                feedforward_channels=embed_dim * 4),
+            in_channels=4,
+            patch_size=patch_size,
+            seq_len=int(seq_len / patch_size) + bool(seq_len % patch_size != 0),
+            norm_cfg=dict(type='LN', eps=1e-6),
+            drop_rate=0.1,
+            drop_path_rate=0.1,
+            init_values=0.1,
+            final_norm=True,
+            out_indices=-1,  # last layer
+            with_cls_token=False,
+            output_cls_token=False),
+        head=dict(
+            type='RegHead',
+            loss=dict(type='RegressionLoss', mode='huber_loss',
+                loss_weight=1.0, reduction='mean',
+                activate='sigmoid', alpha=0.2, gamma=1.0, beta=1.0, residual=False),
+            with_avg_pool=True, in_channels=embed_dim, out_channels=1),
+    )
+
+    return model, checkpoint
+
+
+def main():
+    args = parse_args()
+    if args.debug:
+        input_seq = ["TTGCTGTATCTCTTGCCAGGCCCAAGGCTGCAGAGGGAATTGGTAATATACTTCATTTAATAA"]
+    else:
+        if args.input_seq is not None:
+            input_seq = [args.input_seq]
+        elif args.input_file is not None:
+            input_seq = read_file(args.input_file)
+            for i in range(len(input_seq)):
+                input_seq[i] = input_seq[i].replace('\n', '')
+        else:
+            print(args)
+            assert False and "Invalid input args"
+
+    # input
+    seq_len, key_num = 63, 4
+    key_mapping = dict(A=0, C=1, G=2, T=3)
+    try:
+        input_seq = binarize(
+            input_seq, mapping=key_mapping, max_seq_length=seq_len, data_splitor=',')
+    except ValueError:
+        assert False and "Please check the input sequence"
+
+    # build the model and load checkpoint
+    cfg_model, checkpoint = get_model_config(seq_len=seq_len)
+    model = build_model(cfg_model)
+    load_checkpoint(model, checkpoint, map_location='cpu')
+
+    # inference
+    if len(input_seq) == 1:
+        input_seq = input_seq[0].unsqueeze(0)
+    else:
+        input_seq = torch.concat(input_seq).view(-1, key_num, seq_len)
+
+    output = model(input_seq, mode='inference').detach().cpu().numpy()
+    print("Prediction:", output)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/openbioseq/datasets/data_sources/bio_seq_source.py b/openbioseq/datasets/data_sources/bio_seq_source.py
@@ -26,6 +26,7 @@ def binarize(data_list, mapping, max_seq_length=None, data_splitor=None):
             token_list.append(onehot_seq)
         except:
             print(f"Error seq:", _seq)
+            raise ValueError
     return token_list
 
 
diff --git a/openbioseq/models/classifiers/base_model.py b/openbioseq/models/classifiers/base_model.py
@@ -74,6 +74,22 @@ def forward_test(self, data, **kwargs):
         """
         pass
 
+    def forward_inference(self, data, **kwargs):
+        """
+        Args:
+            data (Tensor): List of tensors. Typically these should be
+                mean centered and std scaled.
+            kwargs (keyword arguments): Specific to concrete implementation.
+
+        Returns:
+            tuple[Tensor]: final model outputs.
+        """
+        x = self.backbone(data)
+        if self.with_neck:
+            x = self.neck(x)
+        preds = self.head(x)
+        return preds[0]
+
     def forward_vis(self, data, **kwargs):
         """Forward backbone features for visualization.
 
@@ -122,6 +138,8 @@ def forward(self, data, mode='train', **kwargs):
             return self.forward_train(data, **kwargs)
         elif mode == 'test':
             return self.forward_test(data, **kwargs)
+        elif mode == 'inference':
+            return self.forward_inference(data, **kwargs)
         elif mode == 'calibration':
             return self.forward_calibration(data, **kwargs)
         elif mode == 'extract':
diff --git a/tools/model_converters/publish_model.py b/tools/model_converters/publish_model.py
@@ -1,32 +1,51 @@
+"""
+Extract parameters and publish the model.
+
+Example command:
+python tools/publish_model.py [PATH/to/checkpoint] [PATH/to/output]
+"""
 import argparse
 import subprocess
 
+import torch
+
 
 def parse_args():
     parser = argparse.ArgumentParser(
         description='Process a checkpoint to be published')
     parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    parser.add_argument('--decode', action='store_true', default=False,
+                        help='whether to add sha256sum in the output name')
     args = parser.parse_args()
     return args
 
 
-def process_checkpoint(in_file):
-    tmp_file = in_file + ".tmp"
-    subprocess.Popen(['cp', in_file, tmp_file])
-    sha = subprocess.check_output(['sha256sum', tmp_file]).decode()
-    out_file = in_file
-    if out_file.endswith('.pth'):
-        out_file = out_file[:-4]
-    final_file = out_file + f'-{sha[:8]}.pth'
-    assert final_file != in_file, \
-        "The output filename is the same as the input file."
-    print("Output file: {}".format(final_file))
-    subprocess.Popen(['mv', tmp_file, final_file])
+def process_checkpoint(in_file, out_file, decode=False):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    if torch.__version__ >= '1.6':
+        torch.save(checkpoint, out_file, _use_new_zipfile_serialization=False)
+    else:
+        torch.save(checkpoint, out_file)
+
+    if decode:
+        sha = subprocess.check_output(['sha256sum', out_file]).decode()
+        if out_file.endswith('.pth'):
+            out_file_name = out_file[:-4]
+        else:
+            out_file_name = out_file
+        final_file = out_file_name + f'-{sha[:8]}.pth'
+        subprocess.Popen(['mv', out_file, final_file])
 
 
 def main():
     args = parse_args()
-    process_checkpoint(args.in_file)
+    process_checkpoint(args.in_file, args.out_file, args.decode)
 
 
 if __name__ == '__main__':
diff --git a/tools/train.py b/tools/train.py
@@ -28,6 +28,10 @@ def parse_args():
         help='the dir to save logs and models')
     parser.add_argument(
         '--resume_from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--auto_resume',
+        action='store_true',
+        help='resume from the latest checkpoint automatically')
     parser.add_argument(
         '--pretrained', default=None, help='pretrained model file')
     parser.add_argument(
@@ -89,6 +93,7 @@ def main():
                                 osp.splitext(osp.basename(args.config))[0])
     if args.resume_from is not None:
         cfg.resume_from = args.resume_from
+    cfg.auto_resume = args.auto_resume
     cfg.gpus = args.gpus
 
     # check memcached package exists

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+1,TCTGTAAAGTGGCAAGCAGGAGTCTGCTACAATGGAGGAAAGGATTTTGCTGTATCTCTTGCC`
	`2`	`+2,CAGGAGGGAAACATGGTTACTGCTCGCCAGGAACCTCGCCTGGTCCTGATTTCCCTGACCTGC`