@@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N
78
78
79
79
#define BCF_IS_64BIT (1<<30)
80
80
81
+ static int vcf_parse_format_partial (bcf1_t * v );
81
82
82
83
static char * find_chrom_header_line (char * s )
83
84
{
@@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1439
1440
static inline uint8_t * bcf_unpack_fmt_core1 (uint8_t * ptr , int n_sample , bcf_fmt_t * fmt );
1440
1441
int bcf_subset_format (const bcf_hdr_t * hdr , bcf1_t * rec )
1441
1442
{
1443
+ if (rec -> unpacked & VCF_UN_FMT ) {
1444
+ if (vcf_parse_format_partial (rec ) < 0 )
1445
+ return -1 ;
1446
+ }
1447
+
1442
1448
if ( !hdr -> keep_samples ) return 0 ;
1443
1449
if ( !bcf_hdr_nsamples (hdr ) )
1444
1450
{
@@ -1734,16 +1740,23 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
1734
1740
if ( h -> dirty ) {
1735
1741
if (bcf_hdr_sync (h ) < 0 ) return -1 ;
1736
1742
}
1743
+
1744
+ if ( hfp -> format .format == vcf || hfp -> format .format == text_format )
1745
+ return vcf_write (hfp ,h ,v );
1746
+
1747
+ if (v -> unpacked & VCF_UN_FMT ) {
1748
+ // slow, but round trip test
1749
+ if (vcf_parse_format_partial (v ) < 0 )
1750
+ return -1 ;
1751
+ }
1752
+
1737
1753
if ( bcf_hdr_nsamples (h )!= v -> n_sample )
1738
1754
{
1739
1755
hts_log_error ("Broken VCF record, the number of columns at %s:%" PRIhts_pos " does not match the number of samples (%d vs %d)" ,
1740
1756
bcf_seqname_safe (h ,v ), v -> pos + 1 , v -> n_sample , bcf_hdr_nsamples (h ));
1741
1757
return -1 ;
1742
1758
}
1743
1759
1744
- if ( hfp -> format .format == vcf || hfp -> format .format == text_format )
1745
- return vcf_write (hfp ,h ,v );
1746
-
1747
1760
if ( v -> errcode )
1748
1761
{
1749
1762
// vcf_parse1() encountered a new contig or tag, undeclared in the
@@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
2569
2582
return 0 ;
2570
2583
}
2571
2584
2585
+ // The indiv.s kstring is the textual VCF representation for FORMAT
2586
+ // column and all the subsequent SAMPLE columns.
2587
+ //
2588
+ // We also need the header, and this cannot be passed in as bcf_unpack calls
2589
+ // this and it doesn't have the header available.
2590
+ // So we also cache a pointer to the header in the first bytes of the
2591
+ // kstring too.
2592
+ //
2593
+ // This packing ensures the data is still in kstring format and amenable
2594
+ // to freeing / reuse. I.e.:
2595
+ //
2596
+ // s.s p q s.l s.m
2597
+ // | | | | |
2598
+ // <HDR_PTR><FORMAT><SAMPLE\tSAMPLE>.........
2599
+ //
2600
+ // Returns 0 on success,
2601
+ // <0 on failure.
2602
+ static int vcf_parse_format_partial (bcf1_t * v ) {
2603
+ if (!(v -> unpacked & VCF_UN_FMT ))
2604
+ return 0 ;
2605
+ kstring_t s = v -> indiv ;
2606
+ const bcf_hdr_t * h = * (const bcf_hdr_t * * )s .s ;
2607
+ char * p = s .s + sizeof (const bcf_hdr_t * );
2608
+ char * q = p + strlen (p );
2609
+
2610
+ v -> indiv .s = NULL ;
2611
+ v -> indiv .l = v -> indiv .m = 0 ;
2612
+
2613
+ int ret ;
2614
+ if ((ret = vcf_parse_format (& s , h , v , p , q ) == 0 )) {
2615
+ v -> unpacked &= ~VCF_UN_FMT ;
2616
+ free (s .s );
2617
+ return ret ;
2618
+ } else {
2619
+ v -> indiv = s ;
2620
+ return ret ;
2621
+ }
2622
+ }
2623
+
2572
2624
static khint_t fix_chromosome (const bcf_hdr_t * h , vdict_t * d , const char * p ) {
2573
2625
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
2574
2626
// been already printed, but will enable tools like vcfcheck to proceed.
@@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
2890
2942
}
2891
2943
if ( v -> max_unpack && !(v -> max_unpack >>3 ) ) goto end ;
2892
2944
} else if (i == 8 ) {// FORMAT
2893
- return vcf_parse_format (s , h , v , p , q ) == 0 ? 0 : -2 ;
2945
+ // Consider complete copy of s, obtained via ks_release,
2946
+ // and cache of p/q pointers. This is then a generalised
2947
+ // parse delay that works for any max_unpack value.
2948
+
2949
+ kstring_t * iv = & v -> indiv ;
2950
+ ks_clear (iv );
2951
+ if (kputsn ((char * )& h , sizeof (& h ), iv ) < 0 ||
2952
+ kputsn (p , s -> s + s -> l - p , iv ) < 0 )
2953
+ goto err ;
2954
+
2955
+ v -> unpacked |= VCF_UN_FMT ;
2956
+
2957
+ // We can't accurately judge n_sample and some things may check
2958
+ // this without doing an explicit bcf_unpack call, but we
2959
+ // set it to bcf_hdr_nsamples(h) as a starting point.
2960
+ // (vcf_parse_format validates this and it's an error if it
2961
+ // mismatches, so this is initial value is incorrect if the
2962
+ // data itself is incorrect, and we'll spot that on an explict
2963
+ // bcf_unpack call.)
2964
+ v -> n_sample = bcf_hdr_nsamples (h );
2965
+
2966
+ return 0 ;
2967
+ //return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
2894
2968
}
2895
2969
}
2896
2970
@@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which)
3002
3076
ptr = bcf_unpack_info_core1 (ptr , & d -> info [i ]);
3003
3077
b -> unpacked |= BCF_UN_INFO ;
3004
3078
}
3079
+ if ((which & BCF_UN_FMT ) && (b -> unpacked & VCF_UN_FMT )) {
3080
+ if (vcf_parse_format_partial (b ) < 0 )
3081
+ return -1 ;
3082
+ }
3005
3083
if ((which & BCF_UN_FMT ) && b -> n_sample && !(b -> unpacked & BCF_UN_FMT )) { // FORMAT
3006
3084
ptr = (uint8_t * )b -> indiv .s ;
3007
3085
hts_expand (bcf_fmt_t , b -> n_fmt , d -> m_fmt , d -> fmt );
@@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which)
3016
3094
int vcf_format (const bcf_hdr_t * h , const bcf1_t * v , kstring_t * s )
3017
3095
{
3018
3096
int i ;
3019
- bcf_unpack ((bcf1_t * )v , BCF_UN_ALL );
3097
+ bcf_unpack ((bcf1_t * )v , (v -> unpacked & VCF_UN_FMT )
3098
+ ? BCF_UN_SHR : BCF_UN_ALL );
3020
3099
kputs (h -> id [BCF_DT_CTG ][v -> rid ].key , s ); // CHROM
3021
3100
kputc ('\t' , s ); kputll (v -> pos + 1 , s ); // POS
3022
3101
kputc ('\t' , s ); kputs (v -> d .id ? v -> d .id : "." , s ); // ID
@@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
3075
3154
if ( first ) kputc ('.' , s );
3076
3155
} else kputc ('.' , s );
3077
3156
// FORMAT and individual information
3078
- if (v -> n_sample )
3079
- {
3080
- int i ,j ;
3081
- if ( v -> n_fmt )
3157
+ if (v -> unpacked & VCF_UN_FMT ) {
3158
+ size_t l = strlen (v -> indiv .s + sizeof (bcf_hdr_t * ));
3159
+ kputc ('\t' , s );
3160
+ kputsn (v -> indiv .s + sizeof (bcf_hdr_t * ), l , s );
3161
+ kputc ('\t' , s );
3162
+ kputsn (v -> indiv .s + sizeof (bcf_hdr_t * ) + l + 1 ,
3163
+ v -> indiv .l - (sizeof (bcf_hdr_t * ) + l + 1 ), s );
3164
+ } else {
3165
+ if (v -> n_sample )
3082
3166
{
3083
- int gt_i = -1 ;
3084
- bcf_fmt_t * fmt = v -> d .fmt ;
3085
- int first = 1 ;
3086
- for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3087
- if ( !fmt [i ].p ) continue ;
3088
- kputc (!first ? ':' : '\t' , s ); first = 0 ;
3089
- if ( fmt [i ].id < 0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3090
- {
3091
- hts_log_error ("Invalid BCF, the FORMAT tag id=%d at %s:%" PRIhts_pos " not present in the header" , fmt [i ].id , bcf_seqname_safe (h ,(bcf1_t * )v ), v -> pos + 1 );
3092
- abort ();
3093
- }
3094
- kputs (h -> id [BCF_DT_ID ][fmt [i ].id ].key , s );
3095
- if (strcmp (h -> id [BCF_DT_ID ][fmt [i ].id ].key , "GT" ) == 0 ) gt_i = i ;
3096
- }
3097
- if ( first ) kputs ("\t." , s );
3098
- for (j = 0 ; j < v -> n_sample ; ++ j ) {
3099
- kputc ('\t' , s );
3100
- first = 1 ;
3167
+ int i ,j ;
3168
+ if ( v -> n_fmt )
3169
+ {
3170
+ int gt_i = -1 ;
3171
+ bcf_fmt_t * fmt = v -> d .fmt ;
3172
+ int first = 1 ;
3101
3173
for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3102
- bcf_fmt_t * f = & fmt [i ];
3103
- if ( !f -> p ) continue ;
3104
- if (!first ) kputc (':' , s );
3105
- first = 0 ;
3106
- if (gt_i == i )
3107
- bcf_format_gt (f ,j ,s );
3108
- else
3109
- bcf_fmt_array (s , f -> n , f -> type , f -> p + j * (size_t )f -> size );
3174
+ if ( !fmt [i ].p ) continue ;
3175
+ kputc (!first ? ':' : '\t' , s ); first = 0 ;
3176
+ if ( fmt [i ].id < 0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3177
+ {
3178
+ hts_log_error ("Invalid BCF, the FORMAT tag id=%d at %s:%" PRIhts_pos " not present in the header" , fmt [i ].id , bcf_seqname_safe (h ,(bcf1_t * )v ), v -> pos + 1 );
3179
+ abort ();
3180
+ }
3181
+ kputs (h -> id [BCF_DT_ID ][fmt [i ].id ].key , s );
3182
+ if (strcmp (h -> id [BCF_DT_ID ][fmt [i ].id ].key , "GT" ) == 0 ) gt_i = i ;
3183
+ }
3184
+ if ( first ) kputs ("\t." , s );
3185
+ for (j = 0 ; j < v -> n_sample ; ++ j ) {
3186
+ kputc ('\t' , s );
3187
+ first = 1 ;
3188
+ for (i = 0 ; i < (int )v -> n_fmt ; ++ i ) {
3189
+ bcf_fmt_t * f = & fmt [i ];
3190
+ if ( !f -> p ) continue ;
3191
+ if (!first ) kputc (':' , s );
3192
+ first = 0 ;
3193
+ if (gt_i == i )
3194
+ bcf_format_gt (f ,j ,s );
3195
+ else
3196
+ bcf_fmt_array (s , f -> n , f -> type , f -> p + j * (size_t )f -> size );
3197
+ }
3198
+ if ( first ) kputc ('.' , s );
3110
3199
}
3111
- if ( first ) kputc ('.' , s );
3112
3200
}
3201
+ else
3202
+ for (j = 0 ; j <=v -> n_sample ; j ++ )
3203
+ kputs ("\t." , s );
3113
3204
}
3114
- else
3115
- for (j = 0 ; j <=v -> n_sample ; j ++ )
3116
- kputs ("\t." , s );
3117
3205
}
3118
3206
kputc ('\n' , s );
3119
3207
return 0 ;
@@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
3824
3912
3825
3913
int bcf_subset (const bcf_hdr_t * h , bcf1_t * v , int n , int * imap )
3826
3914
{
3915
+ if (v -> unpacked & VCF_UN_FMT ) {
3916
+ if (vcf_parse_format_partial (v ) < 0 )
3917
+ return -1 ;
3918
+ }
3919
+
3827
3920
kstring_t ind ;
3828
3921
ind .s = 0 ; ind .l = ind .m = 0 ;
3829
3922
if (n ) {
0 commit comments