Skip to content

Commit a482b91

Browse files
committed
Speed up of VCF parsing / writing with FORMAT fields.
If we're reading and writing VCF without intervening change to BCF or use of bcf_unpack with BCF_UN_FMT (so no querying and/or modification of the per-sample FORMAT columns), then we keep the data in string form instead of binary bcf form. The bcf binary form is held in kstring b->indiv. We also hold the string format there too, but use b->unpacked & VCF_UN_FMT as a bit-flag to indicate whether this is encoding as VCF or BCF. The benefit is reading is considerably faster for many-sample files (~4 fold for a 2.5k sample 1000 genomes test) and similarly (~3 fold) for a read/write loop to VCF as we don't have to re-encode the data from uBCF to VCF either. If we're converting to BCF, or doing a query, we still need the call to vcf_parse_format as before, which reformats b->indiv from VCF to BCF internally. This is done on the fly. There is some nastiness to how this works sadly as bcf_unpack just takes the bcf record and has no access to the header, but this is needed for parsing. Therefore we stash a copy of it in the kstring along with the text itself, but this assumes that no one is doing header oddities such as reading the file, freeing the header, and then attempting to unpack the contents. It feels like a safe assumption, although it's not something that has ever been explicitly spelt out before. Note this causes a few bcftools test failures due to fixups that happen when doing a full conversion to BCF. A similar case happens here in test/noroundtrip.vcf, which has an underspecified FORMAT of "GT:GT 0/1". The test checks it comes out as "GT:GT 2,4:.". This won't happen with pure VCF as we're not parsing that data, but the test fix is to force conversion to BCF and back again to test that code path. A similar fix would need to be made to the failing bcftools tests.
1 parent 9de45b7 commit a482b91

File tree

3 files changed

+136
-41
lines changed

3 files changed

+136
-41
lines changed

htslib/vcf.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,8 @@ typedef struct {
216216
float qual; // QUAL
217217
uint32_t n_info:16, n_allele:16;
218218
uint32_t n_fmt:8, n_sample:24;
219-
kstring_t shared, indiv;
219+
kstring_t shared;
220+
kstring_t indiv; // Per sample data. unpacked & VCF_UN_FMT => VCF / BCF
220221
bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
221222
int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
222223
int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
@@ -389,6 +390,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write().
389390
#define BCF_UN_FMT 8 // unpack format and each sample
390391
#define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
391392
#define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
393+
#define VCF_UN_FMT 16 // indiv.s is VCF string
392394
HTSLIB_EXPORT
393395
int bcf_unpack(bcf1_t *b, int which);
394396

test/test.pl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ sub test_vcf_various
822822
test_cmd($opts, %args, out => "formatcols.vcf",
823823
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatcols.vcf");
824824
test_cmd($opts, %args, out => "noroundtrip-out.vcf",
825-
cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf");
825+
cmd => "$$opts{path}/test_view -b $$opts{path}/noroundtrip.vcf | $$opts{bin}/htsfile -c -");
826826
test_cmd($opts, %args, out => "formatmissing-out.vcf",
827827
cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf");
828828
}

vcf.c

Lines changed: 132 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N
7878

7979
#define BCF_IS_64BIT (1<<30)
8080

81+
static int vcf_parse_format_partial(bcf1_t *v);
8182

8283
static char *find_chrom_header_line(char *s)
8384
{
@@ -1439,6 +1440,11 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
14391440
static inline uint8_t *bcf_unpack_fmt_core1(uint8_t *ptr, int n_sample, bcf_fmt_t *fmt);
14401441
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
14411442
{
1443+
if (rec->unpacked & VCF_UN_FMT) {
1444+
if (vcf_parse_format_partial(rec) < 0)
1445+
return -1;
1446+
}
1447+
14421448
if ( !hdr->keep_samples ) return 0;
14431449
if ( !bcf_hdr_nsamples(hdr) )
14441450
{
@@ -1734,16 +1740,23 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
17341740
if ( h->dirty ) {
17351741
if (bcf_hdr_sync(h) < 0) return -1;
17361742
}
1743+
1744+
if ( hfp->format.format == vcf || hfp->format.format == text_format )
1745+
return vcf_write(hfp,h,v);
1746+
1747+
if (v->unpacked & VCF_UN_FMT) {
1748+
// slow, but round trip test
1749+
if (vcf_parse_format_partial(v) < 0)
1750+
return -1;
1751+
}
1752+
17371753
if ( bcf_hdr_nsamples(h)!=v->n_sample )
17381754
{
17391755
hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
17401756
bcf_seqname_safe(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h));
17411757
return -1;
17421758
}
17431759

1744-
if ( hfp->format.format == vcf || hfp->format.format == text_format )
1745-
return vcf_write(hfp,h,v);
1746-
17471760
if ( v->errcode )
17481761
{
17491762
// vcf_parse1() encountered a new contig or tag, undeclared in the
@@ -2569,6 +2582,45 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
25692582
return 0;
25702583
}
25712584

2585+
// The indiv.s kstring is the textual VCF representation for FORMAT
2586+
// column and all the subsequent SAMPLE columns.
2587+
//
2588+
// We also need the header, and this cannot be passed in as bcf_unpack calls
2589+
// this and it doesn't have the header available.
2590+
// So we also cache a pointer to the header in the first bytes of the
2591+
// kstring too.
2592+
//
2593+
// This packing ensures the data is still in kstring format and amenable
2594+
// to freeing / reuse. I.e.:
2595+
//
2596+
// s.s p q s.l s.m
2597+
// | | | | |
2598+
// <HDR_PTR><FORMAT><SAMPLE\tSAMPLE>.........
2599+
//
2600+
// Returns 0 on success,
2601+
// <0 on failure.
2602+
static int vcf_parse_format_partial(bcf1_t *v) {
2603+
if (!(v->unpacked & VCF_UN_FMT))
2604+
return 0;
2605+
kstring_t s = v->indiv;
2606+
const bcf_hdr_t *h = *(const bcf_hdr_t **)s.s;
2607+
char *p = s.s + sizeof(const bcf_hdr_t *);
2608+
char *q = p + strlen(p);
2609+
2610+
v->indiv.s = NULL;
2611+
v->indiv.l = v->indiv.m = 0;
2612+
2613+
int ret;
2614+
if ((ret = vcf_parse_format(&s, h, v, p, q) == 0)) {
2615+
v->unpacked &= ~VCF_UN_FMT;
2616+
free(s.s);
2617+
return ret;
2618+
} else {
2619+
v->indiv = s;
2620+
return ret;
2621+
}
2622+
}
2623+
25722624
static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
25732625
// Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
25742626
// been already printed, but will enable tools like vcfcheck to proceed.
@@ -2890,7 +2942,29 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
28902942
}
28912943
if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
28922944
} else if (i == 8) {// FORMAT
2893-
return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
2945+
// Consider complete copy of s, obtained via ks_release,
2946+
// and cache of p/q pointers. This is then a generalised
2947+
// parse delay that works for any max_unpack value.
2948+
2949+
kstring_t *iv = &v->indiv;
2950+
ks_clear(iv);
2951+
if (kputsn((char *)&h, sizeof(&h), iv) < 0 ||
2952+
kputsn(p, s->s + s->l - p, iv) < 0)
2953+
goto err;
2954+
2955+
v->unpacked |= VCF_UN_FMT;
2956+
2957+
// We can't accurately judge n_sample and some things may check
2958+
// this without doing an explicit bcf_unpack call, but we
2959+
// set it to bcf_hdr_nsamples(h) as a starting point.
2960+
// (vcf_parse_format validates this and it's an error if it
2961+
// mismatches, so this is initial value is incorrect if the
2962+
// data itself is incorrect, and we'll spot that on an explict
2963+
// bcf_unpack call.)
2964+
v->n_sample = bcf_hdr_nsamples(h);
2965+
2966+
return 0;
2967+
//return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
28942968
}
28952969
}
28962970

@@ -3002,6 +3076,10 @@ int bcf_unpack(bcf1_t *b, int which)
30023076
ptr = bcf_unpack_info_core1(ptr, &d->info[i]);
30033077
b->unpacked |= BCF_UN_INFO;
30043078
}
3079+
if ((which & BCF_UN_FMT) && (b->unpacked & VCF_UN_FMT)) {
3080+
if (vcf_parse_format_partial(b) < 0)
3081+
return -1;
3082+
}
30053083
if ((which&BCF_UN_FMT) && b->n_sample && !(b->unpacked&BCF_UN_FMT)) { // FORMAT
30063084
ptr = (uint8_t*)b->indiv.s;
30073085
hts_expand(bcf_fmt_t, b->n_fmt, d->m_fmt, d->fmt);
@@ -3016,7 +3094,8 @@ int bcf_unpack(bcf1_t *b, int which)
30163094
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
30173095
{
30183096
int i;
3019-
bcf_unpack((bcf1_t*)v, BCF_UN_ALL);
3097+
bcf_unpack((bcf1_t*)v, (v->unpacked & VCF_UN_FMT)
3098+
? BCF_UN_SHR : BCF_UN_ALL);
30203099
kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM
30213100
kputc('\t', s); kputll(v->pos + 1, s); // POS
30223101
kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID
@@ -3075,45 +3154,54 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
30753154
if ( first ) kputc('.', s);
30763155
} else kputc('.', s);
30773156
// FORMAT and individual information
3078-
if (v->n_sample)
3079-
{
3080-
int i,j;
3081-
if ( v->n_fmt)
3157+
if (v->unpacked & VCF_UN_FMT) {
3158+
size_t l = strlen(v->indiv.s + sizeof(bcf_hdr_t *));
3159+
kputc('\t', s);
3160+
kputsn(v->indiv.s + sizeof(bcf_hdr_t *), l, s);
3161+
kputc('\t', s);
3162+
kputsn(v->indiv.s + sizeof(bcf_hdr_t *) + l+1,
3163+
v->indiv.l - (sizeof(bcf_hdr_t *) + l+1), s);
3164+
} else {
3165+
if (v->n_sample)
30823166
{
3083-
int gt_i = -1;
3084-
bcf_fmt_t *fmt = v->d.fmt;
3085-
int first = 1;
3086-
for (i = 0; i < (int)v->n_fmt; ++i) {
3087-
if ( !fmt[i].p ) continue;
3088-
kputc(!first ? ':' : '\t', s); first = 0;
3089-
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3090-
{
3091-
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
3092-
abort();
3093-
}
3094-
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
3095-
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
3096-
}
3097-
if ( first ) kputs("\t.", s);
3098-
for (j = 0; j < v->n_sample; ++j) {
3099-
kputc('\t', s);
3100-
first = 1;
3167+
int i,j;
3168+
if ( v->n_fmt)
3169+
{
3170+
int gt_i = -1;
3171+
bcf_fmt_t *fmt = v->d.fmt;
3172+
int first = 1;
31013173
for (i = 0; i < (int)v->n_fmt; ++i) {
3102-
bcf_fmt_t *f = &fmt[i];
3103-
if ( !f->p ) continue;
3104-
if (!first) kputc(':', s);
3105-
first = 0;
3106-
if (gt_i == i)
3107-
bcf_format_gt(f,j,s);
3108-
else
3109-
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
3174+
if ( !fmt[i].p ) continue;
3175+
kputc(!first ? ':' : '\t', s); first = 0;
3176+
if ( fmt[i].id<0 ) //!bcf_hdr_idinfo_exists(h,BCF_HL_FMT,fmt[i].id) )
3177+
{
3178+
hts_log_error("Invalid BCF, the FORMAT tag id=%d at %s:%"PRIhts_pos" not present in the header", fmt[i].id, bcf_seqname_safe(h,(bcf1_t*)v), v->pos+1);
3179+
abort();
3180+
}
3181+
kputs(h->id[BCF_DT_ID][fmt[i].id].key, s);
3182+
if (strcmp(h->id[BCF_DT_ID][fmt[i].id].key, "GT") == 0) gt_i = i;
3183+
}
3184+
if ( first ) kputs("\t.", s);
3185+
for (j = 0; j < v->n_sample; ++j) {
3186+
kputc('\t', s);
3187+
first = 1;
3188+
for (i = 0; i < (int)v->n_fmt; ++i) {
3189+
bcf_fmt_t *f = &fmt[i];
3190+
if ( !f->p ) continue;
3191+
if (!first) kputc(':', s);
3192+
first = 0;
3193+
if (gt_i == i)
3194+
bcf_format_gt(f,j,s);
3195+
else
3196+
bcf_fmt_array(s, f->n, f->type, f->p + j * (size_t)f->size);
3197+
}
3198+
if ( first ) kputc('.', s);
31103199
}
3111-
if ( first ) kputc('.', s);
31123200
}
3201+
else
3202+
for (j=0; j<=v->n_sample; j++)
3203+
kputs("\t.", s);
31133204
}
3114-
else
3115-
for (j=0; j<=v->n_sample; j++)
3116-
kputs("\t.", s);
31173205
}
31183206
kputc('\n', s);
31193207
return 0;
@@ -3824,6 +3912,11 @@ int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
38243912

38253913
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
38263914
{
3915+
if (v->unpacked & VCF_UN_FMT) {
3916+
if (vcf_parse_format_partial(v) < 0)
3917+
return -1;
3918+
}
3919+
38273920
kstring_t ind;
38283921
ind.s = 0; ind.l = ind.m = 0;
38293922
if (n) {

0 commit comments

Comments
 (0)