Skip to content

Commit b731be0

Browse files
authored
Merge pull request #116 from noamteyssier/115-build-a-join-subcommand
115 build a join subcommand
2 parents 35be4a2 + 3d7338a commit b731be0

File tree

18 files changed

+611
-30
lines changed

18 files changed

+611
-30
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "gia"
3-
version = "0.2.22"
3+
version = "0.2.23"
44
edition = "2021"
55
description = "A tool for set theoretic operations of genomic intervals"
66
license = "MIT"

src/cli/commands.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
use super::{
22
bam::BamCommand, bcf::BcfCommand, ClosestArgs, ClusterArgs, ComplementArgs, CoverageArgs,
3-
ExtendArgs, FlankArgs, GetFastaArgs, IntersectArgs, MergeArgs, RandomArgs, SampleArgs,
4-
SegmentArgs, ShiftArgs, SortArgs, SpacingArgs, SubtractArgs, UnionBedGraphArgs, WindowArgs,
3+
ExtendArgs, FlankArgs, GetFastaArgs, IntersectArgs, JoinArgs, MergeArgs, RandomArgs,
4+
SampleArgs, SegmentArgs, ShiftArgs, SortArgs, SpacingArgs, SubtractArgs, UnionBedGraphArgs,
5+
WindowArgs,
56
};
67
use clap::Subcommand;
78

@@ -49,6 +50,9 @@ pub enum Command {
4950
/// Intersects two BED files
5051
Intersect(IntersectArgs),
5152

53+
/// Joins two BED files
54+
Join(JoinArgs),
55+
5256
/// Merges intervals of a BED file with overlapping regions
5357
Merge(MergeArgs),
5458

src/cli/join.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use super::{DualInput, Output, OverlapPredicates};
2+
use clap::{Parser, ValueEnum};
3+
4+
#[derive(Parser, Debug)]
5+
pub struct JoinArgs {
6+
#[clap(flatten)]
7+
pub inputs: DualInput,
8+
9+
#[clap(flatten)]
10+
pub params: JoinParams,
11+
12+
#[clap(flatten)]
13+
pub output: Output,
14+
}
15+
16+
#[derive(Parser, Debug)]
17+
#[clap(next_help_heading = "Parameters")]
18+
pub struct JoinParams {
19+
#[clap(short = 'H', long, default_value = "inner")]
20+
pub how: JoinMethod,
21+
22+
/// Assert the inputs are pre-sorted
23+
#[clap(short = 'S', long)]
24+
pub sorted: bool,
25+
26+
#[clap(flatten)]
27+
pub overlap_predicates: OverlapPredicates,
28+
}
29+
30+
#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
31+
pub enum JoinMethod {
32+
/// Return all records in the left input even if no match is found in right
33+
Left,
34+
/// Return all records in the right input even if no match is found in left
35+
Right,
36+
/// Return only records that have a match in both inputs
37+
Inner,
38+
}

src/cli/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ mod get_fasta;
1111
mod growth;
1212
mod inputs;
1313
mod intersect;
14+
mod join;
1415
mod merge;
1516
mod outputs;
1617
mod overlap_predicates;
@@ -37,6 +38,7 @@ pub use inputs::{
3738
DualInput, MixedInputBam, MixedInputVcf, MultiInput, SingleInput, SingleInputBam,
3839
};
3940
pub use intersect::{IntersectArgs, IntersectParams, OutputMethod};
41+
pub use join::{JoinArgs, JoinMethod, JoinParams};
4042
pub use merge::{MergeArgs, MergeParams};
4143
pub use outputs::{BamOutput, Output};
4244
pub use overlap_predicates::OverlapPredicates;

src/commands/join.rs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
use crate::{
2+
cli::{JoinArgs, JoinMethod, JoinParams},
3+
dispatch_pair, dispatch_pair_multi,
4+
io::{write_pairs_iter_with, WriteNamedIter, WriteNamedIterImpl},
5+
types::{IntervalPair, Rename, Renamer, SplitTranslater},
6+
utils::sort_pairs,
7+
};
8+
use anyhow::Result;
9+
use bedrs::{traits::IntervalBounds, IntervalContainer};
10+
use serde::Serialize;
11+
use std::io::Write;
12+
13+
fn join_sets_inner<'a, Ia, Ib, Na, Nb, W>(
14+
mut set_a: IntervalContainer<Ia, usize, usize>,
15+
mut set_b: IntervalContainer<Ib, usize, usize>,
16+
translater: Option<&'a SplitTranslater>,
17+
params: JoinParams,
18+
output: W,
19+
) -> Result<()>
20+
where
21+
Ia: IntervalBounds<usize, usize> + Copy + Serialize,
22+
Ib: IntervalBounds<usize, usize> + Copy + Serialize,
23+
Na: IntervalBounds<&'a str, usize> + Serialize,
24+
Nb: IntervalBounds<&'a str, usize> + Serialize,
25+
W: Write,
26+
WriteNamedIterImpl: WriteNamedIter<Ia> + WriteNamedIter<Ib>,
27+
Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>,
28+
{
29+
let query_method = params.overlap_predicates.into();
30+
sort_pairs(&mut set_a, &mut set_b, params.sorted);
31+
let pairs_iter = set_a.records().iter().flat_map(|iv| {
32+
let overlaps = set_b
33+
.query_iter(iv, query_method)
34+
.expect("Error in finding overlaps")
35+
.copied();
36+
overlaps.map(|ov| IntervalPair::new(*iv, ov, translater))
37+
});
38+
write_pairs_iter_with(pairs_iter, output, translater)
39+
}
40+
41+
fn join_sets_left<'a, Ia, Ib, Na, Nb, W>(
42+
mut set_a: IntervalContainer<Ia, usize, usize>,
43+
mut set_b: IntervalContainer<Ib, usize, usize>,
44+
translater: Option<&'a SplitTranslater>,
45+
params: JoinParams,
46+
output: W,
47+
) -> Result<()>
48+
where
49+
Ia: IntervalBounds<usize, usize> + Copy + Serialize,
50+
Ib: IntervalBounds<usize, usize> + Copy + Serialize,
51+
Na: IntervalBounds<&'a str, usize> + Serialize,
52+
Nb: IntervalBounds<&'a str, usize> + Serialize,
53+
W: Write,
54+
WriteNamedIterImpl: WriteNamedIter<Ia> + WriteNamedIter<Ib>,
55+
Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>,
56+
{
57+
let query_method = params.overlap_predicates.into();
58+
sort_pairs(&mut set_a, &mut set_b, params.sorted);
59+
let pairs_iter = set_a.records().iter().flat_map(|iv| {
60+
let overlaps = set_b
61+
.query_iter(iv, query_method)
62+
.expect("Error in finding overlaps")
63+
.copied();
64+
65+
// Peek at the next item to see if there are overlaps or not
66+
let mut peek = overlaps.peekable();
67+
let adjusted_overlap = if peek.peek().is_none() {
68+
let null_pair = IntervalPair::from_option(Some(*iv), None, translater);
69+
let new_iter = std::iter::once(null_pair);
70+
Box::new(new_iter) as Box<dyn Iterator<Item = IntervalPair<Ia, Ib, Na, Nb>>>
71+
} else {
72+
let new_iter = peek.map(|ov| IntervalPair::new(*iv, ov, translater));
73+
Box::new(new_iter) as Box<dyn Iterator<Item = IntervalPair<Ia, Ib, Na, Nb>>>
74+
};
75+
adjusted_overlap
76+
});
77+
write_pairs_iter_with(pairs_iter, output, translater)
78+
}
79+
80+
fn join_sets_right<'a, Ia, Ib, Na, Nb, W>(
81+
mut set_a: IntervalContainer<Ia, usize, usize>,
82+
mut set_b: IntervalContainer<Ib, usize, usize>,
83+
translater: Option<&'a SplitTranslater>,
84+
params: JoinParams,
85+
output: W,
86+
) -> Result<()>
87+
where
88+
Ia: IntervalBounds<usize, usize> + Copy + Serialize,
89+
Ib: IntervalBounds<usize, usize> + Copy + Serialize,
90+
Na: IntervalBounds<&'a str, usize> + Serialize,
91+
Nb: IntervalBounds<&'a str, usize> + Serialize,
92+
W: Write,
93+
WriteNamedIterImpl: WriteNamedIter<Ia> + WriteNamedIter<Ib>,
94+
Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>,
95+
{
96+
let query_method = params.overlap_predicates.into();
97+
sort_pairs(&mut set_a, &mut set_b, params.sorted);
98+
let pairs_iter = set_b.records().iter().flat_map(|iv| {
99+
let overlaps = set_a
100+
.query_iter(iv, query_method)
101+
.expect("Error in finding overlaps")
102+
.copied();
103+
104+
// Peek at the next item to see if there are overlaps or not
105+
let mut peek = overlaps.peekable();
106+
let adjusted_overlap = if peek.peek().is_none() {
107+
let null_pair = IntervalPair::from_option(None, Some(*iv), translater);
108+
let new_iter = std::iter::once(null_pair);
109+
Box::new(new_iter) as Box<dyn Iterator<Item = IntervalPair<Ia, Ib, Na, Nb>>>
110+
} else {
111+
let new_iter = peek.map(|ov| IntervalPair::new(ov, *iv, translater));
112+
Box::new(new_iter) as Box<dyn Iterator<Item = IntervalPair<Ia, Ib, Na, Nb>>>
113+
};
114+
adjusted_overlap
115+
});
116+
write_pairs_iter_with(pairs_iter, output, translater)
117+
}
118+
119+
pub fn join(args: JoinArgs) -> Result<()> {
120+
let writer = args.output.get_writer()?;
121+
if args.inputs.is_multi() {
122+
let (bed_a, bed_b) = args.inputs.get_multi_readers()?;
123+
match args.params.how {
124+
JoinMethod::Inner => {
125+
dispatch_pair_multi!(bed_a, bed_b, writer, args.params, join_sets_inner)
126+
}
127+
JoinMethod::Left => {
128+
dispatch_pair_multi!(bed_a, bed_b, writer, args.params, join_sets_left)
129+
}
130+
JoinMethod::Right => {
131+
dispatch_pair_multi!(bed_a, bed_b, writer, args.params, join_sets_right)
132+
}
133+
}
134+
} else {
135+
let (bed_a, bed_b) = args.inputs.get_readers()?;
136+
match args.params.how {
137+
JoinMethod::Inner => dispatch_pair!(bed_a, bed_b, writer, args.params, join_sets_inner),
138+
JoinMethod::Left => dispatch_pair!(bed_a, bed_b, writer, args.params, join_sets_left),
139+
JoinMethod::Right => dispatch_pair!(bed_a, bed_b, writer, args.params, join_sets_right),
140+
}
141+
}
142+
}

src/commands/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ mod extend;
88
mod flank;
99
mod get_fasta;
1010
mod intersect;
11+
mod join;
1112
mod merge;
1213
mod random;
1314
mod sample;
@@ -27,6 +28,7 @@ pub use extend::extend;
2728
pub use flank::flank;
2829
pub use get_fasta::get_fasta;
2930
pub use intersect::intersect;
31+
pub use join::join;
3032
pub use merge::merge;
3133
pub use random::random;
3234
pub use sample::sample;

src/io/write/utils.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ pub fn write_pairs_iter_with<'a, W, Ia, Ib, Na, Nb, It>(
206206
translater: Option<&SplitTranslater>,
207207
) -> Result<()>
208208
where
209-
Ia: IntervalBounds<usize, usize> + Serialize,
210-
Ib: IntervalBounds<usize, usize> + Serialize,
209+
Ia: IntervalBounds<usize, usize> + Serialize + Copy,
210+
Ib: IntervalBounds<usize, usize> + Serialize + Copy,
211211
Na: IntervalBounds<&'a str, usize> + Serialize,
212212
Nb: IntervalBounds<&'a str, usize> + Serialize,
213213
W: Write,
@@ -224,8 +224,8 @@ where
224224

225225
pub fn write_pairs_iter<'a, W, Ia, Ib, Na, Nb, It>(records: It, writer: W) -> Result<()>
226226
where
227-
Ia: IntervalBounds<usize, usize> + Serialize,
228-
Ib: IntervalBounds<usize, usize> + Serialize,
227+
Ia: IntervalBounds<usize, usize> + Serialize + Copy,
228+
Ib: IntervalBounds<usize, usize> + Serialize + Copy,
229229
Na: IntervalBounds<&'a str, usize> + Serialize,
230230
Nb: IntervalBounds<&'a str, usize> + Serialize,
231231
W: Write,
@@ -241,8 +241,8 @@ where
241241

242242
pub fn write_named_pairs_iter<'a, Ia, Ib, Na, Nb, W, It>(records: It, writer: W) -> Result<()>
243243
where
244-
Ia: IntervalBounds<usize, usize> + Serialize,
245-
Ib: IntervalBounds<usize, usize> + Serialize,
244+
Ia: IntervalBounds<usize, usize> + Serialize + Copy,
245+
Ib: IntervalBounds<usize, usize> + Serialize + Copy,
246246
Na: IntervalBounds<&'a str, usize> + Serialize,
247247
Nb: IntervalBounds<&'a str, usize> + Serialize,
248248
W: Write,

src/main.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ use anyhow::Result;
99
use clap::Parser;
1010
use cli::{bam::BamCommand, bcf::BcfCommand, Cli, Command};
1111
use commands::{
12-
bam, bcf, closest, cluster, complement, coverage, extend, flank, get_fasta, intersect, merge,
13-
random, sample, segment, shift, sort, spacing, subtract, unionbedgraph, window,
12+
bam, bcf, closest, cluster, complement, coverage, extend, flank, get_fasta, intersect, join,
13+
merge, random, sample, segment, shift, sort, spacing, subtract, unionbedgraph, window,
1414
};
1515

1616
fn main() -> Result<()> {
@@ -32,6 +32,7 @@ fn main() -> Result<()> {
3232
Command::Flank(args) => flank(args)?,
3333
Command::GetFasta(args) => get_fasta(args)?,
3434
Command::Intersect(args) => intersect(args)?,
35+
Command::Join(args) => join(args)?,
3536
Command::Merge(args) => merge(args)?,
3637
Command::Random(args) => random(args)?,
3738
Command::Sample(args) => sample(args)?,

src/types/formats/genome.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,15 @@ mod testing {
165165
fn test_genome_named() {
166166
let mut translater = Translater::new();
167167
let genome = Genome::from_reader_named(GENOME_NAMED, &mut translater).unwrap();
168-
assert_eq!(genome.chr_size_unchecked(0), 1000);
169-
assert_eq!(genome.chr_size_unchecked(1), 2000);
170-
assert_eq!(genome.chr_size_unchecked(2), 3000);
168+
assert_eq!(genome.chr_size_unchecked(1), 1000);
169+
assert_eq!(genome.chr_size_unchecked(2), 2000);
170+
assert_eq!(genome.chr_size_unchecked(3), 3000);
171171
assert!(genome.translater().is_some());
172172
let translater = genome.translater().unwrap();
173-
assert_eq!(translater.get_chr_name(0).unwrap(), "chr1");
174-
assert_eq!(translater.get_chr_name(1).unwrap(), "chr2");
175-
assert_eq!(translater.get_chr_name(2).unwrap(), "chr3");
173+
assert_eq!(translater.get_chr_name(0).unwrap(), ".");
174+
assert_eq!(translater.get_chr_name(1).unwrap(), "chr1");
175+
assert_eq!(translater.get_chr_name(2).unwrap(), "chr2");
176+
assert_eq!(translater.get_chr_name(3).unwrap(), "chr3");
176177
}
177178

178179
#[test]

0 commit comments

Comments
 (0)