Skip to content

Commit 3c6d6ba

Browse files
committed
make preserve default true, change to drop columns
1 parent 9ecf7ef commit 3c6d6ba

7 files changed

+55
-64
lines changed

src/peak2gene.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def peak2gene(
3434
up_bound: int = None,
3535
down_bound: int = None,
3636
consensus: bool = False,
37-
preserve: bool = False,
37+
drop_columns: bool = False,
3838
) -> None:
3939
"""
4040
Find the nearest genes for a given list of peaks.
@@ -52,8 +52,8 @@ def peak2gene(
5252
boundary (int): Boundary for artificial peak boundary option. None if other options.
5353
up_bound (int): Maximum allowed distance between peak and upstream feature.
5454
down_bound (int): Maximum allowed distance between peak and downstream feature.
55-
consnsesus (bool): Whether to use consensus peaks.
56-
preserve (bool): Whether to preserve the original file columns.
55+
consensus (bool): Whether to use consensus peaks. Default False.
56+
drop_columns (bool): Whether to drop unnecessary columns from the original file. Default False.
5757
5858
Returns:
5959
None
@@ -66,7 +66,7 @@ def peak2gene(
6666
peaks = process_peaks(peak_file, peak_type, option, boundary, consensus)
6767
decomposed_peaks = decompose_features(peaks)
6868
output = find_nearest(
69-
decomposed_peaks, species, num_features, ref_dir, up_bound, down_bound, preserve
69+
decomposed_peaks, species, num_features, ref_dir, up_bound, down_bound, drop_columns
7070
)
7171
if output_type == "xlsx":
7272
write_to_excel(output, output_name, out_dir)
@@ -83,7 +83,7 @@ def find_nearest(
8383
ref_dir: str,
8484
up_bound: int,
8585
down_bound: int,
86-
preserve: bool,
86+
drop_columns: bool,
8787
) -> pd.DataFrame:
8888
"""
8989
Find the nearest genes for a given list of peaks. Place these in a Pandas DataFrame.
@@ -96,6 +96,7 @@ def find_nearest(
9696
ref_dir (str): Directory containing decomposed reference data.
9797
up_bound (int): Maximum allowed distance between peak and upstream feature.
9898
down_bound (int): Maximum allowed distance between peak and downstream feature.
99+
drop_columns (bool): Whether to drop unnecessary columns from the original file.
99100
100101
Returns:
101102
output (pd.DataFrame): Pandas DataFrame containing peak data, the nearest k genes for each peak,
@@ -123,7 +124,7 @@ def find_nearest(
123124
up_bound,
124125
down_bound,
125126
num_features,
126-
preserve,
127+
drop_columns,
127128
),
128129
]
129130
)

src/peakScout

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def main(args):
3737
db = args.down_bound
3838
gtf_ref = args.gtf_ref
3939
consensus = args.consensus
40-
preserve = args.preserve
40+
drop_columns = args.drop_columns
4141

4242
if function == "peak2gene":
4343
peak2gene(
@@ -54,7 +54,7 @@ def main(args):
5454
ub,
5555
db,
5656
consensus,
57-
preserve
57+
drop_columns
5858
)
5959
elif function == "decompose":
6060
decompose_gtf(ref, species, gtf_ref)
@@ -113,7 +113,7 @@ if __name__ == "__main__":
113113
)
114114
parser.add_argument("--gtf_ref", "--gtf", type=str, help="File path to gtf")
115115
parser.add_argument('--consensus', action='store_true', help='Consensus peak file')
116-
parser.add_argument('--preserve', action='store_true', help='Preserve all columns in input file')
116+
parser.add_argument('--drop_columns', action='store_true', help='Only keep necessary columns from input file')
117117

118118
args = parser.parse_args()
119119

src/process_features.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def get_nearest_features(
2323
up_bound: int,
2424
down_bound: int,
2525
k: int,
26-
preserve: bool,
26+
drop_columns: bool,
2727
) -> pl.DataFrame:
2828
"""
2929
Determine the nearest k features to each peak in roi using the reference
@@ -37,7 +37,7 @@ def get_nearest_features(
3737
up_bound (int): Maximum allowed distance between peak and upstream feature.
3838
down_bound (int): Maximum allowed distance between peak and downstream feature.
3939
k (int): Number of nearest features to collect.
40-
preserve (bool): If True, preserve the original columns of the roi DataFrame.
40+
drop_columns (bool): Whether to drop unnecessary columns from the original file.
4141
4242
Returns:
4343
return_roi (pl.DataFrame): Polars DataFrame containing peak information, the
@@ -55,7 +55,7 @@ def get_nearest_features(
5555
start_features = starts.select(feature).to_numpy().flatten()
5656
end_features = ends.select(feature).to_numpy().flatten()
5757

58-
if not preserve:
58+
if drop_columns:
5959
return_roi = roi.select(["name", "chr", "start", "end"]).clone()
6060
else:
6161
return_roi = roi.clone()
@@ -325,24 +325,14 @@ def gen_return_roi(
325325
None
326326
"""
327327
for i in range(1, k + 1):
328-
if feature == "gene_name":
329-
return_roi = return_roi.with_columns(
330-
[
331-
pl.Series("closest_" + feature + "_" + str(i), features_to_add[i]),
332-
pl.Series(
333-
"closest_" + "gene" + "_" + str(i) + "_dist", dists_to_add[i]
334-
),
335-
]
336-
)
337-
else:
338-
return_roi = return_roi.with_columns(
339-
[
340-
pl.Series("closest_" + feature + "_" + str(i), features_to_add[i]),
341-
pl.Series(
342-
"closest_" + feature + "_" + str(i) + "_dist", dists_to_add[i]
343-
),
344-
]
345-
)
328+
return_roi = return_roi.with_columns(
329+
[
330+
pl.Series("closest_" + feature + "_" + str(i), features_to_add[i]),
331+
pl.Series(
332+
"closest_" + feature + "_" + str(i) + "_dist", dists_to_add[i]
333+
),
334+
]
335+
)
346336
return return_roi
347337

348338

src/process_input.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ def process_genes(file_path: str, species: str, ref_dir: str) -> pl.DataFrame:
251251
gene_df = pl.DataFrame()
252252
for csv in os.listdir(os.path.join(ref_dir, species, "gene")):
253253
cur = pl.read_csv(os.path.join(ref_dir, species, "gene", csv))
254-
for gene in genes:
254+
for gene in genes[:]:
255255
if gene in cur.select(["gene_name"]).to_numpy():
256256
gene_df = pl.concat([gene_df, cur.filter(pl.col("gene_name") == gene)])
257257
genes.remove(gene)
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
name,chr,start,end,closest_gene_name_1,closest_gene_1_dist,closest_gene_name_2,closest_gene_2_dist,closest_gene_name_3,closest_gene_3_dist
2-
peak_name_1,1,4344147,4344187,Rp1,0,Gm37483,19159,Gm6101,-83628
3-
peak_name_2,1,5258993,5259502,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4-
peak_name_3,1,7405722,7406909,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5-
peak_name_4,1,8406429,8407654,Sntg1,0,Gm38024,43306,Gm16284,60772
6-
peak_name_5,1,10551123,10551732,Cpa6,0,Gm15604,2366,Gm25253,3822
7-
peak_name_6,2,3361888,3361958,Olah,0,Gm37525,-7884,Acbd7,-20895
8-
peak_name_7,2,5641105,5641105,Camk1d,0,Gm13216,-36944,Cdc123,153189
9-
peak_name_8,2,7365263,7365293,Celf2,0,Gm24340,-15070,Gm28641,164646
10-
peak_name_9,2,8372018,8372078,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11-
peak_name_10,2,11082018,11082078,Gm26478,5703,Gm13297,8752,Gm13294,16855
1+
chr,start,end,name,score,strand,closest_gene_name_1,closest_gene_name_1_dist,closest_gene_name_2,closest_gene_name_2_dist,closest_gene_name_3,closest_gene_name_3_dist
2+
1,4344147,4344187,peak_name_1,178,.,Rp1,0,Gm37483,19159,Gm6101,-83628
3+
1,5258993,5259502,peak_name_2,708,+,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4+
1,7405722,7406909,peak_name_3,687,+,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5+
1,8406429,8407654,peak_name_4,100,-,Sntg1,0,Gm38024,43306,Gm16284,60772
6+
1,10551123,10551732,peak_name_5,137,-,Cpa6,0,Gm15604,2366,Gm25253,3822
7+
2,3361888,3361958,peak_name_6,578,+,Olah,0,Gm37525,-7884,Acbd7,-20895
8+
2,5641105,5641105,peak_name_7,181,.,Camk1d,0,Gm13216,-36944,Cdc123,153189
9+
2,7365263,7365293,peak_name_8,707,.,Celf2,0,Gm24340,-15070,Gm28641,164646
10+
2,8372018,8372078,peak_name_9,634,-,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11+
2,11082018,11082078,peak_name_10,542,+,Gm26478,5703,Gm13297,8752,Gm13294,16855
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
name,chr,start,end,closest_gene_name_1,closest_gene_1_dist,closest_gene_name_2,closest_gene_2_dist,closest_gene_name_3,closest_gene_3_dist
2-
sampleName.macs2_peak_1,1,4344147,4344187,Rp1,0,Gm37483,19159,Gm6101,-83628
3-
sampleName.macs2_peak_2,1,5258993,5259502,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4-
sampleName.macs2_peak_3,1,7405722,7406909,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5-
sampleName.macs2_peak_4,1,8406429,8407654,Sntg1,0,Gm38024,43306,Gm16284,60772
6-
sampleName.macs2_peak_5,1,10551123,10551732,Cpa6,0,Gm15604,2366,Gm25253,3822
7-
sampleName.macs2_peak_6,2,3361888,3361958,Olah,0,Gm37525,-7884,Acbd7,-20895
8-
sampleName.macs2_peak_7,2,5641105,5641105,Camk1d,0,Gm13216,-36944,Cdc123,153189
9-
sampleName.macs2_peak_8,2,7365263,7365293,Celf2,0,Gm24340,-15070,Gm28641,164646
10-
sampleName.macs2_peak_9,2,8372018,8372078,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11-
sampleName.macs2_peak_10,2,11082018,11082078,Gm26478,5703,Gm13297,8752,Gm13294,16855
1+
chr,start,end,name,score,strand,signal,pvalue,qvalue,peak,closest_gene_name_1,closest_gene_name_1_dist,closest_gene_name_2,closest_gene_name_2_dist,closest_gene_name_3,closest_gene_name_3_dist
2+
1,4344147,4344187,sampleName.macs2_peak_1,178,.,12.3818,22.1235,17.8494,95,Rp1,0,Gm37483,19159,Gm6101,-83628
3+
1,5258993,5259502,sampleName.macs2_peak_2,123,.,9.35221,16.2172,12.3039,104,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4+
1,7405722,7406909,sampleName.macs2_peak_3,52,.,6.43619,8.41372,5.22019,144,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5+
1,8406429,8407654,sampleName.macs2_peak_4,51,.,6.0389,8.23617,5.18458,352,Sntg1,0,Gm38024,43306,Gm16284,60772
6+
1,10551123,10551732,sampleName.macs2_peak_5,28,.,4.12402,5.46401,2.82539,133,Cpa6,0,Gm15604,2366,Gm25253,3822
7+
2,3361888,3361958,sampleName.macs2_peak_6,39,.,5.11495,6.90568,3.99502,68,Olah,0,Gm37525,-7884,Acbd7,-20895
8+
2,5641105,5641105,sampleName.macs2_peak_7,28,.,4.12402,5.46401,2.82539,107,Camk1d,0,Gm13216,-36944,Cdc123,153189
9+
2,7365263,7365293,sampleName.macs2_peak_8,39,.,5.44526,6.90568,3.99502,100,Celf2,0,Gm24340,-15070,Gm28641,164646
10+
2,8372018,8372078,sampleName.macs2_peak_9,80,.,7.42712,11.5959,8.0246,109,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11+
2,11082018,11082078,sampleName.macs2_peak_10,28,.,4.12402,5.46401,2.82539,111,Gm26478,5703,Gm13297,8752,Gm13294,16855
Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
name,chr,start,end,closest_gene_name_1,closest_gene_1_dist,closest_gene_name_2,closest_gene_2_dist,closest_gene_name_3,closest_gene_3_dist
2-
420.473,1,4344147,4344187,Rp1,0,Gm37483,19159,Gm6101,-83628
3-
503.23,1,5258993,5259502,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4-
390.202,1,7405722,7406909,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5-
435.462,1,8406429,8407654,Sntg1,0,Gm38024,43306,Gm16284,60772
6-
409.048,1,10551123,10551732,Cpa6,0,Gm15604,2366,Gm25253,3822
7-
437.073,2,3361888,3361958,Olah,0,Gm37525,-7884,Acbd7,-20895
8-
402.774,2,5641105,5641105,Camk1d,0,Gm13216,-36944,Cdc123,153189
9-
446.691,2,7365263,7365293,Celf2,0,Gm24340,-15070,Gm28641,164646
10-
398.332,2,8372018,8372078,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11-
448.351,2,11082018,11082078,Gm26478,5703,Gm13297,8752,Gm13294,16855
1+
chr,start,end,name,max_signal,region,closest_gene_name_1,closest_gene_name_1_dist,closest_gene_name_2,closest_gene_name_2_dist,closest_gene_name_3,closest_gene_name_3_dist
2+
1,4344147,4344187,420.473,1.31825,1:4344146-4344186,Rp1,0,Gm37483,19159,Gm6101,-83628
3+
1,5258993,5259502,503.23,1.78208,1:5258992-5259501,Gm7182,16604,Gm37567,48237,Atp6v1h,-96464
4+
1,7405722,7406909,390.202,0.683537,1:7405721-7406908,Gm18984,3770,Gm26901,-7853,Gm19002,-89260
5+
1,8406429,8407654,435.462,1.00089,1:8406428-8407653,Sntg1,0,Gm38024,43306,Gm16284,60772
6+
1,10551123,10551732,409.048,1.36707,1:10551122-10551731,Cpa6,0,Gm15604,2366,Gm25253,3822
7+
2,3361888,3361958,437.073,1.58678,2:3361887-3361957,Olah,0,Gm37525,-7884,Acbd7,-20895
8+
2,5641105,5641105,402.774,1.85531,2:5641104-5641104,Camk1d,0,Gm13216,-36944,Cdc123,153189
9+
2,7365263,7365293,446.691,1.36707,2:7365262-7365292,Celf2,0,Gm24340,-15070,Gm28641,164646
10+
2,8372018,8372078,398.332,1.26943,2:8372017-8372077,Gm24534,100016,Gm13254,-224153,Gm13255,261859
11+
2,11082018,11082078,448.351,0.830009,2:11082017-11082077,Gm26478,5703,Gm13297,8752,Gm13294,16855

0 commit comments

Comments
 (0)