Skip to content

Commit aeb924d

Browse files
committed
Only count sampled_portion for direct inputs
1 parent dff94c4 commit aeb924d

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

nmdc_server/ingest/omics_processing.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,23 @@ def find_parent_process(output_id: str, mongodb: Database) -> Optional[dict[str,
6767

6868

6969
def get_biosample_input_ids(
70-
input_id: str, mongodb: Database, results: set[str], sampled_portions: set[str]
70+
input_id: str,
71+
mongodb: Database,
72+
results: set[str],
73+
sampled_portions: set[str],
74+
direct_input: bool,
7175
) -> set[Any]:
7276
"""
7377
Given an input ID return all biosample objects that are included in the input resource.
7478
7579
OmicsProcessing objects can take Biosamples or ProcessedSamples as inputs. Work needs to be done
7680
to determine which biosamples make up a given ProcessedSample. This function recursively tries
7781
to determine those Biosamples.
82+
83+
As a side effect, a set of `sampled_portion` values gets populated. Whether or not a processed
84+
samples' `sampled_portion`s get added to the set is driven by the `direct_input` parameter. Only
85+
processed samples who are inputs directly to a data generation will have their
86+
`sampled_portion`s added.
7887
"""
7988
# Base case, the input is already a biosample
8089
biosample_collection: Collection = mongodb["biosample_set"]
@@ -92,15 +101,16 @@ def get_biosample_input_ids(
92101
processed_sample = query[0]
93102
processed_sample_id = processed_sample["id"]
94103
sampled_portion = set(processed_sample.get("sampled_portion", []))
95-
if sampled_portion:
104+
# only store sampled portion values for immediate input to a data generation
105+
if direct_input and sampled_portion:
96106
sampled_portions.update(sampled_portion)
97107

98108
# Recursive case. For processed samples find the process that created it,
99109
# and check the inputs of that process.
100110
parent_process = find_parent_process(processed_sample_id, mongodb)
101111
if parent_process:
102112
for parent_input_id in parent_process["has_input"]:
103-
get_biosample_input_ids(parent_input_id, mongodb, results, sampled_portions)
113+
get_biosample_input_ids(parent_input_id, mongodb, results, sampled_portions, False)
104114
return results
105115

106116

@@ -145,7 +155,7 @@ def load_omics_processing(db: Session, obj: Dict[str, Any], mongodb: Database, l
145155
sampled_portions: set[str] = set()
146156
for input_id in input_ids:
147157
biosample_input_ids.union(
148-
get_biosample_input_ids(input_id, mongodb, biosample_input_ids, sampled_portions)
158+
get_biosample_input_ids(input_id, mongodb, biosample_input_ids, sampled_portions, True)
149159
)
150160
if sampled_portions:
151161
obj["sampled_portions"] = list(sampled_portions)

0 commit comments

Comments
 (0)