Skip to content

Commit 83f4335

Browse files
authored
Merge pull request #11 from Mye-InfoBank/per-dataset-resolution
Implement per-dataset resolution
2 parents cbc96fa + 31442f5 commit 83f4335

File tree

6 files changed

+65
-15
lines changed

6 files changed

+65
-15
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,18 @@ This includes only two steps:
112112

113113
### Step 3: Find unification opportunities
114114

115-
Currently, there is only one approach implemented.
115+
Currently, there are two approaches implemented, applied in sequence.
116+
117+
#### Resolve per dataset
118+
119+
For each unapproved symbol, this step looks at each dataset individually to see if there's a clear resolution within that specific dataset context. The logic works as follows:
120+
121+
1. For each unapproved symbol, examine each dataset that contains it
122+
2. Look at all the approved symbols that the unapproved symbol connects to (its neighbors in the graph)
123+
3. Check which of these approved neighbors are **not** present in the current dataset
124+
4. If exactly one approved neighbor is missing from the dataset, rename the unapproved symbol to that missing approved symbol in this specific dataset
125+
126+
This approach handles cases where an unapproved symbol has multiple potential approved targets, but the dataset context makes the choice clear. For example, if an unapproved symbol connects to three approved symbols, but two of them are already present in a particular dataset, then the third one is the obvious choice for that dataset.
116127

117128
#### Resolve unapproved symbols
118129

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "hugo-unifier"
3-
version = "0.2.8"
3+
version = "0.3.0"
44
description = "Python package that can unify gene symbols across datasets based on the HUGO database."
55
readme = "README.md"
66
authors = [

src/hugo_unifier/get_changes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
remove_self_edges,
1010
remove_loose_ends,
1111
resolve_unapproved,
12+
resolve_per_dataset,
1213
)
1314

1415

@@ -54,6 +55,7 @@ def get_changes(
5455
remove_loose_ends(G)
5556

5657
graph_manipulations: List[Callable[[nx.DiGraph, pd.DataFrame]]] = [
58+
resolve_per_dataset,
5759
resolve_unapproved,
5860
# aggregate_approved,
5961
]

src/hugo_unifier/graph_manipulations.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,37 @@ def __decide_successor__(G: nx.DiGraph, node: str, df: pd.DataFrame) -> str:
5555
return None
5656

5757

58+
def resolve_per_dataset(G: nx.DiGraph, df: pd.DataFrame) -> pd.DataFrame:
59+
# Iterate all unapproved nodes
60+
# For each sample, check if all except one of the approved symbols are also present in the sample
61+
# If so, rename the unapproved symbol to the last remaining approved symbol
62+
63+
for node in list(G.nodes()):
64+
if G.nodes[node]["type"] == "approvedSymbol":
65+
continue
66+
67+
samples = list(G.nodes[node]["samples"])
68+
for sample in samples:
69+
non_used_neighbors = []
70+
for neighbor in G.neighbors(node):
71+
if sample not in G.nodes[neighbor]["samples"]:
72+
non_used_neighbors.append(neighbor)
73+
74+
if len(non_used_neighbors) == 1:
75+
target_neighbor = non_used_neighbors[0]
76+
df.loc[len(df)] = [
77+
sample,
78+
"rename",
79+
node,
80+
target_neighbor,
81+
f"The unapproved symbol {node} is present in {sample} and only one of its approved neighbors ({target_neighbor}) is not also present in {sample}. Therefore, renaming {node} to {target_neighbor} in {sample}.",
82+
]
83+
G.nodes[target_neighbor]["samples"].add(sample)
84+
G.nodes[node]["samples"].remove(sample)
85+
86+
return df
87+
88+
5889
def resolve_unapproved(G: nx.DiGraph, df: pd.DataFrame) -> pd.DataFrame:
5990
for node in list(G.nodes()):
6091
if G.nodes[node]["type"] == "approvedSymbol":

tests/test_get_changes.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,23 @@ def test_cox1_and_co1():
2424
assert len(sample_changes) == 2
2525

2626
sample1_changes = sample_changes["sample1"]
27-
assert len(sample1_changes) == 1
28-
assert sample1_changes.iloc[0]["action"] == "copy"
29-
assert sample1_changes.iloc[0]["symbol"] == "COX1"
30-
assert sample1_changes.iloc[0]["new"] == "MT-CO1"
27+
assert len(sample1_changes) == 0
3128

3229
sample2_changes = sample_changes["sample2"]
3330
assert len(sample2_changes) == 1
34-
assert sample2_changes.iloc[0]["action"] == "conflict"
31+
assert sample2_changes.iloc[0]["action"] == "rename"
32+
33+
34+
def test_cox1_co1_and_ptgs1():
35+
sample_symbols = {"sample1": ["MT-CO1"], "sample2": ["COX1", "PTGS1"]}
36+
_, sample_changes = get_changes(sample_symbols)
37+
38+
assert len(sample_changes) == 2
39+
assert len(sample_changes["sample1"]) == 0
40+
assert len(sample_changes["sample2"]) == 1
41+
assert sample_changes["sample2"].iloc[0]["action"] == "rename"
42+
assert sample_changes["sample2"].iloc[0]["symbol"] == "COX1"
43+
assert sample_changes["sample2"].iloc[0]["new"] == "MT-CO1"
3544

3645

3746
def test_single_sample():
@@ -88,14 +97,11 @@ def test_cox2_and_co2():
8897
assert len(sample_changes) == 2
8998

9099
sample1_changes = sample_changes["sample1"]
91-
assert len(sample1_changes) == 1
92-
assert sample1_changes.iloc[0]["action"] == "copy"
93-
assert sample1_changes.iloc[0]["symbol"] == "COX2"
94-
assert sample1_changes.iloc[0]["new"] == "MT-CO2"
100+
assert len(sample1_changes) == 0
95101

96102
sample2_changes = sample_changes["sample2"]
97103
assert len(sample2_changes) == 1
98-
assert sample2_changes.iloc[0]["action"] == "conflict"
104+
assert sample2_changes.iloc[0]["action"] == "rename"
99105

100106

101107
def test_cox3():
@@ -114,15 +120,15 @@ def test_cox3():
114120
assert len(sample2_changes) == 0
115121

116122

117-
def test_cox3_and_co1():
123+
def test_cox3_and_co3():
118124
sample_symbols = {"sample1": ["COX3"], "sample2": ["MT-CO3", "COX3"]}
119125

120126
_, sample_changes = get_changes(sample_symbols)
121127
assert len(sample_changes) == 2
122128

123129
sample1_changes = sample_changes["sample1"]
124130
assert len(sample1_changes) == 1
125-
assert sample1_changes.iloc[0]["action"] == "copy"
131+
assert sample1_changes.iloc[0]["action"] == "rename"
126132
assert sample1_changes.iloc[0]["symbol"] == "COX3"
127133
assert sample1_changes.iloc[0]["new"] == "MT-CO3"
128134

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)