Skip to content

Commit 0e7ae54

Browse files
authored
Merge pull request #140 from fedarko/fix-139
Fix, test, and document filtering bug
2 parents 2727c04 + e1d6e3d commit 0e7ae54

File tree

3 files changed

+101
-5
lines changed

3 files changed

+101
-5
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -604,7 +604,7 @@ The larger the batch size, the more samples you average per iteration, but the l
604604
all samples that have less than 1000 reads will be filtered out.
605605

606606
`--min-feature-count` will filter out features according to how many __samples__ they appear in. For instance, if `--min-feature-count 10` is specified,
607-
then all features than appear in less than 10 samples will be thrown out. It is important to note that we are filtering according to number of samples rather than number of reads. The reason why this behavior is chosen relates to a rule of thumb commonly used in linear regression - if a microbe appears in less than 10 samples, it is difficult to fit a meaningful line for that microbe. In other words, there is not even resolution in the study to say anything meaningful about that microbe in the context of differential abundance analysis. The `--min-feature-count` filter is applied _after_ the `--min-sample-count` is applied.
607+
then all features than appear in less than 10 samples will be thrown out. It is important to note that we are filtering according to number of samples rather than number of reads. The reason why this behavior is chosen relates to a rule of thumb commonly used in linear regression - if a microbe appears in less than 10 samples, it is difficult to fit a meaningful line for that microbe. In other words, there is not even resolution in the study to say anything meaningful about that microbe in the context of differential abundance analysis. The `--min-feature-count` filter is applied _after_ the `--min-sample-count` is applied, so it's possible for (for example) a sample to get filtered out which in turn causes a feature to get filtered out.
608608

609609
## 7.4. FAQs: Output files
610610

songbird/tests/test_util.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,94 @@ def test_match_and_filter_big_table(self):
210210
self.assertEqual(res_design.shape[0], drop_design.shape[0])
211211
self.assertEqual(res_metadata.shape[0], drop_metadata.shape[0])
212212

213+
def test_match_and_filter_exact_minimum_feature_count_used(self):
214+
formula = 'C(categorical) + continuous'
215+
216+
# None of the features should be dropped when min_feature_count is 2 or
217+
# below, since all features occur in at least two samples.
218+
res = match_and_filter(self.big_table, self.metadata, formula,
219+
min_sample_count=0, min_feature_count=2)
220+
221+
self.assertEqual(
222+
set(res[0].ids("observation")),
223+
set(self.big_table.ids("observation"))
224+
)
225+
# Samples should be unchanged (well, s9 gets filtered because it's not
226+
# in the metadata, but everything else is kept)
227+
self.assertEqual(
228+
set(res[0].ids()),
229+
set(['s1', 's2', 's3', 's4', 's5', 's6'])
230+
)
231+
232+
# When we bump min_feature_count up to 3, though, stuff should start
233+
# getting filtered -- the last three features in self.big_table all
234+
# occur in just two samples each.
235+
res = match_and_filter(self.big_table, self.metadata, formula,
236+
min_sample_count=0, min_feature_count=3)
237+
self.assertEqual(
238+
set(res[0].ids("observation")),
239+
set(['o1', 'o2', 'o3', 'o4'])
240+
)
241+
# Again, non-s9 samples unchanged because min sample count is 0
242+
self.assertEqual(
243+
set(res[0].ids()),
244+
set(['s1', 's2', 's3', 's4', 's5', 's6'])
245+
)
246+
247+
def test_match_and_filter_exact_minimum_sample_count_used(self):
248+
formula = 'C(categorical) + continuous'
249+
250+
# None of the samples (except s9, which isn't in the metadata)
251+
# should be dropped when min_sample_count is 3 or
252+
# below, since all samples have a total count of at least 3.
253+
res = match_and_filter(self.big_table, self.metadata, formula,
254+
min_sample_count=3, min_feature_count=0)
255+
256+
self.assertEqual(
257+
set(res[0].ids()),
258+
set(self.big_table.ids()) - set(["s9"])
259+
)
260+
# Features should remain unchanged due to min feature count being 0.
261+
self.assertEqual(
262+
set(res[0].ids("observation")),
263+
set(self.big_table.ids("observation"))
264+
)
265+
266+
# When we bump min_feature_count up to 4, s2 and s4 should get filtered
267+
# since they each have a total count of 3
268+
res = match_and_filter(self.big_table, self.metadata, formula,
269+
min_sample_count=4, min_feature_count=0)
270+
self.assertEqual(set(res[0].ids()), set(['s1', 's3', 's5', 's6']))
271+
# Features should remain unchanged due to min feature count being 0.
272+
self.assertEqual(
273+
set(res[0].ids("observation")),
274+
set(self.big_table.ids("observation"))
275+
)
276+
277+
def test_match_and_filter_exact_minima_together_in_sequence(self):
278+
formula = 'C(categorical) + continuous'
279+
280+
res = match_and_filter(self.big_table, self.metadata, formula,
281+
min_sample_count=4, min_feature_count=4)
282+
283+
# Since min_sample_count is 4, s2 and s4 get filtered out due to having
284+
# a total count of 3
285+
self.assertEqual(
286+
set(res[0].ids()),
287+
set(['s1', 's3', 's5', 's6'])
288+
)
289+
# Since min_feature_count is 4, o4, o5, o6, and o7 all get filtered out
290+
# since they were present in 3, 2, 2, and 2 samples respectively
291+
# *BEFORE* the sample filtering was done. However, in addition to this,
292+
# o2 and o3 will get filtered out because (since we filtered out s2 and
293+
# s4) they are now not present in 4 samples! And as match_and_filter's
294+
# docs explain, the sample filtering is done first, and this can impact
295+
# the feature filtering. So... "oops, all o1".
296+
self.assertEqual(
297+
set(res[0].ids("observation")),
298+
set(['o1'])
299+
)
300+
213301
def test_split_training_random(self):
214302
np.random.seed(0)
215303
design = pd.DataFrame(

songbird/util.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ def match_and_filter(table, metadata, formula,
135135
136136
This will also return the patsy representation.
137137
138+
NOTE that this does sample filtering before read filtering -- it's possible
139+
that this could impact the results in unintuitive ways, for example a
140+
sample is filtered out which causes a feature to then be filtered out for
141+
not being present in enough samples.
142+
138143
Parameters
139144
----------
140145
table : biom.Table
@@ -148,14 +153,17 @@ def match_and_filter(table, metadata, formula,
148153
Filtered biom table
149154
metadata : pd.DataFrame
150155
Sample metadata
156+
design : patsy.DesignMatrix
157+
Design matrix created from the formula and filtered metadata
151158
"""
152-
# match them
153-
159+
# Use >= so that samples with exactly "min_sample_count" counts, or
160+
# features present in exactly "min_feature_count" samples, are *not*
161+
# filtered out.
154162
def sample_filter(val, id_, md):
155-
return id_ in metadata.index and np.sum(val) > min_sample_count
163+
return id_ in metadata.index and np.sum(val) >= min_sample_count
156164

157165
def read_filter(val, id_, md):
158-
return np.sum(val > 0) > min_feature_count
166+
return np.sum(val > 0) >= min_feature_count
159167

160168
table = table.filter(sample_filter, axis='sample', inplace=False)
161169
table = table.filter(read_filter, axis='observation', inplace=False)

0 commit comments

Comments
 (0)