Skip to content

Fix reading GFF files where parent lines are after child lines #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ dependencies {
implementation("org.jgrapht:jgrapht-core:1.5.1")


implementation("io.github.oshai:kotlin-logging-jvm:5.0.0")
implementation(group = "ch.qos.logback", name = "logback-classic", version = "1.2.6")
implementation("it.unimi.dsi:fastutil:8.5.12")
implementation("org.lz4:lz4-java:1.8.0")
Expand Down
287 changes: 252 additions & 35 deletions src/main/kotlin/biokotlin/featureTree/Graph.kt
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
package biokotlin.featureTree

import biokotlin.util.bufferedReader
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import java.io.FileReader
import java.util.*
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentMap
Expand Down Expand Up @@ -354,6 +352,14 @@ internal class Graph private constructor(
assert { invariants() }
}

fun addParents(newParents: List<Node>) {
val parentsNotAdded = newParents.filter { it !in parents }
parents.addAll(parentsNotAdded)
parentsNotAdded.forEach { it.children.addLast(this) }
incrementTopo()
assert { invariants() }
}

/**
* Puts node and all orphaned descendants into a deleted state where they cannot be read from nor written to.
*/
Expand Down Expand Up @@ -831,6 +837,17 @@ internal class Graph private constructor(
fun containsName(name: String): Boolean = byName.contains(name)

companion object {

fun parseException(
lineNumber: Int,
line: String,
textCorrector: ((String) -> String)?,
file: String,
helpText: String
): ParseException {
return ParseException(lineNumber, line, textCorrector, file, helpText)
}

/**
* Returns a graph representation of the file.
* @see [Genome.fromFile]
Expand All @@ -845,83 +862,283 @@ internal class Graph private constructor(
): Graph {
// PLANNED: concurrent reading of ### directive

val graph = Graph(multipleParentage)
modifySchema?.invoke(graph.schema)
val graph = getGraph(file, textCorrecter, multipleParentage, modifySchema)

bufferedReader(file).useLines { lines ->
var lineCounter = 0
var commentCounter = 0
for (line in lines) {

lineCounter++
if (line.isEmpty() || line.isBlank()) continue //skip blank lines
// PLANNED: comment support
if (line.startsWith("#")) {
// This has been known to print over 4000 lines of comments in a single file, which is not useful.
//logger.info { "Comments not yet supported. Comment at line $lineCounter discarded: $line" }
commentCounter++
if (commentCounter == 1) {
println("Comments not yet supported. Comment at line $lineCounter discarded: $line")
}
continue
}

val corrected = textCorrecter?.invoke(line) ?: line

fun parseException(helpText: String): ParseException {
return ParseException(lineCounter, line, textCorrecter, file, helpText)
val split = corrected.split("\t")

if (split.size != 9) throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Should contain 9 tab-delineated columns. $corrected"
)

val seqid = split[0]
val source = split[1]
val type = split[2]
val start = split[3].toIntOrNull()
?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse start ${split[3]} into an integer."
)
val end = split[4].toIntOrNull()
?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse start ${split[4]} into an integer."
)
val score = split[5].toDoubleOrNull()
val strand =
Strand.fromString(split[6]) ?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse ${split[6]} into a strand."
)
val phase =
Phase.fromString(split[7]) ?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse ${split[7]} into a phase."
)
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique()) {
throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot have multiple instances of the same tag"
)
}
val attributes = split[8].trimEnd(';').split(';').associate {
val tagValue = it.split('=')
if (tagValue.size != 2)
throw parseException(
lineCounter,
line,
textCorrecter,
file,
"All distinct attributes must be separated by a ; character."
)
val values = tagValue[1].split(',')
tagValue[0] to values
}

if ((attributes["ID"]?.size ?: 0) > 1) throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot have multiple IDs."
)
val id = attributes["ID"]?.get(0)

val parentIDs = attributes["Parent"]
val parents = parentIDs?.map {
graph.byID(it)
?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Contains Parent attribute $it, which is not the ID of a previous line."
)
} ?: listOf(graph.root)
val resolvedParents = if (parentResolver == null || parents.size <= 1) {
parents
} else {
listOf(parents[parentResolver(corrected, parents.map { IFeature(it as DataNode) })])
}

if (resolvedParents.size > 1 && !multipleParentage)
throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Must enable multipleParentage to have features with multiple parents"
)

if (id == null) {
graph.DataNode(
resolvedParents.toMutableList(), LinkedList(), Data(
seqid,
source,
type,
mutableListOf(start..end),
score,
strand,
mutableListOf(phase),
attributes.toMutableMap()
)
)
} else { // nodes with an ID where created in the first pass
val node = graph.byID(id)!!
node.addParents(resolvedParents)
}

}
}
return graph
}

/**
* First pass through GFF file to get nodes.
*/
private fun getGraph(
file: String,
textCorrecter: ((String) -> String)?, // PLANNED: robust convenience function framework
// parentResolver: ParentResolver?,
multipleParentage: Boolean,
modifySchema: (TypeSchema.() -> Unit)?
): Graph {
// PLANNED: concurrent reading of ### directive

val graph = Graph(multipleParentage)
modifySchema?.invoke(graph.schema)
bufferedReader(file).useLines { lines ->
var lineCounter = 0
var commentCounter = 0
for (line in lines) {
lineCounter++
if (line.isEmpty() || line.isBlank()) continue //skip blank lines
// PLANNED: comment support
if (line.startsWith("#")) {
commentCounter++
if (commentCounter == 1) {
println("Comments not yet supported. Comment at line $lineCounter discarded: $line")
}
continue
}

val corrected = textCorrecter?.invoke(line) ?: line

val split = corrected.split("\t")

if (split.size != 9) throw parseException("Should contain 9 tab-delineated columns. ${corrected}")
if (split.size != 9) throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Should contain 9 tab-delineated columns. $corrected"
)

val seqid = split[0]
val source = split[1]
val type = split[2]
val start = split[3].toIntOrNull()
?: throw parseException("Cannot parse start ${split[3]} into an integer.")
?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse start ${split[3]} into an integer."
)
val end = split[4].toIntOrNull()
?: throw parseException("Cannot parse start ${split[4]} into an integer.")
?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse start ${split[4]} into an integer."
)
val score = split[5].toDoubleOrNull()
val strand =
Strand.fromString(split[6]) ?: throw parseException("Cannot parse ${split[6]} into a strand.")
Strand.fromString(split[6]) ?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse ${split[6]} into a strand."
)
val phase =
Phase.fromString(split[7]) ?: throw parseException("Cannot parse ${split[7]} into a phase.")
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique() ) {
throw parseException("Cannot have multiple instances of the same tag")
Phase.fromString(split[7]) ?: throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot parse ${split[7]} into a phase."
)
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique()) {
throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot have multiple instances of the same tag"
)
}
val attributes = split[8].trimEnd(';').split(';').associate {
val tagValue = it.split('=')
if (tagValue.size != 2)
throw parseException("All distinct attributes must be separated by a ; character.")
throw parseException(
lineCounter,
line,
textCorrecter,
file,
"All distinct attributes must be separated by a ; character."
)
val values = tagValue[1].split(',')
tagValue[0] to values
}

if ((attributes["ID"]?.size ?: 0) > 1) throw parseException("Cannot have multiple IDs.")
if ((attributes["ID"]?.size ?: 0) > 1) throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Cannot have multiple IDs."
)
val id = attributes["ID"]?.get(0)
if (id != null) {
val existing = graph.byID(id)
if (existing != null) {
val compatible =
existing.seqid == seqid || existing.source == source || existing.type == type ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe move this to its own function. Too many boolean checks in a row.

existing.score == score || existing.strand == strand
if (!compatible) throw parseException("Shares ID \"$id\" with $existing but they are not compatible.")
if (!compatible) throw parseException(
lineCounter,
line,
textCorrecter,
file,
"Shares ID \"$id\" with $existing but they are not compatible."
)
existing.addDiscontinuity(start..end, phase)
continue
}
}

val parentIDs = attributes["Parent"]
val parents = parentIDs?.map {
graph.byID(it)
?: throw parseException("Contains Parent attribute $it, which is not the ID of a previous line.")
} ?: listOf(graph.root)
val resolvedParents = if (parentResolver == null || parents.size <= 1) {
parents
} else {
listOf(parents[parentResolver(corrected, parents.map { IFeature(it as DataNode) })])
continue
}

if (resolvedParents.size > 1 && !multipleParentage)
throw parseException("Must enable multipleParentage to have features with multiple parents")

graph.DataNode(
resolvedParents.toMutableList(), LinkedList(), Data(
mutableListOf(), LinkedList(), Data(
seqid,
source,
type,
Expand All @@ -934,9 +1151,9 @@ internal class Graph private constructor(
)
}
}

return graph
}
}
}

private val logger = KotlinLogging.logger {}
}
}
2 changes: 0 additions & 2 deletions src/main/kotlin/biokotlin/genome/MAFProcessingUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import com.google.common.collect.Range
import com.google.common.collect.RangeMap
import com.google.common.collect.Sets
import com.google.common.collect.TreeRangeMap
import io.github.oshai.kotlinlogging.KotlinLogging
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import java.io.BufferedReader
Expand All @@ -29,7 +28,6 @@ import java.util.stream.Collectors
// Data class to be used when creating a dataFrame for chrom percent coverage statistics
// This may be used if can get Kotlin DataFrame vs Krangl DataFrame to work.
data class ChromStats(val contig: String, val numRegionBPs: Int, val percentCov: Double, val percentId: Double)
private val logger = KotlinLogging.logger {}
fun createWiggleFilesFromCoverageIdentity(coverage:IntArray, identity:IntArray, contig:String, outputDir:String) {

// There will be 2 wiggle files created: 1 for identity and 1 for coverage
Expand Down
Loading