Skip to content

Commit f53cda8

Browse files
authored
Merge pull request #47 from maize-genetics/fix-read-gff-parent-id
Fix reading GFF files where parent lines are after child lines
2 parents 01a14ca + 50177ee commit f53cda8

File tree

4 files changed

+252
-41
lines changed

4 files changed

+252
-41
lines changed

build.gradle.kts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ dependencies {
114114
implementation("org.jgrapht:jgrapht-core:1.5.1")
115115

116116

117-
implementation("io.github.oshai:kotlin-logging-jvm:5.0.0")
118117
implementation(group = "ch.qos.logback", name = "logback-classic", version = "1.2.6")
119118
implementation("it.unimi.dsi:fastutil:8.5.12")
120119
implementation("org.lz4:lz4-java:1.8.0")

src/main/kotlin/biokotlin/featureTree/Graph.kt

Lines changed: 252 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
package biokotlin.featureTree
22

33
import biokotlin.util.bufferedReader
4-
import io.github.oshai.kotlinlogging.KotlinLogging
54
import kotlinx.coroutines.async
65
import kotlinx.coroutines.awaitAll
76
import kotlinx.coroutines.runBlocking
8-
import java.io.FileReader
97
import java.util.*
108
import java.util.concurrent.ConcurrentHashMap
119
import java.util.concurrent.ConcurrentMap
@@ -354,6 +352,14 @@ internal class Graph private constructor(
354352
assert { invariants() }
355353
}
356354

355+
fun addParents(newParents: List<Node>) {
356+
val parentsNotAdded = newParents.filter { it !in parents }
357+
parents.addAll(parentsNotAdded)
358+
parentsNotAdded.forEach { it.children.addLast(this) }
359+
incrementTopo()
360+
assert { invariants() }
361+
}
362+
357363
/**
358364
* Puts node and all orphaned descendants into a deleted state where they cannot be read from nor written to.
359365
*/
@@ -831,6 +837,17 @@ internal class Graph private constructor(
831837
fun containsName(name: String): Boolean = byName.contains(name)
832838

833839
companion object {
840+
841+
fun parseException(
842+
lineNumber: Int,
843+
line: String,
844+
textCorrector: ((String) -> String)?,
845+
file: String,
846+
helpText: String
847+
): ParseException {
848+
return ParseException(lineNumber, line, textCorrector, file, helpText)
849+
}
850+
834851
/**
835852
* Returns a graph representation of the file.
836853
* @see [Genome.fromFile]
@@ -845,83 +862,283 @@ internal class Graph private constructor(
845862
): Graph {
846863
// PLANNED: concurrent reading of ### directive
847864

848-
val graph = Graph(multipleParentage)
849-
modifySchema?.invoke(graph.schema)
865+
val graph = getGraph(file, textCorrecter, multipleParentage, modifySchema)
866+
850867
bufferedReader(file).useLines { lines ->
851868
var lineCounter = 0
869+
var commentCounter = 0
852870
for (line in lines) {
871+
853872
lineCounter++
854873
if (line.isEmpty() || line.isBlank()) continue //skip blank lines
855874
// PLANNED: comment support
856875
if (line.startsWith("#")) {
857-
// This has been known to print over 4000 lines of comments in a single file, which is not useful.
858-
//logger.info { "Comments not yet supported. Comment at line $lineCounter discarded: $line" }
876+
commentCounter++
877+
if (commentCounter == 1) {
878+
println("Comments not yet supported. Comment at line $lineCounter discarded: $line")
879+
}
859880
continue
860881
}
861882

862883
val corrected = textCorrecter?.invoke(line) ?: line
863884

864-
fun parseException(helpText: String): ParseException {
865-
return ParseException(lineCounter, line, textCorrecter, file, helpText)
885+
val split = corrected.split("\t")
886+
887+
if (split.size != 9) throw parseException(
888+
lineCounter,
889+
line,
890+
textCorrecter,
891+
file,
892+
"Should contain 9 tab-delineated columns. $corrected"
893+
)
894+
895+
val seqid = split[0]
896+
val source = split[1]
897+
val type = split[2]
898+
val start = split[3].toIntOrNull()
899+
?: throw parseException(
900+
lineCounter,
901+
line,
902+
textCorrecter,
903+
file,
904+
"Cannot parse start ${split[3]} into an integer."
905+
)
906+
val end = split[4].toIntOrNull()
907+
?: throw parseException(
908+
lineCounter,
909+
line,
910+
textCorrecter,
911+
file,
912+
"Cannot parse start ${split[4]} into an integer."
913+
)
914+
val score = split[5].toDoubleOrNull()
915+
val strand =
916+
Strand.fromString(split[6]) ?: throw parseException(
917+
lineCounter,
918+
line,
919+
textCorrecter,
920+
file,
921+
"Cannot parse ${split[6]} into a strand."
922+
)
923+
val phase =
924+
Phase.fromString(split[7]) ?: throw parseException(
925+
lineCounter,
926+
line,
927+
textCorrecter,
928+
file,
929+
"Cannot parse ${split[7]} into a phase."
930+
)
931+
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique()) {
932+
throw parseException(
933+
lineCounter,
934+
line,
935+
textCorrecter,
936+
file,
937+
"Cannot have multiple instances of the same tag"
938+
)
939+
}
940+
val attributes = split[8].trimEnd(';').split(';').associate {
941+
val tagValue = it.split('=')
942+
if (tagValue.size != 2)
943+
throw parseException(
944+
lineCounter,
945+
line,
946+
textCorrecter,
947+
file,
948+
"All distinct attributes must be separated by a ; character."
949+
)
950+
val values = tagValue[1].split(',')
951+
tagValue[0] to values
952+
}
953+
954+
if ((attributes["ID"]?.size ?: 0) > 1) throw parseException(
955+
lineCounter,
956+
line,
957+
textCorrecter,
958+
file,
959+
"Cannot have multiple IDs."
960+
)
961+
val id = attributes["ID"]?.get(0)
962+
963+
val parentIDs = attributes["Parent"]
964+
val parents = parentIDs?.map {
965+
graph.byID(it)
966+
?: throw parseException(
967+
lineCounter,
968+
line,
969+
textCorrecter,
970+
file,
971+
"Contains Parent attribute $it, which is not the ID of a previous line."
972+
)
973+
} ?: listOf(graph.root)
974+
val resolvedParents = if (parentResolver == null || parents.size <= 1) {
975+
parents
976+
} else {
977+
listOf(parents[parentResolver(corrected, parents.map { IFeature(it as DataNode) })])
866978
}
867979

980+
if (resolvedParents.size > 1 && !multipleParentage)
981+
throw parseException(
982+
lineCounter,
983+
line,
984+
textCorrecter,
985+
file,
986+
"Must enable multipleParentage to have features with multiple parents"
987+
)
988+
989+
if (id == null) {
990+
graph.DataNode(
991+
resolvedParents.toMutableList(), LinkedList(), Data(
992+
seqid,
993+
source,
994+
type,
995+
mutableListOf(start..end),
996+
score,
997+
strand,
998+
mutableListOf(phase),
999+
attributes.toMutableMap()
1000+
)
1001+
)
1002+
} else { // nodes with an ID where created in the first pass
1003+
val node = graph.byID(id)!!
1004+
node.addParents(resolvedParents)
1005+
}
1006+
1007+
}
1008+
}
1009+
return graph
1010+
}
1011+
1012+
/**
1013+
* First pass through GFF file to get nodes.
1014+
*/
1015+
private fun getGraph(
1016+
file: String,
1017+
textCorrecter: ((String) -> String)?, // PLANNED: robust convenience function framework
1018+
// parentResolver: ParentResolver?,
1019+
multipleParentage: Boolean,
1020+
modifySchema: (TypeSchema.() -> Unit)?
1021+
): Graph {
1022+
// PLANNED: concurrent reading of ### directive
1023+
1024+
val graph = Graph(multipleParentage)
1025+
modifySchema?.invoke(graph.schema)
1026+
bufferedReader(file).useLines { lines ->
1027+
var lineCounter = 0
1028+
var commentCounter = 0
1029+
for (line in lines) {
1030+
lineCounter++
1031+
if (line.isEmpty() || line.isBlank()) continue //skip blank lines
1032+
// PLANNED: comment support
1033+
if (line.startsWith("#")) {
1034+
commentCounter++
1035+
if (commentCounter == 1) {
1036+
println("Comments not yet supported. Comment at line $lineCounter discarded: $line")
1037+
}
1038+
continue
1039+
}
1040+
1041+
val corrected = textCorrecter?.invoke(line) ?: line
1042+
8681043
val split = corrected.split("\t")
8691044

870-
if (split.size != 9) throw parseException("Should contain 9 tab-delineated columns. ${corrected}")
1045+
if (split.size != 9) throw parseException(
1046+
lineCounter,
1047+
line,
1048+
textCorrecter,
1049+
file,
1050+
"Should contain 9 tab-delineated columns. $corrected"
1051+
)
8711052

8721053
val seqid = split[0]
8731054
val source = split[1]
8741055
val type = split[2]
8751056
val start = split[3].toIntOrNull()
876-
?: throw parseException("Cannot parse start ${split[3]} into an integer.")
1057+
?: throw parseException(
1058+
lineCounter,
1059+
line,
1060+
textCorrecter,
1061+
file,
1062+
"Cannot parse start ${split[3]} into an integer."
1063+
)
8771064
val end = split[4].toIntOrNull()
878-
?: throw parseException("Cannot parse start ${split[4]} into an integer.")
1065+
?: throw parseException(
1066+
lineCounter,
1067+
line,
1068+
textCorrecter,
1069+
file,
1070+
"Cannot parse start ${split[4]} into an integer."
1071+
)
8791072
val score = split[5].toDoubleOrNull()
8801073
val strand =
881-
Strand.fromString(split[6]) ?: throw parseException("Cannot parse ${split[6]} into a strand.")
1074+
Strand.fromString(split[6]) ?: throw parseException(
1075+
lineCounter,
1076+
line,
1077+
textCorrecter,
1078+
file,
1079+
"Cannot parse ${split[6]} into a strand."
1080+
)
8821081
val phase =
883-
Phase.fromString(split[7]) ?: throw parseException("Cannot parse ${split[7]} into a phase.")
884-
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique() ) {
885-
throw parseException("Cannot have multiple instances of the same tag")
1082+
Phase.fromString(split[7]) ?: throw parseException(
1083+
lineCounter,
1084+
line,
1085+
textCorrecter,
1086+
file,
1087+
"Cannot parse ${split[7]} into a phase."
1088+
)
1089+
if (!split[8].trimEnd(';').split(';').map { it.split('=').first() }.allUnique()) {
1090+
throw parseException(
1091+
lineCounter,
1092+
line,
1093+
textCorrecter,
1094+
file,
1095+
"Cannot have multiple instances of the same tag"
1096+
)
8861097
}
8871098
val attributes = split[8].trimEnd(';').split(';').associate {
8881099
val tagValue = it.split('=')
8891100
if (tagValue.size != 2)
890-
throw parseException("All distinct attributes must be separated by a ; character.")
1101+
throw parseException(
1102+
lineCounter,
1103+
line,
1104+
textCorrecter,
1105+
file,
1106+
"All distinct attributes must be separated by a ; character."
1107+
)
8911108
val values = tagValue[1].split(',')
8921109
tagValue[0] to values
8931110
}
8941111

895-
if ((attributes["ID"]?.size ?: 0) > 1) throw parseException("Cannot have multiple IDs.")
1112+
if ((attributes["ID"]?.size ?: 0) > 1) throw parseException(
1113+
lineCounter,
1114+
line,
1115+
textCorrecter,
1116+
file,
1117+
"Cannot have multiple IDs."
1118+
)
8961119
val id = attributes["ID"]?.get(0)
8971120
if (id != null) {
8981121
val existing = graph.byID(id)
8991122
if (existing != null) {
9001123
val compatible =
9011124
existing.seqid == seqid || existing.source == source || existing.type == type ||
9021125
existing.score == score || existing.strand == strand
903-
if (!compatible) throw parseException("Shares ID \"$id\" with $existing but they are not compatible.")
1126+
if (!compatible) throw parseException(
1127+
lineCounter,
1128+
line,
1129+
textCorrecter,
1130+
file,
1131+
"Shares ID \"$id\" with $existing but they are not compatible."
1132+
)
9041133
existing.addDiscontinuity(start..end, phase)
9051134
continue
9061135
}
907-
}
908-
909-
val parentIDs = attributes["Parent"]
910-
val parents = parentIDs?.map {
911-
graph.byID(it)
912-
?: throw parseException("Contains Parent attribute $it, which is not the ID of a previous line.")
913-
} ?: listOf(graph.root)
914-
val resolvedParents = if (parentResolver == null || parents.size <= 1) {
915-
parents
9161136
} else {
917-
listOf(parents[parentResolver(corrected, parents.map { IFeature(it as DataNode) })])
1137+
continue
9181138
}
9191139

920-
if (resolvedParents.size > 1 && !multipleParentage)
921-
throw parseException("Must enable multipleParentage to have features with multiple parents")
922-
9231140
graph.DataNode(
924-
resolvedParents.toMutableList(), LinkedList(), Data(
1141+
mutableListOf(), LinkedList(), Data(
9251142
seqid,
9261143
source,
9271144
type,
@@ -934,9 +1151,9 @@ internal class Graph private constructor(
9341151
)
9351152
}
9361153
}
1154+
9371155
return graph
9381156
}
939-
}
940-
}
9411157

942-
private val logger = KotlinLogging.logger {}
1158+
}
1159+
}

src/main/kotlin/biokotlin/genome/MAFProcessingUtils.kt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import com.google.common.collect.Range
55
import com.google.common.collect.RangeMap
66
import com.google.common.collect.Sets
77
import com.google.common.collect.TreeRangeMap
8-
import io.github.oshai.kotlinlogging.KotlinLogging
98
import org.jetbrains.kotlinx.dataframe.DataFrame
109
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
1110
import java.io.BufferedReader
@@ -29,7 +28,6 @@ import java.util.stream.Collectors
2928
// Data class to be used when creating a dataFrame for chrom percent coverage statistics
3029
// This may be used if can get Kotlin DataFrame vs Krangl DataFrame to work.
3130
data class ChromStats(val contig: String, val numRegionBPs: Int, val percentCov: Double, val percentId: Double)
32-
private val logger = KotlinLogging.logger {}
3331
fun createWiggleFilesFromCoverageIdentity(coverage:IntArray, identity:IntArray, contig:String, outputDir:String) {
3432

3533
// There will be 2 wiggle files created: 1 for identity and 1 for coverage

0 commit comments

Comments
 (0)