From c0a02d6e5deea1cff6415373f022aaca60d9347e Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Tue, 13 Aug 2024 23:27:19 -0500 Subject: [PATCH 1/7] created encode_geneotype function and updated get_geno NEED to change type of val in Geno --- src/BigRiverQTL.jl | 3 + src/io/export_to_type.jl | 217 ++++++------------------------------ src/io/io_utils.jl | 98 ++++++++++++++-- src/struct/datastructure.jl | 1 - test/datastruc_test.jl | 74 ++++++------ test/runtests.jl | 5 +- test/utils_test.jl | 29 +++++ 7 files changed, 194 insertions(+), 233 deletions(-) create mode 100644 test/utils_test.jl diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 8de9148..a91854b 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -61,8 +61,11 @@ module BigRiverQTL # IO # ###### include("io/io_utils.jl") + export encode_genotype include("io/export_to_type.jl") export get_geneticstudydata + export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype, get_geno + ######### # Plots # diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index 678f336..1583b43 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -63,7 +63,6 @@ function get_crosstype(filename::String) #crosstype if (in("crosstype", keys(jsondict))) crosstype = jsondict["crosstype"] - else throw("Error: CrossType not found") end @@ -86,8 +85,6 @@ Creates a `Alleles` type/struct from control file in json format. # Output Returns a `Alleles` type/struct. - - """ function get_alleles(filename::String) # check if filename is directory or file name @@ -101,20 +98,10 @@ function get_alleles(filename::String) #alleles alleles = jsondict["alleles"] - - return Alleles(alleles) end - - - - - - - - """ get_genotype(file::String) @@ -127,8 +114,6 @@ Creates a `GenoType` type/struct from control file in json format. # Output Returns a `GenoType` type/struct. - - """ function get_genotype(filename::String) # check if filename is directory or file name @@ -137,31 +122,18 @@ function get_genotype(filename::String) # load file jsondict = BigRiverQTL.parse_json(filename) - # make type - - #genotype + # genotype if (in("genotypes", keys(jsondict))) label = jsondict["genotypes"] - else + # Rqtl2 default label = Dict{String, Int}("A" => 1, "H" => 2, "B" => 3, "D" => 4, "C" => 5) end - - return GenoType(label) end - - - - - - - - - """ get_genotranspose(file::String) @@ -174,8 +146,6 @@ Creates a `GenoTranspose` type/struct from control file in json format. # Output Returns a `GenoTranspose` type/struct. - - """ function get_genotranspose(filename::String) # check if filename is directory or file name @@ -184,29 +154,17 @@ function get_genotranspose(filename::String) # load file jsondict = BigRiverQTL.parse_json(filename) - # make type - - #genotype + #g enotype if (in("geno_transposed", keys(jsondict))) val = jsondict["geno_transposed"] - else val = FALSE end - - return GenoTranspose(val) end - - - - - - - """ get_geno(filename::String) @@ -220,90 +178,80 @@ Creates a `Geno` type/struct from gmap CSV file and geno CSV file. # Output Returns a `Geno` type/struct. - - """ function get_geno(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) # load file - jsondict = BigRiverQTL.parse_json(filename) - # make type - - #gmap + # check gmap file exists if (in("gmap", keys(jsondict))) gmapfile = joinpath(data_dir, jsondict["gmap"]) - else throw("Error: gmap not found in control file") end - gmap = get_gmap(gmapfile) - - #geno + # check geno file exists if (in("geno", keys(jsondict))) genofile = joinpath(data_dir, jsondict["geno"]) - else throw("Error: geno not found in control file") end - - # load file + # get gmap object gmap = get_gmap(gmapfile) + + # load gmap dataframe df_gmap = read_data(gmapfile) - df_geno = read_data(genofile) - # make type + # load geno dataframe + df_geno = read_data(genofile) - # samples + # get samples names samples = names(df_geno)[2:end] - # chromosomes + + # get chromosomes names chr = gmap.chr - # markers + + # get markers names marker = gmap.marker_name - # values - val = [Matrix{Int}(undef, length(samples), length(marker[i])) for i in 1:length(chr)] - for i in 1:length(chr) - for j in 1:length(marker[i]) - #uni = unique(Vector(df_geno[findfirst(x -> x == marker[i][j], marker[i]), 2:end])) - #map = Dict(uni[1] => 1, uni[2] => 2, uni[3] => 0) - map=get_genotype(filename).label - - val[i][:, j]=[map[v] for v in Vector(df_geno[findfirst(x -> x == marker[i][j], marker[i]), 2:end])] - end - end + #genotype + genotype = get_genotype(filename) + # values + # get encoded genetotypes + df_encoded = DataFrame( + encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)), + samples, + ) + + # built encoded geno DataFrames + insertcols!(df_encoded, 1, :marker => df_geno.marker) + + gdf = innerjoin( + select(df_gmap, [:chr, :marker]), + df_encoded, + on = :marker + ) |> x -> groupby(x, :chr); + + val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]; + #crosstype crosstype = get_crosstype(filename) - #alleles alleles = get_alleles(filename) - - - #genotype - genotype = get_genotype(filename) - - - #genotranspose genotranspose = get_genotranspose(filename) - return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) end - - - - """ get_chromosome(filename::String) @@ -318,9 +266,6 @@ Creates a `Chromosome` type/struct from gmap CSV file and geno CSV file. # Output Returns a `Chromosome` type/struct. - - - """ function get_chromosome(gmapfile::String, genofile::String, number::Int) # load file @@ -343,26 +288,14 @@ function get_chromosome(gmapfile::String, genofile::String, number::Int) for j in 1:length(marker) #uni = unique(Vector(df_geno[findfirst(x -> x == marker[j], marker), 2:end])) #map = Dict(uni[1] => 1, uni[2] => 2, uni[3] => 0) - map=get_genotype(filename).label - val[:, j]=[map[v] for v in Vector(df_geno[findfirst(x -> x == marker[j], marker), 2:end])] + map = get_genotype(filename).label + val[:, j] = [map[v] for v in Vector(df_geno[findfirst(x -> x == marker[j], marker), 2:end])] end return Chromosome(name, marker, val) end - - - - - - - - - - - - """ get_geno2(filename::String) @@ -377,8 +310,6 @@ Creates a `Geno2` type/struct from gmap CSV file and geno CSV file. # Output Returns a `Geno2` type/struct. - - """ function get_geno2(gmapfile::String, genofile::String) # load file @@ -401,15 +332,6 @@ function get_geno2(gmapfile::String, genofile::String) end - - - - - - - - - """ get_pmap(filename::String) @@ -423,7 +345,6 @@ Creates a `Pmap` type/struct from Pmap CSV file. Returns a `Pmap` type/struct. - """ function get_pmap(filename::String) # check if filename is directory or file name @@ -447,8 +368,6 @@ function get_pmap(filename::String) end - - """ get_pheno(filename::String) @@ -461,8 +380,6 @@ Creates a `Pheno` type/struct from Pheno CSV file. # Output Returns a `Pheno` type/struct. - - """ function get_pheno(filename::String) # check if filename is directory or file name @@ -487,8 +404,6 @@ function get_pheno(filename::String) end - - """ get_phenocovar(filename::String) @@ -501,8 +416,6 @@ Creates a `Phenocovar` type/struct from phenocovar CSV file. # Output Returns a `Phenocovar` type/struct. - - """ function get_phenocovar(filename::String) # check if filename is directory or file name @@ -525,12 +438,6 @@ function get_phenocovar(filename::String) end - - - - - - """ get_crossinfo(filename::String) @@ -543,8 +450,6 @@ Creates a `Crossinfo` type/struct from crossinfo CSV file. # Output Returns a `Crossinfo` type/struct. - - """ function get_crossinfo(filename::String) # check if filename is directory or file name @@ -566,14 +471,6 @@ function get_crossinfo(filename::String) end - - - - - - - - """ get_isxchar(filename::String) @@ -586,8 +483,6 @@ Creates a `IsXChar` type/struct from gmap CSV file. # Output Returns a `IsXChar` type/struct. - - """ function get_isxchar(filename::String) # check if filename is directory or file name @@ -611,11 +506,6 @@ function get_isxchar(filename::String) end - - - - - """ get_isfemale(filename::String) @@ -628,8 +518,6 @@ Creates a `IsFemale` type/struct from control file in json format. # Output Returns a `IsFemale` type/struct. - - """ function get_isfemale(filename::String) # check if filename is directory or file name @@ -656,24 +544,6 @@ function get_isfemale(filename::String) end - - - - - - - - - - - - - - - - - - """ get_geneticstudydata(filename::String) @@ -686,16 +556,12 @@ Creates a `GeneticStudyData` type/struct from control file in json format. # Output Returns a `GeneticStudyData` type/struct. - - """ function get_geneticstudydata(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - jsondict = BigRiverQTL.parse_json(filename) # make type @@ -756,16 +622,10 @@ function get_geneticstudydata(filename::String) #isfemale isfemale = get_isfemale(filename) - - #crossinfo crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) crossinfo = get_crossinfo(crossinfofile) - - - - return GeneticStudyData(gmap, geno, pmap, @@ -776,6 +636,3 @@ function get_geneticstudydata(filename::String) crossinfo) end - - - diff --git a/src/io/io_utils.jl b/src/io/io_utils.jl index 10b0db6..c023432 100644 --- a/src/io/io_utils.jl +++ b/src/io/io_utils.jl @@ -10,10 +10,7 @@ Parses a JSON file to a dictionary with keys containing the names of CSV file fo # Output Returns a dictionary - - """ - function parse_json(file::String) indict = open(file, "r") do f JSON.parse(f) @@ -22,9 +19,6 @@ function parse_json(file::String) end - - - """ read_data(filename::String) @@ -37,8 +31,6 @@ Writes a CSV file to data frame excluding the comments lines. # Output Returns a data frame. - - """ function read_data(filename::String) # read the file into lines @@ -51,8 +43,27 @@ function read_data(filename::String) return CSV.read(filename, DataFrame; header = startdata)#comment="#") end + + """ -Need description + get_control_file(filename::String) -> (String, String) + +Locate the control file and determine the directory where it resides. + +# Arguments +- `filename::String`: A string representing either the path to a file or a directory. + +# Returns +- `(String, String)`: A tuple containing the directory of the control file and the path +to the control file. If the provided `filename` is a directory, the function will search +for JSON files within and return the path to the first JSON file found. + +# Description +The function first resolves the absolute path of the provided `filename`. If `filename` +points to an existing file, the function identifies the directory containing this +file. If `filename` specifies a directory, the function lists all entries in this +directory, searches for the first JSON file, and updates `filename` to the path of this +JSON file. """ function get_control_file(filename::String) @@ -69,4 +80,71 @@ function get_control_file(filename::String) end return data_dir, filename -end \ No newline at end of file +end + +""" + encode_genotype(geno_dict::Dict{String, Any}, geno_val::AbstractArray) -> Matrix + +Encode a matrix of genotype values using a dictionary of predefined mappings. + +# Arguments +- `geno_dict::Dict{String, Any}`: A dictionary mapping genotype strings to integer codes. +- `geno_val::AbstractArray`: An array of genotype strings to be encoded. + +# Returns +- `Matrix{Union{Missing, Int64}}` or `Matrix{Int64}`: A matrix of the same dimensions as +`geno_val` where each genotype string is replaced by its corresponding integer code. +Genotypes not found in `geno_dict` are encoded as `missing`. + +# Description +The function first copies the provided `geno_dict` to preserve the original. It then identifies +any genotypes in `geno_val` that are not already present in the dictionary. A warning is issued +for these missing genotypes, and they are added to the dictionary with a mapping to `missing`. + +Each element in the input `geno_val` is replaced in the `encoded_val` matrix by its corresponding +integer code from `geno_dict`, handling both existing and newly added genotype mappings. + +# Examples +```julia +geno_dict = Dict{String, Any}("AA" => 1, "AB" => 2, "BB" => 3) +geno_val = ["AA", "AB", "BB", "BC"] +encoded_matrix = encode_genotype(geno_dict, geno_val) +println(encoded_matrix) +``` +""" +function encode_genotype(geno_dict::Dict{String, Any}, geno_val::AbstractArray) + # check if geno_val is a Vector + if size(geno_val, 2) == 1 + geno_val = reshape(geno_val, size(geno_val, 1), 1) + end + + # need to copy dict to preserve original + geno_dict = copy(geno_dict) + + # check uniqueness of genotype + unique_genotype = unique(geno_val) + dict_genotype = string.(keys(geno_dict)) + extra_genotype = setdiff(unique_genotype, dict_genotype) + + # initialize output + if !isempty(extra_genotype) + @warn "Presence of genotypes not encoded by dictionnary of the control file: " * join(extra_genotype, " & ") * + "\n Not specified genetotype will be encoded as missing" + + for i in eachindex(extra_genotype) + setindex!(geno_dict, missing, extra_genotype[i]) + end + encoded_val = Array{Union{Missing, Int64}}(undef, size(geno_val)) + else + encoded_val = Array{Int64}(undef, size(geno_val)) + end + + # encoding + for j in 1:size(geno_val, 2) + for i in 1:size(geno_val, 1) + encoded_val[i, j] = geno_dict[geno_val[i, j]] + end + end + + return encoded_val +end diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl index e0f1c1a..c95db8c 100644 --- a/src/struct/datastructure.jl +++ b/src/struct/datastructure.jl @@ -193,5 +193,4 @@ struct GeneticStudyData isXchar::IsXChar isfemale::IsFemale crossinfo::CrossInfo - end diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl index d1ad65b..b9fee2d 100644 --- a/test/datastruc_test.jl +++ b/test/datastruc_test.jl @@ -10,13 +10,13 @@ alleles = Alleles(["A", "B", "C"]) geno_type = GenoType(Dict("A" => 1, "H" => 2, "B" => 3)) geno_transpose = GenoTranspose(true) geno = Geno( - ["sample1"], - ["chr1"], - [["marker1"]], [[Int16(1) Int16(2); Int16(3) Int16(4)]], - CrossType("risib"), - Alleles(["A", "B"]), - GenoType(Dict("A" => 1)), - GenoTranspose(false) + ["sample1"], + ["chr1"], + [["marker1"]], [[Int16(1) Int16(2); Int16(3) Int16(4)]], + CrossType("risib"), + Alleles(["A", "B"]), + GenoType(Dict("A" => 1)), + GenoTranspose(false) ) =# #pheno = Pheno(["individual1", "individual2"], ["trait1", "trait2"], [1.0 2.0; nothing 3.0]) @@ -25,77 +25,73 @@ geno = Geno( data_dir = joinpath(@__DIR__, "data/BXD/"); file = joinpath(data_dir, "bxd.json"); - # Transforming data to a optimised and accessible data type data = get_geneticstudydata(file); - - - # Testing the Gmap type -gmap=data.gmap +gmap = data.gmap @testset "Gmap Tests" begin - # @test gmap.chr == ["chr1", "chr2"] - # @test length(gmap.marker_name) == 2 - @test all(isa.(gmap.pos, Vector{Float64})) + # @test gmap.chr == ["chr1", "chr2"] + # @test length(gmap.marker_name) == 2 + @test all(isa.(gmap.pos, Vector{Float64})) end # Testing the CrossType type -crosstype=data.geno.cross_type +crosstype = data.geno.cross_type @testset "CrossType Tests" begin - @test isa(cross_type.type, String) + @test isa(cross_type.type, String) end # Testing the CrossInfo type -cross_info=data.cross_info +cross_info = data.cross_info @testset "CrossInfo Tests" begin - @test length(cross_info.sample_id) == length(cross_info.direction) - @test all(isa.(cross_info.direction, Int)) + @test length(cross_info.sample_id) == length(cross_info.direction) + @test all(isa.(cross_info.direction, Int)) end # Testing the Alleles type -alleles=data.geno.alleles +alleles = data.geno.alleles @testset "Alleles Tests" begin - # @test length(alleles.val) == 3 - @test all(isa.(alleles.val, String)) + # @test length(alleles.val) == 3 + @test all(isa.(alleles.val, String)) end # Testing the GenoType type #@testset "GenoType Tests" begin - # @test geno_type.label["A"] == 1 - # @test geno_type.label["H"] == 2 +# @test geno_type.label["A"] == 1 +# @test geno_type.label["H"] == 2 #end # Testing the GenoTranspose type #@testset "GenoTranspose Tests" begin - # @test geno_transpose.val == true +# @test geno_transpose.val == true #end # Testing the Geno type #@testset "Geno Tests" begin - # @test length(geno.sample_id) == 1 - # @test size(geno.val[1]) == (2, 2) +# @test length(geno.sample_id) == 1 +# @test size(geno.val[1]) == (2, 2) #end # Testing the Pheno type -pheno=data.pheno +pheno = data.pheno @testset "Pheno Tests" begin - @test size(pheno.val, 1) == length(pheno.sample_id) - # @test pheno.val[2, 1] === nothing + @test size(pheno.val, 1) == length(pheno.sample_id) + # @test pheno.val[2, 1] === nothing end # Checks whether the output of `get_geneticstudydata` function is of type `GeneticStudyData` function BigRiverQTLData_struct_test(filename::String, testname::String) - @testset "BigRiverQTLData_struct_test" begin - - println("Test results for checking whether the output of `get_geneticstudydata` function is of type `GeneticStudyData`: ", - @test typeof(get_geneticstudydata(filename))==GeneticStudyData) - - - end - + @testset "BigRiverQTLData_struct_test" begin + + println("Test results for checking whether the output of `get_geneticstudydata` function is of type `GeneticStudyData`: ", + @test typeof(get_geneticstudydata(filename)) == GeneticStudyData) + + + end + end diff --git a/test/runtests.jl b/test/runtests.jl index 0a049e0..3dc7976 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -11,10 +11,9 @@ using DelimitedFiles # Test # ######## - @testset "BigRiverQTL" begin include("kinship_test.jl") include("datastruc_test.jl") - include("plots_test.jl") - + include("utils_test.jl") + include("plots_test.jl") end \ No newline at end of file diff --git a/test/utils_test.jl b/test/utils_test.jl new file mode 100644 index 0000000..0a939a5 --- /dev/null +++ b/test/utils_test.jl @@ -0,0 +1,29 @@ + +######################## +# Test encode_genotype # +######################## + +# Test Case 1: All genotypes are known +@testset "Encode Genotypes - Known Genotypes" begin + geno_dict = Dict{String, Any}("AA" => 1, "AB" => 2, "BB" => 3) + geno_val = ["AA" "AB" "BB"; "AB" "AA" "BB"] + expected_output = [1 2 3; 2 1 3] + @test encode_genotype(geno_dict, geno_val) == expected_output +end + +# Test Case 2: Some genotypes are unknown +@testset "Encode Genotypes - Unknown Genotypes" begin + geno_dict = Dict{String, Any}("AA" => 1, "AB" => 2, "BB" => 3) + geno_val = ["AA" "AB" "BC"; "AB" "AA" "BD"] + expected_output = [1 2 missing; 2 1 missing] + encoded_output = encode_genotype(geno_dict, geno_val) + @test (encoded_output[1:2,1:2] == expected_output[1:2,1:2]) && (ismissing(encoded_output[2,3])) +end + +# Test Case 3: All genotypes are unknown +@testset "Encode Genotypes - All Unknown" begin + geno_dict = Dict{String, Any}("AA" => 1, "AB" => 2) + geno_val = ["AC" "AD"; "AE" "AF"] + encoded_output = encode_genotype(geno_dict, geno_val) + @test sum(ismissing.(encoded_output)) == 4 +end From 38b13214aed9c9cf5919cc48597244ad8fc4fc11 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 00:48:25 -0500 Subject: [PATCH 2/7] corrected testing function for struct, corrected function to get gmap, geno, pmap --- src/BigRiverQTL.jl | 10 +-- src/io/export_to_type.jl | 151 +++++++++++++----------------------- src/struct/datastructure.jl | 65 +++++----------- test/datastruc_test.jl | 139 +++++++++++++++++++++------------ 4 files changed, 168 insertions(+), 197 deletions(-) diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index a91854b..196818e 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -54,18 +54,19 @@ module BigRiverQTL # Structure # ############# include("struct/datastructure.jl") - export Gmap, Alleles, CrossType, GenoType, GenoTranspose,Geno, Pmap, Pheno, Phenocov, IsFemale, IsXChar, CrossInfo + export Gmap, Alleles, CrossType, GenoType, GenoTranspose, Geno, Pmap + export Pheno, Phenocov, IsFemale, IsXChar, CrossInfo export GeneticStudyData ###### # IO # ###### include("io/io_utils.jl") - export encode_genotype + export get_control_file, encode_genotype include("io/export_to_type.jl") export get_geneticstudydata - export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype, get_geno - + export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype + export get_geno, get_genotype, get_genotranspose, get_pmap ######### # Plots # @@ -79,7 +80,6 @@ module BigRiverQTL include("plots/plots_manhattan.jl") export plot_manhattan - include("plots/plots_eqtl.jl") export plot_eQTL diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index 1583b43..81d0a89 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -18,15 +18,25 @@ function get_gmap(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - gdf = groupby(read_data(filename), :chr) + # load control file + jsondict = parse_json(filename) - # make type + # check gmap file exists + if (in("gmap", keys(jsondict))) + gmapfile = joinpath(data_dir, jsondict["gmap"]) + else + throw("Error: gmap not found in control file") + end + + # load gmap file + gdf = groupby(read_data(gmapfile), :chr) # chromosome chr = [group.chr[1] for group in gdf] + # marker marker = [String.(group.marker) for group in gdf] + # relative position pos = [group.pos for group in gdf] @@ -71,8 +81,6 @@ function get_crosstype(filename::String) end - - """ get_alleles(file::String) @@ -201,7 +209,7 @@ function get_geno(filename::String) end # get gmap object - gmap = get_gmap(gmapfile) + gmap = get_gmap(filename) # load gmap dataframe df_gmap = read_data(gmapfile) @@ -252,86 +260,6 @@ function get_geno(filename::String) end -""" -get_chromosome(filename::String) - -Creates a `Chromosome` type/struct from gmap CSV file and geno CSV file. - -# Argument - -- `gmapfile` : A string containing the name(with directory) of the gmap CSV file. -- `genofile` : A string containing the name(with directory) of the geno CSV file. -- `number` : A number/index of the chromosome we want to get information. - -# Output - -Returns a `Chromosome` type/struct. -""" -function get_chromosome(gmapfile::String, genofile::String, number::Int) - # load file - gmap = get_gmap(gmapfile) - df_gmap = read_data(gmapfile) - df_geno = read_data(genofile) - - # make type - - - # name - name = gmap.chr[number] - - # markers - marker = gmap.marker[number] - - - # values - val = Matrix{Int}(undef, length(samples), length(marker)) - for j in 1:length(marker) - #uni = unique(Vector(df_geno[findfirst(x -> x == marker[j], marker), 2:end])) - #map = Dict(uni[1] => 1, uni[2] => 2, uni[3] => 0) - map = get_genotype(filename).label - val[:, j] = [map[v] for v in Vector(df_geno[findfirst(x -> x == marker[j], marker), 2:end])] - end - - return Chromosome(name, marker, val) -end - - -""" -get_geno2(filename::String) - -Creates a `Geno2` type/struct from gmap CSV file and geno CSV file. - -# Argument - -- `gmapfile` : A string containing the name(with directory) of the gmap CSV file. -- `genofile` : A string containing the name(with directory) of the geno CSV file. - - -# Output - -Returns a `Geno2` type/struct. -""" -function get_geno2(gmapfile::String, genofile::String) - # load file - gmap = get_gmap(gmapfile) - df_gmap = read_data(gmapfile) - df_geno = read_data(genofile) - - # make type - - - # chromosomes - chr = gmap.chr - # samples - samples = names(df_geno)[2:end] - - # chromosomes - chromosomes = [get_chromosome(gmapfile, genofile, i) for i in 1:length(chr)] - - return Geno2(samples, chromosomes) -end - - """ get_pmap(filename::String) @@ -350,19 +278,45 @@ function get_pmap(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - gdf = groupby(read_data(filename), :chr) + # load control file + jsondict = parse_json(filename) - # make type + # check pmap file exists + if (in("pmap", keys(jsondict))) + pmapfile = joinpath(data_dir, jsondict["pmap"]) + else + throw("Error: pmap not found in control file") + end + + # load pmap file + gdf = groupby(read_data(pmapfile), :chr) # chromosome chr = [group.chr[1] for group in gdf] + # marker - marker = [String.(group.marker) for group in gdf] + marker = [group.marker for group in gdf] + # relative position pos = [group.pos for group in gdf] + # unit - unit = "mm10 Mbp" + unit = "" + f = open(pmapfile, "r") + + s = readline(f) + s_lower = lowercase(s) # Convert the input string to lowercase + if occursin("mbp", s_lower) + unit = "Mbp" + elseif occursin("mb", s_lower) + unit = "Mb" + elseif occursin("cm", s_lower) + unit = "cM" + else + @warn "No unit detected!" + end + + close(f) return Pmap(chr, marker, pos, unit) end @@ -385,10 +339,18 @@ function get_pheno(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - df_pheno = read_data(filename) + # load control file + jsondict = parse_json(filename) - # make type + # check pheno file exists + if (in("pheno", keys(jsondict))) + phenofile = joinpath(data_dir, jsondict["pheno"]) + else + throw("Error: pheno not found in control file") + end + + # load file + df_pheno = read_data(phenofile) # samples samples = df_pheno[:, 1] @@ -524,7 +486,6 @@ function get_isfemale(filename::String) data_dir, filename = get_control_file(filename) # load file - jsondict = BigRiverQTL.parse_json(filename) crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl index c95db8c..dc4fc42 100644 --- a/src/struct/datastructure.jl +++ b/src/struct/datastructure.jl @@ -7,8 +7,8 @@ * `pos` is a vector of vector containing relative position of marker_name in each chromosome. """ struct Gmap - chr::Vector{String} - marker_name::Vector{Vector{String}} + chr::Vector{AbstractString} + marker_name::Vector{Vector{AbstractString}} pos::Vector{Vector{Float64}} end @@ -18,7 +18,7 @@ end * `type` is a string indicating the type of the cross. """ struct CrossType - type::String + type::AbstractString end @@ -28,8 +28,8 @@ end * `direction` is a vector containing the cross direction of sample_id. """ struct CrossInfo - sample_id::Vector{String} - direction::Vector{Integer} + sample_id::Vector{AbstractString} + direction::Vector{AbstractString} end @@ -38,7 +38,7 @@ end * `val` is a vector containing the names of the alleles. """ struct Alleles - val::Vector{String} + val::Vector{AbstractString} end @@ -69,10 +69,10 @@ end * `val` is a vector of matrices containing allele information in each chromosome. """ struct Geno - sample_id::Vector{String} - chromosomes::Vector{String} - marker_name::Vector{Vector{String}} - val::Vector{Matrix{Int16}} + sample_id::Vector{AbstractString} + chromosomes::Vector{AbstractString} + marker_name::Vector{Vector{AbstractString}} + val::Vector{Matrix{Union{Missing, Int16}}} cross_type::CrossType alleles::Alleles geno_type::GenoType @@ -81,33 +81,6 @@ struct Geno end -""" -`Chromosome` type contains genotype information for a chromosome. - -* `name` contains name of the chromosome. -* `marker_name` contains names of marker_name in the chromosome. -* `val` is a vector of matrices containing allele information in the chromosome. -""" -struct Chromosome - name::String - marker_name::Vector{String} - val::Matrix{Int} -end - - -""" -`Geno2` type contains genotype information for all chromosomes. - -* `sample_id` contains sample names such as genotypes or individual IDs. -* `chromosomes` is a vector of type `Chromosome` containing genotype informaton for each chromosome. - -""" -struct Geno2 - sample_id::Vector{String} - chromosomes::Vector{Chromosome} -end - - """ `Pmap` type contains the genetic map showing the relative location of genetic marker_name as phenotype. * `chromosomes` contains chromosomes names. @@ -116,10 +89,10 @@ end * `unit` contains unit for the chromosome length. """ struct Pmap - chromosomes::Vector{String} - marker_name::Vector{Vector{String}} + chromosomes::Vector{AbstractString} + marker_name::Vector{Vector{AbstractString}} pos::Vector{Vector{Float64}} - unit::String + unit::AbstractString end @@ -130,8 +103,8 @@ end * `val` is a matrix containing phenotype/ traits values. """ struct Pheno - sample_id::Vector{String} - traits::Vector{String} + sample_id::Vector{AbstractString} + traits::Vector{AbstractString} val::Matrix{Union{Nothing, Float64}} end @@ -142,8 +115,8 @@ end * `descriptions` is a vector containing the description for each phenotype. """ struct Phenocov - traits::Vector{String} - descriptions::Vector{String} + traits::Vector{AbstractString} + descriptions::Vector{AbstractString} end @@ -153,7 +126,7 @@ IsFemale type indicates if the sample_id (genotypes or individuals) are females. * `val` is a vector containing boolean values indicating if each sample (genotype or individual) is a female. """ struct IsFemale - sample_id::Vector{String} + sample_id::Vector{AbstractString} val::Vector{Bool} end @@ -164,7 +137,7 @@ end * `val` is a vector of boolean values indicating which chromosome is the X one. """ struct IsXChar - chromosomes::Vector{String} + chromosomes::Vector{AbstractString} val::Vector{Bool} end diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl index b9fee2d..816cfff 100644 --- a/test/datastruc_test.jl +++ b/test/datastruc_test.jl @@ -22,88 +22,125 @@ geno = Geno( #pheno = Pheno(["individual1", "individual2"], ["trait1", "trait2"], [1.0 2.0; nothing 3.0]) ######## + data_dir = joinpath(@__DIR__, "data/BXD/"); file = joinpath(data_dir, "bxd.json"); + +data_dir, filename = BigRiverQTL.get_control_file(file) + + + + + + # Transforming data to a optimised and accessible data type -data = get_geneticstudydata(file); +# data = get_geneticstudydata(file); -# Testing the Gmap type -gmap = data.gmap +######################### +# Testing the Gmap type # +######################### @testset "Gmap Tests" begin - - # @test gmap.chr == ["chr1", "chr2"] - # @test length(gmap.marker_name) == 2 + gmap = get_gmap(file) @test all(isa.(gmap.pos, Vector{Float64})) end -# Testing the CrossType type -crosstype = data.geno.cross_type +############################## +# Testing the CrossType type # +############################## @testset "CrossType Tests" begin + cross_type = get_crosstype(file) @test isa(cross_type.type, String) end -# Testing the CrossInfo type +############################ +# Testing the Alleles type # +############################ +@testset "Alleles Tests" begin + alleles = get_alleles(file) + # @test length(alleles.val) == 3 + @test all(isa.(alleles.val, String)) +end + +############################# +# Testing the GenoType type # +############################# +@testset "GenoType Tests" begin + genotype_dict = get_genotype(file) + @test genotype_dict.label["B"] == 1 + @test genotype_dict.label["D"] == 2 +end + +################################## +# Testing the GenoTranspose type # +################################## +@testset "GenoTranspose Tests" begin + geno_transpose = get_genotranspose(file) + @test geno_transpose.val == true +end + +######################### +# Testing the Geno type # +######################### +@testset "Geno Tests" begin + geno = get_geno(file) + @test length(geno.sample_id) == 198 + @test size(geno.val[1]) == (636, 198) +end + +######################### +# Testing the Pmap type # +######################### +@testset "Pmap Tests" begin + pmap = get_pmap(file) + @test length(pmap.pos[1]) == 636 + @test pmap.unit == "Mbp" +end + + +############################## +# Testing the CrossInfo type # +############################## cross_info = data.cross_info @testset "CrossInfo Tests" begin + cross_info = data.cross_info @test length(cross_info.sample_id) == length(cross_info.direction) @test all(isa.(cross_info.direction, Int)) end -# Testing the Alleles type -alleles = data.geno.alleles -@testset "Alleles Tests" begin - # @test length(alleles.val) == 3 - @test all(isa.(alleles.val, String)) -end -# Testing the GenoType type -#@testset "GenoType Tests" begin -# @test geno_type.label["A"] == 1 -# @test geno_type.label["H"] == 2 -#end - -# Testing the GenoTranspose type -#@testset "GenoTranspose Tests" begin -# @test geno_transpose.val == true -#end - -# Testing the Geno type -#@testset "Geno Tests" begin -# @test length(geno.sample_id) == 1 -# @test size(geno.val[1]) == (2, 2) -#end - -# Testing the Pheno type -pheno = data.pheno -@testset "Pheno Tests" begin - @test size(pheno.val, 1) == length(pheno.sample_id) - # @test pheno.val[2, 1] === nothing -end -# Checks whether the output of `get_geneticstudydata` function is of type `GeneticStudyData` -function BigRiverQTLData_struct_test(filename::String, testname::String) - @testset "BigRiverQTLData_struct_test" begin +# # Testing the Pheno type +# pheno = data.pheno +# @testset "Pheno Tests" begin +# @test size(pheno.val, 1) == length(pheno.sample_id) +# # @test pheno.val[2, 1] === nothing +# end - println("Test results for checking whether the output of `get_geneticstudydata` function is of type `GeneticStudyData`: ", - @test typeof(get_geneticstudydata(filename)) == GeneticStudyData) +# # Checks whether the output of `get_geneticstudydata` function is of type `GeneticStudyData` +# function BigRiverQTLData_struct_test(filename::String, testname::String) +# @testset "BigRiverQTLData_struct_test" begin - end +# println("Test results for checking whether the output of `get_geneticstudydata` function is of type `GeneticStudyData`: ", +# @test typeof(get_geneticstudydata(filename)) == GeneticStudyData) -end +# end +# end -######################### -# Test: BigRiverQTLData # -######################### -data_dir = joinpath(@__DIR__, "data/BXD/"); -file = joinpath(data_dir, "bxd.json"); + +# ######################### +# # Test: BigRiverQTLData # +# ######################### + +# data_dir = joinpath(@__DIR__, "data/BXD/"); +# file = joinpath(data_dir, "bxd.json"); -BigRiverQTLData_struct_test(file, "get_bigriverqtldata") +# BigRiverQTLData_struct_test(file, "get_bigriverqtldata") From a8a94929c5c9b52b99c67524814d3227dc4ed243 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 17:49:54 -0500 Subject: [PATCH 3/7] updated struct and get struct --- src/BigRiverQTL.jl | 2 +- src/io/export_to_type.jl | 211 ++++++++++++++++++------------------ src/io/io_utils.jl | 56 ++++++++-- src/struct/datastructure.jl | 10 +- test/datastruc_test.jl | 54 +++++++-- 5 files changed, 204 insertions(+), 129 deletions(-) diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 196818e..008a1ba 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -66,7 +66,7 @@ module BigRiverQTL include("io/export_to_type.jl") export get_geneticstudydata export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype - export get_geno, get_genotype, get_genotranspose, get_pmap + export get_geno, get_genotype, get_genotranspose, get_pmap, get_pheno, get_isxchar ######### # Plots # diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index 81d0a89..d76cd7d 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -21,15 +21,18 @@ function get_gmap(filename::String) # load control file jsondict = parse_json(filename) - # check gmap file exists - if (in("gmap", keys(jsondict))) - gmapfile = joinpath(data_dir, jsondict["gmap"]) - else - throw("Error: gmap not found in control file") - end + # get gmap file + gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) # load gmap file - gdf = groupby(read_data(gmapfile), :chr) + gdf = groupby( + read_data( + gmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ), + :chr, + ) # chromosome chr = [group.chr[1] for group in gdf] @@ -65,17 +68,11 @@ function get_crosstype(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file + # load control file jsondict = BigRiverQTL.parse_json(filename) - # make type - - #crosstype - if (in("crosstype", keys(jsondict))) - crosstype = jsondict["crosstype"] - else - throw("Error: CrossType not found") - end + # get crosstype + crosstype = check_key(jsondict, "crosstype") return CrossType(crosstype) end @@ -98,13 +95,11 @@ function get_alleles(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file + # load control file jsondict = BigRiverQTL.parse_json(filename) - # make type - #alleles - alleles = jsondict["alleles"] + alleles = check_key(jsondict, "alleles") return Alleles(alleles) end @@ -191,31 +186,31 @@ function get_geno(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file + # load control file jsondict = BigRiverQTL.parse_json(filename) - # check gmap file exists - if (in("gmap", keys(jsondict))) - gmapfile = joinpath(data_dir, jsondict["gmap"]) - else - throw("Error: gmap not found in control file") - end + # get gmap file + gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) - # check geno file exists - if (in("geno", keys(jsondict))) - genofile = joinpath(data_dir, jsondict["geno"]) - else - throw("Error: geno not found in control file") - end + # get geno file + genofile = joinpath(data_dir, check_key(jsondict, "geno")) # get gmap object gmap = get_gmap(filename) # load gmap dataframe - df_gmap = read_data(gmapfile) + df_gmap = read_data( + gmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ) # load geno dataframe - df_geno = read_data(genofile) + df_geno = read_data( + genofile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ) # get samples names samples = names(df_geno)[2:end] @@ -240,20 +235,20 @@ function get_geno(filename::String) insertcols!(df_encoded, 1, :marker => df_geno.marker) gdf = innerjoin( - select(df_gmap, [:chr, :marker]), - df_encoded, - on = :marker - ) |> x -> groupby(x, :chr); - - val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]; - - #crosstype + select(df_gmap, [:chr, :marker]), + df_encoded, + on = :marker, + ) |> x -> groupby(x, :chr) + + val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf] + + # crosstype crosstype = get_crosstype(filename) - #alleles + # alleles alleles = get_alleles(filename) - #genotranspose + # genotranspose genotranspose = get_genotranspose(filename) return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) @@ -281,25 +276,28 @@ function get_pmap(filename::String) # load control file jsondict = parse_json(filename) - # check pmap file exists - if (in("pmap", keys(jsondict))) - pmapfile = joinpath(data_dir, jsondict["pmap"]) - else - throw("Error: pmap not found in control file") - end - - # load pmap file - gdf = groupby(read_data(pmapfile), :chr) + # get pmap file + pmapfile = joinpath(data_dir, check_key(jsondict, "pmap")) + + # load pmap file + gdf = groupby( + read_data( + pmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ), + :chr, + ) # chromosome chr = [group.chr[1] for group in gdf] - + # marker marker = [group.marker for group in gdf] - + # relative position pos = [group.pos for group in gdf] - + # unit unit = "" f = open(pmapfile, "r") @@ -312,7 +310,7 @@ function get_pmap(filename::String) unit = "Mb" elseif occursin("cm", s_lower) unit = "cM" - else + else @warn "No unit detected!" end @@ -335,32 +333,33 @@ Creates a `Pheno` type/struct from Pheno CSV file. Returns a `Pheno` type/struct. """ -function get_pheno(filename::String) +function get_pheno(filename::String; samples_col_name::Symbol = :id) # check if filename is directory or file name data_dir, filename = get_control_file(filename) # load control file jsondict = parse_json(filename) - # check pheno file exists - if (in("pheno", keys(jsondict))) - phenofile = joinpath(data_dir, jsondict["pheno"]) - else - throw("Error: pheno not found in control file") - end + # get pheno file + phenofile = joinpath(data_dir, check_key(jsondict, "pheno")) - # load file - df_pheno = read_data(phenofile) + # load pheno file + df_pheno = read_data( + phenofile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + missingstring = "NA" + ) # samples - samples = df_pheno[:, 1] + samples = df_pheno[:, samples_col_name] # traits' names - traits = names(df_pheno)[2:end] + traits = names(df_pheno) + setdiff!(traits, [string(samples_col_name)]) # value of the traits - df_pheno = Matrix(df_pheno) - val = tryparse.(Float64, df_pheno[:, 2:end]) + val = Matrix(df_pheno[:, traits]) return Pheno(samples, traits, val) end @@ -383,16 +382,22 @@ function get_phenocovar(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - - df_phenocovar = read_data(filename) - - # make type + # load control file + jsondict = parse_json(filename) + + # get phenocovar file + phenocovarfile = joinpath(data_dir, check_key(jsondict, "phenocovar")) + + # load phenocovar file + df_phenocovar = read_data( + phenocovarfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ) # traits' names traits = string.(df_phenocovar[:, 1]) - # description description = df_phenocovar[:, 2] @@ -417,15 +422,22 @@ function get_crossinfo(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - df_crossdirection = read_data(filename) - - # make type - - # traits' names + # load control file + jsondict = parse_json(filename) + + # get crossdirection file + crossdirectionfile = joinpath(data_dir, check_key(jsondict, "crossdirection")) + + # load crossdirection file + df_crossdirection = read_data( + crossdirectionfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char") + ) + + # samples names samples = df_crossdirection[:, 1] - # description direction = df_crossdirection[:, 2] @@ -450,21 +462,19 @@ function get_isxchar(filename::String) # check if filename is directory or file name data_dir, filename = get_control_file(filename) - # load file - - gmap = get_gmap(filename) + # load control file + jsondict = parse_json(filename) - # make type + # get gmap file + gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) - # chromosome - chr = gmap.chr + # load gmap + gmap = get_gmap(gmapfile) # isXchar - - isxchar = zeros(Bool, length(chr)) - isxchar[findfirst(x -> x == "X", chr)] = 1 - - return IsXChar(chr, isxchar) + isxchar = map(x -> x == "X", gmap.chr) + + return IsXChar(gmap.chr, isxchar) end @@ -486,8 +496,9 @@ function get_isfemale(filename::String) data_dir, filename = get_control_file(filename) # load file - jsondict = BigRiverQTL.parse_json(filename) - crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) + jsondict = parse_json(filename) + + crossinfofile = get_crossinfo(filename) # make type @@ -528,12 +539,6 @@ function get_geneticstudydata(filename::String) # make type #gmap - if (in("gmap", keys(jsondict))) - gmapfile = joinpath(data_dir, jsondict["gmap"]) - - else - throw("Error: gmap not found in control file") - end gmap = get_gmap(gmapfile) diff --git a/src/io/io_utils.jl b/src/io/io_utils.jl index c023432..200f273 100644 --- a/src/io/io_utils.jl +++ b/src/io/io_utils.jl @@ -32,15 +32,15 @@ Writes a CSV file to data frame excluding the comments lines. Returns a data frame. """ -function read_data(filename::String) - # read the file into lines - lines = readlines(filename) - # which lines have # as first character - firstpound = (x->match(r"^#",x)).( lines ) .!= nothing - # last line of comment - startdata = findfirst(firstpound.==0) - # Check if the comment lines can be run directly from "CSV.read". - return CSV.read(filename, DataFrame; header = startdata)#comment="#") +function read_data(filename::String; kwargs...) + # # read the file into lines + # lines = readlines(filename) + # # which lines have # as first character + # firstpound = (x->match(r"^#",x)).( lines ) .!= nothing + # # last line of comment + # startdata = findfirst(firstpound.==0) + # # Check if the comment lines can be run directly from "CSV.read". + return CSV.read(filename, DataFrame; kwargs...)#comment="#") end @@ -148,3 +148,41 @@ function encode_genotype(geno_dict::Dict{String, Any}, geno_val::AbstractArray) return encoded_val end + + +""" + check_key(control_dict::Dict, s::String) -> Any + +Check if a specified key exists in the given dictionary and return its corresponding value. + +# Arguments +- `control_dict::Dict`: A dictionary from which the value associated with a key is to be retrieved. +- `s::String`: The key for which the existence and value are checked within the dictionary. + +# Returns +- `Any`: Returns the value associated with the key `s` in `control_dict` if it exists. + +# Throws +- Throws an error if the key `s` is not found in `control_dict`. + +# Description +This function checks if the given string `s` exists as a key in the provided dictionary +`control_dict`. If the key exists, the function returns the value associated with this +key. If the key does not exist, it throws an error with a message indicating that the +key was not found in the control file. + +""" +function check_key(control_dict::Dict, s::String) + # check geno file exists + if (in(s, keys(control_dict))) + val = control_dict[s] + else + throw("Error: $(s) not found in control file") + end + + return val +end + + + + diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl index dc4fc42..9f5fb05 100644 --- a/src/struct/datastructure.jl +++ b/src/struct/datastructure.jl @@ -64,13 +64,13 @@ end `Geno` type contains genotype information for all chromosomes. * `sample_id` contains sample names such as genotypes or individual IDs. -* `chromosomes` contains chromosome names. +* `chr` contains chromosome names. * `marker_name` contains marker names for each chromosome. * `val` is a vector of matrices containing allele information in each chromosome. """ struct Geno sample_id::Vector{AbstractString} - chromosomes::Vector{AbstractString} + chr::Vector{AbstractString} marker_name::Vector{Vector{AbstractString}} val::Vector{Matrix{Union{Missing, Int16}}} cross_type::CrossType @@ -83,13 +83,13 @@ end """ `Pmap` type contains the genetic map showing the relative location of genetic marker_name as phenotype. -* `chromosomes` contains chromosomes names. +* `chr` contains chromosomes names. * `marker_name` contains marker names for each chromosome. * `pos` is a vector of vector containing relative position of marker_name as phenotypes in each chromosome. * `unit` contains unit for the chromosome length. """ struct Pmap - chromosomes::Vector{AbstractString} + chr::Vector{AbstractString} marker_name::Vector{Vector{AbstractString}} pos::Vector{Vector{Float64}} unit::AbstractString @@ -105,7 +105,7 @@ end struct Pheno sample_id::Vector{AbstractString} traits::Vector{AbstractString} - val::Matrix{Union{Nothing, Float64}} + val::Matrix{Union{Missing, Float64}} end diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl index 816cfff..2a794a4 100644 --- a/test/datastruc_test.jl +++ b/test/datastruc_test.jl @@ -27,7 +27,7 @@ data_dir = joinpath(@__DIR__, "data/BXD/"); file = joinpath(data_dir, "bxd.json"); -data_dir, filename = BigRiverQTL.get_control_file(file) +# data_dir, filename = BigRiverQTL.get_control_file(file) @@ -98,25 +98,57 @@ end end +########################## +# Testing the Pheno type # +########################## +@testset "Pheno Tests" begin + pheno = get_pheno(file) + @test length(pheno.sample_id) == 198 + @test length(pheno.sample_id) == 5806 + @test pheno.val[1:2, 1:2] == [61.4 54.1;49.0 50.1] +end + + +############################# +# Testing the Phenocov type # +############################# +@testset "Phenocov Tests" begin + phenocovar = get_phenocovar(file) + @test length(phenocovar.traits) == 5806 + @test phenocovar.descriptions[5800] == "MFL-54" +end + + ############################## # Testing the CrossInfo type # ############################## -cross_info = data.cross_info @testset "CrossInfo Tests" begin - cross_info = data.cross_info - @test length(cross_info.sample_id) == length(cross_info.direction) - @test all(isa.(cross_info.direction, Int)) + cross_info = get_crossinfo(file) + @test length(cross_info.sample_id) == 198 + @test cross_info.direction[1] == "BxD" end +############################ +# Testing the IsXChar type # +############################ +@testset "IsXChar Tests" begin + isxchar = get_isxchar(file) + @test length(isxchar.chr) == 20 + @test isxchar.val[20] == true +end + + +############################ +# Testing the IsFemale type # +############################ +@testset "IsFemale Tests" begin + isfemale = get_isfemale(file) + @test length(isxchar.chr) == 20 + @test isxchar.val[20] == true +end -# # Testing the Pheno type -# pheno = data.pheno -# @testset "Pheno Tests" begin -# @test size(pheno.val, 1) == length(pheno.sample_id) -# # @test pheno.val[2, 1] === nothing -# end # # Checks whether the output of `get_geneticstudydata` function is of type `GeneticStudyData` From 3fdc60986cd96ee7223a80ed5e7e458bd72b1de5 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 22:15:54 -0500 Subject: [PATCH 4/7] fixed tests and added test_gf.jl --- src/BigRiverQTL.jl | 3 +- src/io/export_to_type.jl | 69 ++++--------- src/struct/datastructure.jl | 4 +- test/datastruc_test.jl | 20 ++-- test/plots_test.jl | 10 +- test/test_gf.jl | 196 ++++++++++++++++++++++++++++++++++++ 6 files changed, 234 insertions(+), 68 deletions(-) create mode 100644 test/test_gf.jl diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 008a1ba..d86b89b 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -66,7 +66,8 @@ module BigRiverQTL include("io/export_to_type.jl") export get_geneticstudydata export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype - export get_geno, get_genotype, get_genotranspose, get_pmap, get_pheno, get_isxchar + export get_geno, get_genotype, get_genotranspose, get_pmap + export get_phenocovar, get_pheno, get_isxchar ######### # Plots # diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index d76cd7d..ea616a5 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -426,11 +426,12 @@ function get_crossinfo(filename::String) jsondict = parse_json(filename) # get crossdirection file - crossdirectionfile = joinpath(data_dir, check_key(jsondict, "crossdirection")) + crossinfo_dict = check_key(jsondict, "cross_info") + crossinfofile = joinpath(data_dir, crossinfo_dict["file"]) # load crossdirection file df_crossdirection = read_data( - crossdirectionfile; + crossinfofile; delim = check_key(jsondict, "sep"), comment = check_key(jsondict, "comment.char") ) @@ -465,11 +466,8 @@ function get_isxchar(filename::String) # load control file jsondict = parse_json(filename) - # get gmap file - gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) - # load gmap - gmap = get_gmap(gmapfile) + gmap = get_gmap(filename) # isXchar isxchar = map(x -> x == "X", gmap.chr) @@ -503,7 +501,7 @@ function get_isfemale(filename::String) # make type # samples - samples = get_crossinfo(crossinfofile).sample_id + samples = get_crossinfo(filename).sample_id # isfemale @@ -534,65 +532,34 @@ function get_geneticstudydata(filename::String) data_dir, filename = get_control_file(filename) # load file - jsondict = BigRiverQTL.parse_json(filename) - - # make type - - #gmap - - gmap = get_gmap(gmapfile) - - #geno - if (in("geno", keys(jsondict))) - genofile = joinpath(data_dir, jsondict["geno"]) + jsondict = parse_json(filename) - else - throw("Error: geno not found in control file") - end + # gmap + gmap = get_gmap(filename) + # geno geno = get_geno(filename) - - #pmap - if (in("pmap", keys(jsondict))) - pmapfile = joinpath(data_dir, jsondict["pmap"]) - - else - throw("Error: pmap not found in control file") - end - - pmap = get_pmap(pmapfile) + + # pmap + pmap = get_pmap(filename) #pheno - if (in("pheno", keys(jsondict))) - phenofile = joinpath(data_dir, jsondict["pheno"]) - - else - throw("Error: pheno not found in control file") - end - - pheno = get_pheno(phenofile) + pheno = get_pheno(filename) #phenocov - if (in("phenocovar", keys(jsondict))) - phenocovfile = joinpath(data_dir, jsondict["phenocovar"]) - - else - throw("Error: phenocovar not found in control file") - end - - phenocov = get_phenocovar(phenocovfile) + phenocov = get_phenocovar(filename) #isXchar - isXchar = get_isxchar(gmapfile) + isXchar = get_isxchar(filename) #isfemale isfemale = get_isfemale(filename) #crossinfo - crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) - crossinfo = get_crossinfo(crossinfofile) + crossinfo = get_crossinfo(filename) - return GeneticStudyData(gmap, + return GeneticStudyData( + gmap, geno, pmap, pheno, diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl index 9f5fb05..ad8aaf7 100644 --- a/src/struct/datastructure.jl +++ b/src/struct/datastructure.jl @@ -133,11 +133,11 @@ end """ `IsXChar` type indicates which chromosome is the X one. -* `chromosomes` contains chromosome names. +* `chr` contains chromosome names. * `val` is a vector of boolean values indicating which chromosome is the X one. """ struct IsXChar - chromosomes::Vector{AbstractString} + chr::Vector{AbstractString} val::Vector{Bool} end diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl index 2a794a4..62c1ddd 100644 --- a/test/datastruc_test.jl +++ b/test/datastruc_test.jl @@ -98,14 +98,14 @@ end end -########################## -# Testing the Pheno type # -########################## +######################### +# Testing the Phenotype # +######################### @testset "Pheno Tests" begin pheno = get_pheno(file) @test length(pheno.sample_id) == 198 - @test length(pheno.sample_id) == 5806 - @test pheno.val[1:2, 1:2] == [61.4 54.1;49.0 50.1] + @test length(pheno.traits) == 5806 + @test isapprox(pheno.val[1:2, 1:2], [61.4 54.1;49.0 50.1], atol=1e-4) end @@ -142,11 +142,11 @@ end ############################ # Testing the IsFemale type # ############################ -@testset "IsFemale Tests" begin - isfemale = get_isfemale(file) - @test length(isxchar.chr) == 20 - @test isxchar.val[20] == true -end +# @testset "IsFemale Tests" begin +# isfemale = get_isfemale(file) +# @test length(isxchar.chr) == 20 +# @test isxchar.val[20] == true +# end diff --git a/test/plots_test.jl b/test/plots_test.jl index 44d0e2a..bb00d00 100644 --- a/test/plots_test.jl +++ b/test/plots_test.jl @@ -11,8 +11,10 @@ gInfo = data.gmap; pInfo = data.phenocov; # pheno=data.pheno; pheno = data.pheno.val; -geno = reduce(hcat, data.geno.val); -geno_processed = convert(Array{Float64}, geno); +geno = reduce(vcat, data.geno.val); +geno_processed = geno .- 1.0 +replace!(geno_processed, missing => 0.5); +geno_processed = convert(Matrix{Float64}, geno_processed); ################# # Preprocessing # @@ -20,8 +22,8 @@ geno_processed = convert(Array{Float64}, geno); traitID = 1112; pheno_y = pheno[:, traitID]; pheno_y2 = ones(length(pheno_y)); -idx_nothing = findall(x -> x != nothing, pheno_y) -pheno_y2[idx_nothing] = pheno_y[idx_nothing]; +idx_not_missing = findall(!ismissing, pheno_y) +pheno_y2[idx_not_missing] = pheno_y[idx_not_missing]; ########### # Kinship # diff --git a/test/test_gf.jl b/test/test_gf.jl new file mode 100644 index 0000000..aeb309a --- /dev/null +++ b/test/test_gf.jl @@ -0,0 +1,196 @@ + +# https://github.com/rqtl/qtl2data/blob/main/DO_Recla/recla.json + +using BigRiverQTL, LinearAlgebra +using Statistics +using Test +using DelimitedFiles +using CSV +using DataFrames + + + +data_dir = joinpath(@__DIR__, "data/BXD/"); +file = joinpath(data_dir, "bxd.json"); +jsondict = BigRiverQTL.parse_json(file) + + + # check pheno file exists + if (in("pheno", keys(jsondict))) + phenofile = joinpath(data_dir, jsondict["pheno"]) + else + throw("Error: pheno not found in control file") + end + + # load file + + CSV.read(phenofile, DataFrame; comment="#") + + df_pheno = BigRiverQTL.read_data( + phenofile; + missingstring="NA"); + + unit = "" + f = open(pmapfile, "r") + + s = readline(f) + s_lower = lowercase(s) # Convert the input string to lowercase + if occursin("mbp", s_lower) + unit = "Mbp" + elseif occursin("mb", s_lower) + unit = "Mb" + elseif occursin("cm", s_lower) + unit = "cM" + else + @warn "No unit detected!" + end + + close(f) + + unit + + +# function get_geno(filename::String) +filename = file + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load file + jsondict = BigRiverQTL.parse_json(filename) + + # check gmap file exists + if (in("gmap", keys(jsondict))) + gmapfile = joinpath(data_dir, jsondict["gmap"]) + else + throw("Error: gmap not found in control file") + end + + # check geno file exists + if (in("geno", keys(jsondict))) + genofile = joinpath(data_dir, jsondict["geno"]) + else + throw("Error: geno not found in control file") + end + + # get gmap object + gmap = get_gmap(file) + + # load gmap dataframe + df_gmap = read_data(gmapfile) + + # load geno dataframe + df_geno = read_data(genofile) + + # get samples names + samples = names(df_geno)[2:end] + + # get chromosomes names + chr = gmap.chr + + # get markers names + marker = gmap.marker_name + + #genotype + genotype = get_genotype(filename) + + # values + # get encoded genetotypes + df_encoded = DataFrame( + encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)), + samples, + ) + + # built encoded geno DataFrames + insertcols!(df_encoded, 1, :marker => df_geno.marker) + + gdf = innerjoin( + select(df_gmap, [:chr, :marker]), + df_encoded, + on = :marker + ) |> x -> groupby(x, :chr); + + val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]; + + #crosstype + crosstype = get_crosstype(filename) + + #alleles + alleles = get_alleles(filename) + + #genotranspose + genotranspose = get_genotranspose(filename) + + # return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) +# end + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +# Transforming data to a optimised and accessible data type +# data = get_geneticstudydata(file); + +# check if filename is directory or file name +data_dir, filename = BigRiverQTL.get_control_file(file) + +# load file +jsondict = BigRiverQTL.parse_json(filename) + + +crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) + + # make type + + # samples + df_crossdirection = BigRiverQTL.read_data(crossinfofile) + get_crossinfo(crossinfofile) + + + samples = get_crossinfo(crossinfofile).sample_id + + # isfemale + + isfemale = ones(Bool, length(samples)) + if (in("sex", keys(jsondict))) + isfemale[findall(x -> x == "Male", jsondict["sex"])] = 0 + end + + + + + + + From 67759b2a79297259723ca8cb28acc60364f776c2 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 22:32:11 -0500 Subject: [PATCH 5/7] fixed until plotting test --- src/BigRiverQTL.jl | 1 + test/plots_test.jl | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index d86b89b..b341beb 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -7,6 +7,7 @@ module BigRiverQTL using Reexport @reexport import BigRiverQTLPlots: plot_QTL, plot_eQTL, plot_manhattan + @reexport import BulkLMM: scan ######## diff --git a/test/plots_test.jl b/test/plots_test.jl index bb00d00..db9f0eb 100644 --- a/test/plots_test.jl +++ b/test/plots_test.jl @@ -15,6 +15,7 @@ geno = reduce(vcat, data.geno.val); geno_processed = geno .- 1.0 replace!(geno_processed, missing => 0.5); geno_processed = convert(Matrix{Float64}, geno_processed); +geno_processed = permutedims(geno_processed); ################# # Preprocessing # @@ -35,7 +36,7 @@ kinship = kinship_gs(geno_processed, 0.99); # Scan # ######## -single_results_perms = scan( +single_results_perms = BigRiverQTL.BulkLMM.scan( pheno_y2, geno_processed, kinship; From a28a9f10539910e9664156d7ebbeab6d07e0e776 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 22:45:05 -0500 Subject: [PATCH 6/7] added Plots in test --- Project.toml | 3 ++- test/plots_test.jl | 4 ++-- test/test_gf.jl | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 102519c..a64b4d8 100644 --- a/Project.toml +++ b/Project.toml @@ -29,7 +29,8 @@ julia = "1" [extras] DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["DelimitedFiles", "Test"] +test = ["DelimitedFiles", "Plots", "Test"] diff --git a/test/plots_test.jl b/test/plots_test.jl index db9f0eb..7cee95f 100644 --- a/test/plots_test.jl +++ b/test/plots_test.jl @@ -57,11 +57,11 @@ p1 = plot_QTL(single_results_perms, gInfo, mbColname = "Pos"); p2 = plot_manhattan(single_results_perms, gInfo, mbColname = "Pos"); @testset "QTL plot Tests" begin - @test isa(p1[1][4], Series) + @test isa(p1[1][4], Plots.Series) end @testset "Mahattan plot Tests" begin - @test isa(p2[1][3], Series) + @test isa(p2[1][2], Plots.Series) end diff --git a/test/test_gf.jl b/test/test_gf.jl index aeb309a..ee3b57f 100644 --- a/test/test_gf.jl +++ b/test/test_gf.jl @@ -7,6 +7,7 @@ using Test using DelimitedFiles using CSV using DataFrames +using Plots From c0d9c28d5b16377b66263330a1518f0931dda02b Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Thu, 15 Aug 2024 23:17:56 -0500 Subject: [PATCH 7/7] added using Plots in test --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 3dc7976..4f2ad3a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,7 @@ using BigRiverQTL, LinearAlgebra using Statistics using Test -using DelimitedFiles +using DelimitedFiles, Plots ######## # Test #