From a4b58bee89bb61a97bb71e46de2786fd8658213a Mon Sep 17 00:00:00 2001 From: GregFa Date: Mon, 19 Aug 2024 16:34:52 -0500 Subject: [PATCH 1/6] used geno_transposed value in get_geno --- src/io/export_to_type.jl | 638 ++++++++++++++++++++------------------- test/test_gf.jl | 266 ++++++---------- 2 files changed, 421 insertions(+), 483 deletions(-) diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index 067b67e..7a7424e 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -15,35 +15,35 @@ Returns a `Gmap` type/struct. """ function get_gmap(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # get gmap file - gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) + # get gmap file + gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) - # load gmap file - gdf = groupby( - read_data( - gmapfile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ), - :chr, - ) + # load gmap file + gdf = groupby( + read_data( + gmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ), + :chr, + ) - # chromosome - chr = [group.chr[1] for group in gdf] + # chromosome + chr = [group.chr[1] for group in gdf] - # marker - marker = [String.(group.marker) for group in gdf] + # marker + marker = [String.(group.marker) for group in gdf] - # relative position - pos = [group.pos for group in gdf] + # relative position + pos = [group.pos for group in gdf] - return Gmap(chr, marker, pos) + return Gmap(chr, marker, pos) end @@ -65,16 +65,16 @@ Returns a `CrossType` type/struct. """ function get_crosstype(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = BigRiverQTL.parse_json(filename) + # load control file + jsondict = BigRiverQTL.parse_json(filename) - # get crosstype - crosstype = check_key(jsondict, "crosstype") + # get crosstype + crosstype = check_key(jsondict, "crosstype") - return CrossType(crosstype) + return CrossType(crosstype) end @@ -92,16 +92,16 @@ Creates a `Alleles` type/struct from control file in json format. Returns a `Alleles` type/struct. """ function get_alleles(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = BigRiverQTL.parse_json(filename) + # load control file + jsondict = BigRiverQTL.parse_json(filename) - #alleles - alleles = check_key(jsondict, "alleles") + #alleles + alleles = check_key(jsondict, "alleles") - return Alleles(alleles) + return Alleles(alleles) end @@ -119,21 +119,21 @@ Creates a `GenoType` type/struct from control file in json format. Returns a `GenoType` type/struct. """ function get_genotype(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load file - jsondict = BigRiverQTL.parse_json(filename) + # load file + jsondict = BigRiverQTL.parse_json(filename) - # genotype - if (in("genotypes", keys(jsondict))) - label = jsondict["genotypes"] - else - # Rqtl2 default - label = Dict{String,Int}("A" => 1, "H" => 2, "B" => 3, "D" => 4, "C" => 5) - end + # genotype + if (in("genotypes", keys(jsondict))) + label = jsondict["genotypes"] + else + # Rqtl2 default + label = Dict{String, Int}("A" => 1, "H" => 2, "B" => 3, "D" => 4, "C" => 5) + end - return GenoType(label) + return GenoType(label) end @@ -151,20 +151,20 @@ Creates a `GenoTranspose` type/struct from control file in json format. Returns a `GenoTranspose` type/struct. """ function get_genotranspose(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load file - jsondict = BigRiverQTL.parse_json(filename) + # load file + jsondict = BigRiverQTL.parse_json(filename) - #g enotype - if (in("geno_transposed", keys(jsondict))) - val = jsondict["geno_transposed"] - else - val = FALSE - end + #g enotype + if (in("geno_transposed", keys(jsondict))) + val = jsondict["geno_transposed"] + else + val = FALSE + end - return GenoTranspose(val) + return GenoTranspose(val) end @@ -183,75 +183,89 @@ Creates a `Geno` type/struct from gmap CSV file and geno CSV file. Returns a `Geno` type/struct. """ function get_geno(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) - - # load control file - jsondict = BigRiverQTL.parse_json(filename) - - # get gmap file - gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) - - # get geno file - genofile = joinpath(data_dir, check_key(jsondict, "geno")) - - # get gmap object - gmap = get_gmap(filename) - - # load gmap dataframe - df_gmap = read_data( - gmapfile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ) - - # load geno dataframe - df_geno = read_data( - genofile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ) - - # get samples names - samples = names(df_geno)[2:end] - - # get chromosomes names - chr = gmap.chr - - # get markers names - marker = gmap.marker_name - - #genotype - genotype = get_genotype(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load control file + jsondict = parse_json(filename) + + # get gmap file + gmapfile = joinpath(data_dir, check_key(jsondict, "gmap")) + + # get geno file + genofile = joinpath(data_dir, check_key(jsondict, "geno")) + + # get gmap object + gmap = get_gmap(filename) + + # load gmap dataframe + df_gmap = read_data( + gmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ) + + # load geno dataframe + df_geno = read_data( + genofile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ) + + # check if geno data are transposed + if jsondict["geno_transposed"] + # get samples names + samples = names(df_geno)[2:end] + # get values + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) + # get markers names + marker = df_geno.marker + else + # get samples names + samples = df_geno[:, 1] + # get values + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims + # get markers names + marker = names(df_geno)[2:end] + end + + # get chromosomes names + chr = gmap.chr + + #genotype + genotype = get_genotype(filename) + + # values + # get encoded genetotypes + df_encoded = DataFrame( + encode_genotype(genotype.label, mat_geno), + samples, + ) + + # built encoded geno DataFrames + insertcols!(df_encoded, 1, :marker => marker) + + gdf = innerjoin( + select(df_gmap, [:chr, :marker]), + df_encoded, + on = :marker, + ) |> x -> groupby(x, :chr) + + val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf] - # values - # get encoded genetotypes - df_encoded = DataFrame( - encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)), - samples, - ) - - # built encoded geno DataFrames - insertcols!(df_encoded, 1, :marker => df_geno.marker) - - gdf = innerjoin( - select(df_gmap, [:chr, :marker]), - df_encoded, - on=:marker, - ) |> x -> groupby(x, :chr) - - val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf] + # marker + marker = [String.(group.marker) for group in gdf] - # crosstype - crosstype = get_crosstype(filename) + # crosstype + crosstype = get_crosstype(filename) - # alleles - alleles = get_alleles(filename) + # alleles + alleles = get_alleles(filename) - # genotranspose - genotranspose = get_genotranspose(filename) + # genotranspose + genotranspose = get_genotranspose(filename) - return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) + return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) end @@ -270,53 +284,53 @@ Returns a `Pmap` type/struct. """ function get_pmap(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) - - # load control file - jsondict = parse_json(filename) - - # get pmap file - pmapfile = joinpath(data_dir, check_key(jsondict, "pmap")) - - # load pmap file - gdf = groupby( - read_data( - pmapfile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ), - :chr, - ) - - # chromosome - chr = [group.chr[1] for group in gdf] - - # marker - marker = [group.marker for group in gdf] - - # relative position - pos = [group.pos for group in gdf] - - # unit - unit = "" - f = open(pmapfile, "r") - - s = readline(f) - s_lower = lowercase(s) # Convert the input string to lowercase - if occursin("mbp", s_lower) - unit = "Mbp" - elseif occursin("mb", s_lower) - unit = "Mb" - elseif occursin("cm", s_lower) - unit = "cM" - else - @warn "No unit detected!" - end - - close(f) - - return Pmap(chr, marker, pos, unit) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load control file + jsondict = parse_json(filename) + + # get pmap file + pmapfile = joinpath(data_dir, check_key(jsondict, "pmap")) + + # load pmap file + gdf = groupby( + read_data( + pmapfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ), + :chr, + ) + + # chromosome + chr = [group.chr[1] for group in gdf] + + # marker + marker = [group.marker for group in gdf] + + # relative position + pos = [group.pos for group in gdf] + + # unit + unit = "" + f = open(pmapfile, "r") + + s = readline(f) + s_lower = lowercase(s) # Convert the input string to lowercase + if occursin("mbp", s_lower) + unit = "Mbp" + elseif occursin("mb", s_lower) + unit = "Mb" + elseif occursin("cm", s_lower) + unit = "cM" + else + @warn "No unit detected!" + end + + close(f) + + return Pmap(chr, marker, pos, unit) end @@ -333,35 +347,35 @@ Creates a `Pheno` type/struct from Pheno CSV file. Returns a `Pheno` type/struct. """ -function get_pheno(filename::String; samples_col_name::Symbol=:id) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) +function get_pheno(filename::String; samples_col_name::Symbol = :id) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # get pheno file - phenofile = joinpath(data_dir, check_key(jsondict, "pheno")) + # get pheno file + phenofile = joinpath(data_dir, check_key(jsondict, "pheno")) - # load pheno file - df_pheno = read_data( - phenofile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char"), - missingstring="NA" - ) + # load pheno file + df_pheno = read_data( + phenofile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + missingstring = "NA", + ) - # samples - samples = df_pheno[:, samples_col_name] + # samples + samples = df_pheno[:, samples_col_name] - # traits' names - traits = names(df_pheno) - setdiff!(traits, [string(samples_col_name)]) + # traits' names + traits = names(df_pheno) + setdiff!(traits, [string(samples_col_name)]) - # value of the traits - val = Matrix(df_pheno[:, traits]) + # value of the traits + val = Matrix(df_pheno[:, traits]) - return Pheno(samples, traits, val) + return Pheno(samples, traits, val) end @@ -379,29 +393,29 @@ Creates a `Phenocovar` type/struct from phenocovar CSV file. Returns a `Phenocovar` type/struct. """ function get_phenocovar(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # get phenocovar file - phenocovarfile = joinpath(data_dir, check_key(jsondict, "phenocovar")) + # get phenocovar file + phenocovarfile = joinpath(data_dir, check_key(jsondict, "phenocovar")) - # load phenocovar file - df_phenocovar = read_data( - phenocovarfile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ) + # load phenocovar file + df_phenocovar = read_data( + phenocovarfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ) - # traits' names - traits = string.(df_phenocovar[:, 1]) + # traits' names + traits = string.(df_phenocovar[:, 1]) - # description - description = df_phenocovar[:, 2] + # description + description = df_phenocovar[:, 2] - return Phenocov(traits, description) + return Phenocov(traits, description) end @@ -419,30 +433,30 @@ Creates a `Crossinfo` type/struct from crossinfo CSV file. Returns a `Crossinfo` type/struct. """ function get_crossinfo(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # get crossdirection file - crossinfo_dict = check_key(jsondict, "cross_info") - crossinfofile = joinpath(data_dir, crossinfo_dict["file"]) + # get crossdirection file + crossinfo_dict = check_key(jsondict, "cross_info") + crossinfofile = joinpath(data_dir, crossinfo_dict["file"]) - # load crossdirection file - df_crossdirection = read_data( - crossinfofile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char") - ) + # load crossdirection file + df_crossdirection = read_data( + crossinfofile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + ) - # samples names - samples = df_crossdirection[:, 1] + # samples names + samples = df_crossdirection[:, 1] - # description - direction = df_crossdirection[:, 2] + # description + direction = df_crossdirection[:, 2] - return CrossInfo(samples, direction) + return CrossInfo(samples, direction) end @@ -460,19 +474,19 @@ Creates a `IsXChar` type/struct from gmap CSV file. Returns a `IsXChar` type/struct. """ function get_isxchar(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # load gmap - gmap = get_gmap(filename) + # load gmap + gmap = get_gmap(filename) - # isXchar - isxchar = map(x -> x == "X", gmap.chr) + # isXchar + isxchar = map(x -> x == "X", gmap.chr) - return IsXChar(gmap.chr, isxchar) + return IsXChar(gmap.chr, isxchar) end @@ -490,30 +504,30 @@ Creates a `Covar` type/struct from gmap CSV file. Returns a `Covar` type/struct. """ function get_covar(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) - # load control file - jsondict = parse_json(filename) + # load control file + jsondict = parse_json(filename) - # get covar file - covarfile = check_key(jsondict, "covar") + # get covar file + covarfile = check_key(jsondict, "covar") - if ismissing(covarfile) - return Covar(covarfile) - end + if ismissing(covarfile) + return Covar(covarfile) + end - covarfile = joinpath(data_dir,) + covarfile = joinpath(data_dir) - # load covar file - df_covar = read_data( - covarfile; - delim=check_key(jsondict, "sep"), - comment=check_key(jsondict, "comment.char"), - missingstring=check_key(jsondict, "na.strings") - ) + # load covar file + df_covar = read_data( + covarfile; + delim = check_key(jsondict, "sep"), + comment = check_key(jsondict, "comment.char"), + missingstring = check_key(jsondict, "na.strings"), + ) - return Covar(df_covar) + return Covar(df_covar) end @@ -534,36 +548,36 @@ If control file does not stipulate sex information, we assume all female Returns a `IsFemale` type/struct. """ function get_isfemale(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) - - # load control file - jsondict = parse_json(filename) - - # get samples id - samples = get_crossinfo(filename).sample_id - - # initiate isfemale vector - isfemale = ones(Bool, length(samples)) - - # check for sex information - if (in("sex", keys(jsondict))) - sex_dict = jsondict["sex"] - else - @warn "No sex information; assuming all female" - return IsFemale(samples, isfemale) - end - - if (in("covar", keys(sex_dict))) - sex_covar = sex_dict["covar"] - df_covar = get_covar(filename) - idx_male = findall(x -> x == sex_dict["male"], df_covar[:, sex_covar]) - isfemale[idx_male] = false - else - throw("Error: covar-sex not found in control file") - end - - return IsFemale(samples, isfemale) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load control file + jsondict = parse_json(filename) + + # get samples id + samples = get_crossinfo(filename).sample_id + + # initiate isfemale vector + isfemale = ones(Bool, length(samples)) + + # check for sex information + if (in("sex", keys(jsondict))) + sex_dict = jsondict["sex"] + else + @warn "No sex information; assuming all female" + return IsFemale(samples, isfemale) + end + + if (in("covar", keys(sex_dict))) + sex_covar = sex_dict["covar"] + df_covar = get_covar(filename) + idx_male = findall(x -> x == sex_dict["male"], df_covar[:, sex_covar]) + isfemale[idx_male] = false + else + throw("Error: covar-sex not found in control file") + end + + return IsFemale(samples, isfemale) end @@ -581,33 +595,33 @@ Creates a `GeneticStudyData` type/struct from control file in json format. Returns a `GeneticStudyData` type/struct. """ function get_geneticstudydata(filename::String) - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) - - # load file - jsondict = parse_json(filename) - - # get data info - gmap = get_gmap(filename) - geno = get_geno(filename) - pmap = get_pmap(filename) - pheno = get_pheno(filename) - phenocov = get_phenocovar(filename) - isXchar = get_isxchar(filename) - isfemale = get_isfemale(filename) - crossinfo = get_crossinfo(filename) - covar = get_covar(filename) - - return GeneticStudyData( - gmap, - geno, - pmap, - pheno, - phenocov, - isXchar, - isfemale, - crossinfo, - covar - ) + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load file + jsondict = parse_json(filename) + + # get data info + gmap = get_gmap(filename) + geno = get_geno(filename) + pmap = get_pmap(filename) + pheno = get_pheno(filename) + phenocov = get_phenocovar(filename) + isXchar = get_isxchar(filename) + isfemale = get_isfemale(filename) + crossinfo = get_crossinfo(filename) + covar = get_covar(filename) + + return GeneticStudyData( + gmap, + geno, + pmap, + pheno, + phenocov, + isXchar, + isfemale, + crossinfo, + covar, + ) end diff --git a/test/test_gf.jl b/test/test_gf.jl index 3f44b11..e40dd25 100644 --- a/test/test_gf.jl +++ b/test/test_gf.jl @@ -15,183 +15,107 @@ data_dir = joinpath(@__DIR__, "data/BXD/"); file = joinpath(data_dir, "bxd.json"); jsondict = BigRiverQTL.parse_json(file) - - # check pheno file exists - if (in("pheno", keys(jsondict))) - phenofile = joinpath(data_dir, jsondict["pheno"]) - else - throw("Error: pheno not found in control file") - end - - # load file - - df_pheno = CSV.read(phenofile, DataFrame; comment="#") - - df_pheno = BigRiverQTL.read_data( - phenofile; - missingstring="NA"); - - unit = "" - f = open(pmapfile, "r") - - s = readline(f) - s_lower = lowercase(s) # Convert the input string to lowercase - if occursin("mbp", s_lower) - unit = "Mbp" - elseif occursin("mb", s_lower) - unit = "Mb" - elseif occursin("cm", s_lower) - unit = "cM" - else - @warn "No unit detected!" - end - - close(f) - - unit - - # function get_geno(filename::String) -filename = file - # check if filename is directory or file name - data_dir, filename = get_control_file(filename) - - # load file - jsondict = BigRiverQTL.parse_json(filename) - - # check gmap file exists - if (in("gmap", keys(jsondict))) - gmapfile = joinpath(data_dir, jsondict["gmap"]) - else - throw("Error: gmap not found in control file") - end - - # check geno file exists - if (in("geno", keys(jsondict))) - genofile = joinpath(data_dir, jsondict["geno"]) - else - throw("Error: geno not found in control file") + filename = file + # check if filename is directory or file name + data_dir, filename = get_control_file(filename) + + # load control file + jsondict = BigRiverQTL.parse_json(filename) + + # get gmap file + gmapfile = joinpath(data_dir, BigRiverQTL.check_key(jsondict, "gmap")) + + # get geno file + genofile = joinpath(data_dir, BigRiverQTL.check_key(jsondict, "geno")) + + # get gmap object + gmap = get_gmap(filename) + + # load gmap dataframe + df_gmap = BigRiverQTL.read_data( + gmapfile; + delim= BigRiverQTL.check_key(jsondict, "sep"), + comment=BigRiverQTL.check_key(jsondict, "comment.char") + ) + + # load geno dataframe + df_geno = BigRiverQTL.read_data( + genofile; + delim=BigRiverQTL.check_key(jsondict, "sep"), + comment=BigRiverQTL.check_key(jsondict, "comment.char") + ) + + # check if geno data are transposed + if jsondict["geno_transposed"] + # get samples names + samples = names(df_geno)[2:end] + # get values + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) + # get markers names + marker = df_geno.marker[:] + else + # get samples names + samples = df_geno[:, 1] + # get values + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims + # get markers names + marker = names(df_geno)[2:end] + end + + # get chromosomes names + chr = gmap.chr + + #genotype + genotype = get_genotype(filename) + + # values + # get encoded genetotypes + df_encoded = DataFrame( + encode_genotype(genotype.label, mat_geno), + samples, + ) + + # built encoded geno DataFrames + insertcols!(df_encoded, 1, :marker => marker) + + gdf = innerjoin( + select(df_gmap, [:chr, :marker]), + df_encoded, + on=:marker, + ) |> x -> groupby(x, :chr) + + val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf] + # marker + marker = [String.(group.marker) for group in gdf] + # crosstype + crosstype = get_crosstype(filename) + + # alleles + alleles = get_alleles(filename) + + # genotranspose + genotranspose = get_genotranspose(filename) + + Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) + + + struct Geno2 + sample_id::Vector{AbstractString} + chr::Vector{AbstractString} + marker_name::Vector{Vector{AbstractString}} end - # get gmap object - gmap = get_gmap(file) - - # load gmap dataframe - df_gmap = read_data(gmapfile) - - # load geno dataframe - df_geno = read_data(genofile) - - # get samples names - samples = names(df_geno)[2:end] - - # get chromosomes names - chr = gmap.chr - - # get markers names - marker = gmap.marker_name - - #genotype - genotype = get_genotype(filename) - - # values - # get encoded genetotypes - df_encoded = DataFrame( - encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)), - samples, - ) - - # built encoded geno DataFrames - insertcols!(df_encoded, 1, :marker => df_geno.marker) - - gdf = innerjoin( - select(df_gmap, [:chr, :marker]), - df_encoded, - on = :marker - ) |> x -> groupby(x, :chr); - - val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]; - - #crosstype - crosstype = get_crosstype(filename) - - #alleles - alleles = get_alleles(filename) - - #genotranspose - genotranspose = get_genotranspose(filename) - - # return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose) -# end - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# Transforming data to a optimised and accessible data type -# data = get_geneticstudydata(file); - -# check if filename is directory or file name -data_dir, filename = BigRiverQTL.get_control_file(file) - -# load file -jsondict = BigRiverQTL.parse_json(filename) - - -crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"]) - - # make type - - # samples - df_crossdirection = BigRiverQTL.read_data(crossinfofile) - get_crossinfo(crossinfofile) - - - samples = get_crossinfo(crossinfofile).sample_id - - # isfemale - - isfemale = ones(Bool, length(samples)) - if (in("sex", keys(jsondict))) - isfemale[findall(x -> x == "Male", jsondict["sex"])] = 0 + struct Geno4 + sample_id::Vector{AbstractString} + chr::Vector{AbstractString} + # marker_name::Vector{Vector{AbstractString}} end + Geno2(samples, chr, marker) + Geno4(samples, chr) - - - + gmap.marker \ No newline at end of file From c6959cc01a77b5a240d756959e07cc569ab6fb81 Mon Sep 17 00:00:00 2001 From: GregFa Date: Mon, 19 Aug 2024 17:10:48 -0500 Subject: [PATCH 2/6] corrected code usng transposed geno --- example/example_qtl.ipynb | 5 ++++- example/example_qtl.jl | 8 ++++++-- src/io/export_to_type.jl | 4 ++-- src/struct/datastructure.jl | 7 +++++-- test/datastruc_test.jl | 2 +- test/plots_test.jl | 10 ++++------ 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/example/example_qtl.ipynb b/example/example_qtl.ipynb index 4d6fcf7..26e06a2 100644 --- a/example/example_qtl.ipynb +++ b/example/example_qtl.ipynb @@ -127,7 +127,10 @@ "# We can get the genotype matrix using the following command:\n", "geno = reduce(hcat, data.geno.val);\n", "# For computing reasons, we need to convert the geno matrix in Float64\n", - "geno_processed = convert(Array{Float64}, geno);" + "geno_processed = geno .- 1.0 |>\n", + " x -> replace(x, missing => 0.5) |>\n", + "\n", + "# geno_processed = convert(Matrix{Float64}, geno_processed);\n" ] }, { diff --git a/example/example_qtl.jl b/example/example_qtl.jl index 20a2c4e..a2f37d3 100644 --- a/example/example_qtl.jl +++ b/example/example_qtl.jl @@ -6,7 +6,7 @@ # extension: .jl # format_name: light # format_version: '1.5' -# jupytext_version: 1.16.2 +# jupytext_version: 1.16.4 # kernelspec: # display_name: Julia 1.10.4 # language: julia @@ -62,7 +62,11 @@ pheno = data.pheno.val; # We can get the genotype matrix using the following command: geno = reduce(hcat, data.geno.val); # For computing reasons, we need to convert the geno matrix in Float64 -geno_processed = convert(Array{Float64}, geno); +geno_processed = geno .- 1.0 |> + x -> replace(x, missing => 0.5) |> + +# geno_processed = convert(Matrix{Float64}, geno_processed); + # - # #### Preprocessing diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index 7a7424e..da137be 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -217,7 +217,7 @@ function get_geno(filename::String) # get samples names samples = names(df_geno)[2:end] # get values - mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) # get markers names marker = df_geno.marker else @@ -251,7 +251,7 @@ function get_geno(filename::String) on = :marker, ) |> x -> groupby(x, :chr) - val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf] + val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf] # marker marker = [String.(group.marker) for group in gdf] diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl index 49bfc01..799bac1 100644 --- a/src/struct/datastructure.jl +++ b/src/struct/datastructure.jl @@ -67,6 +67,11 @@ end * `chr` contains chromosome names. * `marker_name` contains marker names for each chromosome. * `val` is a vector of matrices containing allele information in each chromosome. + In each matrix, the rows are the samples and the coulmns are the markers. +* `crosstype` is a field of type `CrossType`. Refer to `CrossType` type for more imformation. +* `alleles` is a field of type `Alleles`. Refer to `Alleles` type for more imformation. +* `genotype` is a field of type `Genotype`. Refer to `Genotype` type for more imformation. +* `genotranspose` is a field of type `GenoTranspose`. Refer to `GenoTranspose` type for more imformation. """ struct Geno sample_id::Vector{AbstractString} @@ -77,7 +82,6 @@ struct Geno alleles::Alleles geno_type::GenoType geno_transpose::GenoTranspose - end @@ -129,7 +133,6 @@ struct Covar end - """ IsFemale type indicates if the sample_id (genotypes or individuals) are females. * `sample_id` contains sample names such as genotypes or individual IDs. diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl index 70f4528..201b74d 100644 --- a/test/datastruc_test.jl +++ b/test/datastruc_test.jl @@ -80,7 +80,7 @@ end @testset "Geno Tests" begin geno = get_geno(file) @test length(geno.sample_id) == 198 - @test size(geno.val[1]) == (636, 198) + @test size(geno.val[1]) == (198, 636) end diff --git a/test/plots_test.jl b/test/plots_test.jl index 7cee95f..371bd5b 100644 --- a/test/plots_test.jl +++ b/test/plots_test.jl @@ -11,11 +11,9 @@ gInfo = data.gmap; pInfo = data.phenocov; # pheno=data.pheno; pheno = data.pheno.val; -geno = reduce(vcat, data.geno.val); -geno_processed = geno .- 1.0 -replace!(geno_processed, missing => 0.5); -geno_processed = convert(Matrix{Float64}, geno_processed); -geno_processed = permutedims(geno_processed); +geno = reduce(hcat, data.geno.val); +geno_processed = geno .- 1.0 |> + x -> replace(x, missing => 0.5) ################# # Preprocessing # @@ -36,7 +34,7 @@ kinship = kinship_gs(geno_processed, 0.99); # Scan # ######## -single_results_perms = BigRiverQTL.BulkLMM.scan( +single_results_perms = BigRiverQTL.scan( pheno_y2, geno_processed, kinship; From 0aed57f32bebe2887e42f7009c3a93a14101ab37 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Tue, 20 Aug 2024 12:02:38 -0500 Subject: [PATCH 3/6] changed name plots_utils to plots_helpers --- src/plots/plots_helpers.jl | 49 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/plots/plots_helpers.jl diff --git a/src/plots/plots_helpers.jl b/src/plots/plots_helpers.jl new file mode 100644 index 0000000..0fb357d --- /dev/null +++ b/src/plots/plots_helpers.jl @@ -0,0 +1,49 @@ +function gmap2df(gmap::Gmap) + # Locus = reduce(vcat,gmap.marker_name); + # Mb= reduce(vcat,gmap.pos); + start=0 + Chr =repeat(["0"],sum(length.(gmap.marker_name))) + + # Pos =repeat(["Missing"],sum(length.(gmap.marker_name))) + + for i in eachindex(gmap.chr) + l_i=length(gmap.marker_name[i]) + Chr[start+1:l_i+start] .= gmap.chr[i] + start=start+l_i + end + df=DataFrame( + Locus = reduce(vcat,gmap.marker_name), + Chr = Chr, + Pos= reduce(vcat,gmap.pos) + ) + # Cm = repeat(["Missing"],sum(length.(gmap.marker_name))) + return df + +end + + + + + +function pmap2df(pmap::Pmap) + # Locus = reduce(vcat,gmap.marker_name); + # Mb= reduce(vcat,gmap.pos); + start=0 + Chr =repeat(["0"],sum(length.(gmap.marker_name))) + + # Pos =repeat(["Missing"],sum(length.(gmap.marker_name))) + + for i in eachindex(pmap.chromosomes) + l_i=length(pmap.marker_name[i]) + Chr[start+1:l_i+start] .= pmap.chr[i] + start=start+l_i + end + df=DataFrame( + Locus = reduce(vcat,pmap.marker_name), + Chr = Chr, + Pos= reduce(vcat,pmap.pos) + ) + # Cm = repeat(["Missing"],sum(length.(gmap.marker_name))) + return df + +end \ No newline at end of file From 3a1ce70f10404b116e99e571decbe14ba6cf0bf1 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Tue, 20 Aug 2024 12:11:37 -0500 Subject: [PATCH 4/6] rename io_utils to io_helpers --- src/BigRiverQTL.jl | 4 +-- src/io/{io_utils.jl => io_helpers.jl} | 0 src/plots/plots_utils.jl | 49 --------------------------- 3 files changed, 2 insertions(+), 51 deletions(-) rename src/io/{io_utils.jl => io_helpers.jl} (100%) delete mode 100644 src/plots/plots_utils.jl diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 9398049..17bba3b 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -62,7 +62,7 @@ module BigRiverQTL ###### # IO # ###### - include("io/io_utils.jl") + include("io/io_helpers.jl") export get_control_file, encode_genotype include("io/export_to_type.jl") export get_geneticstudydata @@ -73,7 +73,7 @@ module BigRiverQTL ######### # Plots # ######### - include("plots/plots_utils.jl") + include("plots/plots_helpers.jl") export gmap2df, pmap2df include("plots/plots_qtl.jl") diff --git a/src/io/io_utils.jl b/src/io/io_helpers.jl similarity index 100% rename from src/io/io_utils.jl rename to src/io/io_helpers.jl diff --git a/src/plots/plots_utils.jl b/src/plots/plots_utils.jl deleted file mode 100644 index 0fb357d..0000000 --- a/src/plots/plots_utils.jl +++ /dev/null @@ -1,49 +0,0 @@ -function gmap2df(gmap::Gmap) - # Locus = reduce(vcat,gmap.marker_name); - # Mb= reduce(vcat,gmap.pos); - start=0 - Chr =repeat(["0"],sum(length.(gmap.marker_name))) - - # Pos =repeat(["Missing"],sum(length.(gmap.marker_name))) - - for i in eachindex(gmap.chr) - l_i=length(gmap.marker_name[i]) - Chr[start+1:l_i+start] .= gmap.chr[i] - start=start+l_i - end - df=DataFrame( - Locus = reduce(vcat,gmap.marker_name), - Chr = Chr, - Pos= reduce(vcat,gmap.pos) - ) - # Cm = repeat(["Missing"],sum(length.(gmap.marker_name))) - return df - -end - - - - - -function pmap2df(pmap::Pmap) - # Locus = reduce(vcat,gmap.marker_name); - # Mb= reduce(vcat,gmap.pos); - start=0 - Chr =repeat(["0"],sum(length.(gmap.marker_name))) - - # Pos =repeat(["Missing"],sum(length.(gmap.marker_name))) - - for i in eachindex(pmap.chromosomes) - l_i=length(pmap.marker_name[i]) - Chr[start+1:l_i+start] .= pmap.chr[i] - start=start+l_i - end - df=DataFrame( - Locus = reduce(vcat,pmap.marker_name), - Chr = Chr, - Pos= reduce(vcat,pmap.pos) - ) - # Cm = repeat(["Missing"],sum(length.(gmap.marker_name))) - return df - -end \ No newline at end of file From 6e5ed8370da7061419a0478506609a657c61619f Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Tue, 20 Aug 2024 23:34:18 -0500 Subject: [PATCH 5/6] added wrangling_utils --- src/BigRiverQTL.jl | 7 ++ src/io/export_to_type.jl | 41 ++++++---- src/utils/wrangling_utils.jl | 147 +++++++++++++++++++++++++++++++++++ test/utils_test.jl | 35 +++++++++ 4 files changed, 216 insertions(+), 14 deletions(-) create mode 100644 src/utils/wrangling_utils.jl diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 17bba3b..1f78056 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -64,6 +64,7 @@ module BigRiverQTL ###### include("io/io_helpers.jl") export get_control_file, encode_genotype + include("io/export_to_type.jl") export get_geneticstudydata export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype @@ -85,4 +86,10 @@ module BigRiverQTL include("plots/plots_eqtl.jl") export plot_eQTL + ######### + # Utils # + ######### + include("utils/wrangling_utils.jl") + export get_geno_completecases, summary_missing + end diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl index da137be..0210a78 100644 --- a/src/io/export_to_type.jl +++ b/src/io/export_to_type.jl @@ -213,22 +213,35 @@ function get_geno(filename::String) ) # check if geno data are transposed - if jsondict["geno_transposed"] - # get samples names - samples = names(df_geno)[2:end] - # get values - mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) - # get markers names - marker = df_geno.marker - else - # get samples names - samples = df_geno[:, 1] - # get values - mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims - # get markers names - marker = names(df_geno)[2:end] + if !jsondict["geno_transposed"] + # permutedims + df_geno = permutedims(df_geno, 1, "marker") end + # get samples names + samples = names(df_geno)[2:end] + # get values + mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) + # get markers names + marker = df_geno[:, 1] + + # # check if geno data are transposed + # if jsondict["geno_transposed"] + # # get samples names + # samples = names(df_geno)[2:end] + # # get values + # mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) + # # get markers names + # marker = df_geno.marker + # else + # # get samples names + # samples = df_geno[:, 1] + # # get values + # mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims + # # get markers names + # marker = names(df_geno)[2:end] + # end + # get chromosomes names chr = gmap.chr diff --git a/src/utils/wrangling_utils.jl b/src/utils/wrangling_utils.jl new file mode 100644 index 0000000..ab4a72d --- /dev/null +++ b/src/utils/wrangling_utils.jl @@ -0,0 +1,147 @@ + + +""" + get_geno_completecases(geno_missing::Geno) -> Geno + +Identify and remove rows or columns from a genotype data structure `Geno` containing missing values, +prioritizing those with the highest percentage of missing data. + +# Arguments +- `geno_missing::Geno`: A genotype data structure containing possibly missing values. + +# Returns +- `Geno`: A new genotype data structure with the same fields as the input, but where rows and columns +with missing data have been systematically removed until no missing data remains. + +# Description + +The core of the function involves iteratively identifying and removing rows or columns with the highest percentage of missing data. +This decision is based on whether the maximum percentage of missing data across rows exceeds that across columns. +Rows or columns are removed until there are no missing values left in the matrix. + +After cleaning, the function reconstructs the `Geno` data structure. +It ensures the data aligns with the expected format by adjusting the dimensions of the data, regrouping by chromosomes, +and ensuring marker and sample identifiers are correctly aligned. + +""" +function get_geno_completecases(geno_missing::Geno) + geno = deepcopy(geno_missing) + + # get geno values + mat_geno = reduce(hcat, geno.val) + + # build dataframe + df_geno = DataFrame( + mat_geno, + reduce(vcat, geno.marker_name), + ) + insertcols!(df_geno, 1, :samples => geno.sample_id) + + # get ismissing matrix + mat_geno_missing = ismissing.(mat_geno) .* 1 + + # get complete cases + while sum(mat_geno_missing) > 0 + + missing_row_wise = sum(mat_geno_missing, dims = 2) .* 100 / (size(mat_geno, 2)) |> vec + missing_col_wise = sum(mat_geno_missing, dims = 1) .* 100 / (size(mat_geno, 1)) |> vec + + if (maximum(missing_row_wise) > maximum(missing_col_wise)) + idx_missing = getindex(findmax(missing_row_wise), 2) + deleteat!(df_geno, idx_missing) + else + idx_missing = getindex(findmax(missing_col_wise), 2) + select!(df_geno, Not(idx_missing)) + end + + mat_geno = Matrix(df_geno) + mat_geno_missing = ismissing.(mat_geno) .* 1 + end + + # permute dataframe to fit Geno type/struct + df_geno = permutedims(df_geno, 1, "marker") + + # get samples names + samples = names(df_geno)[2:end] + + # get df_gmap: chromosomes-markers + chr = deepcopy(geno.marker_name) + for i in eachindex(geno.chr) + chr[i][:] .= geno.chr[i] + end + + df_gmap = DataFrame( + chr = reduce(vcat, chr), + marker = reduce(vcat, geno.marker_name), + ) + + # get data per chromosome + gdf = innerjoin( + df_gmap, + df_geno, + on = :marker, + ) |> x -> groupby(x, :chr) + + + # permute back such rows are samples and columns ar markers + val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf] + + # marker + marker = [String.(group.marker) for group in gdf] + + return Geno( + samples, + geno.chr, + marker, + val, + geno.cross_type, + geno.alleles, + geno.geno_type, + geno.geno_transpose, + ) +end + + +""" + summary_missing(geno_missing::Geno; issorted::Bool = false) -> (missing_per_row::DataFrame, missing_per_col::DataFrame) + +Calculate the percentage of missing data for each sample and each marker in a genetic dataset. + +# Arguments +- `geno_missing::Geno`: A genetic data structure of type Geno. +- `issorted::Bool = false`: An optional boolean flag indicating whether to sort the results by + the percentage of missing data in descending order. + +# Returns +- `(missing_per_row::DataFrame, missing_per_col::DataFrame)`: A tuple of two `DataFrame`s: + - `missing_per_row`: Each row represents a sample with columns for sample identifiers and the percentage of missing data. + - `missing_per_col`: Each row represents a marker with columns for marker names and the percentage of missing data. +""" +function summary_missing(geno_missing::Geno; issorted::Bool = false) + # get geno values + mat_geno = reduce(hcat, geno_missing.val) + + # get ismissing matrix + mat_geno_missing = ismissing.(mat_geno) .* 1 + + missing_row_wise = sum(mat_geno_missing, dims = 2) .* 100 / (size(mat_geno, 2)) |> vec + missing_col_wise = sum(mat_geno_missing, dims = 1) .* 100 / (size(mat_geno, 1)) |> vec + + # build dataframe + df_row_missing = DataFrame( + samples = geno_missing.sample_id, + percentage = round.(missing_row_wise, digits = 2), + ) + + df_col_missing = DataFrame( + markers = reduce(vcat, geno_missing.marker_name), + percentage = round.(missing_col_wise, digits = 2), + ) + + if issorted + sort!(df_row_missing, [:percentage], rev = true) + sort!(df_col_missing, [:percentage], rev = true) + end + + return (missing_per_row = df_row_missing, missing_per_col = df_col_missing) +end \ No newline at end of file diff --git a/test/utils_test.jl b/test/utils_test.jl index 0a939a5..856149f 100644 --- a/test/utils_test.jl +++ b/test/utils_test.jl @@ -27,3 +27,38 @@ end encoded_output = encode_genotype(geno_dict, geno_val) @test sum(ismissing.(encoded_output)) == 4 end + + +################ +# Test missing # +################ + +test_geno = Geno( + ["sample1", "sample2", "sample3"], + ["1", "2"], + [["marker1", "marker2", "marker3"], + ["marker4", "marker5", "marker6", "marker7"]], + [[1 2 1; missing missing 2; 1 2 2], + [1 2 2 1; missing missing 2 1; 2 1 2 missing]], + CrossType("risib"), + Alleles(["B", "D"]), + GenoType(Dict{String, Any}("B" => 1, "D" => 2)), + GenoTranspose(true) +); +test_geno_cleaned = get_geno_completecases(test_geno); + +# Test the `get_geno_completecases` function +@testset "Testing get_geno_completecases function" begin + @test typeof(test_geno_cleaned) == typeof(test_geno) # Check type consistency + @test sum(ismissing.(reduce(hcat, test_geno_cleaned.val))) == 0 # Ensure no missing values + @test length(test_geno_cleaned.sample_id) == 2 # Check if some rows might have been removed + @test length(test_geno_cleaned.marker_name[2]) == 3 # Check if some columns might have been removed +end + +# Test the `summary_missing` function +@testset "Testing summary_missing function" begin + tbl_missing = summary_missing(test_geno, issorted = true); + + @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentge row wise + @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentge column wise +end \ No newline at end of file From 1b40225345ab57d4a7571b1095cc3665cd80bb32 Mon Sep 17 00:00:00 2001 From: Gregory Farage Date: Wed, 21 Aug 2024 01:11:09 -0500 Subject: [PATCH 6/6] added selection samples and marker --- src/BigRiverQTL.jl | 2 +- src/utils/wrangling_utils.jl | 167 ++++++++++++++++++++++++++++++++++- test/runtests.jl | 2 +- test/utils_test.jl | 39 +++++++- 4 files changed, 204 insertions(+), 6 deletions(-) diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl index 1f78056..c47bb3e 100644 --- a/src/BigRiverQTL.jl +++ b/src/BigRiverQTL.jl @@ -90,6 +90,6 @@ module BigRiverQTL # Utils # ######### include("utils/wrangling_utils.jl") - export get_geno_completecases, summary_missing + export get_geno_completecases, summary_missing, select_sample, select_marker end diff --git a/src/utils/wrangling_utils.jl b/src/utils/wrangling_utils.jl index ab4a72d..d277a1d 100644 --- a/src/utils/wrangling_utils.jl +++ b/src/utils/wrangling_utils.jl @@ -1,4 +1,166 @@ +""" + select_sample(geno_selection::Geno, markers_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) -> Geno + +Select and return a subset of genetic samples from a `Geno` object based on specified samples names or their indices. + +# Arguments +- `geno_selection`: A `Geno` type or struct that contains genetic data. +- `samples_selection`: A vector of samples names or an inverted index of samples names + to be selected from the `geno_selection`. The selection is applied to the samples names in the `Geno` object. + +# Returns +- `Geno`: A new `Geno` object that contains only the selected samples, their corresponding genetic information, +and the subset of the original genotypes related to these samples. + +""" +function select_sample(geno_selection::Geno, sample_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) + + geno = deepcopy(geno_selection) + + # get geno values + mat_geno = reduce(hcat, geno.val) + + # build dataframe + df_geno = DataFrame( + mat_geno, + reduce(vcat, geno.marker_name), + ) + insertcols!(df_geno, 1, :samples => geno.sample_id) + + # permute dataframe to fit Geno type/struct + df_geno = permutedims(df_geno, 1, "marker") + + # selection + if (typeof(sample_selection) <: InvertedIndex{Vector{String}}) + select!(df_geno, sample_selection); + else + select!(df_geno, vcat(["marker"], sample_selection)); + end + + # get samples names + samples = names(df_geno)[2:end] + + # get df_gmap: chromosomes-markers + chr = deepcopy(geno.marker_name) + for i in eachindex(geno.chr) + chr[i][:] .= geno.chr[i] + end + + df_gmap = DataFrame( + chr = reduce(vcat, chr), + marker = reduce(vcat, geno.marker_name), + ) + + # get data per chromosome + gdf = innerjoin( + df_gmap, + df_geno, + on = :marker, + ) |> x -> groupby(x, :chr) + + + # permute back such rows are samples and columns ar markers + val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf] + + # marker + marker = [String.(group.marker) for group in gdf] + + return Geno( + samples, + geno.chr, + marker, + val, + geno.cross_type, + geno.alleles, + geno.geno_type, + geno.geno_transpose, + ) +end + + +""" + select_marker(geno_selection::Geno, markers_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) -> Geno + +Select and return a subset of genetic markers from a `Geno` object based on specified marker names or their indices. + +# Arguments +- `geno_selection`: A `Geno` type or struct that contains genetic data. +- `markers_selection`: A vector of marker names or an inverted index of marker names + to be selected from the `geno_selection`. The selection is applied to the marker names in the `Geno` object. + +# Returns +- `Geno`: A new `Geno` object that contains only the selected markers, their corresponding genetic information, +and the subset of the original genotypes related to these markers. + +""" +function select_marker(geno_selection::Geno, marker_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) + + geno = deepcopy(geno_selection) + + # get geno values + mat_geno = reduce(hcat, geno.val) + + # build dataframe + df_geno = DataFrame( + mat_geno, + reduce(vcat, geno.marker_name), + ) + insertcols!(df_geno, 1, :samples => geno.sample_id) + + # selection + if (typeof(marker_selection) <: InvertedIndex{Vector{String}}) + select!(df_geno, marker_selection); + else + select!(df_geno, vcat(["samples"], marker_selection)); + end + + # permute dataframe to fit Geno type/struct + df_geno = permutedims(df_geno, 1, "marker") + + # get samples names + samples = names(df_geno)[2:end] + + # get df_gmap: chromosomes-markers + chr = deepcopy(geno.marker_name) + for i in eachindex(geno.chr) + chr[i][:] .= geno.chr[i] + end + + df_gmap = DataFrame( + chr = reduce(vcat, chr), + marker = reduce(vcat, geno.marker_name), + ) + + # get data per chromosome + gdf = innerjoin( + df_gmap, + df_geno, + on = :marker, + ) |> x -> groupby(x, :chr) + + + # permute back such rows are samples and columns ar markers + val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf] + + # marker + marker = [String.(group.marker) for group in gdf] + + # chromosome + chr = [group.chr[1] for group in gdf] + + return Geno( + samples, + chr, + marker, + val, + geno.cross_type, + geno.alleles, + geno.geno_type, + geno.geno_transpose, + ) +end + """ get_geno_completecases(geno_missing::Geno) -> Geno @@ -89,9 +251,12 @@ function get_geno_completecases(geno_missing::Geno) # marker marker = [String.(group.marker) for group in gdf] + # chromosome + chr = [group.chr[1] for group in gdf] + return Geno( samples, - geno.chr, + chr, marker, val, geno.cross_type, diff --git a/test/runtests.jl b/test/runtests.jl index 4f2ad3a..afdfe9a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,7 @@ using BigRiverQTL, LinearAlgebra using Statistics using Test -using DelimitedFiles, Plots +using DelimitedFiles, DataFrames, Plots ######## # Test # diff --git a/test/utils_test.jl b/test/utils_test.jl index 856149f..8d3fdb2 100644 --- a/test/utils_test.jl +++ b/test/utils_test.jl @@ -59,6 +59,39 @@ end @testset "Testing summary_missing function" begin tbl_missing = summary_missing(test_geno, issorted = true); - @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentge row wise - @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentge column wise -end \ No newline at end of file + @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentage row wise + @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentage column wise +end + + +################## +# Test selection # +################## + +# Test the `select_samples` function +@testset "Testing select_samples function" begin + geno_subset_1 = select_marker(test_geno, ["marker2", "marker5"]); + geno_subset_2 = select_marker(test_geno, Not(["marker3"])); + geno_subset_3 = select_marker(test_geno, ["marker2"]); + + @test geno_subset_1.marker_name[1] == ["marker2"] # Check sample selection + @test size(geno_subset_1.val[1], 2) == 1 # Check size of the geno matrix + @test geno_subset_2.marker_name[1] == ["marker1", "marker2"] # Check sample selection + @test size(geno_subset_2.val[1], 2) == 2 # Check size of the geno matrix + @test geno_subset_3.marker_name[1] == ["marker2"] # Check sample selection + @test size(geno_subset_3.val[1], 2) == 1 # Check size of the geno matrix + @test length(geno_subset_3.chr) == 1 # Check size of the geno matrix +end + + +# Test the `select_sample` function +@testset "Testing select_samples function" begin + geno_subset_1 = select_sample(test_geno, ["sample1"]); + geno_subset_2 = select_sample(test_geno, Not(["sample3"])); + + @test geno_subset_1.sample_id == ["sample1"] # Check sample selection + @test size(geno_subset_1.val[1], 1) == 1 # Check size of the geno matrix + @test geno_subset_2.sample_id == ["sample1", "sample2"] # Check sample selection + @test size(geno_subset_2.val[1], 1) == 2 # Check size of the geno matrix + +end