From a4b58bee89bb61a97bb71e46de2786fd8658213a Mon Sep 17 00:00:00 2001
From: GregFa <faragegr@gmail.com>
Date: Mon, 19 Aug 2024 16:34:52 -0500
Subject: [PATCH 1/6] used geno_transposed value in get_geno

---
 src/io/export_to_type.jl | 638 ++++++++++++++++++++-------------------
 test/test_gf.jl          | 266 ++++++----------
 2 files changed, 421 insertions(+), 483 deletions(-)

diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl
index 067b67e..7a7424e 100644
--- a/src/io/export_to_type.jl
+++ b/src/io/export_to_type.jl
@@ -15,35 +15,35 @@ Returns a `Gmap` type/struct.
 
 """
 function get_gmap(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    #  get gmap file
-    gmapfile = joinpath(data_dir, check_key(jsondict, "gmap"))
+	#  get gmap file
+	gmapfile = joinpath(data_dir, check_key(jsondict, "gmap"))
 
-    # load gmap file   
-    gdf = groupby(
-        read_data(
-            gmapfile;
-            delim=check_key(jsondict, "sep"),
-            comment=check_key(jsondict, "comment.char")
-        ),
-        :chr,
-    )
+	# load gmap file   
+	gdf = groupby(
+		read_data(
+			gmapfile;
+			delim = check_key(jsondict, "sep"),
+			comment = check_key(jsondict, "comment.char"),
+		),
+		:chr,
+	)
 
-    # chromosome
-    chr = [group.chr[1] for group in gdf]
+	# chromosome
+	chr = [group.chr[1] for group in gdf]
 
-    # marker
-    marker = [String.(group.marker) for group in gdf]
+	# marker
+	marker = [String.(group.marker) for group in gdf]
 
-    # relative position
-    pos = [group.pos for group in gdf]
+	# relative position
+	pos = [group.pos for group in gdf]
 
-    return Gmap(chr, marker, pos)
+	return Gmap(chr, marker, pos)
 end
 
 
@@ -65,16 +65,16 @@ Returns a `CrossType` type/struct.
 
 """
 function get_crosstype(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = BigRiverQTL.parse_json(filename)
+	# load control file   
+	jsondict = BigRiverQTL.parse_json(filename)
 
-    # get crosstype
-    crosstype = check_key(jsondict, "crosstype")
+	# get crosstype
+	crosstype = check_key(jsondict, "crosstype")
 
-    return CrossType(crosstype)
+	return CrossType(crosstype)
 end
 
 
@@ -92,16 +92,16 @@ Creates a `Alleles` type/struct from  control file in json format.
 Returns a `Alleles` type/struct.
 """
 function get_alleles(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = BigRiverQTL.parse_json(filename)
+	# load control file   
+	jsondict = BigRiverQTL.parse_json(filename)
 
-    #alleles
-    alleles = check_key(jsondict, "alleles")
+	#alleles
+	alleles = check_key(jsondict, "alleles")
 
-    return Alleles(alleles)
+	return Alleles(alleles)
 end
 
 
@@ -119,21 +119,21 @@ Creates a `GenoType` type/struct from  control file in json format.
 Returns a `GenoType` type/struct.
 """
 function get_genotype(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load file   
-    jsondict = BigRiverQTL.parse_json(filename)
+	# load file   
+	jsondict = BigRiverQTL.parse_json(filename)
 
-    # genotype
-    if (in("genotypes", keys(jsondict)))
-        label = jsondict["genotypes"]
-    else
-        # Rqtl2 default
-        label = Dict{String,Int}("A" => 1, "H" => 2, "B" => 3, "D" => 4, "C" => 5)
-    end
+	# genotype
+	if (in("genotypes", keys(jsondict)))
+		label = jsondict["genotypes"]
+	else
+		# Rqtl2 default
+		label = Dict{String, Int}("A" => 1, "H" => 2, "B" => 3, "D" => 4, "C" => 5)
+	end
 
-    return GenoType(label)
+	return GenoType(label)
 end
 
 
@@ -151,20 +151,20 @@ Creates a `GenoTranspose` type/struct from  control file in json format.
 Returns a `GenoTranspose` type/struct.
 """
 function get_genotranspose(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load file   
-    jsondict = BigRiverQTL.parse_json(filename)
+	# load file   
+	jsondict = BigRiverQTL.parse_json(filename)
 
-    #g enotype
-    if (in("geno_transposed", keys(jsondict)))
-        val = jsondict["geno_transposed"]
-    else
-        val = FALSE
-    end
+	#g enotype
+	if (in("geno_transposed", keys(jsondict)))
+		val = jsondict["geno_transposed"]
+	else
+		val = FALSE
+	end
 
-    return GenoTranspose(val)
+	return GenoTranspose(val)
 end
 
 
@@ -183,75 +183,89 @@ Creates a `Geno` type/struct from gmap CSV file and geno CSV file.
 Returns a `Geno` type/struct.
 """
 function get_geno(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
-
-    # load control file   
-    jsondict = BigRiverQTL.parse_json(filename)
-
-    # get gmap file
-    gmapfile = joinpath(data_dir, check_key(jsondict, "gmap"))
-
-    # get geno file 
-    genofile = joinpath(data_dir, check_key(jsondict, "geno"))
-
-    # get gmap object
-    gmap = get_gmap(filename)
-
-    # load gmap dataframe
-    df_gmap = read_data(
-        gmapfile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char")
-    )
-
-    # load geno dataframe
-    df_geno = read_data(
-        genofile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char")
-    )
-
-    # get samples names
-    samples = names(df_geno)[2:end]
-
-    # get chromosomes names
-    chr = gmap.chr
-
-    # get markers names
-    marker = gmap.marker_name
-
-    #genotype
-    genotype = get_genotype(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
+
+	# load control file   
+	jsondict = parse_json(filename)
+
+	# get gmap file
+	gmapfile = joinpath(data_dir, check_key(jsondict, "gmap"))
+
+	# get geno file 
+	genofile = joinpath(data_dir, check_key(jsondict, "geno"))
+
+	# get gmap object
+	gmap = get_gmap(filename)
+
+	# load gmap dataframe
+	df_gmap = read_data(
+		gmapfile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+	)
+
+	# load geno dataframe
+	df_geno = read_data(
+		genofile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+	)
+
+	# check if geno data are transposed
+	if jsondict["geno_transposed"]
+		# get samples names
+		samples = names(df_geno)[2:end]
+		# get values 
+		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x)
+		# get markers names
+		marker = df_geno.marker
+	else
+		# get samples names
+		samples = df_geno[:, 1]
+		# get values
+		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims
+		# get markers names
+		marker = names(df_geno)[2:end]
+	end
+
+	# get chromosomes names
+	chr = gmap.chr
+
+	#genotype
+	genotype = get_genotype(filename)
+
+	# values
+	# get encoded genetotypes
+	df_encoded = DataFrame(
+		encode_genotype(genotype.label, mat_geno),
+		samples,
+	)
+
+	# built encoded geno DataFrames
+	insertcols!(df_encoded, 1, :marker => marker)
+
+	gdf = innerjoin(
+		select(df_gmap, [:chr, :marker]),
+		df_encoded,
+		on = :marker,
+	) |> x -> groupby(x, :chr)
+
+	val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]
 
-    # values
-    # get encoded genetotypes
-    df_encoded = DataFrame(
-        encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)),
-        samples,
-    )
-
-    # built encoded geno DataFrames
-    insertcols!(df_encoded, 1, :marker => df_geno.marker)
-
-    gdf = innerjoin(
-        select(df_gmap, [:chr, :marker]),
-        df_encoded,
-        on=:marker,
-    ) |> x -> groupby(x, :chr)
-
-    val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]
+    # marker
+	marker = [String.(group.marker) for group in gdf]
 
-    # crosstype
-    crosstype = get_crosstype(filename)
+	# crosstype
+	crosstype = get_crosstype(filename)
 
-    # alleles
-    alleles = get_alleles(filename)
+	# alleles
+	alleles = get_alleles(filename)
 
-    # genotranspose
-    genotranspose = get_genotranspose(filename)
+	# genotranspose
+	genotranspose = get_genotranspose(filename)
 
-    return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose)
+	return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose)
 end
 
 
@@ -270,53 +284,53 @@ Returns a `Pmap` type/struct.
 
 """
 function get_pmap(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
-
-    # load control file   
-    jsondict = parse_json(filename)
-
-    # get pmap file
-    pmapfile = joinpath(data_dir, check_key(jsondict, "pmap"))
-
-    # load pmap file
-    gdf = groupby(
-        read_data(
-            pmapfile;
-            delim=check_key(jsondict, "sep"),
-            comment=check_key(jsondict, "comment.char")
-        ),
-        :chr,
-    )
-
-    # chromosome
-    chr = [group.chr[1] for group in gdf]
-
-    # marker
-    marker = [group.marker for group in gdf]
-
-    # relative position
-    pos = [group.pos for group in gdf]
-
-    # unit
-    unit = ""
-    f = open(pmapfile, "r")
-
-    s = readline(f)
-    s_lower = lowercase(s)  # Convert the input string to lowercase
-    if occursin("mbp", s_lower)
-        unit = "Mbp"
-    elseif occursin("mb", s_lower)
-        unit = "Mb"
-    elseif occursin("cm", s_lower)
-        unit = "cM"
-    else
-        @warn "No unit detected!"
-    end
-
-    close(f)
-
-    return Pmap(chr, marker, pos, unit)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
+
+	# load control file   
+	jsondict = parse_json(filename)
+
+	# get pmap file
+	pmapfile = joinpath(data_dir, check_key(jsondict, "pmap"))
+
+	# load pmap file
+	gdf = groupby(
+		read_data(
+			pmapfile;
+			delim = check_key(jsondict, "sep"),
+			comment = check_key(jsondict, "comment.char"),
+		),
+		:chr,
+	)
+
+	# chromosome
+	chr = [group.chr[1] for group in gdf]
+
+	# marker
+	marker = [group.marker for group in gdf]
+
+	# relative position
+	pos = [group.pos for group in gdf]
+
+	# unit
+	unit = ""
+	f = open(pmapfile, "r")
+
+	s = readline(f)
+	s_lower = lowercase(s)  # Convert the input string to lowercase
+	if occursin("mbp", s_lower)
+		unit = "Mbp"
+	elseif occursin("mb", s_lower)
+		unit = "Mb"
+	elseif occursin("cm", s_lower)
+		unit = "cM"
+	else
+		@warn "No unit detected!"
+	end
+
+	close(f)
+
+	return Pmap(chr, marker, pos, unit)
 end
 
 
@@ -333,35 +347,35 @@ Creates a `Pheno` type/struct from Pheno CSV file.
 
 Returns a `Pheno` type/struct.
 """
-function get_pheno(filename::String; samples_col_name::Symbol=:id)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+function get_pheno(filename::String; samples_col_name::Symbol = :id)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    # get pheno file
-    phenofile = joinpath(data_dir, check_key(jsondict, "pheno"))
+	# get pheno file
+	phenofile = joinpath(data_dir, check_key(jsondict, "pheno"))
 
-    # load pheno file   
-    df_pheno = read_data(
-        phenofile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char"),
-        missingstring="NA"
-    )
+	# load pheno file   
+	df_pheno = read_data(
+		phenofile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+		missingstring = "NA",
+	)
 
-    # samples
-    samples = df_pheno[:, samples_col_name]
+	# samples
+	samples = df_pheno[:, samples_col_name]
 
-    # traits' names
-    traits = names(df_pheno)
-    setdiff!(traits, [string(samples_col_name)])
+	# traits' names
+	traits = names(df_pheno)
+	setdiff!(traits, [string(samples_col_name)])
 
-    # value of the traits
-    val = Matrix(df_pheno[:, traits])
+	# value of the traits
+	val = Matrix(df_pheno[:, traits])
 
-    return Pheno(samples, traits, val)
+	return Pheno(samples, traits, val)
 end
 
 
@@ -379,29 +393,29 @@ Creates a `Phenocovar` type/struct from phenocovar CSV file.
 Returns a `Phenocovar` type/struct.
 """
 function get_phenocovar(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    # get phenocovar file
-    phenocovarfile = joinpath(data_dir, check_key(jsondict, "phenocovar"))
+	# get phenocovar file
+	phenocovarfile = joinpath(data_dir, check_key(jsondict, "phenocovar"))
 
-    # load phenocovar file   
-    df_phenocovar = read_data(
-        phenocovarfile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char")
-    )
+	# load phenocovar file   
+	df_phenocovar = read_data(
+		phenocovarfile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+	)
 
-    # traits' names
-    traits = string.(df_phenocovar[:, 1])
+	# traits' names
+	traits = string.(df_phenocovar[:, 1])
 
-    # description
-    description = df_phenocovar[:, 2]
+	# description
+	description = df_phenocovar[:, 2]
 
-    return Phenocov(traits, description)
+	return Phenocov(traits, description)
 end
 
 
@@ -419,30 +433,30 @@ Creates a `Crossinfo` type/struct from crossinfo CSV file.
 Returns a `Crossinfo` type/struct.
 """
 function get_crossinfo(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    # get crossdirection file
-    crossinfo_dict = check_key(jsondict, "cross_info")
-    crossinfofile = joinpath(data_dir, crossinfo_dict["file"])
+	# get crossdirection file
+	crossinfo_dict = check_key(jsondict, "cross_info")
+	crossinfofile = joinpath(data_dir, crossinfo_dict["file"])
 
-    # load crossdirection file   
-    df_crossdirection = read_data(
-        crossinfofile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char")
-    )
+	# load crossdirection file   
+	df_crossdirection = read_data(
+		crossinfofile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+	)
 
-    # samples names
-    samples = df_crossdirection[:, 1]
+	# samples names
+	samples = df_crossdirection[:, 1]
 
-    # description
-    direction = df_crossdirection[:, 2]
+	# description
+	direction = df_crossdirection[:, 2]
 
-    return CrossInfo(samples, direction)
+	return CrossInfo(samples, direction)
 end
 
 
@@ -460,19 +474,19 @@ Creates a `IsXChar` type/struct from gmap CSV file.
 Returns a `IsXChar` type/struct.
 """
 function get_isxchar(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    # load gmap
-    gmap = get_gmap(filename)
+	# load gmap
+	gmap = get_gmap(filename)
 
-    # isXchar
-    isxchar = map(x -> x == "X", gmap.chr)
+	# isXchar
+	isxchar = map(x -> x == "X", gmap.chr)
 
-    return IsXChar(gmap.chr, isxchar)
+	return IsXChar(gmap.chr, isxchar)
 end
 
 
@@ -490,30 +504,30 @@ Creates a `Covar` type/struct from gmap CSV file.
 Returns a `Covar` type/struct.
 """
 function get_covar(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
 
-    # load control file   
-    jsondict = parse_json(filename)
+	# load control file   
+	jsondict = parse_json(filename)
 
-    # get covar file
-    covarfile = check_key(jsondict, "covar")
+	# get covar file
+	covarfile = check_key(jsondict, "covar")
 
-    if ismissing(covarfile)
-        return Covar(covarfile)
-    end
+	if ismissing(covarfile)
+		return Covar(covarfile)
+	end
 
-    covarfile = joinpath(data_dir,)
+	covarfile = joinpath(data_dir)
 
-    # load covar file   
-    df_covar = read_data(
-        covarfile;
-        delim=check_key(jsondict, "sep"),
-        comment=check_key(jsondict, "comment.char"),
-        missingstring=check_key(jsondict, "na.strings")
-    )
+	# load covar file   
+	df_covar = read_data(
+		covarfile;
+		delim = check_key(jsondict, "sep"),
+		comment = check_key(jsondict, "comment.char"),
+		missingstring = check_key(jsondict, "na.strings"),
+	)
 
-    return Covar(df_covar)
+	return Covar(df_covar)
 end
 
 
@@ -534,36 +548,36 @@ If control file does not stipulate sex information, we assume all female
 Returns a `IsFemale` type/struct.
 """
 function get_isfemale(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
-
-    # load control file   
-    jsondict = parse_json(filename)
-
-    # get samples id
-    samples = get_crossinfo(filename).sample_id
-
-    # initiate isfemale vector
-    isfemale = ones(Bool, length(samples))
-
-    # check for sex information
-    if (in("sex", keys(jsondict)))
-        sex_dict = jsondict["sex"]
-    else
-        @warn "No sex information; assuming all female"
-        return IsFemale(samples, isfemale)
-    end
-
-    if (in("covar", keys(sex_dict)))
-        sex_covar = sex_dict["covar"]
-        df_covar = get_covar(filename)
-        idx_male = findall(x -> x == sex_dict["male"], df_covar[:, sex_covar])
-        isfemale[idx_male] = false
-    else
-        throw("Error: covar-sex not found in control file")
-    end
-
-    return IsFemale(samples, isfemale)
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
+
+	# load control file   
+	jsondict = parse_json(filename)
+
+	# get samples id
+	samples = get_crossinfo(filename).sample_id
+
+	# initiate isfemale vector
+	isfemale = ones(Bool, length(samples))
+
+	# check for sex information
+	if (in("sex", keys(jsondict)))
+		sex_dict = jsondict["sex"]
+	else
+		@warn "No sex information; assuming all female"
+		return IsFemale(samples, isfemale)
+	end
+
+	if (in("covar", keys(sex_dict)))
+		sex_covar = sex_dict["covar"]
+		df_covar = get_covar(filename)
+		idx_male = findall(x -> x == sex_dict["male"], df_covar[:, sex_covar])
+		isfemale[idx_male] = false
+	else
+		throw("Error: covar-sex not found in control file")
+	end
+
+	return IsFemale(samples, isfemale)
 end
 
 
@@ -581,33 +595,33 @@ Creates a `GeneticStudyData` type/struct from  control file in json format.
 Returns a `GeneticStudyData` type/struct.
 """
 function get_geneticstudydata(filename::String)
-    # check if filename is directory or file name
-    data_dir, filename = get_control_file(filename)
-
-    # load file   
-    jsondict = parse_json(filename)
-
-    # get data info
-    gmap = get_gmap(filename)
-    geno = get_geno(filename)
-    pmap = get_pmap(filename)
-    pheno = get_pheno(filename)
-    phenocov = get_phenocovar(filename)
-    isXchar = get_isxchar(filename)
-    isfemale = get_isfemale(filename)
-    crossinfo = get_crossinfo(filename)
-    covar = get_covar(filename)
-
-    return GeneticStudyData(
-        gmap,
-        geno,
-        pmap,
-        pheno,
-        phenocov,
-        isXchar,
-        isfemale,
-        crossinfo,
-        covar
-    )
+	# check if filename is directory or file name
+	data_dir, filename = get_control_file(filename)
+
+	# load file   
+	jsondict = parse_json(filename)
+
+	# get data info
+	gmap = get_gmap(filename)
+	geno = get_geno(filename)
+	pmap = get_pmap(filename)
+	pheno = get_pheno(filename)
+	phenocov = get_phenocovar(filename)
+	isXchar = get_isxchar(filename)
+	isfemale = get_isfemale(filename)
+	crossinfo = get_crossinfo(filename)
+	covar = get_covar(filename)
+
+	return GeneticStudyData(
+		gmap,
+		geno,
+		pmap,
+		pheno,
+		phenocov,
+		isXchar,
+		isfemale,
+		crossinfo,
+		covar,
+	)
 end
 
diff --git a/test/test_gf.jl b/test/test_gf.jl
index 3f44b11..e40dd25 100644
--- a/test/test_gf.jl
+++ b/test/test_gf.jl
@@ -15,183 +15,107 @@ data_dir = joinpath(@__DIR__, "data/BXD/");
 file = joinpath(data_dir, "bxd.json");
 jsondict = BigRiverQTL.parse_json(file)
 
-
-	#  check pheno file exists
-	if (in("pheno", keys(jsondict)))
-		phenofile = joinpath(data_dir, jsondict["pheno"])
-	else
-		throw("Error: pheno not found in control file")
-	end	
-
-	# load file 
-	
-	df_pheno = CSV.read(phenofile, DataFrame; comment="#")
-
-	df_pheno = 	BigRiverQTL.read_data(
-		phenofile; 
-		missingstring="NA");
-
-	unit = ""
-	f = open(pmapfile, "r")
-
-	s = readline(f)
-	s_lower = lowercase(s)  # Convert the input string to lowercase
-	if occursin("mbp", s_lower)
-		unit = "Mbp"
-	elseif occursin("mb", s_lower)
-		unit = "Mb"
-	elseif occursin("cm", s_lower)
-		unit = "cM"
-	else 
-		@warn "No unit detected!"
-	end
-
-	close(f)
-
-	unit
-
-
 # function get_geno(filename::String)
-filename = file
-	# check if filename is directory or file name
-	data_dir, filename = get_control_file(filename)
-
-	# load file   
-	jsondict = BigRiverQTL.parse_json(filename)
-
-	#  check gmap file exists
-	if (in("gmap", keys(jsondict)))
-		gmapfile = joinpath(data_dir, jsondict["gmap"])
-	else
-		throw("Error: gmap not found in control file")
-	end
-
-	# check geno file exists
-	if (in("geno", keys(jsondict)))
-		genofile = joinpath(data_dir, jsondict["geno"])
-	else
-		throw("Error: geno not found in control file")
+	filename = file
+    # check if filename is directory or file name
+    data_dir, filename = get_control_file(filename)
+
+    # load control file   
+    jsondict = BigRiverQTL.parse_json(filename)
+
+    # get gmap file
+    gmapfile = joinpath(data_dir, BigRiverQTL.check_key(jsondict, "gmap"))
+
+    # get geno file 
+    genofile = joinpath(data_dir, BigRiverQTL.check_key(jsondict, "geno"))
+
+    # get gmap object
+    gmap = get_gmap(filename)
+
+    # load gmap dataframe
+    df_gmap = BigRiverQTL.read_data(
+        gmapfile;
+        delim= BigRiverQTL.check_key(jsondict, "sep"),
+        comment=BigRiverQTL.check_key(jsondict, "comment.char")
+    )
+
+    # load geno dataframe
+    df_geno = BigRiverQTL.read_data(
+        genofile;
+        delim=BigRiverQTL.check_key(jsondict, "sep"),
+        comment=BigRiverQTL.check_key(jsondict, "comment.char")
+    )
+
+    # check if geno data are transposed
+    if jsondict["geno_transposed"]
+        # get samples names
+        samples = names(df_geno)[2:end]
+        # get values 
+        mat_geno = df_geno[:, 2:end] |> x -> Matrix(x)
+        # get markers names
+        marker = df_geno.marker[:]
+    else
+        # get samples names
+        samples = df_geno[:, 1]
+        # get values
+        mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims
+        # get markers names
+        marker =  names(df_geno)[2:end]
+    end
+
+    # get chromosomes names
+    chr = gmap.chr
+
+    #genotype
+    genotype = get_genotype(filename)
+
+    # values
+    # get encoded genetotypes
+    df_encoded = DataFrame(
+        encode_genotype(genotype.label, mat_geno),
+        samples,
+    )
+
+    # built encoded geno DataFrames
+    insertcols!(df_encoded, 1, :marker => marker)
+
+    gdf = innerjoin(
+        select(df_gmap, [:chr, :marker]),
+        df_encoded,
+        on=:marker,
+    ) |> x -> groupby(x, :chr)
+
+    val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]
+	# marker
+	marker = [String.(group.marker) for group in gdf]
+    # crosstype
+    crosstype = get_crosstype(filename)
+
+    # alleles
+    alleles = get_alleles(filename)
+
+    # genotranspose
+    genotranspose = get_genotranspose(filename)
+
+    Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose)
+
+
+	struct Geno2
+		sample_id::Vector{AbstractString}
+		chr::Vector{AbstractString}
+		marker_name::Vector{Vector{AbstractString}}
 	end
 
-	# get gmap object
-	gmap = get_gmap(file)
-
-	# load gmap dataframe
-	df_gmap = read_data(gmapfile)
-
-	# load geno dataframe
-	df_geno = read_data(genofile)
-
-	# get samples names
-	samples = names(df_geno)[2:end]
-
-	# get chromosomes names
-	chr = gmap.chr
-
-	# get markers names
-	marker = gmap.marker_name
-
-	#genotype
-	genotype = get_genotype(filename)
-
-	# values
-	# get encoded genetotypes
-	df_encoded = DataFrame(
-		encode_genotype(genotype.label, df_geno[:, 2:end] |> x -> Matrix(x)),
-		samples,
-	)
-
-	# built encoded geno DataFrames
-	insertcols!(df_encoded, 1, :marker => df_geno.marker)
-
-	gdf = innerjoin(
-		select(df_gmap, [:chr, :marker]), 
-		df_encoded, 
-		on = :marker
-	) |> x -> groupby(x, :chr);
-		
-	val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf];
-	
-	#crosstype
-	crosstype = get_crosstype(filename)
-
-	#alleles
-	alleles = get_alleles(filename)
-
-	#genotranspose
-	genotranspose = get_genotranspose(filename)
-
-	# return Geno(samples, chr, marker, val, crosstype, alleles, genotype, genotranspose)
-# end
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# Transforming data to a optimised and accessible data type
-# data = get_geneticstudydata(file);
-
-# check if filename is directory or file name
-data_dir, filename = BigRiverQTL.get_control_file(file)
-
-# load file   
-jsondict = BigRiverQTL.parse_json(filename)
-
-
-crossinfofile = joinpath(data_dir, jsondict["cross_info"]["file"])
-
-	# make type
-
-	# samples
-	df_crossdirection = BigRiverQTL.read_data(crossinfofile)
-	get_crossinfo(crossinfofile)
-
-
-	samples = get_crossinfo(crossinfofile).sample_id
-
-	# isfemale
-
-	isfemale = ones(Bool, length(samples))
-	if (in("sex", keys(jsondict)))
-		isfemale[findall(x -> x == "Male", jsondict["sex"])] = 0
+	struct Geno4
+		sample_id::Vector{AbstractString}
+		chr::Vector{AbstractString}
+		# marker_name::Vector{Vector{AbstractString}}
 	end
 
 
 
+	Geno2(samples, chr, marker)
+	Geno4(samples, chr)
 
-
-
-
+	gmap.marker
\ No newline at end of file

From c6959cc01a77b5a240d756959e07cc569ab6fb81 Mon Sep 17 00:00:00 2001
From: GregFa <faragegr@gmail.com>
Date: Mon, 19 Aug 2024 17:10:48 -0500
Subject: [PATCH 2/6] corrected code usng transposed geno

---
 example/example_qtl.ipynb   |  5 ++++-
 example/example_qtl.jl      |  8 ++++++--
 src/io/export_to_type.jl    |  4 ++--
 src/struct/datastructure.jl |  7 +++++--
 test/datastruc_test.jl      |  2 +-
 test/plots_test.jl          | 10 ++++------
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/example/example_qtl.ipynb b/example/example_qtl.ipynb
index 4d6fcf7..26e06a2 100644
--- a/example/example_qtl.ipynb
+++ b/example/example_qtl.ipynb
@@ -127,7 +127,10 @@
     "# We can get the genotype matrix using the following command:\n",
     "geno = reduce(hcat, data.geno.val);\n",
     "# For computing reasons, we need to convert the geno matrix in Float64\n",
-    "geno_processed = convert(Array{Float64}, geno);"
+    "geno_processed = geno .- 1.0 |>\n",
+    "    x -> replace(x, missing => 0.5) |>\n",
+    "\n",
+    "# geno_processed = convert(Matrix{Float64}, geno_processed);\n"
    ]
   },
   {
diff --git a/example/example_qtl.jl b/example/example_qtl.jl
index 20a2c4e..a2f37d3 100644
--- a/example/example_qtl.jl
+++ b/example/example_qtl.jl
@@ -6,7 +6,7 @@
 #       extension: .jl
 #       format_name: light
 #       format_version: '1.5'
-#       jupytext_version: 1.16.2
+#       jupytext_version: 1.16.4
 #   kernelspec:
 #     display_name: Julia 1.10.4
 #     language: julia
@@ -62,7 +62,11 @@ pheno = data.pheno.val;
 # We can get the genotype matrix using the following command:
 geno = reduce(hcat, data.geno.val);
 # For computing reasons, we need to convert the geno matrix in Float64
-geno_processed = convert(Array{Float64}, geno);
+geno_processed = geno .- 1.0 |>
+    x -> replace(x, missing => 0.5) |>
+
+# geno_processed = convert(Matrix{Float64}, geno_processed);
+
 # -
 
 # #### Preprocessing
diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl
index 7a7424e..da137be 100644
--- a/src/io/export_to_type.jl
+++ b/src/io/export_to_type.jl
@@ -217,7 +217,7 @@ function get_geno(filename::String)
 		# get samples names
 		samples = names(df_geno)[2:end]
 		# get values 
-		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x)
+		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) 
 		# get markers names
 		marker = df_geno.marker
 	else
@@ -251,7 +251,7 @@ function get_geno(filename::String)
 		on = :marker,
 	) |> x -> groupby(x, :chr)
 
-	val = [Matrix(select(group, Not(:marker, :chr))) for group in gdf]
+	val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf]
 
     # marker
 	marker = [String.(group.marker) for group in gdf]
diff --git a/src/struct/datastructure.jl b/src/struct/datastructure.jl
index 49bfc01..799bac1 100644
--- a/src/struct/datastructure.jl
+++ b/src/struct/datastructure.jl
@@ -67,6 +67,11 @@ end
 * `chr` contains chromosome names.
 * `marker_name` contains marker names for each chromosome.
 * `val` is a vector of matrices containing allele information in each chromosome.
+  In each matrix, the rows are the samples and the coulmns are the markers.
+* `crosstype` is a field of type `CrossType`. Refer to `CrossType` type for more imformation.
+* `alleles` is a field of type `Alleles`. Refer to `Alleles` type for more imformation.
+* `genotype` is a field of type `Genotype`. Refer to `Genotype` type for more imformation.
+* `genotranspose` is a field of type `GenoTranspose`. Refer to `GenoTranspose` type for more imformation.
 """
 struct Geno
 	sample_id::Vector{AbstractString}
@@ -77,7 +82,6 @@ struct Geno
 	alleles::Alleles
 	geno_type::GenoType
 	geno_transpose::GenoTranspose
-
 end
 
 
@@ -129,7 +133,6 @@ struct Covar
 end
 
 
-
 """
 IsFemale type indicates if the sample_id (genotypes or individuals) are females.
 * `sample_id` contains sample names such as genotypes or individual IDs.
diff --git a/test/datastruc_test.jl b/test/datastruc_test.jl
index 70f4528..201b74d 100644
--- a/test/datastruc_test.jl
+++ b/test/datastruc_test.jl
@@ -80,7 +80,7 @@ end
 @testset "Geno Tests" begin
 	geno = get_geno(file)
 	@test length(geno.sample_id) == 198
-	@test size(geno.val[1]) == (636, 198)
+	@test size(geno.val[1]) == (198, 636)
 end
 
 
diff --git a/test/plots_test.jl b/test/plots_test.jl
index 7cee95f..371bd5b 100644
--- a/test/plots_test.jl
+++ b/test/plots_test.jl
@@ -11,11 +11,9 @@ gInfo = data.gmap;
 pInfo = data.phenocov;
 # pheno=data.pheno;
 pheno = data.pheno.val;
-geno = reduce(vcat, data.geno.val);
-geno_processed = geno .- 1.0
-replace!(geno_processed, missing => 0.5);
-geno_processed = convert(Matrix{Float64}, geno_processed);
-geno_processed = permutedims(geno_processed);
+geno = reduce(hcat, data.geno.val);
+geno_processed = geno .- 1.0 |>
+    x -> replace(x, missing => 0.5)
 
 #################
 # Preprocessing #
@@ -36,7 +34,7 @@ kinship = kinship_gs(geno_processed, 0.99);
 # Scan #
 ########
 
-single_results_perms = BigRiverQTL.BulkLMM.scan(
+single_results_perms = BigRiverQTL.scan(
 	pheno_y2,
 	geno_processed,
 	kinship;

From 0aed57f32bebe2887e42f7009c3a93a14101ab37 Mon Sep 17 00:00:00 2001
From: Gregory Farage <gfarage@uthsc.edu>
Date: Tue, 20 Aug 2024 12:02:38 -0500
Subject: [PATCH 3/6] changed name plots_utils to plots_helpers

---
 src/plots/plots_helpers.jl | 49 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 src/plots/plots_helpers.jl

diff --git a/src/plots/plots_helpers.jl b/src/plots/plots_helpers.jl
new file mode 100644
index 0000000..0fb357d
--- /dev/null
+++ b/src/plots/plots_helpers.jl
@@ -0,0 +1,49 @@
+function gmap2df(gmap::Gmap)
+    # Locus = reduce(vcat,gmap.marker_name);
+    # Mb= reduce(vcat,gmap.pos);
+    start=0
+    Chr =repeat(["0"],sum(length.(gmap.marker_name)))
+    
+    # Pos =repeat(["Missing"],sum(length.(gmap.marker_name)))
+    
+    for i in eachindex(gmap.chr)
+        l_i=length(gmap.marker_name[i])
+        Chr[start+1:l_i+start] .= gmap.chr[i]
+        start=start+l_i
+    end
+    df=DataFrame(
+        Locus = reduce(vcat,gmap.marker_name),
+        Chr = Chr,
+        Pos= reduce(vcat,gmap.pos)
+    )
+        # Cm = repeat(["Missing"],sum(length.(gmap.marker_name)))
+    return df
+
+end
+
+
+
+
+
+function pmap2df(pmap::Pmap)
+    # Locus = reduce(vcat,gmap.marker_name);
+    # Mb= reduce(vcat,gmap.pos);
+    start=0
+    Chr =repeat(["0"],sum(length.(gmap.marker_name)))
+    
+    # Pos =repeat(["Missing"],sum(length.(gmap.marker_name)))
+    
+    for i in eachindex(pmap.chromosomes)
+        l_i=length(pmap.marker_name[i])
+        Chr[start+1:l_i+start] .= pmap.chr[i]
+        start=start+l_i
+    end
+    df=DataFrame(
+        Locus = reduce(vcat,pmap.marker_name),
+        Chr = Chr,
+        Pos= reduce(vcat,pmap.pos)
+    )
+        # Cm = repeat(["Missing"],sum(length.(gmap.marker_name)))
+    return df
+
+end
\ No newline at end of file

From 3a1ce70f10404b116e99e571decbe14ba6cf0bf1 Mon Sep 17 00:00:00 2001
From: Gregory Farage <gfarage@uthsc.edu>
Date: Tue, 20 Aug 2024 12:11:37 -0500
Subject: [PATCH 4/6] rename io_utils to io_helpers

---
 src/BigRiverQTL.jl                    |  4 +--
 src/io/{io_utils.jl => io_helpers.jl} |  0
 src/plots/plots_utils.jl              | 49 ---------------------------
 3 files changed, 2 insertions(+), 51 deletions(-)
 rename src/io/{io_utils.jl => io_helpers.jl} (100%)
 delete mode 100644 src/plots/plots_utils.jl

diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl
index 9398049..17bba3b 100644
--- a/src/BigRiverQTL.jl
+++ b/src/BigRiverQTL.jl
@@ -62,7 +62,7 @@ module BigRiverQTL
     ######
     # IO #
     ######
-    include("io/io_utils.jl")
+    include("io/io_helpers.jl")
     export get_control_file, encode_genotype
     include("io/export_to_type.jl")
     export get_geneticstudydata
@@ -73,7 +73,7 @@ module BigRiverQTL
     #########
     # Plots #
     #########
-    include("plots/plots_utils.jl")
+    include("plots/plots_helpers.jl")
     export gmap2df, pmap2df
 
     include("plots/plots_qtl.jl")
diff --git a/src/io/io_utils.jl b/src/io/io_helpers.jl
similarity index 100%
rename from src/io/io_utils.jl
rename to src/io/io_helpers.jl
diff --git a/src/plots/plots_utils.jl b/src/plots/plots_utils.jl
deleted file mode 100644
index 0fb357d..0000000
--- a/src/plots/plots_utils.jl
+++ /dev/null
@@ -1,49 +0,0 @@
-function gmap2df(gmap::Gmap)
-    # Locus = reduce(vcat,gmap.marker_name);
-    # Mb= reduce(vcat,gmap.pos);
-    start=0
-    Chr =repeat(["0"],sum(length.(gmap.marker_name)))
-    
-    # Pos =repeat(["Missing"],sum(length.(gmap.marker_name)))
-    
-    for i in eachindex(gmap.chr)
-        l_i=length(gmap.marker_name[i])
-        Chr[start+1:l_i+start] .= gmap.chr[i]
-        start=start+l_i
-    end
-    df=DataFrame(
-        Locus = reduce(vcat,gmap.marker_name),
-        Chr = Chr,
-        Pos= reduce(vcat,gmap.pos)
-    )
-        # Cm = repeat(["Missing"],sum(length.(gmap.marker_name)))
-    return df
-
-end
-
-
-
-
-
-function pmap2df(pmap::Pmap)
-    # Locus = reduce(vcat,gmap.marker_name);
-    # Mb= reduce(vcat,gmap.pos);
-    start=0
-    Chr =repeat(["0"],sum(length.(gmap.marker_name)))
-    
-    # Pos =repeat(["Missing"],sum(length.(gmap.marker_name)))
-    
-    for i in eachindex(pmap.chromosomes)
-        l_i=length(pmap.marker_name[i])
-        Chr[start+1:l_i+start] .= pmap.chr[i]
-        start=start+l_i
-    end
-    df=DataFrame(
-        Locus = reduce(vcat,pmap.marker_name),
-        Chr = Chr,
-        Pos= reduce(vcat,pmap.pos)
-    )
-        # Cm = repeat(["Missing"],sum(length.(gmap.marker_name)))
-    return df
-
-end
\ No newline at end of file

From 6e5ed8370da7061419a0478506609a657c61619f Mon Sep 17 00:00:00 2001
From: Gregory Farage <gfarage@uthsc.edu>
Date: Tue, 20 Aug 2024 23:34:18 -0500
Subject: [PATCH 5/6] added wrangling_utils

---
 src/BigRiverQTL.jl           |   7 ++
 src/io/export_to_type.jl     |  41 ++++++----
 src/utils/wrangling_utils.jl | 147 +++++++++++++++++++++++++++++++++++
 test/utils_test.jl           |  35 +++++++++
 4 files changed, 216 insertions(+), 14 deletions(-)
 create mode 100644 src/utils/wrangling_utils.jl

diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl
index 17bba3b..1f78056 100644
--- a/src/BigRiverQTL.jl
+++ b/src/BigRiverQTL.jl
@@ -64,6 +64,7 @@ module BigRiverQTL
     ######
     include("io/io_helpers.jl")
     export get_control_file, encode_genotype
+
     include("io/export_to_type.jl")
     export get_geneticstudydata
     export get_gmap, get_alleles, get_chromosome, get_crossinfo, get_crosstype 
@@ -85,4 +86,10 @@ module BigRiverQTL
     include("plots/plots_eqtl.jl")
     export plot_eQTL 
 
+    #########
+    # Utils #
+    #########
+    include("utils/wrangling_utils.jl")
+    export get_geno_completecases, summary_missing
+    
 end
diff --git a/src/io/export_to_type.jl b/src/io/export_to_type.jl
index da137be..0210a78 100644
--- a/src/io/export_to_type.jl
+++ b/src/io/export_to_type.jl
@@ -213,22 +213,35 @@ function get_geno(filename::String)
 	)
 
 	# check if geno data are transposed
-	if jsondict["geno_transposed"]
-		# get samples names
-		samples = names(df_geno)[2:end]
-		# get values 
-		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) 
-		# get markers names
-		marker = df_geno.marker
-	else
-		# get samples names
-		samples = df_geno[:, 1]
-		# get values
-		mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims
-		# get markers names
-		marker = names(df_geno)[2:end]
+	if !jsondict["geno_transposed"]
+		# permutedims
+		df_geno = permutedims(df_geno, 1, "marker")
 	end
 
+	# get samples names
+	samples = names(df_geno)[2:end]
+	# get values 
+	mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) 
+	# get markers names
+	marker = df_geno[:, 1]
+
+	# # check if geno data are transposed
+	# if jsondict["geno_transposed"]
+	# 	# get samples names
+	# 	samples = names(df_geno)[2:end]
+	# 	# get values 
+	# 	mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) 
+	# 	# get markers names
+	# 	marker = df_geno.marker
+	# else
+	# 	# get samples names
+	# 	samples = df_geno[:, 1]
+	# 	# get values
+	# 	mat_geno = df_geno[:, 2:end] |> x -> Matrix(x) |> permutedims
+	# 	# get markers names
+	# 	marker = names(df_geno)[2:end]
+	# end
+
 	# get chromosomes names
 	chr = gmap.chr
 
diff --git a/src/utils/wrangling_utils.jl b/src/utils/wrangling_utils.jl
new file mode 100644
index 0000000..ab4a72d
--- /dev/null
+++ b/src/utils/wrangling_utils.jl
@@ -0,0 +1,147 @@
+
+
+"""
+    get_geno_completecases(geno_missing::Geno) -> Geno
+
+Identify and remove rows or columns from a genotype data structure `Geno` containing missing values, 
+prioritizing those with the highest percentage of missing data.
+
+# Arguments
+- `geno_missing::Geno`: A genotype data structure containing possibly missing values.
+
+# Returns
+- `Geno`: A new genotype data structure with the same fields as the input, but where rows and columns 
+with missing data have been systematically removed until no missing data remains.
+
+# Description
+
+The core of the function involves iteratively identifying and removing rows or columns with the highest percentage of missing data. 
+This decision is based on whether the maximum percentage of missing data across rows exceeds that across columns. 
+Rows or columns are removed until there are no missing values left in the matrix.
+
+After cleaning, the function reconstructs the `Geno` data structure. 
+It ensures the data aligns with the expected format by adjusting the dimensions of the data, regrouping by chromosomes, 
+and ensuring marker and sample identifiers are correctly aligned.
+
+"""
+function get_geno_completecases(geno_missing::Geno)
+	geno = deepcopy(geno_missing)
+
+	# get geno values
+	mat_geno = reduce(hcat, geno.val)
+
+	# build dataframe
+	df_geno = DataFrame(
+		mat_geno,
+		reduce(vcat, geno.marker_name),
+	)
+	insertcols!(df_geno, 1, :samples => geno.sample_id)
+
+	# get ismissing matrix
+	mat_geno_missing = ismissing.(mat_geno) .* 1
+
+	# get complete cases
+	while sum(mat_geno_missing) > 0
+
+		missing_row_wise = sum(mat_geno_missing, dims = 2) .* 100 / (size(mat_geno, 2)) |> vec
+		missing_col_wise = sum(mat_geno_missing, dims = 1) .* 100 / (size(mat_geno, 1)) |> vec
+
+		if (maximum(missing_row_wise) > maximum(missing_col_wise))
+			idx_missing = getindex(findmax(missing_row_wise), 2)
+			deleteat!(df_geno, idx_missing)
+		else
+			idx_missing = getindex(findmax(missing_col_wise), 2)
+			select!(df_geno, Not(idx_missing))
+		end
+
+		mat_geno = Matrix(df_geno)
+		mat_geno_missing = ismissing.(mat_geno) .* 1
+	end
+
+	# permute dataframe to fit Geno type/struct
+	df_geno = permutedims(df_geno, 1, "marker")
+
+	# get samples names
+	samples = names(df_geno)[2:end]
+
+	# get df_gmap: chromosomes-markers
+	chr = deepcopy(geno.marker_name)
+	for i in eachindex(geno.chr)
+		chr[i][:] .= geno.chr[i]
+	end
+
+	df_gmap = DataFrame(
+		chr = reduce(vcat, chr),
+		marker = reduce(vcat, geno.marker_name),
+	)
+
+	# get data per chromosome
+	gdf = innerjoin(
+		df_gmap,
+		df_geno,
+		on = :marker,
+	) |> x -> groupby(x, :chr)
+
+
+	# permute back such rows are samples and columns ar markers
+	val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf]
+
+	# marker
+	marker = [String.(group.marker) for group in gdf]
+
+	return Geno(
+		samples,
+		geno.chr,
+		marker,
+		val,
+		geno.cross_type,
+		geno.alleles,
+		geno.geno_type,
+		geno.geno_transpose,
+	)
+end
+
+
+"""
+    summary_missing(geno_missing::Geno; issorted::Bool = false) -> (missing_per_row::DataFrame, missing_per_col::DataFrame)
+
+Calculate the percentage of missing data for each sample and each marker in a genetic dataset.
+
+# Arguments
+- `geno_missing::Geno`: A genetic data structure of type Geno.
+- `issorted::Bool = false`: An optional boolean flag indicating whether to sort the results by 
+   the percentage of missing data in descending order.
+
+# Returns
+- `(missing_per_row::DataFrame, missing_per_col::DataFrame)`: A tuple of two `DataFrame`s:
+  - `missing_per_row`: Each row represents a sample with columns for sample identifiers and the percentage of missing data.
+  - `missing_per_col`: Each row represents a marker with columns for marker names and the percentage of missing data.
+"""
+function summary_missing(geno_missing::Geno; issorted::Bool = false)
+	# get geno values
+	mat_geno = reduce(hcat, geno_missing.val)
+
+	# get ismissing matrix
+	mat_geno_missing = ismissing.(mat_geno) .* 1
+
+	missing_row_wise = sum(mat_geno_missing, dims = 2) .* 100 / (size(mat_geno, 2)) |> vec
+	missing_col_wise = sum(mat_geno_missing, dims = 1) .* 100 / (size(mat_geno, 1)) |> vec
+
+	# build dataframe
+	df_row_missing = DataFrame(
+		samples = geno_missing.sample_id,
+		percentage = round.(missing_row_wise, digits = 2),
+	)
+
+	df_col_missing = DataFrame(
+		markers = reduce(vcat, geno_missing.marker_name),
+		percentage = round.(missing_col_wise, digits = 2),
+	)
+
+	if issorted
+		sort!(df_row_missing, [:percentage], rev = true)
+		sort!(df_col_missing, [:percentage], rev = true)
+	end
+
+	return (missing_per_row = df_row_missing, missing_per_col = df_col_missing)
+end
\ No newline at end of file
diff --git a/test/utils_test.jl b/test/utils_test.jl
index 0a939a5..856149f 100644
--- a/test/utils_test.jl
+++ b/test/utils_test.jl
@@ -27,3 +27,38 @@ end
     encoded_output = encode_genotype(geno_dict, geno_val)
     @test sum(ismissing.(encoded_output)) == 4
 end
+
+
+################
+# Test missing #
+################
+
+test_geno = Geno(
+    ["sample1", "sample2", "sample3"],
+    ["1", "2"],
+    [["marker1", "marker2", "marker3"], 
+     ["marker4", "marker5", "marker6", "marker7"]],
+    [[1 2 1; missing missing 2; 1 2 2],
+     [1 2 2 1; missing missing 2 1; 2 1 2 missing]],
+     CrossType("risib"),
+     Alleles(["B", "D"]),
+     GenoType(Dict{String, Any}("B" => 1, "D" => 2)),
+     GenoTranspose(true)
+);
+test_geno_cleaned = get_geno_completecases(test_geno);
+
+# Test the `get_geno_completecases` function
+@testset "Testing get_geno_completecases function" begin
+    @test typeof(test_geno_cleaned) == typeof(test_geno)  # Check type consistency
+    @test sum(ismissing.(reduce(hcat, test_geno_cleaned.val))) == 0  # Ensure no missing values
+    @test length(test_geno_cleaned.sample_id) == 2 # Check if some rows might have been removed
+    @test length(test_geno_cleaned.marker_name[2]) == 3 # Check if some columns might have been removed
+end
+
+# Test the `summary_missing` function
+@testset "Testing summary_missing function" begin
+    tbl_missing = summary_missing(test_geno, issorted = true);
+
+    @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentge row wise
+    @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentge column wise
+end
\ No newline at end of file

From 1b40225345ab57d4a7571b1095cc3665cd80bb32 Mon Sep 17 00:00:00 2001
From: Gregory Farage <gfarage@uthsc.edu>
Date: Wed, 21 Aug 2024 01:11:09 -0500
Subject: [PATCH 6/6] added selection samples and marker

---
 src/BigRiverQTL.jl           |   2 +-
 src/utils/wrangling_utils.jl | 167 ++++++++++++++++++++++++++++++++++-
 test/runtests.jl             |   2 +-
 test/utils_test.jl           |  39 +++++++-
 4 files changed, 204 insertions(+), 6 deletions(-)

diff --git a/src/BigRiverQTL.jl b/src/BigRiverQTL.jl
index 1f78056..c47bb3e 100644
--- a/src/BigRiverQTL.jl
+++ b/src/BigRiverQTL.jl
@@ -90,6 +90,6 @@ module BigRiverQTL
     # Utils #
     #########
     include("utils/wrangling_utils.jl")
-    export get_geno_completecases, summary_missing
+    export get_geno_completecases, summary_missing, select_sample, select_marker
     
 end
diff --git a/src/utils/wrangling_utils.jl b/src/utils/wrangling_utils.jl
index ab4a72d..d277a1d 100644
--- a/src/utils/wrangling_utils.jl
+++ b/src/utils/wrangling_utils.jl
@@ -1,4 +1,166 @@
 
+"""
+    select_sample(geno_selection::Geno, markers_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) -> Geno
+
+Select and return a subset of genetic samples from a `Geno` object based on specified samples names or their indices.
+
+# Arguments
+- `geno_selection`: A `Geno` type or struct that contains genetic data.
+- `samples_selection`: A vector of samples names or an inverted index of samples names 
+    to be selected from the `geno_selection`. The selection is applied to the samples names in the `Geno` object.
+
+# Returns
+- `Geno`: A new `Geno` object that contains only the selected samples, their corresponding genetic information, 
+and the subset of the original genotypes related to these samples.
+
+"""
+function select_sample(geno_selection::Geno, sample_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}})
+
+	geno = deepcopy(geno_selection)
+
+	# get geno values
+	mat_geno = reduce(hcat, geno.val)
+
+	# build dataframe
+	df_geno = DataFrame(
+		mat_geno,
+		reduce(vcat, geno.marker_name),
+	)
+	insertcols!(df_geno, 1, :samples => geno.sample_id)
+
+	# permute dataframe to fit Geno type/struct
+	df_geno = permutedims(df_geno, 1, "marker")
+
+    # selection
+    if (typeof(sample_selection) <: InvertedIndex{Vector{String}})
+	    select!(df_geno, sample_selection);
+    else
+        select!(df_geno, vcat(["marker"], sample_selection));
+    end
+
+	# get samples names
+	samples = names(df_geno)[2:end]
+
+	# get df_gmap: chromosomes-markers
+	chr = deepcopy(geno.marker_name)
+	for i in eachindex(geno.chr)
+		chr[i][:] .= geno.chr[i]
+	end
+
+	df_gmap = DataFrame(
+		chr = reduce(vcat, chr),
+		marker = reduce(vcat, geno.marker_name),
+	)
+
+	# get data per chromosome
+	gdf = innerjoin(
+		df_gmap,
+		df_geno,
+		on = :marker,
+	) |> x -> groupby(x, :chr)
+
+
+	# permute back such rows are samples and columns ar markers
+	val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf]
+
+	# marker
+	marker = [String.(group.marker) for group in gdf]
+
+	return Geno(
+		samples,
+		geno.chr,
+		marker,
+		val,
+		geno.cross_type,
+		geno.alleles,
+		geno.geno_type,
+		geno.geno_transpose,
+	)
+end
+
+
+"""
+    select_marker(geno_selection::Geno, markers_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}}) -> Geno
+
+Select and return a subset of genetic markers from a `Geno` object based on specified marker names or their indices.
+
+# Arguments
+- `geno_selection`: A `Geno` type or struct that contains genetic data.
+- `markers_selection`: A vector of marker names or an inverted index of marker names 
+    to be selected from the `geno_selection`. The selection is applied to the marker names in the `Geno` object.
+
+# Returns
+- `Geno`: A new `Geno` object that contains only the selected markers, their corresponding genetic information, 
+and the subset of the original genotypes related to these markers.
+
+"""
+function select_marker(geno_selection::Geno, marker_selection:: Union{Vector{String}, InvertedIndex{Vector{String}}})
+
+	geno = deepcopy(geno_selection)
+
+	# get geno values
+	mat_geno = reduce(hcat, geno.val)
+
+	# build dataframe
+	df_geno = DataFrame(
+		mat_geno,
+		reduce(vcat, geno.marker_name),
+	)
+	insertcols!(df_geno, 1, :samples => geno.sample_id)
+
+    # selection
+    if (typeof(marker_selection) <: InvertedIndex{Vector{String}})
+	    select!(df_geno, marker_selection);
+    else
+        select!(df_geno, vcat(["samples"], marker_selection));
+    end
+
+	# permute dataframe to fit Geno type/struct
+	df_geno = permutedims(df_geno, 1, "marker")
+
+	# get samples names
+	samples = names(df_geno)[2:end]
+
+	# get df_gmap: chromosomes-markers
+	chr = deepcopy(geno.marker_name)
+	for i in eachindex(geno.chr)
+		chr[i][:] .= geno.chr[i]
+	end
+
+	df_gmap = DataFrame(
+		chr = reduce(vcat, chr),
+		marker = reduce(vcat, geno.marker_name),
+	)
+
+	# get data per chromosome
+	gdf = innerjoin(
+		df_gmap,
+		df_geno,
+		on = :marker,
+	) |> x -> groupby(x, :chr)
+
+
+	# permute back such rows are samples and columns ar markers
+	val = [permutedims(Matrix(select(group, Not(:marker, :chr)))) for group in gdf]
+
+	# marker
+	marker = [String.(group.marker) for group in gdf]
+
+    # chromosome
+	chr = [group.chr[1] for group in gdf]
+
+	return Geno(
+		samples,
+		chr,
+		marker,
+		val,
+		geno.cross_type,
+		geno.alleles,
+		geno.geno_type,
+		geno.geno_transpose,
+	)
+end
+
 
 """
     get_geno_completecases(geno_missing::Geno) -> Geno
@@ -89,9 +251,12 @@ function get_geno_completecases(geno_missing::Geno)
 	# marker
 	marker = [String.(group.marker) for group in gdf]
 
+    # chromosome
+	chr = [group.chr[1] for group in gdf]
+
 	return Geno(
 		samples,
-		geno.chr,
+		chr,
 		marker,
 		val,
 		geno.cross_type,
diff --git a/test/runtests.jl b/test/runtests.jl
index 4f2ad3a..afdfe9a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -5,7 +5,7 @@
 using BigRiverQTL, LinearAlgebra
 using Statistics
 using Test
-using DelimitedFiles, Plots
+using DelimitedFiles, DataFrames, Plots
 
 ########
 # Test #
diff --git a/test/utils_test.jl b/test/utils_test.jl
index 856149f..8d3fdb2 100644
--- a/test/utils_test.jl
+++ b/test/utils_test.jl
@@ -59,6 +59,39 @@ end
 @testset "Testing summary_missing function" begin
     tbl_missing = summary_missing(test_geno, issorted = true);
 
-    @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentge row wise
-    @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentge column wise
-end
\ No newline at end of file
+    @test tbl_missing[1].percentage[1] == 57.14 # Check missing percentage row wise
+    @test tbl_missing.missing_per_col.percentage[1] == 33.33 # Check missing percentage column wise
+end
+
+
+##################
+# Test selection #
+##################
+
+# Test the `select_samples` function
+@testset "Testing select_samples function" begin
+    geno_subset_1 = select_marker(test_geno, ["marker2", "marker5"]);
+    geno_subset_2 = select_marker(test_geno, Not(["marker3"]));
+    geno_subset_3 = select_marker(test_geno, ["marker2"]);
+
+    @test geno_subset_1.marker_name[1] == ["marker2"] # Check sample selection
+    @test size(geno_subset_1.val[1], 2) == 1 # Check size of the geno matrix
+    @test geno_subset_2.marker_name[1] == ["marker1", "marker2"] # Check sample selection
+    @test size(geno_subset_2.val[1], 2) == 2 # Check size of the geno matrix
+    @test geno_subset_3.marker_name[1] == ["marker2"] # Check sample selection
+    @test size(geno_subset_3.val[1], 2) == 1 # Check size of the geno matrix
+    @test length(geno_subset_3.chr) == 1 # Check size of the geno matrix
+end
+
+
+# Test the `select_sample` function
+@testset "Testing select_samples function" begin
+    geno_subset_1 = select_sample(test_geno, ["sample1"]);
+    geno_subset_2 = select_sample(test_geno, Not(["sample3"]));
+
+    @test geno_subset_1.sample_id == ["sample1"] # Check sample selection
+    @test size(geno_subset_1.val[1], 1) == 1 # Check size of the geno matrix
+    @test geno_subset_2.sample_id == ["sample1", "sample2"] # Check sample selection
+    @test size(geno_subset_2.val[1], 1) == 2 # Check size of the geno matrix
+
+end