Skip to content

Commit b70b1e3

Browse files
committed
Changes for experimenting with simulated data sets.
1 parent 4492667 commit b70b1e3

File tree

3 files changed

+110
-2
lines changed

3 files changed

+110
-2
lines changed

scripts/discretize.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,20 @@ def write_dataset(dataset, name, sep, colobs, varnames, indices):
101101
'''
102102
Write the dataset as a CSV file.
103103
'''
104+
header = False
105+
index = False
104106
if colobs:
105107
dataset = dataset.T
106-
dataset.to_csv(name, sep=sep, header=varnames, index=indices)
108+
if indices:
109+
header = True
110+
if varnames:
111+
index = True
112+
else:
113+
if varnames:
114+
header = True
115+
if indices:
116+
index = True
117+
dataset.to_csv(name, sep=sep, header=header, index=index)
107118

108119

109120
def main():

scripts/run_experiments.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,15 @@
4747
('complete' , ('data/athaliana/athaliana_complete_discretized.tsv', 18380, 16838, ' ', True, True, True)),
4848
])
4949

50-
all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()))
50+
simulated_datasets = OrderedDict([
51+
#(name, (-f, -n, -m, -s, -c, -v, -i)),
52+
('s1' , ('data/simulated/n30000_p0.00005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
53+
('s2' , ('data/simulated/n30000_p0.0001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
54+
('s3' , ('data/simulated/n30000_p0.0005_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
55+
('s4' , ('data/simulated/n30000_p0.001_m10000_discretized.tsv', 30000, 10000, ' ', True, True, False)),
56+
])
57+
58+
all_datasets = OrderedDict(list(small_datasets.items()) + list(big_datasets.items()) + list(simulated_datasets.items()))
5159

5260
all_algorithms = [
5361
'gs',

scripts/simulate_bn.R

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#!/usr/bin/env Rscript
2+
3+
##
4+
# @file simulate_bn.R
5+
# @brief Script for simulating a Bayesian network using pcalg
6+
# @author Ankit Srivastava <asrivast@gatech.edu>
7+
#
8+
# Copyright 2020 Georgia Institute of Technology
9+
#
10+
# Licensed under the Apache License, Version 2.0 (the "License");
11+
# you may not use this file except in compliance with the License.
12+
# You may obtain a copy of the License at
13+
#
14+
# http://www.apache.org/licenses/LICENSE-2.0
15+
#
16+
# Unless required by applicable law or agreed to in writing, software
17+
# distributed under the License is distributed on an "AS IS" BASIS,
18+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19+
# See the License for the specific language governing permissions and
20+
# limitations under the License.
21+
22+
library('pcalg')
23+
library('optparse')
24+
25+
26+
if (!exists('argv')) {
27+
argv = commandArgs(trailing=TRUE)
28+
}
29+
30+
parser <- OptionParser()
31+
parser <- add_option(parser, c('--seed'), type='integer', help='PRNG seed.')
32+
parser <- add_option(parser, c('--nvars', '-n'), type='integer', help='Number of variables in the simulated network.')
33+
parser <- add_option(parser, c('--prob', '-p'), type='double', default=0.05, help='Threshold p-value used for generating the graph.')
34+
parser <- add_option(parser, c('--mbsize'), action='store_true', default=FALSE, help='Print out the average MB size.')
35+
parser <- add_option(parser, c('--bn', '-b'), type='character', help='Name of the dot file to which the network is written.')
36+
parser <- add_option(parser, c('--nobs', '-m'), type='integer', help='Number of observations in the simulated dataset.')
37+
parser <- add_option(parser, c('--datafile', '-d'), type='character', help='Name of the file to which dataset is written.')
38+
parser <- add_option(parser, c('--colobs', '-c'), action='store_true', default=FALSE, help='The file contains observations in columns.')
39+
parser <- add_option(parser, c('--sep', '-s'), type='character', default=' ', help='Delimiting character in the dataset file.')
40+
args <- parse_args(parser, args=argv)
41+
42+
43+
if (!is.null(args$seed)) {
44+
set.seed(args$seed)
45+
}
46+
47+
tGenerate <- proc.time()
48+
nodes <- c()
49+
for (v in seq(1, args$nvars)) {
50+
nodes <- c(nodes, paste('V', v, sep=''))
51+
}
52+
# Graph of type graphNEL
53+
dag <- randomDAG(args$nvars, prob=args$prob, V=nodes)
54+
show(dag)
55+
tGenerate <- proc.time() - tGenerate
56+
cat('Time taken in generating the network:', tGenerate['elapsed'], 'sec\n')
57+
58+
if (args$mbsize) {
59+
cat('Using bnlearn from', find.package('bnlearn'), '\n')
60+
library('bnlearn')
61+
tMBSize <- proc.time()
62+
# Convert to BN
63+
bn <- as.bn(dag)
64+
avgmb <- mean(sapply(nodes, function(n) { length(bn$nodes[[n]]$mb) }))
65+
cat('Average MB size is', avgmb, '\n')
66+
tMBSize <- proc.time() - tMBSize
67+
cat('Time taken in getting the MB sizes:', tMBSize['elapsed'], 'sec\n')
68+
}
69+
70+
if (!is.null(args$bn)) {
71+
tWrite <- proc.time()
72+
write.dot(args$bn, bn)
73+
tWrite <- proc.time() - tWrite
74+
cat('Time taken in writing the network:', tWrite['elapsed'], 'sec\n')
75+
}
76+
77+
if (!is.null(args$nobs)) {
78+
tData <- proc.time()
79+
data <- rmvDAG(args$nobs, dag, use.node.names=TRUE)
80+
if (!args$colobs) {
81+
data <- t(data)
82+
}
83+
tData <- proc.time() - tData
84+
cat('Time taken in getting the data:', tData['elapsed'], 'sec\n')
85+
tWrite <- proc.time()
86+
write.table(data, file=args$datafile, sep=args$sep, row.names=!args$colobs, col.names=args$colobs)
87+
tWrite <- proc.time() - tWrite
88+
cat('Time taken in writing the dataset:', tWrite['elapsed'], 'sec\n')
89+
}

0 commit comments

Comments
 (0)