Skip to content

Commit b5e6d66

Browse files
authored
Merge pull request #68 from bensadeghi/testset
Testset
2 parents adf7457 + 92f9aef commit b5e6d66

File tree

18 files changed

+155
-25
lines changed

18 files changed

+155
-25
lines changed

README.md

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
[![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.6.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.6)
88
[![DecisionTree](http://pkg.julialang.org/badges/DecisionTree_0.7.svg)](http://pkg.julialang.org/?pkg=DecisionTree&ver=0.7)
99

10-
Julia implementation of Decision Trees & Random Forests
10+
Julia implementation of Decision Tree and Random Forest algorithms
1111

1212
## Classification
1313
* pre-pruning (max depth, min leaf size)
@@ -72,7 +72,7 @@ Also have a look at these [classification](https://github.com/cstjean/ScikitLear
7272

7373
## Native API
7474
### Classification Example
75-
Pruned Tree Classifier
75+
Decision Tree Classifier
7676
```julia
7777
# train full-tree classifier
7878
model = build_tree(labels, features)
@@ -87,11 +87,21 @@ apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"]
8787
# run n-fold cross validation for pruned tree,
8888
# using 90% purity threshold pruning, and 3 CV folds
8989
accuracy = nfoldCV_tree(labels, features, 0.9, 3)
90+
91+
# set of classification build_tree() parameters and respective default values
92+
# max_depth: maximum depth of the decision tree (default: -1, no maximum)
93+
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 1)
94+
# min_samples_split: the minimum number of samples in needed for a split (default: 2)
95+
# min_purity_increase: minimum purity needed for a split (default: 0.0)
96+
# nsubfeatures: number of features to select at random (default: 0, keep all)
97+
nsubfeatures=0; maxdepth=-1; min_samples_leaf=1; min_samples_split=2; min_purity_increase=0.0;
98+
model = build_tree(labels, features, nsubfeatures, maxdepth, min_samples_leaf, min_samples_split, min_purity_increase)
99+
90100
```
91101
Random Forest Classifier
92102
```julia
93103
# train random forest classifier
94-
# using 2 random features, 10 trees, 0.5 portion of samples per tree (optional), and a maximum tree depth of 6 (optional)
104+
# using 2 random features, 10 trees, 0.5 portion of samples per tree, and a maximum tree depth of 6
95105
model = build_forest(labels, features, 2, 10, 0.5, 6)
96106
# apply learned model
97107
apply_forest(model, [5.9,3.0,5.1,1.9])
@@ -100,6 +110,14 @@ apply_forest_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica
100110
# run n-fold cross validation for forests
101111
# using 2 random features, 10 trees, 3 folds, and 0.5 portion of samples per tree (optional)
102112
accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5)
113+
114+
# set of classification build_forest() parameters and respective default values
115+
# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
116+
# ntrees: number of trees to train (default: 10)
117+
# partialsampling: fraction of samples to train each tree on (default: 0.7)
118+
# max_depth: maximum depth of the decision trees (default: no maximum)
119+
nsubfeatures=0; ntrees=10; partialsampling=0.7; maxdepth=-1;
120+
model = build_forest(labels, features, nsubfeatures, ntrees, partialsampling, maxdepth)
103121
```
104122
Adaptive-Boosted Decision Stumps Classifier
105123
```julia
@@ -129,11 +147,21 @@ apply_tree(model, [-0.9,3.0,5.1,1.9,0.0])
129147
# run n-fold cross validation, using 3 folds and averaging of 5 samples per leaf (optional)
130148
# returns array of coefficients of determination (R^2)
131149
r2 = nfoldCV_tree(labels, features, 3, 5)
150+
151+
# set of regression build_tree() parameters and respective default values
152+
# max_depth: maximum depth of the decision tree (default: -1, no maximum)
153+
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
154+
# min_samples_split: the minimum number of samples in needed for a split (default: 2)
155+
# min_purity_increase: minimum purity needed for a split (default: 0.0)
156+
# nsubfeatures: number of features to select at random (default: 0, keep all)
157+
min_samples_leaf = 5; nsubfeatures = 0; max_depth = -1; min_samples_split = 2; min_purity_increase = 0.0;
158+
model = build_tree(labels, features, min_samples_leaf, nsubfeatures, max_depth, min_samples_split, min_purity_increase)
159+
132160
```
133161
Regression Random Forest
134162
```julia
135163
# train regression forest, using 2 random features, 10 trees,
136-
# averaging of 5 samples per leaf (optional), and 0.7 portion of samples per tree (optional)
164+
# averaging of 5 samples per leaf, and 0.7 portion of samples per tree
137165
model = build_forest(labels, features, 2, 10, 5, 0.7)
138166
# apply learned model
139167
apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
@@ -142,4 +170,13 @@ apply_forest(model, [-0.9,3.0,5.1,1.9,0.0])
142170
# and 0.7 porition of samples per tree (optional)
143171
# returns array of coefficients of determination (R^2)
144172
r2 = nfoldCV_forest(labels, features, 2, 10, 3, 5, 0.7)
173+
174+
# set of regression build_forest() parameters and respective default values
175+
# nsubfeatures: number of features to consider at random per split (default: 0, keep all)
176+
# ntrees: number of trees to train (default: 10)
177+
# partialsampling: fraction of samples to train each tree on (default: 0.7)
178+
# max_depth: maximum depth of the decision trees (default: no maximum)
179+
# min_samples_leaf: the minimum number of samples each leaf needs to have (default: 5)
180+
nsubfeatures=0; ntrees=10; min_samples_leaf=5; partialsampling=0.7; max_depth=-1;
181+
model = build_forest(labels, features, nsubfeatures, ntrees, min_samples_leaf, partialsampling, max_depth)
145182
```

src/classification/main.jl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ end
5454

5555
function build_stump(labels::Vector, features::Matrix, weights=[0];
5656
rng=Base.GLOBAL_RNG)
57+
if weights == [0]
58+
return build_tree(labels, features, 0, 1)
59+
end
5760
S = _split_neg_z1_loss(labels, features, weights)
5861
if S == NO_BEST
5962
return Leaf(majority_vote(labels), labels)
@@ -188,7 +191,7 @@ end
188191
apply_tree_proba(tree::LeafOrNode, features::Matrix, labels) =
189192
stack_function_results(row->apply_tree_proba(tree, row, labels), features)
190193

191-
function build_forest(labels::Vector, features::Matrix, nsubfeatures::Integer, ntrees::Integer, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
194+
function build_forest(labels::Vector, features::Matrix, nsubfeatures=0, ntrees=10, partialsampling=0.7, maxdepth=-1; rng=Base.GLOBAL_RNG)
192195
rng = mk_rng(rng)::AbstractRNG
193196
partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
194197
Nlabels = length(labels)

src/regression/main.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ function build_tree{T<:Float64}(
4242
return _convert(t)
4343
end
4444

45-
function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures::Integer, ntrees::Integer, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
45+
function build_forest{T<:Float64}(labels::Vector{T}, features::Matrix, nsubfeatures=0, ntrees=10, min_samples_leaf=5, partialsampling=0.7, max_depth=-1; rng=Base.GLOBAL_RNG)
4646
rng = mk_rng(rng)::AbstractRNG
4747
partialsampling = partialsampling > 1.0 ? 1.0 : partialsampling
4848
Nlabels = length(labels)

src/scikitlearnAPI.jl

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,9 @@ Random forest classification. See [DecisionTree.jl's documentation](https://gith
157157
158158
Hyperparameters:
159159
160-
- `nsubfeatures`: number of features to select in each tree at random (default: keep all)
161-
- `ntrees`: number of trees to train
162-
- `partialsampling`: fraction of samples to train each tree on
160+
- `nsubfeatures`: number of features to consider at random per split (default: keep all)
161+
- `ntrees`: number of trees to train (default: 10)
162+
- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
163163
- `max_depth`: maximum depth of the decision trees (default: no maximum)
164164
- `rng`: the random number generator to use. Can be an `Int`, which will be used
165165
to seed and create a new random number generator.
@@ -220,10 +220,9 @@ Random forest regression. See [DecisionTree.jl's documentation](https://github.c
220220
221221
Hyperparameters:
222222
223-
- `nsubfeatures`: number of features to select in each tree at random (default:
224-
keep all)
225-
- `ntrees`: number of trees to train
226-
- `partialsampling`: fraction of samples to train each tree on
223+
- `nsubfeatures`: number of features to consider at random per split (default: keep all)
224+
- `ntrees`: number of trees to train (default: 10)
225+
- `partialsampling`: fraction of samples to train each tree on (default: 0.7)
227226
- `max_depth`: maximum depth of the decision trees (default: no maximum)
228227
- `min_samples_leaf`: the minimum number of samples each leaf needs to have (default: 5)
229228
- `rng`: the random number generator to use. Can be an `Int`, which will be used

test/classification/adult.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Classification Test - Adult Data Set
22
# https://archive.ics.uci.edu/ml/datasets/adult
33

4+
@testset "adult.jl" begin
5+
46
download("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", "adult.csv");
57
adult = readcsv("adult.csv");
68

@@ -19,3 +21,5 @@ accuracy = nfoldCV_tree(labels, features, 0.9, 3);
1921
println("\n##### 3 foldCV Classification Forest #####")
2022
accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5);
2123
@test mean(accuracy) > 0.8
24+
25+
end # @testset

test/classification/digits.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
@testset "digits.jl" begin
2+
13
function loaddata()
24
f = open("data/digits.csv")
35
data = readlines(f)[2:end]
@@ -29,3 +31,5 @@ t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5)
2931

3032
t = DecisionTree.build_tree(Y, X, 0, 6, 3, 5, 0.05)
3133
@test num_leaves(t) == 54
34+
35+
end # @testset

test/classification/heterogeneous.jl

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
### Classification - Heterogeneously typed features (ints, floats, bools, strings)
22

3+
@testset "heterogeneous.jl" begin
4+
35
m, n = 10^2, 5
46

57
tf = [trues(Int(m/2)) falses(Int(m/2))]
@@ -15,14 +17,16 @@ features[:,4] = tf[inds]
1517
model = build_tree(labels, features)
1618
preds = apply_tree(model, features)
1719
cm = confusion_matrix(labels, preds)
18-
@test cm.accuracy > 0.99
20+
@test cm.accuracy > 0.95
1921

2022
model = build_forest(labels, features, 2, 3)
2123
preds = apply_forest(model, features)
2224
cm = confusion_matrix(labels, preds)
23-
@test cm.accuracy > 0.99
25+
@test cm.accuracy > 0.95
2426

2527
model, coeffs = build_adaboost_stumps(labels, features, 7)
2628
preds = apply_adaboost_stumps(model, coeffs, features)
2729
cm = confusion_matrix(labels, preds)
28-
@test cm.accuracy > 0.99
30+
@test cm.accuracy > 0.95
31+
32+
end # @testset

test/classification/iris.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,21 @@
11
# Classification Test - Iris Data Set
22
# https://archive.ics.uci.edu/ml/datasets/iris
33

4+
@testset "iris.jl" begin
5+
46
download("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "iris.csv")
57
iris = readcsv("iris.csv");
68

79
features = iris[:, 1:4];
810
labels = iris[:, 5];
911

12+
# train a decision stump (depth=1)
13+
model = build_stump(labels, features)
14+
preds = apply_tree(model, features);
15+
cm = confusion_matrix(labels, preds);
16+
@test cm.accuracy > 0.6
17+
@test depth(model) == 1
18+
1019
# train full-tree classifier (over-fit)
1120
model = build_tree(labels, features);
1221
preds = apply_tree(model, features);
@@ -44,3 +53,5 @@ preds = apply_adaboost_stumps(model, coeffs, features);
4453
println("\n##### nfoldCV Classification Adaboosted Stumps #####")
4554
accuracy = nfoldCV_stumps(labels, features, 7, 3);
4655
@test mean(accuracy) > 0.7
56+
57+
end # @testset

test/classification/random.jl

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,30 @@
1+
@testset "random.jl" begin
2+
13
srand(16)
24

35
n,m = 10^3, 5;
46
features = rand(n,m);
57
weights = rand(-1:1,m);
68
labels = _int(features * weights);
79

10+
model = build_stump(labels, features)
11+
@test depth(model) == 1
12+
813
maxdepth = 3
914
model = build_tree(labels, features, 0, maxdepth)
1015
@test depth(model) == maxdepth
1116
print_tree(model, 3)
1217

18+
model = build_tree(labels, features)
19+
preds = apply_tree(model, features)
20+
cm = confusion_matrix(labels, preds)
21+
@test cm.accuracy > 0.95
22+
23+
model = build_forest(labels, features)
24+
preds = apply_forest(model, features)
25+
cm = confusion_matrix(labels, preds)
26+
@test cm.accuracy > 0.95
27+
1328
println("\n##### nfoldCV Classification Tree #####")
1429
accuracy = nfoldCV_tree(labels, features, 0.9, 3)
1530
@test mean(accuracy) > 0.7
@@ -21,3 +36,5 @@ accuracy = nfoldCV_forest(labels, features, 2, 10, 3)
2136
println("\n##### nfoldCV Adaboosted Stumps #####")
2237
accuracy = nfoldCV_stumps(labels, features, 7, 3)
2338
@test mean(accuracy) > 0.5
39+
40+
end # @testset

test/classification/scikitlearn.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
@testset "scikitlearn.jl" begin
2+
13
srand(2)
24
n,m = 10^3, 5 ;
35
features = rand(n,m);
@@ -35,3 +37,5 @@ y = rand(Bool, 100);
3537
predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X)
3638
@test predict_proba(fit!(RandomForestClassifier(; rng=10), X, y), X) !=
3739
predict_proba(fit!(RandomForestClassifier(; rng=12), X, y), X)
40+
41+
end # @testset

0 commit comments

Comments
 (0)