Skip to content

Commit f66dc14

Browse files
committed
Fix end of text behaviour in case of sentence positions
Change-Id: Ic433dd3579d9a79df5734a405e682596c3ccddad
1 parent 78d270d commit f66dc14

File tree

4 files changed

+24
-0
lines changed

4 files changed

+24
-0
lines changed

Changes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
0.2.2 2023-09-06
2+
- Fix behaviour for end of text character positions
3+
when no end of sentence occured before.
4+
15
0.2.1 2023-09-05
26
- Add english tokenizer.
37
- Fix buffer bug.

datok.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,10 @@ PARSECHAR:
10181018

10191019
if eot {
10201020
eot = false
1021+
if !sentenceEnd {
1022+
sentenceEnd = true
1023+
w.SentenceEnd(buffc)
1024+
}
10211025
textEnd = true
10221026
w.TextEnd(0)
10231027
if DEBUG {

matrix.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,10 @@ PARSECHARM:
592592

593593
if eot {
594594
eot = false
595+
if !sentenceEnd {
596+
sentenceEnd = true
597+
w.SentenceEnd(buffc)
598+
}
595599
textEnd = true
596600
w.TextEnd(buffc)
597601
rewindBuffer = true

token_writer_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,18 @@ func TestTokenWriterFromOptions(t *testing.T) {
8585
matStr = w.String()
8686
assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr)
8787

88+
w.Reset()
89+
mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws)
90+
91+
matStr = w.String()
92+
assert.Equal("0 4\n0 4\n", matStr)
93+
94+
w.Reset()
95+
mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws)
96+
97+
matStr = w.String()
98+
assert.Equal("0 4 4 5\n0 5\n", matStr)
99+
88100
//
89101
// Write sentence offsets without token offsets
90102
tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)

0 commit comments

Comments
 (0)