Skip to content

Commit b98e4cf

Browse files
committed
Improve Emoticons
Change-Id: I0d72781b41381aa2c86e41287b8f824af4af95d1
1 parent f94b9ce commit b98e4cf

File tree

8 files changed

+75
-32
lines changed

8 files changed

+75
-32
lines changed

Changes

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
0.1.5 2022-03-28
2+
- Improve Emoticon-List.
3+
14
0.1.4 2022-03-27
25
- Improved handling of ellipsis.
36
- Make algorithm more robust to nevere fail.

matrix_test.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ Innstetten!`
376376
assert.Equal(\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
377377
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
378378

379-
// Check paranthesis at the end of sentences.
379+
// Check parantheses at the end of the sentence
380380
w.Reset()
381381
assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
382382
sentences = strings.Split(w.String(), "\n\n")
@@ -1083,6 +1083,32 @@ func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
10831083
*/
10841084
}
10851085

1086+
func TestMatrixEmoticons(t *testing.T) {
1087+
assert := assert.New(t)
1088+
1089+
if mat == nil {
1090+
mat = LoadMatrixFile("testdata/tokenizer.matok")
1091+
}
1092+
1093+
assert.NotNil(mat)
1094+
1095+
b := make([]byte, 0, 2048)
1096+
w := bytes.NewBuffer(b)
1097+
var tokens []string
1098+
1099+
tokens = ttokenize(mat, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
1100+
assert.Equal(tokens[0], ":-*")
1101+
assert.Equal(tokens[1], ";)")
1102+
assert.Equal(tokens[2], ":))")
1103+
assert.Equal(tokens[3], ":*(")
1104+
assert.Equal(tokens[4], "^___^")
1105+
assert.Equal(tokens[5], "T__T")
1106+
assert.Equal(tokens[6], "^^;")
1107+
assert.Equal(tokens[7], "-_-;;;")
1108+
assert.Equal(tokens[8], "-_-^")
1109+
assert.Equal(len(tokens), 9)
1110+
}
1111+
10861112
func TestMatrixFullTokenizerXML(t *testing.T) {
10871113
assert := assert.New(t)
10881114

src/emoji.xfst

Lines changed: 0 additions & 28 deletions
This file was deleted.

src/emoticons.xfst

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
! Partially by Park, Barash, Fink & Cha (2013)
2+
3+
define verticalemoticon [
4+
[ "ಠ" "_" "ಠ"]|
5+
[ "T" ["_"|"."|"-"]+ "T"] |
6+
[ "♥" ["_"|"."|"-"]+ "♥" ] |
7+
[ "@" ["_"|"."|"-"]* "@" ] |
8+
[ "*" ["_"|"."|"-"]+ "*" ] |
9+
[ "x" ["_"|"."|"-"]+ "x" ] |
10+
[ "X" ["_"|"."|"-"]+ "X" ] |
11+
[ "-" ["_"|"."]+ "-" ] |
12+
[ "." ["_"]+ "." ] |
13+
[ "^" ["_"|"."|"-"]* "^" ] |
14+
[ ">" ["_"|"."|"-"]* "<" ] |
15+
[ ["o"|"O"] ["_"|"."|"-"]+ ["o"|"O"] ]
16+
];
17+
18+
read regex [
19+
["<" ("/") "3"+] |
20+
verticalemoticon (";"+|"^") |
21+
["(" verticalemoticon ")"] |
22+
23+
! May be end of brackets as well, like
24+
! Author (2018):
25+
[ [")"|"("] ["'"|"-"|"o"]* [":"|"="|"x"] ] |
26+
! May be end of xml, like
27+
! <b class="emp">=</b>
28+
[ ["<"*|">"*] ["B"|"X"|"8"|":"|";"|"="|"x"] ["'"|"-"|"o"]* ["/"|"<"|"C"|"["|")"|"("|"D"|"P"|"d"|"p"|"3"|">"|"o"|"O"|"*"]] |
29+
[ ["D"|">"] ("'") ":"] |
30+
31+
! May be end of square bracket
32+
! Author [2018]:
33+
["]" ":"] |
34+
[(">") [";"|":"] ["-"|"*"]* [ ")" | "(" | %] | %[ ]+ ] |
35+
[(">") [";"|":"] ["-"]* ["*"|"P"|"p"|"o"|"O"|"D"]] |
36+
["x" "("] |
37+
["^" (".") "^"] |
38+
[%\ ["{" "o" "}"|"o"|"m"] "/"] |
39+
[":" ["a"|"b"|"c"|"d"|"e"|"f"|"g"|"h"|"i"|"j"|"k"|"l"|"m"|"n"|"o"|"p"|"q"|"r"|"s"|"t"|"u"|"v"|"w"|"x"|"y"|"z"|"_"|"-"]+ ":"] |
40+
[">" "_" "<"] |
41+
["*" "<" ":" "-" ")"]
42+
];

src/tokenizer.xfst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ define Years ["(" Digit+ (".") ")"] | ["[" Digit+ (".") "]"];
119119
! 20:00 Uhr, 00:12:25,34 Minuten
120120
define Times [ ( [%0|1|2|3|4|5] ) Digit [ ":" [%0|1|2|3|4|5] Digit ]^{1,2} ( "," [ Digit ]^{1,3} ) ];
121121

122-
source emoji.xfst
123-
define Emoji;
122+
source emoticons.xfst
123+
define Emoticons;
124124

125125
! acronyms: U.S.A., I.B.M., etc.
126126
! use a post-filter to remove dots
@@ -215,7 +215,7 @@ define Token [
215215
Email @-> ... NLout,
216216
File @-> ... NLout,
217217
Domain @-> ... NLout,
218-
Emoji @-> ... NLout
218+
Emoticons @-> ... NLout
219219
];
220220

221221
echo - Introduce Sentence splitter

testdata/tokenizer.datok

28 KB
Binary file not shown.

testdata/tokenizer.fst

26.9 KB
Binary file not shown.

testdata/tokenizer.matok

24.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)