Skip to content

Commit 53ce771

Browse files
committed
feat(subtitle): support Chinese word segmentation [Fixes #84]
Support for splitting chinese words otherwise the text wrap display may not be correct in the sub display [TODO] - Do same segmentation in sidebar
1 parent 2634b09 commit 53ce771

File tree

1 file changed

+20
-5
lines changed

1 file changed

+20
-5
lines changed

LLPlayer/Controls/SelectableSubtitleText.xaml.cs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ private static void OnTextLanguageChanged(DependencyObject d, DependencyProperty
196196
[GeneratedRegex(@"^(?:[^\P{P}'-]+|\s)$")]
197197
private static partial Regex WordSplitFullReg { get; }
198198

199+
[GeneratedRegex(@"((?:[^\P{P}'-]+|\s|[\p{IsCJKUnifiedIdeographs}\p{IsCJKUnifiedIdeographsExtensionA}]))")]
200+
private static partial Regex ChineseWordSplitReg { get; }
201+
199202

200203
private static readonly Lazy<MeCabIpaDicTagger> MeCabTagger = new(() => MeCabIpaDicTagger.Create(), true);
201204

@@ -228,31 +231,43 @@ private void SetText(string text)
228231
// Use an OutlinedTextBlock for each word to display the border Text and enclose it in a WrapPanel
229232
for (int i = 0; i < lines.Length; i++)
230233
{
231-
List<string> words;
234+
IEnumerable<string> words;
232235

233236
if (TextLanguage != null && TextLanguage.ISO6391 == "ja")
234237
{
235238
// word segmentation for Japanese
236239
// TODO: L: Also do word segmentation in sidebar
237240
var nodes = MeCabTagger.Value.Parse(lines[i]);
238-
words = new List<string>(nodes.Length);
241+
List<string> wordsList = new(nodes.Length);
239242
foreach (var node in nodes)
240243
{
241244
// If there are space-separated characters, such as English, add them manually since they are not on the Surface
242245
if (char.IsWhiteSpace(lines[i][node.BPos]))
243246
{
244-
words.Add(" ");
247+
wordsList.Add(" ");
245248
}
246-
words.Add(node.Surface);
249+
wordsList.Add(node.Surface);
247250
}
251+
252+
words = wordsList;
253+
}
254+
else if (TextLanguage != null && TextLanguage.ISO6391 == "zh")
255+
{
256+
words = ChineseWordSplitReg.Split(lines[i]);
248257
}
249258
else
250259
{
251-
words = WordSplitReg.Split(lines[i]).Where(w => w != "").ToList();
260+
words = WordSplitReg.Split(lines[i]);
252261
}
253262

254263
foreach (string word in words)
255264
{
265+
// skip empty string because Split includes
266+
if (word.Length == 0)
267+
{
268+
continue;
269+
}
270+
256271
if (string.IsNullOrWhiteSpace(word))
257272
{
258273
// Blanks are inserted with TextBlock.

0 commit comments

Comments
 (0)