feat: 滑动窗户防止文本被截断

wangtao2001 · wangtao2001 · commit b461a8393db3 · 2025-04-09T17:11:27.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -64,4 +64,6 @@ src/course_graph/database/faiss_index
 src/course_graph/database/milvus.db
 libreoffice_convert.log
 .cursorrules
-.ruff_cache
+.ruff_cache
+
+train.sh
diff --git a/docs/tutorials/other/rust.md b/docs/tutorials/other/rust.md
@@ -11,25 +11,25 @@
 
 ### 编写 Rust 代码
 
-Rust 扩展代码都应该放到 `rust/src/ext` 目录下, 具体实现可参考 [PyO3 指南](https://pyo3.rs/v0.15.1/)。
+Rust 扩展代码都应该放到 `src/lib.rs` 目录下, 具体实现可参考 [PyO3 指南](https://pyo3.rs/v0.15.1/)。
 
 ### 导出函数
 
-在 `rust/src/lib.rs` 中添加导出函数, 具体导出方式可参考已导出的函数部分。
+在 `src/lib.rs` 中的 `_core` 函数中添加导出函数, 具体导出方式可参考已导出的函数部分。
 
 ### 编写函数接口
 
 为了使得 IDE 获得更好的提示, 我们可以为这些函数编写 Python 接口, 但不用编写具体的实现。
 
-在 `rust/extension.pyi` 文件中继续添加函数接口, 包含类型标注和函数注解等信息即可。
+在 `src/course_graph/_core.pyi` 文件中继续添加函数接口, 包含类型标注和函数注解等信息即可。
 
 ### 编译并安装
 
-确保已安装 Rust 环境、Cargo 和 Python 的 `maturin` 库, 然后执行：
+确保已安装 Rust 环境、Cargo, 然后执行：
 
 ```bash
-cd rust
-maturin develop
+source .venv/bin/activate
+maturin develop --uv
 ```
 
-所有编写的 Rust 扩展函数会安装到 `extension` 包下。
+所有编写的 Rust 扩展函数会安装到 `course_graph._core` 包中。
diff --git a/experimental/README.md b/experimental/README.md
@@ -14,9 +14,9 @@ experimental/
 │       └── txt/
 │           └── *.txt   # 原始纯文本数据
 ├── scripts/
-│   ├── ner/            # 实体识别模型
-│   ├── overview.py     # 数据概览
-│   └── pre_trained/    # 预训练模型
+│   ├── ke/             # 知识抽取模型
+│   └── overview.py     # 数据概览
+├── pre_trained/        # 预训练模型
 ├── results/            # 结果
 └── README.md
 
diff --git a/experimental/results/train_ner_model/README.md b/experimental/results/train_ner_model/README.md
@@ -0,0 +1 @@
+#### 实体识别模型训练
diff --git a/experimental/results/train_re_model/README.md b/experimental/results/train_re_model/README.md
@@ -0,0 +1 @@
+#### 关系识别模型训练
diff --git a/experimental/scripts/ke/dataset.py b/experimental/scripts/ke/dataset.py
@@ -6,15 +6,79 @@
 
 import torch
 from torch.utils.data import Dataset
-from .config import *
+from config import *
 from transformers import PreTrainedTokenizerFast
 
 
 class NERDataset(Dataset):
-    def __init__(self, data, tokenizer, max_len=128):
-        self.data = data
+    def __init__(self, data, tokenizer, max_len=512, exceed_strategy="truncation"):
         self.tokenizer = tokenizer
         self.max_len = max_len
+        self.data = []
+
+        if exceed_strategy == "truncation":
+            self.data = data
+        elif exceed_strategy == "sliding_window":
+            # only support fast tokenizer temporarily
+            for item in data:
+                text = item['text']
+                entities = item['entities']
+
+                full_encoding = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    return_offsets_mapping=True,
+                    return_tensors="pt"
+                )
+
+                tokens = full_encoding.tokens()
+                offset_mapping = full_encoding["offset_mapping"].squeeze().tolist()
+
+                if len(tokens) <= max_len:
+                    # item['encoding'] = full_encoding
+                    self.data.append(item)
+                    continue
+
+                window_size = max_len
+                stride = window_size // 2
+
+                start_token_idx = 0
+                while start_token_idx < len(tokens):
+                    end_token_idx = min(start_token_idx + window_size, len(tokens))
+
+                    # [start_token_idx, end_token_idx) ==> [start_char_idx, end_char_idx)
+                    start_char_idx = offset_mapping[start_token_idx][0]
+                    end_char_idx = offset_mapping[end_token_idx - 1][1]
+
+                    # 对每一个窗口, 只保留完全在当前窗口内的实体 (可能会减少窗口长度)
+                    for entity in entities:
+                        # bais: 实体长度远低于 window_size 和 stride
+                        if entity['start'] <= start_char_idx < entity['end']:
+                            start_char_idx = entity['end']
+                        if entity['start'] <= end_char_idx < entity['end']:
+                            end_char_idx = entity['start']
+                            break
+                        # start_char_idx 和 end_char_idx 也应该变化，但这里不处理
+
+                    window_entities = []
+                    for entity in entities:
+                        if entity['start'] >= start_char_idx and entity['end'] <= end_char_idx:
+                            new_entity = entity.copy()
+                            new_entity['start'] -= start_char_idx
+                            new_entity['end'] -= start_char_idx
+                            window_entities.append(new_entity)
+
+                    window_text = text[start_char_idx:end_char_idx]
+                    window_data = {
+                        'text': window_text,
+                        'entities': window_entities  # 暂时不添加 encoding
+                    }
+                    self.data.append(window_data)
+
+                    next_token_idx = start_token_idx + stride  # 重叠窗口
+                    start_token_idx = next_token_idx
+        else:
+            pass
 
     def __len__(self):
         return len(self.data)
@@ -31,21 +95,24 @@ def __getitem__(self, idx):
             for i in range(start + 1, end):
                 char_labels[i] = f"I-{entity_type}"
 
+        # char_labels 对齐为 token_labels
         if isinstance(self.tokenizer, PreTrainedTokenizerFast):
-            encoding = self.tokenizer(
-                text,
-                add_special_tokens=False,
-                max_length=self.max_len,
-                padding="max_length",
-                truncation=True,
-                return_offsets_mapping=True,
-                return_tensors="pt"
-            )
-
+            if self.data[idx].get('encoding'):
+                encoding = self.data[idx]['encoding']  # 预处理阶段可能得到
+            else:
+                encoding = self.tokenizer(
+                    text,
+                    add_special_tokens=False,
+                    max_length=self.max_len,
+                    padding="max_length",
+                    truncation=True,
+                    return_offsets_mapping=True,
+                    return_tensors="pt"
+                )
             input_ids = encoding["input_ids"].squeeze()
             attention_mask = encoding["attention_mask"].squeeze()
-            tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
-            offset_mapping = encoding["offset_mapping"].squeeze().tolist()
+            tokens = encoding.tokens()
+            offset_mapping = encoding["offset_mapping"].squeeze().tolist()  # 每个 token 在原文中的位置
 
             # 从实体得到 token_labels
             token_labels = []
@@ -70,7 +137,7 @@ def __getitem__(self, idx):
 
             input_ids = encoding["input_ids"].squeeze()
             attention_mask = encoding["attention_mask"].squeeze()
-            tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
+            tokens = encoding.tokens()
 
             token_labels = []
             char_idx = 0
@@ -94,7 +161,7 @@ def __getitem__(self, idx):
 
 
 class REDataset(Dataset):
-    def __init__(self, data, tokenizer, max_len=128):
+    def __init__(self, data, tokenizer, max_len=512, exceed_strategy="truncation"):
         self.tokenizer = tokenizer
         self.max_len = max_len
 
@@ -103,13 +170,19 @@ def __init__(self, data, tokenizer, max_len=128):
             text = line['text']
             entities = line['entities']
             relations = line['relations']
-            for relation in relations:
-                self.data.append({
-                    'text': text,
-                    'e1': next(filter(lambda x: x['id'] == relation['source_id'], entities)),
-                    'e2': next(filter(lambda x: x['id'] == relation['target_id'], entities)),
-                    'relation': relation['type']
-                })
+            
+            if exceed_strategy == "truncation":
+                for relation in relations:
+                    self.data.append({
+                        'text': text,
+                        'e1': next(filter(lambda x: x['id'] == relation['source_id'], entities)),
+                        'e2': next(filter(lambda x: x['id'] == relation['target_id'], entities)),
+                        'relation': relation['type']
+                    })
+            elif exceed_strategy == "sliding_window":
+                pass
+            else:
+                pass
 
     def __len__(self):
         return len(self.data)
@@ -139,11 +212,11 @@ def __getitem__(self, idx):
             attention_mask = encoding["attention_mask"].squeeze()
             offset_mapping = encoding["offset_mapping"].squeeze().tolist()
 
-            e1_mask = _create_entity_mask(input_ids, offset_mapping, e1_start, e1_end)
+            e1_mask = _create_entity_mask(input_ids, offset_mapping, e1_start, e1_end)  # 实体掩码为 1
             e2_mask = _create_entity_mask(input_ids, offset_mapping, e2_start, e2_end)
 
         else:
-            
+
             encoding = self.tokenizer(
                 text,
                 add_special_tokens=False,
@@ -156,10 +229,10 @@ def __getitem__(self, idx):
             input_ids = encoding["input_ids"].squeeze()
             attention_mask = encoding["attention_mask"].squeeze()
             tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
-            
+
             e1_mask = _create_entity_mask2(text, input_ids, tokens, e1_start, e1_end)
             e2_mask = _create_entity_mask2(text, input_ids, tokens, e2_start, e2_end)
-            
+
         return {
             "input_ids": input_ids,
             "attention_mask": attention_mask,
diff --git a/experimental/scripts/ke/model.py b/experimental/scripts/ke/model.py
@@ -45,13 +45,18 @@ def forward(self, input_ids, attention_mask, token_type_ids, labels=None, **kwar
         emissions = self.hidden2label(lstm_output)  # (batch_size, max_len, num_labels)
         
         pred_label_ids = self.crf.decode(emissions, mask=attention_mask.bool())
-        pred_label_ids = torch.tensor(pred_label_ids, device=emissions.device)
+        max_len = input_ids.shape[1]
+        padded_label_ids = []
+        for seq in pred_label_ids:
+            padded_seq = seq + [0] * (max_len - len(seq))  # 使用列表操作进行填充
+            padded_label_ids.append(padded_seq)
+
+        pred_label_ids = torch.tensor(padded_label_ids, device=emissions.device)
         pred_label_ids[pred_label_ids == 0] = 1  # 模型后处理，不允许预测出现 IGNORE
-        pred_label_ids = F.pad(pred_label_ids, (0, input_ids.shape[1] - pred_label_ids.shape[1]), value=0, mode="constant")
         
         if labels is not None:
             valid_mask = labels != 0
-            loss = -self.crf(emissions, labels, mask=valid_mask)
+            loss = -self.crf(emissions, labels, mask=valid_mask, reduction='mean')
             return {
                 "loss": loss,
                 "pred_label_ids": pred_label_ids
@@ -93,8 +98,7 @@ def forward(self, input_ids, attention_mask, token_type_ids, e1_mask, e2_mask, l
         concat_h = torch.cat([e1_h, e2_h], dim=-1)  # (batch_size, hidden_size*2)
         concat_h = self.dropout(concat_h)
         logits = self.classifier(concat_h)  # (batch_size, num_relations)
-        
-        loss = None
+
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss()
             loss = loss_fct(logits.view(-1, self.num_relations), labels.view(-1))
diff --git a/experimental/scripts/ke/predict.py b/experimental/scripts/ke/predict.py
@@ -142,7 +142,7 @@ def _re_predict(
 def ner_predict(
     text: str,
     model_path: str = "experimental/scripts/ke/checkpoints/ner/final_model",
-    max_len: int = 128,
+    max_len: int = 512,
     device: str = "cuda" if torch.cuda.is_available() else "cpu",
 ) -> list[dict]:
     """
@@ -151,7 +151,7 @@ def ner_predict(
     Args:
         texts (list[str]): 需要预测的文本列表
         model_path (str): 模型路径. Defaults to "experimental/scripts/ke/checkpoints/ner/final_model".
-        max_len (int): 最大长度. Defaults to 128.
+        max_len (int): 最大长度. Defaults to 512.
         device (str): 设备. Defaults to "cuda" or "cpu".
 
     Returns:
@@ -166,7 +166,7 @@ def re_predict(
     e1_range: tuple[int, int],
     e2_range: tuple[int, int],
     model_path: str = "experimental/scripts/ke/checkpoints/re/final_model",
-    max_len: int = 128,
+    max_len: int = 512,
     device: str = "cuda" if torch.cuda.is_available() else "cpu",
 ) -> dict:
     """
@@ -177,7 +177,7 @@ def re_predict(
         e1_range (tuple[int, int]): 实体1的位置 [start:end]
         e2_range (tuple[int, int]): 实体2的位置 [start:end]
         model_path (str): 模型路径. Defaults to "experimental/scripts/ke/checkpoints/re/final_model".
-        max_len (int): 最大长度. Defaults to 128.
+        max_len (int): 最大长度. Defaults to 512.
         device (str): 设备. Defaults to "cuda" or "cpu".
 
     Returns:
diff --git a/experimental/scripts/ke/train_with_trainer.py b/experimental/scripts/ke/train_with_trainer.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/uv.lock b/uv.lock