1
1
import os
2
2
import warnings
3
- import pandas as pd
4
- from .cheetah import Cheetah
3
+ from typing import Dict , List , Tuple , Any , Optional , Set
4
+ from ...helpers .terms import resolve_substitution_conflicts
5
+
5
6
6
7
class CheetahTermFormatter :
7
8
"""
8
9
Loads search terms from a Markdown file and returns them as
9
10
plain strings or dict blocks, with optional category filtering.
10
- Can also generate a substitutions lookup dict mapping phrases
11
- to underscored forms and back, if substitutions=True.
12
-
13
- New parameters:
14
- all_categories (bool): if True, ignore `category` and
15
- `include_general` and include every section.
11
+
12
+ Optionally generates a substitution lookup map (with underscore variants),
13
+ and can drop conflicts if requested.
14
+
15
+ Parameters
16
+ ----------
17
+ markdown_file : str | Path
18
+ Path to the .md file to load.
19
+ lower : bool
20
+ Whether to lowercase all term headers.
21
+ category : str | None
22
+ If set, include only `# Category: <category>` sections.
23
+ include_general : bool
24
+ If filtering by category, whether to include pre-category terms.
25
+ substitutions : bool
26
+ If True, builds substitution maps.
27
+ all_categories : bool
28
+ If True, overrides `category` and `include_general`.
29
+ drop_conflicts : bool
30
+ If True, resolve substitution conflicts and prune dropped entries.
31
+ If False, keep all substitutions as-is (even if conflicting).
16
32
"""
17
- def __init__ (self , markdown_file , lower = False , category = None ,
18
- include_general = True , substitutions = False , all_categories = False ):
33
+
34
+ def __init__ (
35
+ self ,
36
+ markdown_file ,
37
+ lower : bool = False ,
38
+ category : Optional [str ] = None ,
39
+ include_general : bool = True ,
40
+ substitutions : bool = False ,
41
+ all_categories : bool = False ,
42
+ drop_conflicts : bool = True ,
43
+ ):
19
44
self .markdown_file = markdown_file
20
45
self .lower = lower
21
46
self .category = category
22
47
self .include_general = include_general
23
48
self .substitutions = substitutions
24
49
self .all_categories = all_categories
50
+ self .drop_conflicts = drop_conflicts
25
51
26
- self .substitution_forward = {}
27
- self .substitution_reverse = {}
52
+ self .substitution_forward : Dict [ str , str ] = {}
53
+ self .substitution_reverse : Dict [ str , str ] = {}
28
54
29
- # parse the markdown into self. terms
30
- self .terms = self ._parse_markdown ()
55
+ # parse markdown → raw terms list
56
+ self .terms : List [ Any ] = self ._parse_markdown ()
31
57
32
- # optionally build lookup table
58
+ # optionally build lookup tables
33
59
if self .substitutions :
34
60
self ._build_substitutions_lookup ()
61
+ if self .drop_conflicts :
62
+ self ._postprocess_conflicts ()
35
63
36
-
37
- def _parse_markdown (self ):
38
- terms = []
39
- current_term = None
40
- positives = []
41
- negatives = []
42
- active_block = False
43
- current_section = None
64
+ # ──────────────────────────────────────────────────────────────── #
65
+ # markdown parsing #
66
+ # ──────────────────────────────────────────────────────────────── #
67
+ def _parse_markdown (self ) -> List [Any ]:
68
+ terms : List [Any ] = []
69
+ current_term , positives , negatives = None , [], []
70
+ active_block , current_section = False , None
44
71
45
72
try :
46
- with open (self .markdown_file , 'r' , encoding = ' utf-8' ) as f :
73
+ with open (self .markdown_file , "r" , encoding = " utf-8" ) as f :
47
74
lines = f .readlines ()
48
75
except FileNotFoundError :
49
76
warnings .warn (f"File '{ self .markdown_file } ' not found. Returning empty list." )
@@ -52,120 +79,145 @@ def _parse_markdown(self):
52
79
for raw in lines :
53
80
line = raw .strip ()
54
81
55
- # Section header
56
82
if line .startswith ("# Category:" ):
57
83
current_section = line .split (":" , 1 )[1 ].strip ()
58
84
continue
59
85
60
- # Decide whether to include this section
61
- if self .all_categories :
62
- include_section = True
63
- elif self .category is None :
64
- # no filtering → include everything
65
- include_section = True
66
- else :
67
- if current_section is None and self .include_general :
68
- include_section = True
69
- else :
70
- include_section = (current_section == self .category )
86
+ include_section = self .all_categories or self .category is None
87
+ if self .category and not self .all_categories :
88
+ include_section = (current_section == self .category ) or (
89
+ current_section is None and self .include_general
90
+ )
71
91
72
- # Term header
73
92
if line .startswith ("##" ):
74
- # finish previous block
75
93
if current_term is not None and active_block :
76
94
if positives or negatives :
77
- terms .append ({
78
- current_term : {
79
- "positives" : positives ,
80
- "negatives" : negatives
81
- }
82
- })
95
+ terms .append ({current_term : {"positives" : positives , "negatives" : negatives }})
83
96
else :
84
97
terms .append (current_term )
85
98
86
- # reset for new block
87
- positives = []
88
- negatives = []
99
+ positives , negatives = [], []
89
100
header = line .lstrip ("#" ).strip ()
90
101
if self .lower :
91
102
header = header .lower ()
92
- current_term = header
93
- active_block = include_section
103
+ current_term = header
104
+ active_block = include_section
94
105
95
- # collect positives / negatives
96
- elif active_block and line .lower ().startswith ("must have:" ):
106
+ elif active_block and line .lower ().startswith ("positives:" ):
97
107
items = [i .strip () for i in line .split (":" , 1 )[1 ].split ("," ) if i .strip ()]
98
108
positives .extend (items )
99
- elif active_block and line .lower ().startswith ("exclude with:" ):
109
+
110
+ elif active_block and line .lower ().startswith ("negatives:" ):
100
111
items = [i .strip () for i in line .split (":" , 1 )[1 ].split ("," ) if i .strip ()]
101
112
negatives .extend (items )
102
113
103
- # final block
104
114
if current_term is not None and active_block :
105
115
if positives or negatives :
106
- terms .append ({
107
- current_term : {
108
- "positives" : positives ,
109
- "negatives" : negatives
110
- }
111
- })
116
+ terms .append ({current_term : {"positives" : positives , "negatives" : negatives }})
112
117
else :
113
118
terms .append (current_term )
114
119
115
120
return terms
116
121
117
- def _build_substitutions_lookup (self ):
118
- """
119
- Build a dict mapping each term to its underscored form and vice versa.
120
- """
122
+ # ──────────────────────────────────────────────────────────────── #
123
+ # substitutions lookup #
124
+ # ──────────────────────────────────────────────────────────────── #
125
+ def _build_substitutions_lookup (self ) -> None :
126
+ """Create forward & reverse maps (no filtering yet)."""
121
127
for entry in self .terms :
122
128
if isinstance (entry , str ):
123
129
term = entry
124
130
underscored = term .replace (" " , "_" )
125
131
self .substitution_forward [term ] = underscored
126
132
self .substitution_reverse [underscored ] = term
127
- elif isinstance ( entry , dict ):
133
+ else : # dict
128
134
for term in entry .keys ():
129
135
underscored = term .replace (" " , "_" )
130
136
self .substitution_forward [term ] = underscored
131
137
self .substitution_reverse [underscored ] = term
132
138
139
+ def _postprocess_conflicts (self ) -> None :
140
+ """Resolve substitution conflicts and prune dropped terms."""
141
+ clean_forward , dropped = resolve_substitution_conflicts (
142
+ self .substitution_forward , warn = True
143
+ )
144
+ self .substitution_forward = clean_forward
145
+
146
+ # rebuild reverse map
147
+ rev : Dict [str , List [str ]] = {}
148
+ for src , tgt in clean_forward .items ():
149
+ rev .setdefault (tgt , []).append (src )
150
+ self .substitution_reverse = rev
151
+
152
+ if not dropped :
153
+ return
154
+
155
+ # prune self.terms to match cleaned substitutions
156
+ pruned_terms : List [Any ] = []
157
+ for entry in self .terms :
158
+ if isinstance (entry , str ):
159
+ if entry not in dropped :
160
+ pruned_terms .append (entry )
161
+ else :
162
+ kept = {k : v for k , v in entry .items () if k not in dropped }
163
+ if kept :
164
+ pruned_terms .append (kept )
165
+ self .terms = pruned_terms
166
+
167
+ # ──────────────────────────────────────────────────────────────── #
168
+ # public access #
169
+ # ──────────────────────────────────────────────────────────────── #
170
+ def get_terms (self ) -> List [Any ]:
171
+ return self .terms
172
+
173
+ def get_substitution_maps (self ) -> Tuple [Dict [str , str ], Dict [str , str ]]:
174
+ return self .substitution_forward , self .substitution_reverse
175
+
176
+
177
+ # ──────────────────────────────────────────────────────────────── #
178
+ # public helpers #
179
+ # ──────────────────────────────────────────────────────────────── #
133
180
def get_terms (self ):
134
181
return self .terms
135
182
136
183
def get_substitution_maps (self ):
137
- """
138
- Return the substitutions lookup dict (empty if substitutions=False).
139
- """
184
+ """Return (forward_map, reverse_map)."""
140
185
return self .substitution_forward , self .substitution_reverse
141
186
142
187
188
+ # ═══════════════════════════════════════════════════════════════════ #
189
+ # utility: convert TXT dump → cheetah markdown #
190
+ # ═══════════════════════════════════════════════════════════════════ #
143
191
def convert_txt_to_cheetah_markdown (txt_path , markdown_path ):
192
+ """
193
+ Helper to convert a simple TXT list (optionally containing dict literals)
194
+ into the markdown format expected by CheetahTermFormatter.
195
+ """
144
196
import ast
145
197
146
- with open (txt_path , 'r' , encoding = ' utf-8' ) as f :
198
+ with open (txt_path , "r" , encoding = " utf-8" ) as f :
147
199
lines = [line .strip () for line in f if line .strip ()]
148
200
149
- markdown_lines = []
201
+ md_lines : List [ str ] = []
150
202
151
203
for line in lines :
152
204
if line .startswith ("{" ) and line .endswith ("}" ):
153
205
try :
154
206
parsed = ast .literal_eval (line )
155
207
for key , value in parsed .items ():
156
- positives = [v .lstrip ('+' ) for v in value if v .startswith ('+' )]
157
- negatives = [v for v in value if not v .startswith ('+' )]
158
- markdown_lines .append (f"## { key } " )
208
+ positives = [v .lstrip ("+" ) for v in value if v .startswith ("+" )]
209
+ negatives = [v for v in value if not v .startswith ("+" )]
210
+ md_lines .append (f"## { key } " )
159
211
if positives :
160
- markdown_lines .append (f"positives: { ', ' .join (positives )} " )
212
+ md_lines .append (f"positives: { ', ' .join (positives )} " )
161
213
if negatives :
162
- markdown_lines .append (f"negatives: { ', ' .join (negatives )} " )
214
+ md_lines .append (f"negatives: { ', ' .join (negatives )} " )
163
215
except Exception as e :
164
216
print (f"Skipping line due to parse error: { line } \n Error: { e } " )
165
217
else :
166
- markdown_lines .append (f"## { line .strip ()} " )
218
+ md_lines .append (f"## { line .strip ()} " )
167
219
168
- with open (markdown_path , 'w' , encoding = ' utf-8' ) as f :
169
- f .write ("\n " .join (markdown_lines ))
220
+ with open (markdown_path , "w" , encoding = " utf-8" ) as f :
221
+ f .write ("\n " .join (md_lines ))
170
222
171
223
print (f"Converted markdown saved to: { markdown_path } " )
0 commit comments