Skip to content

Commit adc009d

Browse files
authored
Add LLM tokens used to MetaData, fix PCCW parser and text hook improvement (#328)
* Add tokens used to MetaData, fix PCCW and text hook improvement * Add changes doc
1 parent 202b4d2 commit adc009d

File tree

6 files changed

+72
-14
lines changed

6 files changed

+72
-14
lines changed

changes/328.housekeeping

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add LLM tokens used to MetaData
2+
Fix PCCW parser
3+
Text hook improvement

circuit_maintenance_parser/output.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ class Metadata(BaseModel):
113113
processor: StrictStr
114114
parsers: List[StrictStr]
115115
generated_by_llm: bool = False
116+
tokens_used: int = 0
116117

117118

118119
class Maintenance(BaseModel, extra="forbid"):

circuit_maintenance_parser/parser.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import bs4 # type: ignore
1515
from bs4.element import ResultSet # type: ignore
16+
from charset_normalizer import from_bytes
1617
from dateutil.parser import isoparse
1718
from icalendar import Calendar # type: ignore
1819
from pydantic import BaseModel, PrivateAttr
@@ -305,6 +306,7 @@ class LLM(Parser):
305306
"""LLM parser."""
306307

307308
_data_types = PrivateAttr(["text/html", "html", "text/plain"])
309+
_tokens_used = PrivateAttr(default=0)
308310

309311
_llm_question = """Please, could you extract a JSON form without any other comment,
310312
with the following JSON schema (start and end times are datetime objects and should be displayed in UTC):
@@ -374,7 +376,18 @@ def parser_hook(self, raw: bytes, content_type: str):
374376
@staticmethod
375377
def get_text_hook(raw: bytes) -> str:
376378
"""Can be overwritten by subclasses."""
377-
return raw.decode()
379+
try:
380+
# Decode quoted-printable if needed
381+
decoded_bytes = quopri.decodestring(raw)
382+
383+
# Auto-detect and decode
384+
result = from_bytes(decoded_bytes).best()
385+
if result is not None:
386+
return str(result)
387+
return decoded_bytes.decode("latin-1", errors="replace")
388+
except (UnicodeDecodeError, ValueError, TypeError, AttributeError):
389+
# Final fallback if all above methods fail
390+
return raw.decode("utf-8", errors="replace")
378391

379392
@staticmethod
380393
def get_key_with_string(dictionary: dict, string: str):
@@ -401,6 +414,16 @@ def llm_question(self):
401414

402415
return self._llm_question
403416

417+
@property
418+
def tokens_used(self):
419+
"""Return the number of tokens used by the LLM."""
420+
return self._tokens_used
421+
422+
@tokens_used.setter
423+
def tokens_used(self, value):
424+
"""Set the number of tokens used by the LLM."""
425+
self._tokens_used = value
426+
404427
def get_llm_response(self, content):
405428
"""Method to retrieve the response from the LLM for some content."""
406429
raise NotImplementedError
@@ -482,7 +505,6 @@ def _get_maintenance_id(self, generated_json: dict, start, end, circuits):
482505
maintenance_key = self.get_key_with_string(generated_json, "maintenance")
483506
if maintenance_key and generated_json["maintenance_id"] != "N/A":
484507
return generated_json["maintenance_id"]
485-
486508
maintenance_id = str(start) + str(end) + "".join(list(circuits))
487509
return hashlib.sha256(maintenance_id.encode("utf-8")).hexdigest() # nosec
488510

@@ -508,6 +530,7 @@ def parse_content(self, content):
508530
"summary": str(self._get_summary(generated_json)),
509531
"status": self._get_status(generated_json),
510532
"account": str(self._get_account(generated_json)),
533+
"_llm_tokens_used": self.tokens_used,
511534
}
512535

513536
# Generate maintenance ID for main window

circuit_maintenance_parser/parsers/openai.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def get_llm_response(self, content) -> Optional[List]:
4848
logger.error(err)
4949
return None
5050

51+
# Store the token usage information
52+
if hasattr(response, "usage") and hasattr(response.usage, "total_tokens"):
53+
self.tokens_used = response.usage.total_tokens
54+
5155
logger.info("Used OpenAI tokens: %s", response.usage)
5256
generated_text = response.choices[0].message.content
5357
logger.info("Response from LLM: %s", generated_text)

circuit_maintenance_parser/parsers/pccw.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,18 +38,36 @@ def parse_html(self, soup: ResultSet) -> List[Dict]:
3838

3939
def _extract_account(self, soup: ResultSet) -> str:
4040
"""Extract customer account from soup."""
41-
customer_field = soup.find(string=re.compile("Customer Name :", re.IGNORECASE))
42-
return customer_field.split(":")[1].strip()
41+
for string_node in soup.find_all(string=True):
42+
text = str(string_node)
43+
if "customer name" in text.lower():
44+
# Try to extract just the account name
45+
match = re.search(r"Customer Name\s*:\s*(.+)", text, re.IGNORECASE)
46+
if match:
47+
return match.group(1).strip()
48+
49+
# Return a default value if customer name is not found
50+
return "Unknown"
4351

4452
def _extract_maintenance_window(self, soup: ResultSet) -> Tuple[datetime, datetime]:
4553
"""Extract start and end times from maintenance window."""
46-
datetime_field = soup.find(string=re.compile("Date Time :", re.IGNORECASE))
47-
time_parts = (
48-
datetime_field.lower().replace("date time :", "-").replace("to", "-").replace("gmt", "-").split("-")
49-
)
50-
start_time = datetime.strptime(time_parts[1].strip(), self.DATE_TIME_FORMAT)
51-
end_time = datetime.strptime(time_parts[2].strip(), self.DATE_TIME_FORMAT)
52-
return start_time, end_time
54+
for string_node in soup.find_all(string=True):
55+
text = str(string_node)
56+
57+
if "date time" in text.lower():
58+
# Match format like: Date Time : 12/06/2025 15:30:00 to 12/06/2025 19:30:00 GMT
59+
match = re.search(
60+
r"Date Time\s*:\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})\s*to\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})",
61+
text,
62+
re.IGNORECASE,
63+
)
64+
if match:
65+
start_str, end_str = match.groups()
66+
start_time = datetime.strptime(start_str.strip(), self.DATE_TIME_FORMAT)
67+
end_time = datetime.strptime(end_str.strip(), self.DATE_TIME_FORMAT)
68+
return start_time, end_time
69+
70+
raise ValueError("Could not find 'Date Time :' field or failed to parse timestamps.")
5371

5472

5573
class SubjectParserPCCW(EmailSubjectParser):

circuit_maintenance_parser/processor.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,14 @@ def get_name(cls) -> str:
101101
"""Return the processor name."""
102102
return cls.__name__
103103

104-
def generate_metadata(self):
104+
def generate_metadata(self, tokens_used=0):
105105
"""Generate the Metadata for the Maintenance."""
106106
return Metadata(
107107
parsers=[parser.get_name() for parser in self.data_parsers],
108108
generated_by_llm=any(issubclass(parser, LLM) for parser in self.data_parsers),
109109
processor=self.get_name(),
110110
provider=self.extended_data["provider"],
111+
tokens_used=tokens_used,
111112
)
112113

113114

@@ -118,7 +119,11 @@ def process_hook(self, maintenances_extracted_data, maintenances_data):
118119
"""For each data extracted (that can be multiple), we try to build a complete Maintenance."""
119120
for extracted_data in maintenances_extracted_data:
120121
self.extend_processor_data(extracted_data)
121-
extracted_data["_metadata"] = self.generate_metadata()
122+
123+
# Extract tokens information if present
124+
tokens_used = extracted_data.pop("_llm_tokens_used", 0)
125+
126+
extracted_data["_metadata"] = self.generate_metadata(tokens_used=tokens_used)
122127
maintenances_data.append(Maintenance(**extracted_data))
123128

124129

@@ -156,7 +161,11 @@ def post_process_hook(self, maintenances_data):
156161
for maintenance in maintenances:
157162
try:
158163
combined_data = {**self.combined_maintenance_data, **maintenance}
159-
combined_data["_metadata"] = self.generate_metadata()
164+
165+
# Extract tokens information if present
166+
tokens_used = combined_data.pop("_llm_tokens_used", 0)
167+
168+
combined_data["_metadata"] = self.generate_metadata(tokens_used=tokens_used)
160169
maintenances_data.append(Maintenance(**combined_data))
161170
except ValidationError as exc:
162171
raise ProcessorError("Not enough information available to create a Maintenance notification.") from exc

0 commit comments

Comments
 (0)