Add LLM tokens used to MetaData, fix PCCW parser and text hook improvement (#328)

mkekez-SIE · web-flow · commit adc009d0a07b · 2025-08-07T10:13:24.000+02:00
* Add tokens used to MetaData, fix PCCW and text hook improvement

* Add changes doc
diff --git a/changes/328.housekeeping b/changes/328.housekeeping
@@ -0,0 +1,3 @@
+Add LLM tokens used to MetaData
+Fix PCCW parser
+Text hook improvement
diff --git a/circuit_maintenance_parser/output.py b/circuit_maintenance_parser/output.py
@@ -113,6 +113,7 @@ class Metadata(BaseModel):
     processor: StrictStr
     parsers: List[StrictStr]
     generated_by_llm: bool = False
+    tokens_used: int = 0
 
 
 class Maintenance(BaseModel, extra="forbid"):
diff --git a/circuit_maintenance_parser/parser.py b/circuit_maintenance_parser/parser.py
@@ -13,6 +13,7 @@
 
 import bs4  # type: ignore
 from bs4.element import ResultSet  # type: ignore
+from charset_normalizer import from_bytes
 from dateutil.parser import isoparse
 from icalendar import Calendar  # type: ignore
 from pydantic import BaseModel, PrivateAttr
@@ -305,6 +306,7 @@ class LLM(Parser):
     """LLM parser."""
 
     _data_types = PrivateAttr(["text/html", "html", "text/plain"])
+    _tokens_used = PrivateAttr(default=0)
 
     _llm_question = """Please, could you extract a JSON form without any other comment,
     with the following JSON schema (start and end times are datetime objects and should be displayed in UTC):
@@ -374,7 +376,18 @@ def parser_hook(self, raw: bytes, content_type: str):
     @staticmethod
     def get_text_hook(raw: bytes) -> str:
         """Can be overwritten by subclasses."""
-        return raw.decode()
+        try:
+            # Decode quoted-printable if needed
+            decoded_bytes = quopri.decodestring(raw)
+
+            # Auto-detect and decode
+            result = from_bytes(decoded_bytes).best()
+            if result is not None:
+                return str(result)
+            return decoded_bytes.decode("latin-1", errors="replace")
+        except (UnicodeDecodeError, ValueError, TypeError, AttributeError):
+            # Final fallback if all above methods fail
+            return raw.decode("utf-8", errors="replace")
 
     @staticmethod
     def get_key_with_string(dictionary: dict, string: str):
@@ -401,6 +414,16 @@ def llm_question(self):
 
         return self._llm_question
 
+    @property
+    def tokens_used(self):
+        """Return the number of tokens used by the LLM."""
+        return self._tokens_used
+
+    @tokens_used.setter
+    def tokens_used(self, value):
+        """Set the number of tokens used by the LLM."""
+        self._tokens_used = value
+
     def get_llm_response(self, content):
         """Method to retrieve the response from the LLM for some content."""
         raise NotImplementedError
@@ -482,7 +505,6 @@ def _get_maintenance_id(self, generated_json: dict, start, end, circuits):
         maintenance_key = self.get_key_with_string(generated_json, "maintenance")
         if maintenance_key and generated_json["maintenance_id"] != "N/A":
             return generated_json["maintenance_id"]
-
         maintenance_id = str(start) + str(end) + "".join(list(circuits))
         return hashlib.sha256(maintenance_id.encode("utf-8")).hexdigest()  # nosec
 
@@ -508,6 +530,7 @@ def parse_content(self, content):
             "summary": str(self._get_summary(generated_json)),
             "status": self._get_status(generated_json),
             "account": str(self._get_account(generated_json)),
+            "_llm_tokens_used": self.tokens_used,
         }
 
         # Generate maintenance ID for main window
diff --git a/circuit_maintenance_parser/parsers/openai.py b/circuit_maintenance_parser/parsers/openai.py
@@ -48,6 +48,10 @@ def get_llm_response(self, content) -> Optional[List]:
             logger.error(err)
             return None
 
+        # Store the token usage information
+        if hasattr(response, "usage") and hasattr(response.usage, "total_tokens"):
+            self.tokens_used = response.usage.total_tokens
+
         logger.info("Used OpenAI tokens: %s", response.usage)
         generated_text = response.choices[0].message.content
         logger.info("Response from LLM: %s", generated_text)
diff --git a/circuit_maintenance_parser/parsers/pccw.py b/circuit_maintenance_parser/parsers/pccw.py
@@ -38,18 +38,36 @@ def parse_html(self, soup: ResultSet) -> List[Dict]:
 
     def _extract_account(self, soup: ResultSet) -> str:
         """Extract customer account from soup."""
-        customer_field = soup.find(string=re.compile("Customer Name :", re.IGNORECASE))
-        return customer_field.split(":")[1].strip()
+        for string_node in soup.find_all(string=True):
+            text = str(string_node)
+            if "customer name" in text.lower():
+                # Try to extract just the account name
+                match = re.search(r"Customer Name\s*:\s*(.+)", text, re.IGNORECASE)
+                if match:
+                    return match.group(1).strip()
+
+        # Return a default value if customer name is not found
+        return "Unknown"
 
     def _extract_maintenance_window(self, soup: ResultSet) -> Tuple[datetime, datetime]:
         """Extract start and end times from maintenance window."""
-        datetime_field = soup.find(string=re.compile("Date Time :", re.IGNORECASE))
-        time_parts = (
-            datetime_field.lower().replace("date time :", "-").replace("to", "-").replace("gmt", "-").split("-")
-        )
-        start_time = datetime.strptime(time_parts[1].strip(), self.DATE_TIME_FORMAT)
-        end_time = datetime.strptime(time_parts[2].strip(), self.DATE_TIME_FORMAT)
-        return start_time, end_time
+        for string_node in soup.find_all(string=True):
+            text = str(string_node)
+
+            if "date time" in text.lower():
+                # Match format like: Date Time : 12/06/2025 15:30:00 to 12/06/2025 19:30:00 GMT
+                match = re.search(
+                    r"Date Time\s*:\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})\s*to\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})",
+                    text,
+                    re.IGNORECASE,
+                )
+                if match:
+                    start_str, end_str = match.groups()
+                    start_time = datetime.strptime(start_str.strip(), self.DATE_TIME_FORMAT)
+                    end_time = datetime.strptime(end_str.strip(), self.DATE_TIME_FORMAT)
+                    return start_time, end_time
+
+        raise ValueError("Could not find 'Date Time :' field or failed to parse timestamps.")
 
 
 class SubjectParserPCCW(EmailSubjectParser):
diff --git a/circuit_maintenance_parser/processor.py b/circuit_maintenance_parser/processor.py
@@ -101,13 +101,14 @@ def get_name(cls) -> str:
         """Return the processor name."""
         return cls.__name__
 
-    def generate_metadata(self):
+    def generate_metadata(self, tokens_used=0):
         """Generate the Metadata for the Maintenance."""
         return Metadata(
             parsers=[parser.get_name() for parser in self.data_parsers],
             generated_by_llm=any(issubclass(parser, LLM) for parser in self.data_parsers),
             processor=self.get_name(),
             provider=self.extended_data["provider"],
+            tokens_used=tokens_used,
         )
 
 
@@ -118,7 +119,11 @@ def process_hook(self, maintenances_extracted_data, maintenances_data):
         """For each data extracted (that can be multiple), we try to build a complete Maintenance."""
         for extracted_data in maintenances_extracted_data:
             self.extend_processor_data(extracted_data)
-            extracted_data["_metadata"] = self.generate_metadata()
+
+            # Extract tokens information if present
+            tokens_used = extracted_data.pop("_llm_tokens_used", 0)
+
+            extracted_data["_metadata"] = self.generate_metadata(tokens_used=tokens_used)
             maintenances_data.append(Maintenance(**extracted_data))
 
 
@@ -156,7 +161,11 @@ def post_process_hook(self, maintenances_data):
         for maintenance in maintenances:
             try:
                 combined_data = {**self.combined_maintenance_data, **maintenance}
-                combined_data["_metadata"] = self.generate_metadata()
+
+                # Extract tokens information if present
+                tokens_used = combined_data.pop("_llm_tokens_used", 0)
+
+                combined_data["_metadata"] = self.generate_metadata(tokens_used=tokens_used)
                 maintenances_data.append(Maintenance(**combined_data))
             except ValidationError as exc:
                 raise ProcessorError("Not enough information available to create a Maintenance notification.") from exc

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Add LLM tokens used to MetaData`
	`2`	`+Fix PCCW parser`
	`3`	`+Text hook improvement`