baseline code added

shalymin-amzn · shalymin-amzn · commit 8a3b7fd2e64b · 2025-01-13T10:33:08.000-08:00
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,10 @@
+langchain==0.2.6
+langchain-community==0.2.6
+sentence-transformers==2.6.1
+scikit-learn==0.22.2.post1
+coclust==0.2.1
+rouge-score==0.1.2
+scipy==1.13.0
+numpy==1.23.5
+tqdm==4.66.4
+torch==2.2.2
diff --git a/scripts/run_theme_detection.py b/scripts/run_theme_detection.py
@@ -0,0 +1,136 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from argparse import ArgumentParser
+import json
+import os
+import copy
+import collections
+
+import getpass
+import tqdm
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.runnables import RunnableParallel
+from sklearn.cluster import KMeans
+
+from dstc12.prompts import LABEL_CLUSTERS_PROMPT
+from dstc12.utils import get_llm, DotAllRegexParser
+import numpy as np
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument('dataset_file', type=str)
+    parser.add_argument('preferences_file', type=str)
+    parser.add_argument('result_file', type=str)
+    parser.add_argument('--n-clusters', type=int, default=10)
+    parser.add_argument('--random-state', type=int, default=42)
+    parser.add_argument('--embedding-model-name', type=str, default='sentence-transformers/all-mpnet-base-v2')
+    parser.add_argument('--llm-name', type=str, default='mistralai/Mistral-7B-Instruct-v0.3')
+    return parser.parse_args()
+
+
+def find_second_closest_cluster(emb, centroids):
+    distances = [np.linalg.norm(emb - centroid) for centroid in centroids]
+    sorted_indices = np.argsort(distances)
+    return sorted_indices[1]
+
+
+def apply_preferences_to_clusters(utterances, utterance_embs, cluster_labels, cluster_centroids, shouldlink_pairs, cannot_link_pairs):
+    assert len(utterances) == len(cluster_labels)
+
+    datapoint_modification_counter = collections.defaultdict(lambda: 0)
+
+    utterance_cluster_mapping = collections.defaultdict(lambda: -1)
+    utterance_idx_mapping = collections.defaultdict(lambda: -1)
+    for utt_idx, cluster_label in enumerate(cluster_labels):
+        utterance = utterances[utt_idx]
+        utterance_cluster_mapping[utterance] = cluster_label
+        utterance_idx_mapping[utterance] = utt_idx
+    modified_cluster_labels = copy.deepcopy(cluster_labels)
+    for utt_a, utt_b in shouldlink_pairs:
+        cluster_a, cluster_b = utterance_cluster_mapping[utt_a], utterance_cluster_mapping[utt_b]
+        if cluster_a != cluster_b:
+            utt_b_idx = utterance_idx_mapping[utt_b]
+            modified_cluster_labels[utt_b_idx] = cluster_a
+            utterance_cluster_mapping[utt_b] = cluster_a
+            datapoint_modification_counter[utt_b_idx] += 1
+    for utt_a, utt_b in cannot_link_pairs:
+        cluster_a, cluster_b = utterance_cluster_mapping[utt_a], utterance_cluster_mapping[utt_b]
+        if cluster_a == cluster_b:
+            utt_b_idx = utterance_idx_mapping[utt_b]
+            utt_b_new_cluster = find_second_closest_cluster(utterance_embs[utt_b_idx], cluster_centroids)
+            modified_cluster_labels[utt_b_idx] = utt_b_new_cluster
+            utterance_cluster_mapping[utt_b] = utt_b_new_cluster
+            datapoint_modification_counter[utt_b_idx] += 1
+    return modified_cluster_labels
+
+
+def main(utterances, linking_preferences, embedding_model_name, llm_name, n_clusters, random_state):
+    llm = get_llm(llm_name)
+    chain = (
+        LABEL_CLUSTERS_PROMPT |
+        llm |
+        RunnableParallel(
+            theme_label=DotAllRegexParser(regex=r'<theme_label>(.*?)</theme_label>', output_keys=['theme_label']),
+            theme_label_explanation=DotAllRegexParser(regex=r'<theme_label_explanation>(.*?)</theme_label_explanation>', output_keys=['theme_label_explanation'])
+        )
+     )
+    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
+    query_embeddings = [embeddings.embed_query(utterance) for utterance in tqdm.tqdm(utterances)]
+    kmeans = KMeans(n_clusters=n_clusters, n_init=1, init='k-means++', random_state=random_state)
+    kmeans.fit(query_embeddings)
+    clusters = kmeans.labels_
+    centroids = kmeans.cluster_centers_
+    clusters_with_preferences = apply_preferences_to_clusters(
+        utterances,
+        query_embeddings,
+        clusters,
+        centroids,
+        linking_preferences['should_link'],
+        linking_preferences['cannot_link']
+    )
+    clustered_utterances = [[] for _ in range(n_clusters)]
+    for i, label in enumerate(clusters_with_preferences):
+        clustered_utterances[label].append(utterances[i])
+    cluster_label_map = {}
+    for i, cluster in tqdm.tqdm(enumerate(clustered_utterances)):
+        outputs_parsed = chain.invoke({'utterances': '\n'.join(cluster)})
+        for utterance in cluster:
+            cluster_label_map[utterance] = outputs_parsed['theme_label']['theme_label']
+    return cluster_label_map
+
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if not os.getenv("HUGGINGFACEHUB_API_TOKEN"):
+        os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your token: ")
+
+    with open(args.dataset_file) as f:
+        dataset = [json.loads(line) for line in f]
+    themed_utterances = set([])
+    for dialogue in dataset:
+        for turn in dialogue['turns']:
+            if turn['theme_label'] is not None:
+                themed_utterances.add(turn['utterance'])
+
+    with open(args.preferences_file) as prefs_in:
+        linking_preferences = json.load(prefs_in)
+    cluster_label_map = main(
+        list(themed_utterances),
+        linking_preferences,
+        args.embedding_model_name,
+        args.llm_name,
+        args.n_clusters,
+        args.random_state
+    )
+    dataset_predicted = copy.deepcopy(dataset)
+    for dialogue in dataset_predicted:
+        for turn in dialogue['turns']:
+            if turn['theme_label'] is not None:
+                turn['theme_label_predicted'] = cluster_label_map[turn['utterance']]
+    with open(args.result_file, 'w') as result_out:
+        for dialogue in dataset_predicted:
+            print(json.dumps(dialogue), file=result_out)
diff --git a/set_paths.sh b/set_paths.sh
@@ -0,0 +1 @@
+export PYTHONPATH=$PYTHONPATH:`pwd`/src/:`pwd`/scripts/
diff --git a/src/dstc12/__init__.py b/src/dstc12/__init__.py
@@ -0,0 +1,2 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
diff --git a/src/dstc12/eval.py b/src/dstc12/eval.py
@@ -0,0 +1,106 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+import tqdm
+import numpy as np
+from coclust.evaluation.external import accuracy
+from sklearn.metrics import normalized_mutual_info_score
+from sklearn.metrics.pairwise import cosine_similarity
+from rouge_score import rouge_scorer
+from rouge_score.scoring import Score
+from langchain_core.runnables import RunnableParallel
+
+from dstc12.utils import DotAllRegexParser
+from dstc12.prompts import STYLEGUIDE_SECTION_1_PROMPT, STYLEGUIDE_SECTION_2_PROMPT, STYLEGUIDE_SECTION_3_PROMPT
+
+
+def acc(references=None, predictions=None):
+    assert references and predictions and len(references) == len(predictions)
+    return accuracy(references, predictions)
+
+
+def nmi(references=None, predictions=None):
+    assert references and predictions and len(references) == len(predictions)
+    return normalized_mutual_info_score(references, predictions)
+
+
+def rouge(references=None, predictions=None, metrics=['rouge1', 'rouge2', 'rouge3'], average=False):
+    assert len(references) == len(predictions)
+    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
+    scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
+    if average:
+        scores_aggregated = {metric: {'precision': 0, 'recall': 0, 'fmeasure': 0} for metric in scores}
+        for score in scores:
+            for metric in score:
+                scores_aggregated[metric]['precision'] += score[metric].precision / len(scores)
+                scores_aggregated[metric]['recall'] += score[metric].recall / len(scores)
+                scores_aggregated[metric]['fmeasure'] += score[metric].fmeasure / len(scores)
+        result = {
+            metric_name: Score(metric['precision'], metric['recall'], metric['fmeasure'])
+            for metric_name, metric in result.items()
+        }
+    else:
+        result = scores
+    return result
+
+
+def rouge_with_multiple_references(references_list, predictions):
+    scores = [rouge(refs_i, predictions, aggregate=True) for refs_i in references_list]
+    scores_averaged = {
+        metric_name: {
+            'precision': 0,
+            'recall': 0,
+            'fmeasure': 0
+        } for metric_name in scores[0]
+    }
+
+    for score_i in scores:
+        for metric_name, score in score_i.items():
+            scores_averaged[metric_name]['precision'] += score.precision / len(scores)
+            scores_averaged[metric_name]['recall'] += score.recall / len(scores)
+            scores_averaged[metric_name]['fmeasure'] += score.fmeasure / len(scores)
+    return scores_averaged
+
+
+def cosine_similarity_with_multiple_references(references_list, predictions):
+    scores = [cosine_similarity(refs_i, predictions) for refs_i in references_list]
+    scores_averaged = sum(scores) / len(scores)
+    return scores_averaged
+
+
+def process_llm_judge_output(output):
+    scores = []
+    for  section in ['section_1', 'section_2', 'section_3']:
+        assert section in output and 'score' in output[section]
+        scores.append(int(output[section]['score']['value'] == 'Good'))
+    return np.mean(scores)
+
+
+def llm_score(predictions, llm):
+    chain = (
+        RunnableParallel(
+            section_1=STYLEGUIDE_SECTION_1_PROMPT
+                | llm
+                | RunnableParallel(
+                    score=DotAllRegexParser(regex=r'<score>\s*(.*?)\s*</score>', output_keys=['value']),
+                    explanation=DotAllRegexParser(regex=r'<explanation>\s*(.*?)\s*</explanation>', output_keys=['value']),
+            ),
+            section_2=STYLEGUIDE_SECTION_2_PROMPT
+                | llm
+                | RunnableParallel(
+                    score=DotAllRegexParser(regex=r'<score>\s*(.*?)\s*</score>', output_keys=['value']),
+                    explanation=DotAllRegexParser(regex=r'<explanation>\s*(.*?)\s*</explanation>', output_keys=['value']),
+            ),
+            section_3=STYLEGUIDE_SECTION_3_PROMPT
+                | llm
+                | RunnableParallel(
+                    score=DotAllRegexParser(regex=r'<score>\s*(.*?)\s*</score>', output_keys=['value']),
+                    explanation=DotAllRegexParser(regex=r'<explanation>\s*(.*?)\s*</explanation>', output_keys=['value']),
+            ),
+        )
+    )
+    scores = []
+    for prediction in tqdm.tqdm(predictions):
+        judge_output = chain.invoke({'theme_label': prediction})
+        scores.append(process_llm_judge_output(judge_output))
+    return np.mean(scores)
diff --git a/src/dstc12/prompts/__init__.py b/src/dstc12/prompts/__init__.py
@@ -0,0 +1,11 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from .extract_themes import PROMPT as EXTRACT_THEMES_PROMPT
+from .label_utterances import PROMPT as LABEL_UTTERANCES_PROMPT
+from .label_clusters import PROMPT as LABEL_CLUSTERS_PROMPT
+from .styleguide import (
+    SECTION_1_PROMPT as STYLEGUIDE_SECTION_1_PROMPT,
+    SECTION_2_PROMPT as STYLEGUIDE_SECTION_2_PROMPT,
+    SECTION_3_PROMPT as STYLEGUIDE_SECTION_3_PROMPT
+)
diff --git a/src/dstc12/prompts/extract_themes.py b/src/dstc12/prompts/extract_themes.py
@@ -0,0 +1,26 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from langchain_core.prompts import PromptTemplate
+
+
+PROMPT = PromptTemplate.from_template(
+'''<task>
+You are an expert call center assistant. You will be given a set of utterances in <utterances> </utterances> tags, each one on a new line. Read through them carefully and cluster them into themes. The themes should be exhaustive and mutually exclusive and should cover the dataset completely.
+Output a full set of themes you identified. One utterance can only belong to one theme.
+
+<guidance>
+Write your output in the following format:
+Unique themes number: n
+<theme>theme label 1</theme>
+<theme>theme label 2</theme>
+...
+<theme>theme label n</theme>
+</guidance>
+
+H:
+<utterances>
+{utterances}
+</utterances>
+'''
+)
diff --git a/src/dstc12/prompts/label_clusters.py b/src/dstc12/prompts/label_clusters.py
@@ -0,0 +1,26 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from langchain_core.prompts import PromptTemplate
+
+
+PROMPT = PromptTemplate.from_template(
+'''<task>
+You are an expert call center assistant. You will be given a set of utterances in <utterances> </utterances> tags, each one on a new line.
+The utterances are part of callcenter conversations between the customer and the support agent.
+Your task is to generate a short label describing the theme of all the given utterances. The theme label should be under 5 words and describe the desired customer's action in the call.
+
+
+<guidance>
+Output your response in the following way.
+<theme_label_explanation>Your short step-by-step explanation behind the theme</theme_label_explanation>
+<theme_label>your theme label</theme_label>
+</guidance>
+</task>
+
+H:
+<utterances>
+{utterances}
+</utterances>
+'''
+)
diff --git a/src/dstc12/prompts/label_utterances.py b/src/dstc12/prompts/label_utterances.py
@@ -0,0 +1,48 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.  
+# SPDX-License-Identifier: CC-BY-NC-4.0
+
+from langchain_core.prompts import PromptTemplate
+
+
+PROMPT = PromptTemplate.from_template(
+'''<task>
+You are an expert call center assistant. You will be given a set of utterances in <utterances> </utterances> tags, each one on a new line.
+You will also receive a set of theme labels in <themes> </themes> tags. Read through them carefully and associate each utterance with the corresponding theme label index.
+
+<example>
+H:
+<utterances>
+I want to cancel my account
+I never received my order
+I want to get some information about your insurance offerings
+</utterances>
+
+<themes>
+1. book a flight
+2. information about insurance
+3. return product
+4. cancel account
+5. request refund
+6. open account
+</themes>
+
+A:
+<theme_indices>4, 0, 2</theme_indices>
+</example>
+
+<guidance>
+Write output in the following format: <theme_indices>comma separated theme indices for every input utterance</theme_indices>
+If no theme matches an utterance, assign it the index 0. If multiple themes match an utterance, assign it the theme you thought of first.
+</guidance>
+</task>
+
+H:
+<utterances>
+{utterances}
+</utterances>
+
+<themes>
+{themes}
+</themes>
+'''
+)
diff --git a/src/dstc12/prompts/styleguide.py b/src/dstc12/prompts/styleguide.py
diff --git a/src/dstc12/utils.py b/src/dstc12/utils.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+export PYTHONPATH=$PYTHONPATH:`pwd`/src/:`pwd`/scripts/
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.`
	`2`	`+# SPDX-License-Identifier: CC-BY-NC-4.0`