Skip to content

Commit 10b40df

Browse files
committed
Add support for ingesting individual images and contextual images in PDFs
1 parent 18ffc72 commit 10b40df

File tree

3 files changed

+213
-35
lines changed

3 files changed

+213
-35
lines changed

requirements-dev.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
-r requirements.txt
2-
azure-ai-formrecognizer==3.2.1
2+
azure-ai-documentintelligence==1.0.0b2
33
Markdown==3.4.4
44
requests==2.31.0
55
tqdm==4.66.1
@@ -9,6 +9,7 @@ bs4==0.0.1
99
urllib3==2.1.0
1010
pytest==7.4.0
1111
pytest-asyncio==0.23.2
12+
PyMuPDF==1.24.5
1213
azure-storage-blob
1314
chardet
1415
azure-keyvault-secrets

scripts/data_preparation.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import time
88

99
import requests
10-
from azure.ai.formrecognizer import DocumentAnalysisClient
10+
from azure.ai.documentintelligence import DocumentIntelligenceClient
1111
from azure.core.credentials import AzureKeyCredential
1212
from azure.identity import AzureCliCredential
1313
from azure.search.documents import SearchClient
@@ -209,6 +209,14 @@ def create_or_update_search_index(
209209
"type": "Edm.String",
210210
"searchable": True,
211211
},
212+
{
213+
"name": "image_mapping",
214+
"type": "Edm.String",
215+
"searchable": False,
216+
"sortable": False,
217+
"facetable": False,
218+
"filterable": False
219+
}
212220
],
213221
"suggesters": [],
214222
"scoringProfiles": [],
@@ -343,7 +351,7 @@ def validate_index(service_name, subscription_id, resource_group, index_name):
343351
print(f"Request failed. Please investigate. Status code: {response.status_code}")
344352
break
345353

346-
def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4):
354+
def create_index(config, credential, form_recognizer_client=None, embedding_model_endpoint=None, use_layout=False, njobs=4, captioning_model_endpoint=None, captioning_model_key=None):
347355
service_name = config["search_service_name"]
348356
subscription_id = config["subscription_id"]
349357
resource_group = config["resource_group"]
@@ -397,7 +405,8 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
397405
elif os.path.exists(data_config["path"]):
398406
result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
399407
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
400-
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
408+
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"],
409+
captioning_model_endpoint=captioning_model_endpoint, captioning_model_key=captioning_model_key)
401410
else:
402411
raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")
403412

@@ -430,11 +439,13 @@ def valid_range(n):
430439
parser.add_argument("--config", type=str, help="Path to config file containing settings for data preparation")
431440
parser.add_argument("--form-rec-resource", type=str, help="Name of your Form Recognizer resource to use for PDF cracking.")
432441
parser.add_argument("--form-rec-key", type=str, help="Key for your Form Recognizer resource to use for PDF cracking.")
433-
parser.add_argument("--form-rec-use-layout", default=False, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
442+
parser.add_argument("--form-rec-use-layout", default=True, action='store_true', help="Whether to use Layout model for PDF cracking, if False will use Read model.")
434443
parser.add_argument("--njobs", type=valid_range, default=4, help="Number of jobs to run (between 1 and 32). Default=4")
435444
parser.add_argument("--embedding-model-endpoint", type=str, help="Endpoint for the embedding model to use for vector search. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<Ada deployment name>/embeddings?api-version=2023-03-15-preview'")
436445
parser.add_argument("--embedding-model-key", type=str, help="Key for the embedding model to use for vector search.")
437446
parser.add_argument("--search-admin-key", type=str, help="Admin key for the search service. If not provided, will use Azure CLI to get the key.")
447+
parser.add_argument("--azure-openai-endpoint", type=str, help="Endpoint for the (Azure) OpenAI API. Format: 'https://<AOAI resource name>.openai.azure.com/openai/deployments/<vision model name>/chat/completions?api-version=2024-04-01-preview'")
448+
parser.add_argument("--azure-openai-key", type=str, help="Key for the (Azure) OpenAI API.")
438449
args = parser.parse_args()
439450

440451
with open(args.config) as f:
@@ -451,15 +462,15 @@ def valid_range(n):
451462
os.environ["FORM_RECOGNIZER_ENDPOINT"] = f"https://{args.form_rec_resource}.cognitiveservices.azure.com/"
452463
os.environ["FORM_RECOGNIZER_KEY"] = args.form_rec_key
453464
if args.njobs==1:
454-
form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
465+
form_recognizer_client = DocumentIntelligenceClient(endpoint=f"https://{args.form_rec_resource}.cognitiveservices.azure.com/", credential=AzureKeyCredential(args.form_rec_key))
455466
print(f"Using Form Recognizer resource {args.form_rec_resource} for PDF cracking, with the {'Layout' if args.form_rec_use_layout else 'Read'} model.")
456467

457468
for index_config in config:
458469
print("Preparing data for index:", index_config["index_name"])
459470
if index_config.get("vector_config_name") and not args.embedding_model_endpoint:
460471
raise Exception("ERROR: Vector search is enabled in the config, but no embedding model endpoint and key were provided. Please provide these values or disable vector search.")
461472

462-
create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs)
473+
create_index(index_config, credential, form_recognizer_client, embedding_model_endpoint=args.embedding_model_endpoint, use_layout=args.form_rec_use_layout, njobs=args.njobs, captioning_model_endpoint=args.azure_openai_endpoint, captioning_model_key=args.azure_openai_key)
463474
print("Data preparation for index", index_config["index_name"], "completed")
464475

465476
print(f"Data preparation script completed. {len(config)} indexes updated.")

0 commit comments

Comments
 (0)