Skip to content

Commit 96499c7

Browse files
authored
Merge pull request #14 from jatin-cegis/main
Iteration 4 and Performance Improvements on ADD module
2 parents 349dfd5 + 7c94371 commit 96499c7

38 files changed

+810
-1142
lines changed

api/__init__.py

Whitespace-only changes.

api/main.py

Lines changed: 66 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,12 @@
4747
from api.database import get_db, UploadedFile
4848
from api.database import engine, Base
4949
import chardet
50+
import logging
51+
import csv
52+
from io import StringIO
5053

5154
app = FastAPI()
55+
logger = logging.getLogger(__name__)
5256

5357
# Global variable to store the last processed data
5458
last_processed_data = {"unique_rows": None, "duplicate_rows": None}
@@ -69,45 +73,82 @@ async def startup_event():
6973
async def health():
7074
return {"status": "ok"}
7175

76+
def detect_delimiter(sample_text: str) -> str:
77+
"""Detect whether comma or semicolon is the most likely delimiter."""
78+
comma_count = sample_text.count(',')
79+
semicolon_count = sample_text.count(';')
80+
return ',' if comma_count >= semicolon_count else ';'
7281
@app.post("/upload_file")
7382
async def upload_file(
7483
file: UploadFile = File(...),
7584
category: str = Form(...),
7685
db: Session = Depends(get_db)
7786
):
7887
contents = await file.read()
79-
encoding = chardet.detect(contents)["encoding"]
88+
detection = chardet.detect(contents)
89+
encoding = detection["encoding"]
8090
encoding = encoding.lower() if encoding else None
81-
if encoding not in ['utf-8', 'utf-8-sig']:
82-
return JSONResponse(
83-
status_code=400,
84-
content={
85-
"message": "File is not UTF-8 encoded",
86-
},
91+
logger.info(f"Detected file encoding: '{encoding}'")
92+
try:
93+
if encoding not in ['utf-8', 'utf-8-sig', 'iso-8859-1', 'windows-1252', 'ascii']:
94+
logger.warning(f"Unsupported file encoding: {file.filename} (Detected: {encoding})")
95+
return JSONResponse(
96+
status_code=400,
97+
content={"message": f"Unsupported file encoding: {encoding}"},
98+
)
99+
100+
# Decode contents
101+
text_data = contents.decode(encoding)
102+
df_raw = pd.read_csv(io.StringIO(contents.decode(encoding)), header=None)
103+
104+
# If only one column exists, try splitting it
105+
if df_raw.shape[1] == 1:
106+
logger.warning(f"Only one column detected in {file.filename}. Attempting to split.")
107+
108+
sample = text_data[:1000] # Use first 1000 characters for detection
109+
detected_delim = detect_delimiter(sample)
110+
logger.warning(f"Detected delimiter: '{detected_delim}'")
111+
# Split the single column into columns
112+
split_df = df_raw[0].str.split(detected_delim, expand=True)
113+
# Use first row as header -> include explicitly headers
114+
split_df.columns = split_df.iloc[0].astype(str)
115+
# Drop the header row from data
116+
df = split_df.iloc[1:].reset_index(drop=True)
117+
118+
# Convert DataFrame back to CSV
119+
processed_csv = df.to_csv(index=False)
120+
121+
122+
# Check if a file with the same name and category already exists
123+
existing_file = (
124+
db.query(UploadedFile)
125+
.filter(UploadedFile.filename == file.filename, UploadedFile.category == category)
126+
.first()
87127
)
88128

89-
# Check if a file with the same name and category already exists
90-
existing_file = (
91-
db.query(UploadedFile)
92-
.filter(UploadedFile.filename == file.filename, UploadedFile.category == category)
93-
.first()
94-
)
129+
if existing_file:
130+
return JSONResponse(
131+
status_code=409, # Conflict
132+
content={
133+
"message": f"'{file.filename}' already exists in category '{category}'.",
134+
"id": existing_file.id,
135+
},
136+
)
137+
138+
db_file = UploadedFile(filename=file.filename, content=processed_csv.encode(encoding), category=category)
139+
db.add(db_file)
140+
db.commit()
141+
db.refresh(db_file)
95142

96-
if existing_file:
143+
except Exception as e:
144+
db.rollback()
145+
logger.error(f"Error saving file: {file.filename}. Error: {e}")
97146
return JSONResponse(
98-
status_code=409, # Conflict
99-
content={
100-
"message": f"'{file.filename}' already exists in category '{category}'.",
101-
"id": existing_file.id,
102-
},
147+
status_code=500,
148+
content={"message": "Internal server error", "details": str(e)},
103149
)
104150

105-
# Proceed with saving the file if it doesn't exist
106-
db_file = UploadedFile(filename=file.filename, content=contents, category=category)
107-
db.add(db_file)
108-
db.commit()
109-
db.refresh(db_file)
110-
151+
logger.info(f"File uploaded successfully: {file.filename}")
111152
return {"message": "File uploaded successfully", "id": db_file.id}
112153

113154

0 commit comments

Comments
 (0)