Skip to content

Added subquery logic (Not fully) #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/dataneuron/core/nlp_helpers/cte_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ def extract_main_query(parsed):
def filter_cte(cte_part, filter_function, client_id):
filtered_ctes = []

is_recursive = False

for token in cte_part.tokens:
if token.ttype is Keyword and token.value.upper() == 'RECURSIVE':
is_recursive = True

def process_cte(token):
if isinstance(token, sqlparse.sql.Identifier):
cte_name = token.get_name()
Expand All @@ -57,7 +63,7 @@ def process_cte(token):
# Remove outer parentheses
inner_query_str = str(inner_query)[1:-1]
filtered_inner_query = filter_function(
sqlparse.parse(inner_query_str)[0], client_id)
sqlparse.parse(inner_query_str)[0], client_id, cte_name)
filtered_ctes.append(f"{cte_name} AS ({filtered_inner_query})")

for token in cte_part.tokens:
Expand All @@ -68,7 +74,10 @@ def process_cte(token):
process_cte(token)

if filtered_ctes:
filtered_cte_str = "WITH " + ",\n".join(filtered_ctes)
if is_recursive:
filtered_cte_str = "WITH RECURSIVE " + ",\n".join(filtered_ctes)
else:
filtered_cte_str = "WITH " + ",\n".join(filtered_ctes)
else:
filtered_cte_str = ""
return filtered_cte_str
Expand Down
148 changes: 148 additions & 0 deletions src/dataneuron/core/nlp_helpers/is_subquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import re
from sqlparse.sql import Token
from sqlparse.tokens import DML, Keyword, Whitespace, Newline
from query_cleanup import _cleanup_whitespace

def _contains_subquery(parsed):
tokens = parsed.tokens if hasattr(parsed, 'tokens') else [parsed]
end_keywords = {'GROUP BY', 'HAVING', 'ORDER BY'}
joins = {'JOIN', 'LEFT JOIN', 'RIGHT JOIN', 'INNER JOIN'}
where_keywords = {'IN', 'NOT IN', 'EXISTS', 'ALL', 'ANY'}
where_keyword_pattern = '|'.join(where_keywords)

select_index = None
from_index = None
where_index = None
end_index = None

select_block = []
from_block = []
where_block = []
end_keywords_block = []
results = []
join_statement = []
join_found = False

i = 0
while i < len(tokens):
token = tokens[i]

if isinstance(token, Token) and token.ttype is DML and token.value.upper() == 'SELECT':
select_index = i
k = i + 1
while k < len(tokens) and not (isinstance(tokens[k], Token) and tokens[k].ttype == Keyword and tokens[k].value.upper() == 'FROM'):
k += 1

from_index = k
k = from_index + 1
while k < len(tokens):
if isinstance(tokens[k], Token) and 'WHERE' in str(tokens[k]) and not \
re.match(r'\(\s*SELECT.*?\bWHERE\b.*?\)', str(tokens[k])):
where_index = k
elif isinstance(tokens[k], Token) and str(tokens[k]) in end_keywords:
end_index = k
break

k += 1
i += 1

where_end = end_index if end_index else len(tokens)
from_end = min(
index for index in [where_index, end_index] if index is not None) if any([where_index, end_index]) \
else len(tokens)

for j in range(select_index + 1, from_index): # Between SELECT and FROM block
select_block.append(_cleanup_whitespace(str(tokens[j])))

select_elements = ' '.join(select_block).strip().split(',') # Split by commas to handle multiple elements in the SELECT block
for element in select_elements:
element = element.replace('\n', ' ').strip() # Clean up any extra whitespace

if re.search(r'\bCASE\b((\s+WHEN\b.*?\bTHEN\b.*?)+)(\s+ELSE\b.*)?(?=\s+END\b)', element, re.DOTALL):

for match in re.findall(r'\bWHEN\b.*?\bTHEN\b.*?\bELSE\b.*?(?=\bWHEN\b|\bELSE\b|\bEND\b)', element, re.DOTALL): #Split them into WHEN, THEN and ELSE blocks: # Check for subquery inside WHEN THEN
if re.search(r'\(.*?\bSELECT\b.*?\)', match, re.DOTALL):
results.append("Subquery exists inside CASE WHEN THEN ELSE block")

elif '(' in element and ')' in element: # Find if any element has parenthesis
if re.search(r'\(.*?\bSELECT\b.*?\)', element, re.DOTALL):
results.append("Inline Subquery exists inside SELECT block")


for j in range(from_index + 1, from_end):
if isinstance(tokens[j], Token) and tokens[j].ttype not in [Whitespace, Newline]:
from_block.append(tokens[j])

for i, element in enumerate(from_block):
if isinstance(element, Token) and element.ttype == Keyword and element.value.upper() in joins:
join_found = True

if i == 1:
join_statement.append(str(from_block[i - 1]))
join_statement.append(str(from_block[i + 1]))
elif i > 1:
join_statement.append(str(from_block[i + 1]))

elif not join_found and re.match(r'\(\s*([\s\S]*?)\s*\)', str(element), re.DOTALL):
if re.findall(r'(UNION\s+ALL|UNION|INTERSECT\s+ALL|INTERSECT|EXCEPT\s+ALL|EXCEPT)', str(element), re.IGNORECASE | re.DOTALL):
results.append("Contains set operation - Subquery found inside FROM block")
elif re.match(r'\(\s*SELECT.*\)\s+\w+', str(element), re.IGNORECASE | re.DOTALL):
results.append("Inline subquery inside FROM block")

if join_found:
for stmt in join_statement:
join_statement_str = _cleanup_whitespace(str(stmt))
if re.findall(r'\(\s*([\s\S]*?)\s*\)', join_statement_str):
if re.findall(r'(UNION\s+ALL|UNION|INTERSECT\s+ALL|INTERSECT|EXCEPT\s+ALL|EXCEPT)', join_statement_str, re.IGNORECASE | re.DOTALL):
results.append("Set operation - Subquery inside JOIN")
elif re.match(r'\(\s*SELECT.*\)\s+\w+', join_statement_str, re.IGNORECASE | re.DOTALL):
results.append("Inline subquery inside JOIN")

if where_index:
for j in range(where_index, where_end):
where_block.append(_cleanup_whitespace(str(tokens[j]).strip('WHERE ')))

for i in where_block:
for clause in re.split(r'\bAND\b(?![^()]*\))', i):
clause = clause.strip()

if re.search(fr'\b({where_keyword_pattern})\b\s*\(.*?\bSELECT\b.*?\)', clause, re.DOTALL):
found_keyword = re.search(fr'\b({where_keyword_pattern})\b', clause).group()
results.append(f"Subquery with special keyword found in WHERE block: {found_keyword} \n")
elif re.search(r'\(.*?\bSELECT\b.*?\)', clause, re.DOTALL):
results.append("Inline subquery found in WHERE block \n")

if end_index:
for j in range(end_index, len(tokens)):
if isinstance(tokens[j], Token) and tokens[j].ttype not in [Whitespace, Newline]:
end_keywords_block.append(_cleanup_whitespace(str(tokens[j])))

endsubquery_block = []
count = 0
indices = []

for index, token in enumerate(end_keywords_block):
if str(token).upper() in end_keywords:
count += 1
indices.append(index)

if count >= 1: # If there is at least one end keyword
for i in range(len(indices)):
start_idx = indices[i] # Start and end indices of each block
if i < len(indices) - 1:
end_idx = indices[i + 1] # Until the next keyword
else:
end_idx = len(end_keywords_block) # Until the end of the block

# Extract the block between start_idx and end_idx
endsubquery_block = end_keywords_block[start_idx:end_idx]
endsubquery_block_str = ' '.join(endsubquery_block)

if re.search(r'\((SELECT [\s\S]*?)\)', str(endsubquery_block_str), re.IGNORECASE):
if re.search(r'\(((?:[^()]+|\([^()]*\))*)\)\s*(?:AS\s+)?(\w+)?', str(endsubquery_block_str), re.IGNORECASE).group(1):
results.append("Subquery in END keywords")

if len(results) >= 1:
return True
else:
return False
17 changes: 17 additions & 0 deletions src/dataneuron/core/nlp_helpers/query_cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import re

def _cleanup_whitespace(query: str) -> str:
# Split the query into lines
lines = query.split('\n')
cleaned_lines = []
for line in lines:
# Remove leading/trailing whitespace from each line
line = line.strip()
# Replace multiple spaces with a single space, but not in quoted strings
line = re.sub(r'\s+(?=(?:[^\']*\'[^\']*\')*[^\']*$)', ' ', line)
# Ensure single space after commas, but not in quoted strings
line = re.sub(
r'\s*,\s*(?=(?:[^\']*\'[^\']*\')*[^\']*$)', ', ', line)
cleaned_lines.append(line)
# Join the lines back together
return '\n'.join(cleaned_lines)
Loading