Skip to content

Commit 65eaf60

Browse files
authored
Merge pull request #126 from macbre/high-offset-selects
Avoid high offsets in queries
2 parents fbc4c37 + 852fb20 commit 65eaf60

File tree

8 files changed

+102
-5
lines changed

8 files changed

+102
-5
lines changed

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ matrix:
2323
- python: 3.6
2424
env: DOCKER_IMAGE=mysql:5.6
2525
- python: 3.6
26-
env: DOCKER_IMAGE=mysql:8.0
26+
env: DOCKER_IMAGE=mysql:8.0.3
2727
# @see https://hub.docker.com/_/mariadb/
2828
- python: 3.6
2929
env: DOCKER_IMAGE=mariadb:10.0 # used by Wikipedia
@@ -35,7 +35,7 @@ before_script:
3535
# @see https://hub.docker.com/_/mysql/
3636
- sudo service mysql stop # stop any running MySQL server
3737
- sudo docker run -e MYSQL_ALLOW_EMPTY_PASSWORD=yes -d -p 3306:3306 $DOCKER_IMAGE
38-
- sleep 15 # wait for MySQL server to be fully set up
38+
- ./wait_for_mysql.sh
3939
- sudo docker ps
4040
# set up a database
4141
- mysql --protocol=tcp -u root -e "CREATE DATABASE index_digest; CREATE USER 'index_digest'@'%' IDENTIFIED BY 'qwerty'; GRANT ALL ON index_digest.* TO 'index_digest'@'%';"

README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Analyses your database queries and schema and suggests indices improvements. You
1313
* reports text columns with character set different than `utf`
1414
* reports queries that do not use indices
1515
* reports queries that use filesort, temporary file or full table scan
16-
* reports queries that are not quite kosher (e.g. `LIKE "%foo%"`, `INSERT IGNORE`, `SELECT *`, `HAVING` clause)
16+
* reports queries that are not quite kosher (e.g. `LIKE "%foo%"`, `INSERT IGNORE`, `SELECT *`, `HAVING` clause, high `OFFSET` in pagination queries)
1717
* if run with `--analyze-data` switch it:
1818
* reports tables with old data (by querying for `MIN()` value of time column) where data retency can be reviewed
1919
* reports tables with not up-to-date data (by querying for `MAX()` value of time column)
@@ -314,6 +314,15 @@ data_not_updated_recently → table affected: 0028_data_not_updated_recently
314314
- rows: 3
315315
- table_size_mb: 0.015625
316316
317+
------------------------------------------------------------
318+
high_offset_selects → table affected: page
319+
320+
✗ "SELECT /* CategoryPaginationViewer::processSection..." query uses too high offset impacting the performance
321+
322+
- query: SELECT /* CategoryPaginationViewer::processSection */ page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' ORDER BY cl_sortkey LIMIT 927600,200
323+
- limit: 200
324+
- offset: 927600
325+
317326
------------------------------------------------------------
318327
Queries performed: 100
319328
```
@@ -358,7 +367,7 @@ Outputs YML file with results and metadata.
358367

359368
You can select which checks should be reported by the tool by using `--checks` command line option. Certain checks can also be skipped via `--skip-checks` option. Refer to `index_digest --help` for examples.
360369

361-
> **Number of checks**: 20
370+
> **Number of checks**: 21
362371
363372
* `redundant_indices`: reports indices that are redundant and covered by other
364373
* `non_utf_columns`: reports text columns that have characters encoding set to `latin1` (utf is the way to go)
@@ -383,6 +392,7 @@ You can select which checks should be reported by the tool by using `--checks` c
383392
* `insert_ignore`: reports [queries using `INSERT IGNORE`](https://medium.com/legacy-systems-diary/things-to-avoid-episode-1-insert-ignore-535b4c24406b)
384393
* `select_star`: reports [queries using `SELECT *`](https://github.com/jarulraj/sqlcheck/blob/master/docs/query/3001.md)
385394
* `having_clause`: reports [queries using `HAVING` clause](https://github.com/jarulraj/sqlcheck/blob/master/docs/query/3012.md)
395+
* `high_offset_selects`: report [SELECT queries using high OFFSET](https://www.percona.com/blog/2008/09/24/four-ways-to-optimize-paginated-displays/)
386396

387397
### Additional checks performed on tables data
388398

indexdigest/cli/script.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@
6161
check_having_clause, \
6262
check_data_too_old, \
6363
check_data_not_updated_recently, \
64-
check_generic_primary_key
64+
check_generic_primary_key, \
65+
check_high_offset_selects
6566

6667

6768
def get_reports(database, sql_log=None, analyze_data=False):
@@ -110,6 +111,7 @@ def get_reports(database, sql_log=None, analyze_data=False):
110111
check_insert_ignore_queries(database, queries=queries),
111112
check_select_star(database, queries=queries),
112113
check_having_clause(database, queries=queries),
114+
check_high_offset_selects(database, queries=queries),
113115
)
114116

115117
# checks that require --analyze-data switch to be on (see #28)

indexdigest/linters/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@
2121
from .linter_0092_select_star import check_select_star
2222
from .linter_0093_having_clause import check_having_clause
2323
from .linter_0094_generic_primary_key import check_generic_primary_key
24+
from .linter_0118_high_offset_selects import check_high_offset_selects
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
This linter checks for too high offset SELECT queries
3+
"""
4+
from collections import OrderedDict
5+
6+
from sql_metadata import get_query_limit_and_offset, get_query_tables
7+
8+
from indexdigest.utils import LinterEntry, shorten_query
9+
10+
11+
OFFSET_THRESHOLD = 1000
12+
13+
14+
def check_high_offset_selects(_, queries):
15+
"""
16+
:type _ indexdigest.database.Database
17+
:type queries list[str]
18+
:rtype: list[LinterEntry]
19+
"""
20+
for query in queries:
21+
res = get_query_limit_and_offset(query)
22+
23+
if res is None:
24+
continue
25+
26+
(limit, offset) = res
27+
28+
if offset < OFFSET_THRESHOLD:
29+
continue
30+
31+
table_name = get_query_tables(query)[0]
32+
33+
context = OrderedDict()
34+
context['query'] = query
35+
context['limit'] = limit
36+
context['offset'] = offset
37+
38+
yield LinterEntry(linter_type='high_offset_selects', table_name=table_name,
39+
message='"{}" query uses too high offset impacting the performance'.
40+
format(shorten_query(query)),
41+
context=context)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from __future__ import print_function
2+
3+
from unittest import TestCase
4+
5+
from indexdigest.linters.linter_0118_high_offset_selects import check_high_offset_selects
6+
from indexdigest.test import DatabaseTestMixin, read_queries_from_log
7+
8+
9+
class TestLinter(TestCase, DatabaseTestMixin):
10+
11+
def test_high_offset_selects(self):
12+
reports = list(check_high_offset_selects(
13+
self.connection, queries=read_queries_from_log('0118-high-offset-selects-log')))
14+
15+
print(reports, reports[0].context)
16+
17+
self.assertEqual(len(reports), 1)
18+
19+
self.assertEqual(str(reports[0]), 'page: "SELECT /* CategoryPaginationViewer::processSection..." query uses too high offset impacting the performance')
20+
self.assertEqual(reports[0].table_name, 'page')
21+
self.assertEqual(str(reports[0].context['query']), "SELECT /* CategoryPaginationViewer::processSection */ page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' ORDER BY cl_sortkey LIMIT 927600,200")
22+
self.assertEqual(reports[0].context['limit'], 200)
23+
self.assertEqual(reports[0].context['offset'], 927600)
24+
25+
# assert False

sql/0118-high-offset-selects-log

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- no offset queries
2+
SELECT foo_limit FROM bar_offset
3+
-- not high enough query
4+
SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 100
5+
select * from 0020_big_table order by id limit 50, 5;
6+
-- offset queries
7+
SELECT /* CategoryPaginationViewer::processSection */ page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' ORDER BY cl_sortkey LIMIT 927600,200

wait_for_mysql.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/bash
2+
# wait for MySQL server to be fully set up
3+
echo -n 'Waiting for MySQL server '
4+
5+
until mysql --silent --protocol=tcp -uroot -e 'select 1' 1>/dev/null 2>/dev/null
6+
do
7+
echo -n '.'
8+
sleep 1
9+
done
10+
11+
echo ' done'

0 commit comments

Comments
 (0)