Skip to content

Commit de8f562

Browse files
authored
Index and search DataTable with Lucene (#902)
* Refactor the geocoder package * Add consumer, mapper and query for datatable abstration * Add abstractions to create index from data tables
1 parent f2fb634 commit de8f562

File tree

24 files changed

+558
-71
lines changed

24 files changed

+558
-71
lines changed

baremaps-cli/src/main/java/org/apache/baremaps/cli/geocoder/Search.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import java.nio.file.Path;
2323
import java.util.concurrent.Callable;
24-
import org.apache.baremaps.geocoder.GeonamesQueryBuilder;
24+
import org.apache.baremaps.geocoder.geonames.GeonamesQueryBuilder;
2525
import org.apache.lucene.search.*;
2626
import org.apache.lucene.store.FSDirectory;
2727
import org.slf4j.Logger;
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to you under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.baremaps.geocoder;
19+
20+
import java.util.function.Consumer;
21+
import org.apache.baremaps.data.storage.DataRow;
22+
import org.apache.lucene.index.IndexWriter;
23+
import org.slf4j.Logger;
24+
import org.slf4j.LoggerFactory;
25+
26+
public class DataRowConsumer implements Consumer<DataRow> {
27+
28+
private static final Logger logger = LoggerFactory.getLogger(DataRowConsumer.class);
29+
30+
private final IndexWriter indexWriter;
31+
32+
private final DataRowMapper dataRowMapper = new DataRowMapper();
33+
34+
public DataRowConsumer(IndexWriter indexWriter) {
35+
this.indexWriter = indexWriter;
36+
}
37+
38+
@Override
39+
public void accept(DataRow row) {
40+
try {
41+
var document = dataRowMapper.apply(row);
42+
indexWriter.addDocument(document);
43+
} catch (Exception e) {
44+
logger.warn("The following row ({}) is not processed due to {}", row, e);
45+
}
46+
}
47+
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to you under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.baremaps.geocoder;
19+
20+
import java.net.InetAddress;
21+
import java.time.LocalDate;
22+
import java.time.LocalDateTime;
23+
import java.time.LocalTime;
24+
import java.util.ArrayList;
25+
import java.util.Arrays;
26+
import java.util.List;
27+
import java.util.Map;
28+
import java.util.function.Function;
29+
import org.apache.baremaps.data.storage.DataColumn;
30+
import org.apache.baremaps.data.storage.DataRow;
31+
import org.apache.baremaps.data.storage.DataSchema;
32+
import org.apache.lucene.document.*;
33+
import org.locationtech.jts.geom.*;
34+
import org.slf4j.Logger;
35+
import org.slf4j.LoggerFactory;
36+
37+
public class DataRowMapper implements Function<DataRow, Document> {
38+
39+
private static final Logger logger = LoggerFactory.getLogger(DataRowMapper.class);
40+
41+
@Override
42+
public Document apply(DataRow dataRow) {
43+
Document doc = new Document();
44+
DataSchema schema = dataRow.schema();
45+
List<DataColumn> columns = schema.columns();
46+
for (int i = 0; i < columns.size(); i++) {
47+
Object value = dataRow.get(i);
48+
if (value == null) {
49+
continue;
50+
}
51+
52+
DataColumn column = columns.get(i);
53+
applyValue(column, doc, value);
54+
}
55+
return doc;
56+
}
57+
58+
@SuppressWarnings("squid:S6541")
59+
private void applyValue(DataColumn column, Document doc, Object value) {
60+
String columnName = column.name();
61+
DataColumn.Type type = column.type();
62+
try {
63+
switch (type) {
64+
case BINARY:
65+
doc.add(new StoredField(columnName, (byte[]) value));
66+
break;
67+
case BYTE:
68+
doc.add(new IntPoint(columnName, ((Byte) value).intValue()));
69+
doc.add(new StoredField(columnName, ((Byte) value).intValue()));
70+
break;
71+
case BOOLEAN:
72+
doc.add(new StringField(columnName, value.toString(), Field.Store.YES));
73+
break;
74+
case SHORT:
75+
doc.add(new IntPoint(columnName, ((Short) value).intValue()));
76+
doc.add(new StoredField(columnName, ((Short) value).intValue()));
77+
break;
78+
case INTEGER:
79+
doc.add(new IntPoint(columnName, (Integer) value));
80+
doc.add(new StoredField(columnName, (Integer) value));
81+
break;
82+
case LONG:
83+
doc.add(new LongPoint(columnName, (Long) value));
84+
doc.add(new StoredField(columnName, (Long) value));
85+
break;
86+
case FLOAT:
87+
doc.add(new FloatPoint(columnName, (Float) value));
88+
doc.add(new StoredField(columnName, (Float) value));
89+
break;
90+
case DOUBLE:
91+
doc.add(new DoublePoint(columnName, (Double) value));
92+
doc.add(new StoredField(columnName, (Double) value));
93+
break;
94+
case STRING:
95+
doc.add(new TextField(columnName, (String) value, Field.Store.YES));
96+
break;
97+
case COORDINATE:
98+
Coordinate coord = (Coordinate) value;
99+
double lat = coord.getY();
100+
double lon = coord.getX();
101+
doc.add(new LatLonPoint(columnName, lat, lon));
102+
doc.add(new StoredField(columnName + "_lat", lat));
103+
doc.add(new StoredField(columnName + "_lon", lon));
104+
break;
105+
case POINT:
106+
Point point = (Point) value;
107+
double pointLat = point.getY();
108+
double pointLon = point.getX();
109+
doc.add(new LatLonPoint(columnName, pointLat, pointLon));
110+
doc.add(new StoredField(columnName + "_lat", pointLat));
111+
doc.add(new StoredField(columnName + "_lon", pointLon));
112+
break;
113+
case LINESTRING, POLYGON, MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, GEOMETRYCOLLECTION, GEOMETRY:
114+
Geometry geometry = (Geometry) value;
115+
if (geometry != null) {
116+
Field[] shapeFields = createShapeFields(columnName, geometry);
117+
for (Field field : shapeFields) {
118+
doc.add(field);
119+
}
120+
doc.add(new StoredField(columnName + "_wkt", geometry.toText()));
121+
}
122+
break;
123+
case ENVELOPE:
124+
Envelope envelope = (Envelope) value;
125+
String envelopeStr = envelope.toString();
126+
doc.add(new StringField(columnName, envelopeStr, Field.Store.YES));
127+
break;
128+
case INET_ADDRESS, INET4_ADDRESS, INET6_ADDRESS:
129+
InetAddress addr = (InetAddress) value;
130+
doc.add(new StringField(columnName, addr.getHostAddress(), Field.Store.YES));
131+
break;
132+
case LOCAL_DATE:
133+
LocalDate date = (LocalDate) value;
134+
doc.add(new StringField(columnName, date.toString(), Field.Store.YES));
135+
break;
136+
case LOCAL_TIME:
137+
LocalTime time = (LocalTime) value;
138+
doc.add(new StringField(columnName, time.toString(), Field.Store.YES));
139+
break;
140+
case LOCAL_DATE_TIME:
141+
LocalDateTime dateTime = (LocalDateTime) value;
142+
doc.add(new StringField(columnName, dateTime.toString(), Field.Store.YES));
143+
break;
144+
case NESTED:
145+
Map<String, Object> map = (Map<String, Object>) value;
146+
for (Map.Entry<String, Object> entry : map.entrySet()) {
147+
String nestedKey = columnName + "." + entry.getKey();
148+
Object nestedValue = entry.getValue();
149+
if (nestedValue != null) {
150+
doc.add(new TextField(nestedKey, nestedValue.toString(), Field.Store.YES));
151+
}
152+
}
153+
break;
154+
default:
155+
doc.add(new StringField(columnName, value.toString(), Field.Store.YES));
156+
break;
157+
}
158+
} catch (Exception e) {
159+
logger.error("Error processing column '{}' with value '{}': {}", columnName, value,
160+
e.getMessage());
161+
}
162+
}
163+
164+
private Field[] createShapeFields(String fieldName, Geometry geometry) {
165+
if (geometry instanceof Point point) {
166+
double lat = point.getY();
167+
double lon = point.getX();
168+
return new Field[] {new LatLonPoint(fieldName, lat, lon)};
169+
} else if (geometry instanceof LineString lineString) {
170+
return LatLonShape.createIndexableFields(fieldName, convertToLuceneLine(lineString));
171+
} else if (geometry instanceof Polygon polygon) {
172+
org.apache.lucene.geo.Polygon lucenePolygon = convertToLucenePolygon(polygon);
173+
return LatLonShape.createIndexableFields(fieldName, lucenePolygon);
174+
} else if (geometry instanceof MultiPolygon multiPolygon) {
175+
return createFieldsFromMultiPolygon(fieldName, multiPolygon);
176+
} else if (geometry instanceof GeometryCollection collection) {
177+
List<Field> fieldList = new ArrayList<>();
178+
for (int i = 0; i < collection.getNumGeometries(); i++) {
179+
Geometry geom = collection.getGeometryN(i);
180+
Field[] fields = createShapeFields(fieldName, geom);
181+
fieldList.addAll(Arrays.asList(fields));
182+
}
183+
return fieldList.toArray(new Field[0]);
184+
} else {
185+
logger.warn("Unsupported geometry type '{}' for field '{}'", geometry.getGeometryType(),
186+
fieldName);
187+
return new Field[0];
188+
}
189+
}
190+
191+
private org.apache.lucene.geo.Line convertToLuceneLine(LineString lineString) {
192+
Coordinate[] coords = lineString.getCoordinates();
193+
double[] lats = new double[coords.length];
194+
double[] lons = new double[coords.length];
195+
for (int i = 0; i < coords.length; i++) {
196+
lats[i] = coords[i].getY();
197+
lons[i] = coords[i].getX();
198+
}
199+
return new org.apache.lucene.geo.Line(lats, lons);
200+
}
201+
202+
private org.apache.lucene.geo.Polygon convertToLucenePolygon(
203+
org.locationtech.jts.geom.Polygon jtsPolygon) {
204+
LinearRing shell = jtsPolygon.getExteriorRing();
205+
Coordinate[] shellCoords = shell.getCoordinates();
206+
double[] lats = new double[shellCoords.length];
207+
double[] lons = new double[shellCoords.length];
208+
for (int i = 0; i < shellCoords.length; i++) {
209+
lats[i] = shellCoords[i].getY();
210+
lons[i] = shellCoords[i].getX();
211+
}
212+
213+
int numHoles = jtsPolygon.getNumInteriorRing();
214+
org.apache.lucene.geo.Polygon[] holes = new org.apache.lucene.geo.Polygon[numHoles];
215+
for (int i = 0; i < numHoles; i++) {
216+
LinearRing hole = jtsPolygon.getInteriorRingN(i);
217+
Coordinate[] holeCoords = hole.getCoordinates();
218+
double[] holeLats = new double[holeCoords.length];
219+
double[] holeLons = new double[holeCoords.length];
220+
for (int j = 0; j < holeCoords.length; j++) {
221+
holeLats[j] = holeCoords[j].getY();
222+
holeLons[j] = holeCoords[j].getX();
223+
}
224+
holes[i] = new org.apache.lucene.geo.Polygon(holeLats, holeLons);
225+
}
226+
227+
return new org.apache.lucene.geo.Polygon(lats, lons, holes);
228+
}
229+
230+
private Field[] createFieldsFromMultiPolygon(String fieldName, MultiPolygon multiPolygon) {
231+
List<Field> fieldList = new ArrayList<>();
232+
for (int i = 0; i < multiPolygon.getNumGeometries(); i++) {
233+
org.locationtech.jts.geom.Polygon polygon =
234+
(org.locationtech.jts.geom.Polygon) multiPolygon.getGeometryN(i);
235+
org.apache.lucene.geo.Polygon lucenePolygon = convertToLucenePolygon(polygon);
236+
Field[] fields = LatLonShape.createIndexableFields(fieldName, lucenePolygon);
237+
fieldList.addAll(Arrays.asList(fields));
238+
}
239+
return fieldList.toArray(new Field[0]);
240+
}
241+
}

0 commit comments

Comments
 (0)