Skip to content

Commit 63e3594

Browse files
committed
Fix boost compatibility
1 parent ce48741 commit 63e3594

4 files changed

Lines changed: 96 additions & 34 deletions

File tree

extensions/indexes/lucene/pom.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@
211211
<exclude>src/main/java/org/exist/indexing/lucene/LuceneIndexConfig.java</exclude>
212212
<include>src/main/java/org/exist/indexing/lucene/ExistLuceneTextField.java</include>
213213
<exclude>src/main/java/org/exist/xquery/modules/lucene/QueryOptions.java</exclude>
214+
<exclude>src/main/java/org/exist/indexing/lucene/BoostField.java</exclude>
214215
<exclude>src/test/java/org/exist/indexing/lucene/AnalyzerConfigTest.java</exclude>
215216
<exclude>src/test/java/org/exist/indexing/lucene/analyzers/NoDiacriticsStandardAnalyzerTest.java</exclude>
216217
<include>src/test/java/org/exist/indexing/lucene/analyzers/AdHocAnalysersTest.java</include>
@@ -272,6 +273,7 @@
272273
<includes>
273274
<include>src/main/java/org/exist/indexing/lucene/ExistFacetsCollector.java</include>
274275
<include>src/main/java/org/exist/indexing/lucene/ExistLuceneTextField.java</include>
276+
<include>src/main/java/org/exist/indexing/lucene/BoostField.java</include>
275277
<include>src/test/java/org/exist/indexing/lucene/analyzers/NoDiacriticsStandardAnalyzerTest.java</include>
276278
<include>src/test/java/org/exist/indexing/lucene/analyzers/AdHocAnalysersTest.java</include>
277279
</includes>
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright (C) 2024 Evolved Binary Ltd
3+
*
4+
* This code is proprietary and is not Open Source.
5+
*/
6+
7+
package org.exist.indexing.lucene;
8+
9+
import org.apache.lucene.document.Field;
10+
import org.apache.lucene.index.DocValuesType;
11+
import org.apache.lucene.index.IndexOptions;
12+
13+
public class BoostField extends Field {
14+
15+
public static final org.apache.lucene.document.FieldType BOOST_FIELD_TYPE = new org.apache.lucene.document.FieldType();
16+
17+
static {
18+
BOOST_FIELD_TYPE.setIndexOptions(IndexOptions.NONE);
19+
BOOST_FIELD_TYPE.setStored(true);
20+
BOOST_FIELD_TYPE.setTokenized(false);
21+
BOOST_FIELD_TYPE.setStoreTermVectors(false);
22+
BOOST_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC);
23+
BOOST_FIELD_TYPE.freeze();
24+
}
25+
26+
public BoostField(String name, float value) {
27+
super(name, BOOST_FIELD_TYPE);
28+
this.fieldsData = Float.valueOf(value);
29+
}
30+
}

extensions/indexes/lucene/src/main/java/org/exist/indexing/lucene/LuceneIndexWorker.java

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.apache.lucene.facet.taxonomy.SearcherTaxonomyManager;
4747
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
4848
import org.apache.lucene.index.*;
49+
import org.apache.lucene.queries.function.FunctionScoreQuery;
4950
import org.apache.lucene.queryparser.classic.ParseException;
5051
import org.apache.lucene.search.*;
5152
import org.apache.lucene.util.Bits;
@@ -98,7 +99,8 @@ public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {
9899

99100
public static final org.apache.lucene.document.FieldType TYPE_NODE_ID = new org.apache.lucene.document.FieldType();
100101
public static final org.apache.lucene.document.FieldType CONTENT_FIELD_TYPE = new org.apache.lucene.document.FieldType();
101-
102+
public static final org.apache.lucene.document.FieldType NON_XML_STORED_FIELD_TYPE = new org.apache.lucene.document.FieldType();
103+
public static final org.apache.lucene.document.FieldType NON_XML_FIELD_TYPE = new org.apache.lucene.document.FieldType();
102104
static {
103105
TYPE_NODE_ID.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
104106
TYPE_NODE_ID.setStored(false);
@@ -111,6 +113,20 @@ public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {
111113
CONTENT_FIELD_TYPE.setTokenized(true);
112114
CONTENT_FIELD_TYPE.setStoreTermVectors(true);
113115
CONTENT_FIELD_TYPE.freeze();
116+
117+
NON_XML_FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
118+
NON_XML_FIELD_TYPE.setStored(false);
119+
NON_XML_FIELD_TYPE.setTokenized(true);
120+
NON_XML_FIELD_TYPE.setStoreTermVectors(false);
121+
NON_XML_FIELD_TYPE.freeze();
122+
123+
NON_XML_STORED_FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
124+
NON_XML_STORED_FIELD_TYPE.setStored(true);
125+
NON_XML_STORED_FIELD_TYPE.setTokenized(true);
126+
NON_XML_STORED_FIELD_TYPE.setStoreTermVectors(false);
127+
NON_XML_STORED_FIELD_TYPE.freeze();
128+
129+
114130
}
115131

116132
static final Logger LOG = LogManager.getLogger(LuceneIndexWorker.class);
@@ -400,6 +416,7 @@ protected void removeNodes() {
400416
int nodeIdLen = nodeId.size();
401417
byte[] data = new byte[nodeIdLen + 2];
402418
//TODO - Should be rewritten to IntPoint
419+
//
403420
ByteConversion.shortToByte((short) nodeId.units(), data, 0);
404421
nodeId.serialize(data, 2);
405422
Term it = new Term(LuceneUtil.FIELD_NODE_ID, new BytesRef(data));
@@ -460,13 +477,44 @@ public NodeSet query(final int contextId, final DocumentSet docs, @Nullable fina
460477
if (facets.isPresent() && config != null) {
461478
query = drilldown(facets.get(), query, config);
462479
}
480+
query = rewriteBoost(query, field);
481+
463482
searchAndProcess(contextId, qname, docs, contextSet, resultSet,
464483
returnAncestor, searcher, query, config);
465484
}
466485
return resultSet;
467486
});
468487
}
469488

489+
490+
public Query rewriteBoost(Query q, String field) {
491+
if(q instanceof TermQuery) {
492+
var query = (TermQuery) q;
493+
if (query.getTerm().field().equals(field)) {
494+
return new FunctionScoreQuery(query, DoubleValuesSource.fromFloatField(field + "_boost"));
495+
}
496+
} else if(q instanceof WildcardQuery) {
497+
var query = (WildcardQuery) q;
498+
if (query.getField().equals(field)) {
499+
return new FunctionScoreQuery(query, DoubleValuesSource.fromFloatField(field + "_boost"));
500+
}
501+
}else if (q instanceof PhraseQuery) {
502+
var query = (PhraseQuery) q;
503+
if(query.getField().equals(field)) {
504+
return new FunctionScoreQuery(query, DoubleValuesSource.fromFloatField(field + "_boost"));
505+
}
506+
} else if(q instanceof BooleanQuery) {
507+
var query = (BooleanQuery) q;
508+
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder();
509+
for(BooleanClause c : query.clauses()) {
510+
queryBuilder.add(rewriteBoost(c.getQuery(), field), c.getOccur());
511+
}
512+
return queryBuilder.build();
513+
}
514+
return q;
515+
}
516+
517+
470518
/**
471519
* Query the index. Returns a node set containing all matching nodes. Each node
472520
* in the node set has a {@link LuceneMatch}
@@ -676,25 +724,9 @@ public void indexNonXML(NodeValue descriptor) {
676724

677725
// Get name from SOLR field
678726
String contentFieldName = field.getName();
679-
680-
// Actual field content ; Store flag can be set in solrField
681-
// Field contentField = new Field(contentFieldName, field.getData().toString(), store, Field.Index.ANALYZED, Field.TermVector.YES);
682-
683-
//TODO - Refactor this code and create one Field type, maybe reuse what we have.
684-
var gggg = new org.apache.lucene.document.FieldType();
685-
gggg.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
686-
gggg.setStored(store == Field.Store.YES);
687-
gggg.setTokenized(true);
688-
gggg.setStoreTermVectors(false);//TODO - It looks like we should not store term vector // Equivalent to TermVector.YES
689-
// gggg.setStoreTermVectorPositions(true);
690-
// gggg.setStoreTermVectorOffsets(true);
691-
Field contentField = new Field(contentFieldName, field.getData(), gggg);
692-
693-
// Extract (document) Boost factor
694-
// if (field.getBoost() > 0) {
695-
// contentField.setBoost(field.getBoost());
696-
// }
697-
727+
Field contentField = new Field(contentFieldName, field.getData(),
728+
store == Field.Store.YES ? NON_XML_STORED_FIELD_TYPE : NON_XML_FIELD_TYPE
729+
);
698730
pendingDoc.add(contentField);
699731
}
700732
}
@@ -1442,7 +1474,6 @@ private void write() {
14421474
doc.add(fNodeId);
14431475

14441476
// add separate index for node id
1445-
//TODO : Rewrite to the Point API.
14461477
BinaryTokenStream bts = new BinaryTokenStream(new BytesRef(data));
14471478
Field fNodeIdIdx = new Field(LuceneUtil.FIELD_NODE_ID, bts, TYPE_NODE_ID);
14481479
doc.add(fNodeIdIdx);
@@ -1457,13 +1488,20 @@ private void write() {
14571488
else
14581489
contentField = LuceneUtil.encodeQName(pending.qname, index.getBrokerPool().getSymbols());
14591490

1460-
//var fld = new Field(contentField, pending.text.toString(), CONTENT_FIELD_TYPE);
14611491
var fld = new ExistLuceneTextField(contentField, pending.text.toString(), CONTENT_FIELD_TYPE);
14621492
if (pending.idxConf.getAnalyzer() != null) {
14631493
fld.setAnalyzer(pending.idxConf.getAnalyzer());
1464-
//fld.setTokenStream(pending.idxConf.getAnalyzer().tokenStream(fld.name(), fld.stringValue()));
14651494
}
14661495
doc.add(fld);
1496+
1497+
float boost = 1.0f; //Default boost for all fields.
1498+
if (pending.boost > 0) {
1499+
boost = pending.boost;
1500+
} else if (config.getBoost() > 0) {
1501+
boost = config.getBoost();
1502+
}
1503+
final var boostField = new BoostField(contentField + "_boost", boost);
1504+
doc.add(boostField);
14671505
}
14681506
writer.addDocument(config.facetsConfig.build(index.getTaxonomyWriter(), doc));
14691507
}

extensions/indexes/lucene/src/main/java/org/exist/indexing/lucene/LuceneUtil.java

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
import org.apache.lucene.facet.DrillDownQuery;
3737
import org.apache.lucene.index.*;
38+
import org.apache.lucene.queries.function.FunctionScoreQuery;
3839
import org.apache.lucene.search.*;
3940
import org.apache.lucene.util.AttributeSource;
4041
import org.apache.lucene.util.BytesRef;
@@ -69,17 +70,6 @@ public static byte[] createId(final NodeId nodeId) {
6970
return data;
7071
}
7172

72-
public static NodeId readNodeId(final int doc, final BinaryDocValues nodeIdValues, final BrokerPool pool) {
73-
final BytesRef ref;
74-
try {
75-
ref = nodeIdValues.binaryValue();
76-
} catch (IOException e) {
77-
throw new RuntimeException(e); //TODO - Refactor
78-
}
79-
final int units = ByteConversion.byteToShort(ref.bytes, ref.offset);
80-
return pool.getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2);
81-
}
82-
8373
/**
8474
* Encode an element or attribute qname into a lucene field name using the
8575
* internal ids for namespace and local name.
@@ -164,6 +154,8 @@ public static void extractTerms(final Query query, final Map<Object, Query> term
164154
extractTermsFromTermRange((TermRangeQuery) query, terms, reader, includeFields);
165155
} else if (query instanceof DrillDownQuery) {
166156
extractTermsFromDrillDown((DrillDownQuery) query, terms, reader, includeFields);
157+
} else if (query instanceof FunctionScoreQuery) {
158+
extractTerms(((FunctionScoreQuery)query).getWrappedQuery(), terms, reader, includeFields);
167159
} else {
168160
// fallback to Lucene's Query.extractTerms if none of the
169161
// above matches

0 commit comments

Comments
 (0)