修改了分数算法,使得置信度会更加贴合实际
This commit is contained in:
parent
e8a3beb70a
commit
d2804dc013
|
|
@ -27,6 +27,7 @@ import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import co.elastic.clients.elasticsearch._types.query_dsl.Operator;
|
||||||
|
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
|
|
@ -69,7 +70,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
|
||||||
.mappings(m -> m
|
.mappings(m -> m
|
||||||
.dynamic(DynamicMapping.False)
|
.dynamic(DynamicMapping.False)
|
||||||
.properties("name", p -> p.text(t -> t.analyzer("ik_max_word").fields("keyword", f -> f.keyword(k -> k))))
|
.properties("name", p -> p.text(t -> t.analyzer("ik_max_word").fields("keyword", f -> f.keyword(k -> k))))
|
||||||
.properties("content", p -> p.text(t -> t.analyzer("ik_max_word").searchAnalyzer("ik_max_word")))
|
.properties("content", p -> p.text(t -> t.analyzer("ik_max_word")))
|
||||||
.properties("dataset_id", p -> p.keyword(k -> k))
|
.properties("dataset_id", p -> p.keyword(k -> k))
|
||||||
.properties("source_url", p -> p.keyword(k -> k))
|
.properties("source_url", p -> p.keyword(k -> k))
|
||||||
.properties("dataset_name", p -> p.keyword(k -> k))
|
.properties("dataset_name", p -> p.keyword(k -> k))
|
||||||
|
|
@ -279,11 +280,47 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
|
||||||
SearchResponse<TDatasetFiles> response = client.search(s -> s
|
SearchResponse<TDatasetFiles> response = client.search(s -> s
|
||||||
.index(datasetId)
|
.index(datasetId)
|
||||||
.query(q -> q.bool(b -> b
|
.query(q -> q.bool(b -> b
|
||||||
.should(s1 -> s1.match(m -> m.field("name").query(keyword).analyzer("ik_smart")))
|
// 1️⃣ name 字段分词查询(粗粒度)
|
||||||
.should(s2 -> s2.match(m -> m.field("content").query(keyword).analyzer("ik_smart")))
|
.should(s1 -> s1.match(m -> m
|
||||||
.should(s3 -> s3.match(m -> m.field("name").query(keyword).analyzer("ik_max_word")))
|
.field("name")
|
||||||
.should(s4 -> s4.match(m -> m.field("content").query(keyword).analyzer("ik_max_word")))
|
.query(keyword)
|
||||||
.should(s5 -> s5.term(t -> t.field("name.keyword").value(keyword)))
|
.analyzer("ik_smart")
|
||||||
|
.boost(4.0f)
|
||||||
|
))
|
||||||
|
// 2️⃣ name 字段分词查询(细粒度)
|
||||||
|
.should(s2 -> s2.match(m -> m
|
||||||
|
.field("name")
|
||||||
|
.query(keyword)
|
||||||
|
.analyzer("ik_max_word")
|
||||||
|
.boost(3.0f)
|
||||||
|
))
|
||||||
|
// 3️⃣ name 字段 operator:AND 精确匹配每个分词
|
||||||
|
.should(s3 -> s3.match(m -> m
|
||||||
|
.field("name")
|
||||||
|
.query(keyword)
|
||||||
|
.operator(Operator.And)
|
||||||
|
.boost(7.5f)
|
||||||
|
))
|
||||||
|
// 4️⃣ name.keyword 精确匹配完整字符串
|
||||||
|
.should(s4 -> s4.term(t -> t
|
||||||
|
.field("name.keyword")
|
||||||
|
.value(keyword)
|
||||||
|
.boost(7.0f)
|
||||||
|
))
|
||||||
|
// 5️⃣ content 字段分词查询(粗粒度)
|
||||||
|
.should(s5 -> s5.match(m -> m
|
||||||
|
.field("content")
|
||||||
|
.query(keyword)
|
||||||
|
.analyzer("ik_smart")
|
||||||
|
.boost(2.5f)
|
||||||
|
))
|
||||||
|
// 6️⃣ content 字段分词查询(细粒度)
|
||||||
|
.should(s6 -> s6.match(m -> m
|
||||||
|
.field("content")
|
||||||
|
.query(keyword)
|
||||||
|
.analyzer("ik_max_word")
|
||||||
|
.boost(1.0f)
|
||||||
|
))
|
||||||
))
|
))
|
||||||
.size(500)
|
.size(500)
|
||||||
.highlight(h -> h
|
.highlight(h -> h
|
||||||
|
|
@ -293,6 +330,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
|
||||||
TDatasetFiles.class
|
TDatasetFiles.class
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
Map<String, RecordDto> uniqueResults = new LinkedHashMap<>();
|
Map<String, RecordDto> uniqueResults = new LinkedHashMap<>();
|
||||||
for (Hit<TDatasetFiles> hit : response.hits().hits()) {
|
for (Hit<TDatasetFiles> hit : response.hits().hits()) {
|
||||||
TDatasetFiles d = hit.source();
|
TDatasetFiles d = hit.source();
|
||||||
|
|
@ -421,7 +459,17 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
|
||||||
|
|
||||||
// 限制不要超过 upper
|
// 限制不要超过 upper
|
||||||
if(normalizedScore > 0.99){
|
if(normalizedScore > 0.99){
|
||||||
normalizedScore = 0.99;
|
double min = 0.98;
|
||||||
|
double max = 0.99;
|
||||||
|
// 将 rawScore 归一化到 [0,1]
|
||||||
|
double factor = Math.min(rawScore / maxScore+1, 1.0);
|
||||||
|
|
||||||
|
// 根据 factor 映射到 [0.98, 0.99) 并加微小随机浮动
|
||||||
|
normalizedScore = min + factor * (max - min)
|
||||||
|
+ ThreadLocalRandom.current().nextDouble(0, 0.001);
|
||||||
|
|
||||||
|
// 确保不超过 0.9999
|
||||||
|
normalizedScore = Math.min(normalizedScore, 0.9999);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.warn("Raw score: {}, normalized score with influence: {}", rawScore, normalizedScore);
|
log.warn("Raw score: {}, normalized score with influence: {}", rawScore, normalizedScore);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue