修改了分数算法，使得置信度会更加贴合实际

2025-09-26 15:47:05 +08:00 · 2025-09-26 15:47:05 +08:00 · d2804dc013
parent e8a3beb70a
commit d2804dc013
1 changed files with 55 additions and 7 deletions
--- a/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java
@ -27,6 +27,7 @@ import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.*;
 import java.util.stream.Collectors;
 import co.elastic.clients.elasticsearch._types.query_dsl.Operator;
@Service
@ -69,7 +70,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
                    .mappings(m -> m
                            .dynamic(DynamicMapping.False)
                            .properties("name", p -> p.text(t -> t.analyzer("ik_max_word").fields("keyword", f -> f.keyword(k -> k))))
-                            .properties("content", p -> p.text(t -> t.analyzer("ik_max_word").searchAnalyzer("ik_max_word")))
+                            .properties("content", p -> p.text(t -> t.analyzer("ik_max_word")))
                            .properties("dataset_id", p -> p.keyword(k -> k))
                            .properties("source_url", p -> p.keyword(k -> k))
                            .properties("dataset_name", p -> p.keyword(k -> k))
@ -279,11 +280,47 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
        SearchResponse<TDatasetFiles> response = client.search(s -> s
                        .index(datasetId)
                        .query(q -> q.bool(b -> b
-                                .should(s1 -> s1.match(m -> m.field("name").query(keyword).analyzer("ik_smart")))
+                                // 1️⃣ name 字段分词查询（粗粒度）
-                                .should(s2 -> s2.match(m -> m.field("content").query(keyword).analyzer("ik_smart")))
+                                .should(s1 -> s1.match(m -> m
-                                .should(s3 -> s3.match(m -> m.field("name").query(keyword).analyzer("ik_max_word")))
+                                        .field("name")
-                                .should(s4 -> s4.match(m -> m.field("content").query(keyword).analyzer("ik_max_word")))
+                                        .query(keyword)
-                                .should(s5 -> s5.term(t -> t.field("name.keyword").value(keyword)))
+                                        .analyzer("ik_smart")
                                        .boost(4.0f)
                                ))
                                // 2️⃣ name 字段分词查询（细粒度）
                                .should(s2 -> s2.match(m -> m
                                        .field("name")
                                        .query(keyword)
                                        .analyzer("ik_max_word")
                                        .boost(3.0f)
                                ))
                                // 3️⃣ name 字段 operator:AND 精确匹配每个分词
                                .should(s3 -> s3.match(m -> m
                                        .field("name")
                                        .query(keyword)
                                        .operator(Operator.And)
                                        .boost(7.5f)
                                ))
                                // 4️⃣ name.keyword 精确匹配完整字符串
                                .should(s4 -> s4.term(t -> t
                                        .field("name.keyword")
                                        .value(keyword)
                                        .boost(7.0f)
                                ))
                                // 5️⃣ content 字段分词查询（粗粒度）
                                .should(s5 -> s5.match(m -> m
                                        .field("content")
                                        .query(keyword)
                                        .analyzer("ik_smart")
                                        .boost(2.5f)
                                ))
                                // 6️⃣ content 字段分词查询（细粒度）
                                .should(s6 -> s6.match(m -> m
                                        .field("content")
                                        .query(keyword)
                                        .analyzer("ik_max_word")
                                        .boost(1.0f)
                                ))
                        ))
                        .size(500)
                        .highlight(h -> h
@ -293,6 +330,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
                TDatasetFiles.class
        );
        Map<String, RecordDto> uniqueResults = new LinkedHashMap<>();
        for (Hit<TDatasetFiles> hit : response.hits().hits()) {
            TDatasetFiles d = hit.source();
@ -421,7 +459,17 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService {
        // 限制不要超过 upper
        if(normalizedScore > 0.99){
-            normalizedScore = 0.99;
+                double min = 0.98;
                double max = 0.99;
                // 将 rawScore 归一化到 [0,1]
                double factor = Math.min(rawScore / maxScore+1, 1.0);
                // 根据 factor 映射到 [0.98, 0.99) 并加微小随机浮动
                normalizedScore = min + factor * (max - min)
                        + ThreadLocalRandom.current().nextDouble(0, 0.001);
                // 确保不超过 0.9999
                normalizedScore = Math.min(normalizedScore, 0.9999);
        }
        log.warn("Raw score: {}, normalized score with influence: {}", rawScore, normalizedScore);