diff --git a/chat-client/src/api/dataset.ts b/chat-client/src/api/dataset.ts index 1579c9e..4f38ae4 100644 --- a/chat-client/src/api/dataset.ts +++ b/chat-client/src/api/dataset.ts @@ -108,9 +108,9 @@ export const deleteDataset = (id: string) => { } interface RenameParams { - fileId: number - newName: string -} + fileId: number + newName: string + } export const renameDocument = (data: RenameParams) => { return request({ url: '/brichat-service/datasetManage/document/rename', diff --git a/chat-client/src/views/permission/esindexManage/index.vue b/chat-client/src/views/permission/esindexManage/index.vue index a8e5fc4..420e8ee 100644 --- a/chat-client/src/views/permission/esindexManage/index.vue +++ b/chat-client/src/views/permission/esindexManage/index.vue @@ -233,16 +233,16 @@ const pollTaskStatus = async (taskId) => { try { const statusRes = await getIndexTask(taskId) console.log(statusRes) - const { status, total, finished } = statusRes.data + const { status, total, finished, failed1, failed2, failed3 } = statusRes.data taskStatus.value = status if (total > 0) progress.value = Math.floor((finished / total) * 100) if (status === "done") { - ElMessage.success("索引构建完成"+`共构建${total}个文档,成功${finished}个`) + ElMessage.success("索引构建完成"+`共构建${total}个文档,成功${finished}个,失败路径不在${failed1}个,文件不存在${failed2}个,已有es${failed3}个`) loadIndexList() } else if (status === "failed") { - ElMessage.error("索引构建失败"+`共构建${total}个文档,成功${finished}个`) + ElMessage.success("索引构建完成"+`共构建${total}个文档,成功${finished}个,失败路径不在${failed1}个,文件不存在${failed2}个,已有es${failed3}个`) } else { // 继续轮询 timer = setTimeout(() => pollTaskStatus(taskId), 200) diff --git a/chat-server/src/main/java/com/bjtds/brichat/controller/KnowledgeBaseController.java b/chat-server/src/main/java/com/bjtds/brichat/controller/KnowledgeBaseController.java index 2ebe60e..58dbab0 100644 --- a/chat-server/src/main/java/com/bjtds/brichat/controller/KnowledgeBaseController.java +++ b/chat-server/src/main/java/com/bjtds/brichat/controller/KnowledgeBaseController.java @@ -139,6 +139,12 @@ public class KnowledgeBaseController { result.put("status", status); result.put("total", total); result.put("finished", finished); + String failed1 = redisTemplate.opsForValue().get("import:task:" + taskId + ":failed1"); + String failed2 = redisTemplate.opsForValue().get("import:task:" + taskId + ":failed2"); + String failed3 = redisTemplate.opsForValue().get("import:task:" + taskId + ":failed3"); + result.put("failed1", failed1); + result.put("failed2", failed2); + result.put("failed3", failed3); return ResultUtils.success(result); } // @ApiOperation("删除索引下的文件") diff --git a/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java b/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java index 6231239..1013547 100644 --- a/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java +++ b/chat-server/src/main/java/com/bjtds/brichat/service/impl/EsTDatasetFilesServiceImpl.java @@ -90,16 +90,12 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { // ================= 多线程索引 ================= @Override public void addDoc(TDatasetFiles doc) throws IOException { -// 先检查索引中是否已存在该文件 -// if (existsDoc(doc.getFilePath())) { -// System.out.println("文件已存在索引中,跳过: " + doc.getFilePath()); -// return; -// } - File file = new File(doc.getDifyStoragePath()); int cpuThreads = Runtime.getRuntime().availableProcessors(); ExecutorService executor = Executors.newFixedThreadPool(cpuThreads); List> futures = new ArrayList<>(); + List chunkSizes = new ArrayList<>(); // 存放每个分片大小 + try { boolean[] splitted = {false}; @@ -107,37 +103,40 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { try { EsFileSplitter.streamSplitFile(file, chunk -> { splitted[0] = true; // 标记拆分成功 + chunkSizes.add((long) chunk.length()); // 记录分片大小 + + TDatasetFiles subDoc = new TDatasetFiles( + doc.getId(), + doc.getName(), + chunk, + doc.getType(), + doc.getParentId(), + doc.getPath(), + doc.getSize(), + doc.getOwnerId(), + doc.getCreatedAt(), + doc.getUpdatedAt(), + doc.getIsDeleted(), + doc.getMimeType(), + doc.getExtension(), + doc.getDifyDocId(), + doc.getIndexingStatus(), + doc.getDifyDatasetId(), + doc.getDifyStoragePath(), + doc.getSourceUrl(), + doc.getIsDeep(), + true + ); + + // 异步索引 futures.add(executor.submit(() -> { try { - TDatasetFiles subDoc = new TDatasetFiles( - doc.getId(), - doc.getName(), - chunk, - doc.getType(), - doc.getParentId(), - doc.getPath(), - doc.getSize(), - doc.getOwnerId(), - doc.getCreatedAt(), - doc.getUpdatedAt(), - doc.getIsDeleted(), - doc.getMimeType(), - doc.getExtension(), - doc.getDifyDocId(), - doc.getIndexingStatus(), - doc.getDifyDatasetId(), - doc.getDifyStoragePath(), - doc.getSourceUrl(), - doc.getIsDeep(), - true - + client.index(req -> req + .index(subDoc.getDifyDatasetId()) + .id(subDoc.getId() + "_" + UUID.randomUUID()) + .document(subDoc) ); - client.index(req -> req - .index(doc.getDifyDatasetId()) - .id(subDoc.getId().toString()) - .document(subDoc) - ); - log.info("异步添加文档分片到索引 fileId={} 索引构建成功 docName={}", subDoc.getId(), subDoc.getName()); + log.info("异步添加文档分片到索引 fileId={} docName={}", subDoc.getId(), subDoc.getName()); } catch (IOException e) { throw new RuntimeException("分片索引失败", e); } @@ -147,6 +146,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { splitted[0] = false; } + // 如果没有拆分成功且文件较小,直接索引整个文件 long maxSize = 1024 * 1024; // 1MB if (!splitted[0] && file.length() <= maxSize) { String content = new String(java.nio.file.Files.readAllBytes(file.toPath())); @@ -173,15 +173,17 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { true ); + client.index(req -> req + .index(singleDoc.getDifyDatasetId()) + .id(singleDoc.getId() + "_" + UUID.randomUUID()) + .document(singleDoc) + ); + log.info("Single文档索引成功 fileId={} docName={}", singleDoc.getId(), singleDoc.getName()); - client.index(req -> req - .index(singleDoc.getDifyDatasetId()) - .id(singleDoc.getId().toString()) - .document(singleDoc) - ); - log.info("Single异步添加文档分片到索引 fileId={} 索引构建成功 docName={}", singleDoc.getId(), singleDoc.getName()); + chunkSizes.add(file.length()); // 单文件也算一个分片 } + // 等待所有异步任务完成 for (Future f : futures) { try { f.get(); @@ -190,8 +192,14 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { } } - // System.out.println("文档索引完成: " + doc.getName()); - log.info("addDoc文档索引完成: 知识库id={} docName={}", doc.getDifyDatasetId(), doc.getName()); + // 打印每个分片大小 + for (int i = 0; i < chunkSizes.size(); i++) { + log.error("文件 {} 分片 {} 大小 = {} 字节", doc.getName(), i + 1, chunkSizes.get(i)); + } + + // 打印总共拆分份数 + log.info("文件 {} 总共被拆分成 {} 份", doc.getName(), chunkSizes.size()); + log.info("addDoc文档索引完成: 知识库id={} docName={}", doc.getDifyDatasetId(), doc.getName()); } finally { executor.shutdown(); } @@ -223,11 +231,38 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { }, executor)) .collect(Collectors.toList()); - return futures.stream() + // 收集所有结果(带原始分数) + List allResults = futures.stream() .map(CompletableFuture::join) .flatMap(List::stream) .collect(Collectors.toList()); + if (allResults.isEmpty()) { + return allResults; + } + + // 统一归一化处理 + List scores = allResults.stream() + .map(r -> Double.parseDouble(r.getScore())) + .collect(Collectors.toList()); + + double minScore = scores.stream().min(Double::compare).orElse(0.0); + double maxScore = scores.stream().max(Double::compare).orElse(1.0); + double epsilon = 1e-6; + double lower = 0.1, upper = 0.95; + + Random random = new Random(); + + for (RecordDto recordDto : allResults) { + double rawScore = Double.parseDouble(recordDto.getScore()); + double normalizedScore = normalizeScore(rawScore, minScore, maxScore, lower, upper, epsilon, random); + recordDto.setScore(String.format("%.4f", normalizedScore)); + + log.warn("文件 {} 归一化前: {} 归一化后: {}", recordDto.getRetrievalDto().getName(), rawScore, normalizedScore); + } + + return allResults; + } finally { executor.shutdown(); } @@ -244,8 +279,8 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { SearchResponse response = client.search(s -> s .index(datasetId) .query(q -> q.bool(b -> b - .should(s1 -> s1.match(m -> m.field("name").query(keyword))) - .should(s2 -> s2.match(m -> m.field("content").query(keyword))) + .should(s1 -> s1.match(m -> m.field("name").query(keyword).analyzer("ik_smart"))) + .should(s2 -> s2.match(m -> m.field("content").query(keyword).analyzer("ik_smart"))) )) .size(500) .highlight(h -> h @@ -256,28 +291,15 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { ); Map uniqueResults = new LinkedHashMap<>(); - List scores = response.hits().hits().stream() - .map(hit -> hit.score() != null ? hit.score() : 0.0) - .collect(Collectors.toList()); - - double minScore = scores.stream().min(Double::compare).orElse(0.0); - double maxScore = scores.stream().max(Double::compare).orElse(1.0); - double epsilon = 1e-6; - double lower = 0.05, upper = 0.98; - - Random random = new Random(); - int index = 0; for (Hit hit : response.hits().hits()) { TDatasetFiles d = hit.source(); - double rawScore = scores.get(index++); - log.info("Score: {}", rawScore); - double normalizedScore = normalizeScore(rawScore, minScore, maxScore, lower, upper, epsilon, random); + double rawScore = hit.score() != null ? hit.score() : 0.0; + log.info("Raw score: {}", rawScore); // 高亮内容 - String content = String.join(" ... ", hit.highlight().getOrDefault("content", Collections.emptyList())); - System.out.println("content: " + content); String datasetName = difyDatasetsMapper.getDatasetNameById(datasetId); + RetrievalDto retrievalDto = new RetrievalDto( d.getId() != null ? d.getId().toString() : null, d.getName(), @@ -290,7 +312,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { RecordDto recordDto = new RecordDto(); recordDto.setRetrievalDto(retrievalDto); - recordDto.setScore(String.format("%.4f", normalizedScore)); + recordDto.setScore(String.valueOf(rawScore)); // 保存原始分数 // 按 name 去重,只保留第一个 uniqueResults.putIfAbsent(d.getName(), recordDto); @@ -301,6 +323,7 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { + @Override public Pagination getAllIndexInfos(Integer pageNo, Integer pageSize, String keyword) throws IOException { List indexInfos = new ArrayList<>(); @@ -370,33 +393,42 @@ public class EsTDatasetFilesServiceImpl implements EsTDatasetFilesService { }); } - private double normalizeScore(double rawScore, double minScore, double maxScore, double lower, double upper, double epsilon, Random random) { - // 计算分数范围 + private double normalizeScore(double rawScore, double minScore, double maxScore, + double lower, double upper, double epsilon, Random random) { double scoreRange = maxScore - minScore; log.warn("Score range: {}", scoreRange); - // 如果最大最小分数相差小于 epsilon,直接使用 upper + // 基础归一化 double normalizedScore = (scoreRange < epsilon) ? upper : lower + (rawScore - minScore) / scoreRange * (upper - lower); - log.warn("rawScore: {}, normalizedScore before fluctuation: {}", rawScore, normalizedScore); + double influence = 0.0; + // 小扰动,让归一化结果受原始分数影响 - // 获取原始分数的小数点后两位 - double integerPart = Math.floor(rawScore); // 获取整数部分 - double decimalPart = rawScore - integerPart; // 获取小数部分 - double decimalPartOneDigit = Math.floor(decimalPart * 10) / 10.0; // 获取小数部分的第一位 - double result = integerPart + decimalPartOneDigit; // 将整数部分和小数点后一位合成 - double decimalPartTwoDigits = (result*10)/ 1000; // 获取小数部分的两位 - log.warn("Raw score decimal part (2 digits): {}", decimalPartTwoDigits); - if(normalizedScore==upper){ - normalizedScore -= (0.1-decimalPartTwoDigits); - }else if(normalizedScore==lower){ - normalizedScore += decimalPartTwoDigits; + if(rawScore>10){ + double maxScoreForMapping = Math.max(rawScore, 50); // 可以根据实际最大分数调整 + influence = 0.03 + (rawScore - 10) / (maxScoreForMapping - 10) * (0.05 - 0.03); + }else { + influence = 0.01 + (rawScore / 10.0) * (0.02 - 0.01); } + + log.warn("Influence: {}", influence); + normalizedScore += influence; + log.warn("Normalized score before limit: {}", normalizedScore); + + // 限制不要超过 upper + if(normalizedScore > 0.99){ + normalizedScore = 0.99; + } + + log.warn("Raw score: {}, normalized score with influence: {}", rawScore, normalizedScore); + return normalizedScore; } + + } diff --git a/chat-server/src/main/java/com/bjtds/brichat/util/EsTDatasetFilesImporter.java b/chat-server/src/main/java/com/bjtds/brichat/util/EsTDatasetFilesImporter.java index c5b9818..9ac87f0 100644 --- a/chat-server/src/main/java/com/bjtds/brichat/util/EsTDatasetFilesImporter.java +++ b/chat-server/src/main/java/com/bjtds/brichat/util/EsTDatasetFilesImporter.java @@ -14,6 +14,7 @@ import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.List; @@ -52,26 +53,40 @@ public class EsTDatasetFilesImporter { redisTemplate.opsForValue().set("import:task:" + taskId + ":status", "failed"); return; } + int total = documents.size(); redisTemplate.opsForValue().set("import:task:" + taskId + ":total", String.valueOf(total)); redisTemplate.opsForValue().set("import:task:" + taskId + ":finished", "0"); + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed1", "0"); + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed2", "0"); + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed3", "0"); redisTemplate.opsForValue().set("import:task:" + taskId + ":status", "processing"); int finished = 0; + int failed_1 = 0; + int failed_2 = 0; + int failed_3 = 0; for (TDatasetFiles document : documents) { if (document == null) continue; String filePath = document.getDifyStoragePath(); if (filePath == null) { - log.warn("documentId=" + document.getId() + " 不存在difyStoragePath,跳过"); + log.error("documentId=" + document.getId() + " 不存在difyStoragePath,跳过"); + failed_1++; + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed1", String.valueOf(failed_1)); continue; } File file = new File(filePath); if (!file.exists()) { - log.warn(file.getAbsolutePath() + " 不存在,跳过"); + log.error("文件不存在: {}", file.getAbsolutePath()); + failed_2++; + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed2", String.valueOf(failed_2)); continue; } + if(Boolean.TRUE.equals(document.getIsEs())){ log.warn("documentId=" + document.getId() + " 是ES索引文件,跳过"); + failed_3++; + redisTemplate.opsForValue().set("import:task:" + taskId + ":failed3", String.valueOf(failed_3)); continue; } diff --git a/chat-server/src/main/resources/com/bjtds/brichat/mapper/opengauss/TDatasetFilesMapper.xml b/chat-server/src/main/resources/com/bjtds/brichat/mapper/opengauss/TDatasetFilesMapper.xml index ccfcc85..0661c87 100644 --- a/chat-server/src/main/resources/com/bjtds/brichat/mapper/opengauss/TDatasetFilesMapper.xml +++ b/chat-server/src/main/resources/com/bjtds/brichat/mapper/opengauss/TDatasetFilesMapper.xml @@ -261,6 +261,7 @@ SELECT FROM t_dataset_files WHERE type = 'file' + and indexing_status='completed' ORDER BY created_at DESC