feat:文件上传是增加source_url文件溯源预览路径

2025-08-25 11:07:07 +08:00 · 2025-08-25 11:07:07 +08:00 · 5faa1aa59b
parent a023719e5c
commit 5faa1aa59b
7 changed files with 255 additions and 27 deletions
--- a/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyGetMetadatasRes.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyGetMetadatasRes.java
@ -0,0 +1,16 @@
+package com.bjtds.brichat.entity.dify;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.Data;
+
+import java.util.List;
+@Data
+public class DifyGetMetadatasRes {
+
+    @JsonProperty("doc_metadata")
+    private List<DifyMetadata> docMetadatas;
+
+    @JsonProperty("built_in_field_enabled")
+    private Boolean builtInFieldEnabled;
+
+}
--- a/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyMatedataAnnoReq.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyMatedataAnnoReq.java
@ -0,0 +1,14 @@
+package com.bjtds.brichat.entity.dify;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.Data;
+
+import java.util.List;
+@Data
+public class DifyMatedataAnnoReq {
+
+    @JsonProperty("document_id")
+    private String documentId;
+    @JsonProperty("metadata_list")
+    private List<DifyMetadata> metadataList ;
+}
--- a/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyMetadata.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/entity/dify/DifyMetadata.java
@ -0,0 +1,22 @@
+package com.bjtds.brichat.entity.dify;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import lombok.AllArgsConstructor;
+import lombok.Builder;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+import javax.websocket.OnOpen;
+
+@Data
+@Builder
+@AllArgsConstructor
+@NoArgsConstructor
+public class DifyMetadata {
+    private String id;
+    private String type;
+    private String name;
+    private String value;
+    @JsonProperty("use_count")
+    private Integer useCount;
+}
--- a/chat-server/src/main/java/com/bjtds/brichat/service/dify/DifyDatasetApiService.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/service/dify/DifyDatasetApiService.java
@ -1,8 +1,7 @@
 package com.bjtds.brichat.service.dify;

 import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
-import com.bjtds.brichat.entity.dify.DatasetDto;
-import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
+import com.bjtds.brichat.entity.dify.*;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.multipart.MultipartFile;

@ -24,4 +23,28 @@ public interface DifyDatasetApiService{
   ResponseEntity<Map> createDocumentByFile(
            DocumentUploadReq request,
            MultipartFile file) throws IOException;
+
+    /**
+     * 为知识库添加元数据
+     * @param datasetId
+     * @param metadata
+     * @return
+     */
+    ResponseEntity<DifyMetadata> addMetadata(String datasetId, DifyMetadata metadata);
+
+    /***
+     * 获取知识库元数据列表
+     * @param datasetId
+     * @return
+     */
+    ResponseEntity<DifyGetMetadatasRes> getMetadatas(String datasetId);
+
+
+    /***
+     *  为文档标注元数据
+     * @param datasetId
+     * @return
+     */
+    ResponseEntity<Map> setMetadataForDoc(String datasetId, DifyMatedataAnnoReq metadataAnnoReq);
+
 }
--- a/chat-server/src/main/java/com/bjtds/brichat/service/dify/impl/DifyDatasetApiServiceImpl.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/service/dify/impl/DifyDatasetApiServiceImpl.java
@ -1,9 +1,9 @@
 package com.bjtds.brichat.service.dify.impl;

+import cn.hutool.json.JSONUtil;
 import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
 import com.bjtds.brichat.entity.dataset.RetrievalModel;
-import com.bjtds.brichat.entity.dify.DatasetDto;
-import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
+import com.bjtds.brichat.entity.dify.*;
 import com.bjtds.brichat.entity.dto.PdfConversionResponse;
 import com.bjtds.brichat.entity.dto.PdfTaskDto;
 import com.bjtds.brichat.mapper.postgresql.DifyDatasetsMapper;
@ -14,11 +14,17 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import io.github.guoshiqiufeng.dify.dataset.DifyDataset;
 import io.github.guoshiqiufeng.dify.dataset.dto.request.DatasetInfoRequest;
 import io.github.guoshiqiufeng.dify.dataset.dto.response.DatasetInfoResponse;
+import io.github.guoshiqiufeng.dify.dataset.dto.response.UploadFileInfoResponse;
+import org.apache.commons.io.IOUtils;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Range;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.docx4j.openpackaging.exceptions.Docx4JException;
 import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -37,9 +43,7 @@ import org.springframework.web.client.RestTemplate;
 import org.springframework.web.multipart.MultipartFile;

 import javax.annotation.Resource;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
+import java.io.*;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
@ -85,6 +89,104 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    @Resource
    private DifyDatasetsMapper difyDatasetsMapper;

+
+    @Override
+    public ResponseEntity<DifyMetadata> addMetadata(String datasetId, DifyMetadata metadata) {
+        // 1. 设置请求URL
+        String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
+
+        // 2. 设置请求头
+        HttpHeaders headers = new HttpHeaders();
+        headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
+        headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
+
+        // 3. 构建请求体（包含name和description的JSON对象）
+        Map<String, String> requestBody = new HashMap<>();
+        requestBody.put("type", metadata.getType());
+        requestBody.put("name", metadata.getName());
+        // 4. 创建包含请求体和头的HttpEntity
+        HttpEntity<Map<String, String>> requestEntity = new HttpEntity<>(requestBody, headers);
+
+        // 5. 发送POST请求
+        return restTemplate.exchange(
+                url,
+                HttpMethod.POST,
+                requestEntity,
+                new ParameterizedTypeReference<DifyMetadata>() {} // 解决泛型类型擦除问题
+        );
+
+
+    }
+
+
+    @Override
+    public ResponseEntity<DifyGetMetadatasRes> getMetadatas(String datasetId) {
+
+        // 1. 设置请求URL
+        String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
+
+        // 2. 设置请求头
+        HttpHeaders headers = new HttpHeaders();
+        headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
+        headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
+        HttpEntity<?> requestEntity = new HttpEntity<>(headers);
+        return restTemplate.exchange(
+                url,
+                HttpMethod.GET,
+                requestEntity,
+                new ParameterizedTypeReference<DifyGetMetadatasRes>() {} // 解决泛型类型擦除问题
+        );
+    }
+
+
+    @Override
+    public ResponseEntity<Map> setMetadataForDoc(String datasetId, DifyMatedataAnnoReq metadataAnnoReq) {
+
+        // 1. 设置请求URL
+        String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/documents/metadata";
+        // 2. 设置请求头
+        HttpHeaders headers = new HttpHeaders();
+        headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
+        headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
+
+        // 3. 构建请求体
+        Map<String, Object> requestBody = new HashMap<>();
+
+        // 构建operation_data数组 - 修复：应该是数组格式
+        List<DifyMatedataAnnoReq> operationDataList = new ArrayList<>();
+        operationDataList.add(metadataAnnoReq);
+        requestBody.put("operation_data", operationDataList);
+
+        // 4. 创建包含请求体和头的HttpEntity
+        HttpEntity<Map<String, Object>> requestEntity = new HttpEntity<>(requestBody, headers);
+
+        // 5. 发送POST请求 - 修复：使用String类型接收响应，然后手动处理
+        try {
+            ResponseEntity<String> response = restTemplate.exchange(
+                    url,
+                    HttpMethod.POST,
+                    requestEntity,
+                    String.class
+            );
+            
+            // 手动构建返回的Map
+            Map<String, Object> resultMap = new HashMap<>();
+            resultMap.put("success", true);
+            resultMap.put("message", "元数据设置成功");
+            resultMap.put("response", response.getBody());
+            
+            return ResponseEntity.status(response.getStatusCode()).body(resultMap);
+            
+        } catch (Exception e) {
+            logger.error("设置文档元数据失败: {}", e.getMessage(), e);
+            Map<String, Object> errorMap = new HashMap<>();
+            errorMap.put("success", false);
+            errorMap.put("message", "元数据设置失败: " + e.getMessage());
+            return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMap);
+        }
+    }
+
+
    @Override
    public ResponseEntity<DatasetDto> createDataset(String name, String description) {
        // 1. 设置请求URL
@ -190,6 +292,9 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
            // 2. 执行正常文件上传逻辑
            return handleNormalFile(request, file);
        }
+        //3 为文件标注预览路径的元数据
+
+
    }

    /**
@ -306,18 +411,62 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {

        // 发送请求
        try {
-            return restTemplate.exchange(
+            ResponseEntity<Map> exchange = restTemplate.exchange(
                    url,
                    HttpMethod.POST,
                    requestEntity,
                    Map.class,
                    uriVariables
            );
+
+            //3. 为文档添加元数据
+            Map<String,String> document = (  Map<String,String> ) exchange.getBody().get("document");
+            String documentId = document.get("id");
+
+            UploadFileInfoResponse uploadFileInfoResponse = difyDatasetService.uploadFileInfo(request.getDatasetId(), documentId);
+            String urls = uploadFileInfoResponse.getUrl();
+            String fullUrl = difyUrl + urls;
+            setSourceUrlMatedata(request.getDatasetId(),documentId,fullUrl);
+
+            return  exchange;
        } catch (Exception e) {
            throw new RuntimeException("文档上传失败: " + e.getMessage(), e);
        }
    }

+    private void setSourceUrlMatedata(String datasetId,String documentId,String sourceUrl){
+        try {
+            //1. 查询知识库是否存在source元数据
+            ResponseEntity<DifyGetMetadatasRes> res = getMetadatas(datasetId);
+
+            List<DifyMetadata> docMetadatas = res.getBody().getDocMetadatas();
+            DifyMetadata metadata = null;
+            DifyMatedataAnnoReq metadataAnnoReq = new DifyMatedataAnnoReq();
+            metadataAnnoReq.setDocumentId(documentId);
+            if (docMetadatas == null || docMetadatas.isEmpty() ||docMetadatas.stream().noneMatch(m -> "source_url".equals(m.getName()))) {
+                metadata = addMetadata(datasetId, DifyMetadata.builder().type("string").name("source_url").build()).getBody();
+            }
+            if (metadata == null) {
+                metadata = docMetadatas.stream().map(m -> {
+                    if ("source_url".equals(m.getName())) {
+                        m.setValue(sourceUrl);
+                    }
+                    return m;
+                }).findFirst().orElse(null);
+            }
+            metadata.setValue(sourceUrl);
+            metadataAnnoReq.setMetadataList(Collections.singletonList(metadata));
+            ResponseEntity<Map> result = setMetadataForDoc(datasetId, metadataAnnoReq);
+            logger.info("为文档 {} 设置source_url元数据成功", documentId);
+
+        } catch (Exception e) {
+            // 元数据设置失败不应该影响文档上传的主流程
+            logger.warn("为文档 {} 设置source_url元数据失败，但文档上传成功: {}", documentId, e.getMessage());
+        }
+    }
+
+
+
    /**
     * 调用PDF转换服务
     */
@ -414,8 +563,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
                
                // 读取转换后的文件内容 - Java 8兼容
                byte[] docxBytes;
-                try (java.io.FileInputStream fis = new java.io.FileInputStream(tempFile)) {
-                    docxBytes = org.apache.commons.io.IOUtils.toByteArray(fis);
+                try (FileInputStream fis = new FileInputStream(tempFile)) {
+                    docxBytes = IOUtils.toByteArray(fis);
                }
                
                logger.info("docx4j转换成功: {} -> {} (大小: {} -> {} bytes)", 
@ -440,7 +589,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
                }
            }
            
-        } catch (org.docx4j.openpackaging.exceptions.Docx4JException e) {
+        } catch (Docx4JException e) {
            logger.warn("docx4j无法识别文件格式，可能是较旧的DOC格式: {}", e.getMessage());
            return handleOlderDocFormat(docFile, originalFilename);
        } catch (Exception e) {
@ -520,7 +669,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
        
        try {
            // 读取DOC文档
-            try (java.io.InputStream inputStream = docFile.getInputStream()) {
+            try (InputStream inputStream = docFile.getInputStream()) {
                docDocument = new HWPFDocument(inputStream);
            }
            
@ -561,7 +710,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    private void convertDocContentToDocx(HWPFDocument docDocument, XWPFDocument docxDocument) {
        try {
            // 获取文档范围
-            org.apache.poi.hwpf.usermodel.Range documentRange = docDocument.getRange();
+            Range documentRange = docDocument.getRange();
            
            // 按段落处理
            int numParagraphs = documentRange.numParagraphs();
@ -569,7 +718,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
            
            for (int i = 0; i < numParagraphs; i++) {
                try {
-                    org.apache.poi.hwpf.usermodel.Paragraph hwpfParagraph = documentRange.getParagraph(i);
+                    Paragraph hwpfParagraph = documentRange.getParagraph(i);
                    String paragraphText = hwpfParagraph.text();
                    
                    // 跳过空段落和只包含控制字符的段落
@ -585,7 +734,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
                    int numCharacterRuns = hwpfParagraph.numCharacterRuns();
                    for (int j = 0; j < numCharacterRuns; j++) {
                        try {
-                            org.apache.poi.hwpf.usermodel.CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
+                            CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
                            String runText = characterRun.text();
                            
                            if (runText != null && !runText.trim().isEmpty()) {
@ -744,7 +893,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    private String extractTextFromDocSimple(MultipartFile docFile) {
        try {
            // 尝试使用HWPFDocument提取文本（最基本的方式）
-            try (java.io.InputStream inputStream = docFile.getInputStream()) {
+            try (InputStream inputStream = docFile.getInputStream()) {
                HWPFDocument docDocument = new HWPFDocument(inputStream);
                WordExtractor extractor = new WordExtractor(docDocument);
                String text = extractor.getText();
@ -765,7 +914,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    private byte[] convertDocToDocxUsingPOI(MultipartFile docFile) throws Exception {
        ByteArrayOutputStream baos = null;
        XWPFDocument docxDocument = null;
-        java.io.InputStream docInputStream = null;
+        InputStream docInputStream = null;
        
        try {
            // 创建输入流
@ -802,7 +951,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    /**
     * 从DOC文件中提取文本
     */
-    private String extractTextFromDoc(java.io.InputStream docInputStream) throws Exception {
+    private String extractTextFromDoc(InputStream docInputStream) throws Exception {
        HWPFDocument docDocument = null;
        WordExtractor extractor = null;
        
@ -811,8 +960,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
            if (!docInputStream.markSupported()) {
                // 如果不支持mark，将流转换为ByteArrayInputStream
                // Java 8兼容：使用Apache Commons IO读取所有字节
-                byte[] bytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
-                docInputStream = new java.io.ByteArrayInputStream(bytes);
+                byte[] bytes = IOUtils.toByteArray(docInputStream);
+                docInputStream = new ByteArrayInputStream(bytes);
            }
            
            docInputStream.mark(Integer.MAX_VALUE);
@ -842,11 +991,11 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    /**
     * 备用的文本提取方法
     */
-    private String extractTextWithAlternativeMethod(java.io.InputStream docInputStream) throws Exception {
+    private String extractTextWithAlternativeMethod(InputStream docInputStream) throws Exception {
        try {
            // 尝试使用更宽松的方式读取
            // Java 8兼容：使用Apache Commons IO读取所有字节
-            byte[] docBytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
+            byte[] docBytes = IOUtils.toByteArray(docInputStream);
            
            // 简单的文本提取 - 寻找可能的文本内容
            String content = new String(docBytes, "UTF-8");
@ -905,7 +1054,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
    /**
     * 安全关闭资源
     */
-    private void closeResources(java.io.InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
+    private void closeResources(InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
        try {
            if (inputStream != null) inputStream.close();
        } catch (Exception e) {
@ -972,13 +1121,13 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
        }

        @Override
-        public java.io.InputStream getInputStream() throws IOException {
-            return new java.io.ByteArrayInputStream(content);
+        public InputStream getInputStream() throws IOException {
+            return new ByteArrayInputStream(content);
        }

        @Override
        public void transferTo(File dest) throws IOException, IllegalStateException {
-            try (java.io.FileOutputStream fos = new java.io.FileOutputStream(dest)) {
+            try (FileOutputStream fos = new FileOutputStream(dest)) {
                fos.write(content);
            }
        }
--- a/chat-server/src/main/java/com/bjtds/brichat/service/impl/KnowledgeBaseServiceImpl.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/service/impl/KnowledgeBaseServiceImpl.java
@ -61,6 +61,8 @@ public class KnowledgeBaseServiceImpl implements KnowledgeBaseService {
        List<RecordDto> recordDtos = Lists.newArrayList();
        log.info("datasetPath:{}", datasetPath);
        log.info("apiKey:{}", apiKey);
+
+        //同步查询，我需要异步
        for (String datasetId : datasetIds) {
            List<RecordDto> recordDtoList = RetrievalUtil.getRetrieval(datasetPath, apiKey, datasetId, knowledgeBaseDto);
            if (recordDtoList != null && !recordDtoList.isEmpty()) {
--- a/chat-server/src/main/java/com/bjtds/brichat/service/impl/TraceSourceServiceImpl.java
+++ b/chat-server/src/main/java/com/bjtds/brichat/service/impl/TraceSourceServiceImpl.java
@ -58,11 +58,13 @@ public class TraceSourceServiceImpl implements TraceSourceService {
        TraceResult traceResult;
        try {
            String key = Constants.TRACE + Constants.SYMBOL_SEMICOLON + sysMessageId;
+            Object o = redisTemplate.opsForValue().get(key);
            //从redis中获取
-            traceResult = (TraceResult) redisTemplate.opsForValue().get(key);
+            traceResult = (TraceResult) o;
            if (traceResult != null) {
                return traceResult;
            }
+            log.error("溯源文件为空 sysMessageId:{}", sysMessageId);
            return null;
        } catch (Exception e) {
            log.error("溯源文件异常e:", e);