feat:文件上传是增加source_url文件溯源预览路径

This commit is contained in:
wenjinbo 2025-08-25 11:07:07 +08:00
parent a023719e5c
commit 5faa1aa59b
7 changed files with 255 additions and 27 deletions

View File

@ -0,0 +1,16 @@
package com.bjtds.brichat.entity.dify;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
import java.util.List;
@Data
public class DifyGetMetadatasRes {
@JsonProperty("doc_metadata")
private List<DifyMetadata> docMetadatas;
@JsonProperty("built_in_field_enabled")
private Boolean builtInFieldEnabled;
}

View File

@ -0,0 +1,14 @@
package com.bjtds.brichat.entity.dify;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
import java.util.List;
@Data
public class DifyMatedataAnnoReq {
@JsonProperty("document_id")
private String documentId;
@JsonProperty("metadata_list")
private List<DifyMetadata> metadataList ;
}

View File

@ -0,0 +1,22 @@
package com.bjtds.brichat.entity.dify;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import javax.websocket.OnOpen;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class DifyMetadata {
private String id;
private String type;
private String name;
private String value;
@JsonProperty("use_count")
private Integer useCount;
}

View File

@ -1,8 +1,7 @@
package com.bjtds.brichat.service.dify;
import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
import com.bjtds.brichat.entity.dify.DatasetDto;
import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
import com.bjtds.brichat.entity.dify.*;
import org.springframework.http.ResponseEntity;
import org.springframework.web.multipart.MultipartFile;
@ -24,4 +23,28 @@ public interface DifyDatasetApiService{
ResponseEntity<Map> createDocumentByFile(
DocumentUploadReq request,
MultipartFile file) throws IOException;
/**
* 为知识库添加元数据
* @param datasetId
* @param metadata
* @return
*/
ResponseEntity<DifyMetadata> addMetadata(String datasetId, DifyMetadata metadata);
/***
* 获取知识库元数据列表
* @param datasetId
* @return
*/
ResponseEntity<DifyGetMetadatasRes> getMetadatas(String datasetId);
/***
* 为文档标注元数据
* @param datasetId
* @return
*/
ResponseEntity<Map> setMetadataForDoc(String datasetId, DifyMatedataAnnoReq metadataAnnoReq);
}

View File

@ -1,9 +1,9 @@
package com.bjtds.brichat.service.dify.impl;
import cn.hutool.json.JSONUtil;
import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
import com.bjtds.brichat.entity.dataset.RetrievalModel;
import com.bjtds.brichat.entity.dify.DatasetDto;
import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
import com.bjtds.brichat.entity.dify.*;
import com.bjtds.brichat.entity.dto.PdfConversionResponse;
import com.bjtds.brichat.entity.dto.PdfTaskDto;
import com.bjtds.brichat.mapper.postgresql.DifyDatasetsMapper;
@ -14,11 +14,17 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import io.github.guoshiqiufeng.dify.dataset.DifyDataset;
import io.github.guoshiqiufeng.dify.dataset.dto.request.DatasetInfoRequest;
import io.github.guoshiqiufeng.dify.dataset.dto.response.DatasetInfoResponse;
import io.github.guoshiqiufeng.dify.dataset.dto.response.UploadFileInfoResponse;
import org.apache.commons.io.IOUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,9 +43,7 @@ import org.springframework.web.client.RestTemplate;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@ -85,6 +89,104 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
@Resource
private DifyDatasetsMapper difyDatasetsMapper;
@Override
public ResponseEntity<DifyMetadata> addMetadata(String datasetId, DifyMetadata metadata) {
// 1. 设置请求URL
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
// 2. 设置请求头
HttpHeaders headers = new HttpHeaders();
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
// 3. 构建请求体包含name和description的JSON对象
Map<String, String> requestBody = new HashMap<>();
requestBody.put("type", metadata.getType());
requestBody.put("name", metadata.getName());
// 4. 创建包含请求体和头的HttpEntity
HttpEntity<Map<String, String>> requestEntity = new HttpEntity<>(requestBody, headers);
// 5. 发送POST请求
return restTemplate.exchange(
url,
HttpMethod.POST,
requestEntity,
new ParameterizedTypeReference<DifyMetadata>() {} // 解决泛型类型擦除问题
);
}
@Override
public ResponseEntity<DifyGetMetadatasRes> getMetadatas(String datasetId) {
// 1. 设置请求URL
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
// 2. 设置请求头
HttpHeaders headers = new HttpHeaders();
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
HttpEntity<?> requestEntity = new HttpEntity<>(headers);
return restTemplate.exchange(
url,
HttpMethod.GET,
requestEntity,
new ParameterizedTypeReference<DifyGetMetadatasRes>() {} // 解决泛型类型擦除问题
);
}
@Override
public ResponseEntity<Map> setMetadataForDoc(String datasetId, DifyMatedataAnnoReq metadataAnnoReq) {
// 1. 设置请求URL
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/documents/metadata";
// 2. 设置请求头
HttpHeaders headers = new HttpHeaders();
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
// 3. 构建请求体
Map<String, Object> requestBody = new HashMap<>();
// 构建operation_data数组 - 修复应该是数组格式
List<DifyMatedataAnnoReq> operationDataList = new ArrayList<>();
operationDataList.add(metadataAnnoReq);
requestBody.put("operation_data", operationDataList);
// 4. 创建包含请求体和头的HttpEntity
HttpEntity<Map<String, Object>> requestEntity = new HttpEntity<>(requestBody, headers);
// 5. 发送POST请求 - 修复使用String类型接收响应然后手动处理
try {
ResponseEntity<String> response = restTemplate.exchange(
url,
HttpMethod.POST,
requestEntity,
String.class
);
// 手动构建返回的Map
Map<String, Object> resultMap = new HashMap<>();
resultMap.put("success", true);
resultMap.put("message", "元数据设置成功");
resultMap.put("response", response.getBody());
return ResponseEntity.status(response.getStatusCode()).body(resultMap);
} catch (Exception e) {
logger.error("设置文档元数据失败: {}", e.getMessage(), e);
Map<String, Object> errorMap = new HashMap<>();
errorMap.put("success", false);
errorMap.put("message", "元数据设置失败: " + e.getMessage());
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMap);
}
}
@Override
public ResponseEntity<DatasetDto> createDataset(String name, String description) {
// 1. 设置请求URL
@ -190,6 +292,9 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
// 2. 执行正常文件上传逻辑
return handleNormalFile(request, file);
}
//3 为文件标注预览路径的元数据
}
/**
@ -306,18 +411,62 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
// 发送请求
try {
return restTemplate.exchange(
ResponseEntity<Map> exchange = restTemplate.exchange(
url,
HttpMethod.POST,
requestEntity,
Map.class,
uriVariables
);
//3. 为文档添加元数据
Map<String,String> document = ( Map<String,String> ) exchange.getBody().get("document");
String documentId = document.get("id");
UploadFileInfoResponse uploadFileInfoResponse = difyDatasetService.uploadFileInfo(request.getDatasetId(), documentId);
String urls = uploadFileInfoResponse.getUrl();
String fullUrl = difyUrl + urls;
setSourceUrlMatedata(request.getDatasetId(),documentId,fullUrl);
return exchange;
} catch (Exception e) {
throw new RuntimeException("文档上传失败: " + e.getMessage(), e);
}
}
private void setSourceUrlMatedata(String datasetId,String documentId,String sourceUrl){
try {
//1. 查询知识库是否存在source元数据
ResponseEntity<DifyGetMetadatasRes> res = getMetadatas(datasetId);
List<DifyMetadata> docMetadatas = res.getBody().getDocMetadatas();
DifyMetadata metadata = null;
DifyMatedataAnnoReq metadataAnnoReq = new DifyMatedataAnnoReq();
metadataAnnoReq.setDocumentId(documentId);
if (docMetadatas == null || docMetadatas.isEmpty() ||docMetadatas.stream().noneMatch(m -> "source_url".equals(m.getName()))) {
metadata = addMetadata(datasetId, DifyMetadata.builder().type("string").name("source_url").build()).getBody();
}
if (metadata == null) {
metadata = docMetadatas.stream().map(m -> {
if ("source_url".equals(m.getName())) {
m.setValue(sourceUrl);
}
return m;
}).findFirst().orElse(null);
}
metadata.setValue(sourceUrl);
metadataAnnoReq.setMetadataList(Collections.singletonList(metadata));
ResponseEntity<Map> result = setMetadataForDoc(datasetId, metadataAnnoReq);
logger.info("为文档 {} 设置source_url元数据成功", documentId);
} catch (Exception e) {
// 元数据设置失败不应该影响文档上传的主流程
logger.warn("为文档 {} 设置source_url元数据失败但文档上传成功: {}", documentId, e.getMessage());
}
}
/**
* 调用PDF转换服务
*/
@ -414,8 +563,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
// 读取转换后的文件内容 - Java 8兼容
byte[] docxBytes;
try (java.io.FileInputStream fis = new java.io.FileInputStream(tempFile)) {
docxBytes = org.apache.commons.io.IOUtils.toByteArray(fis);
try (FileInputStream fis = new FileInputStream(tempFile)) {
docxBytes = IOUtils.toByteArray(fis);
}
logger.info("docx4j转换成功: {} -> {} (大小: {} -> {} bytes)",
@ -440,7 +589,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
}
}
} catch (org.docx4j.openpackaging.exceptions.Docx4JException e) {
} catch (Docx4JException e) {
logger.warn("docx4j无法识别文件格式可能是较旧的DOC格式: {}", e.getMessage());
return handleOlderDocFormat(docFile, originalFilename);
} catch (Exception e) {
@ -520,7 +669,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
try {
// 读取DOC文档
try (java.io.InputStream inputStream = docFile.getInputStream()) {
try (InputStream inputStream = docFile.getInputStream()) {
docDocument = new HWPFDocument(inputStream);
}
@ -561,7 +710,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
private void convertDocContentToDocx(HWPFDocument docDocument, XWPFDocument docxDocument) {
try {
// 获取文档范围
org.apache.poi.hwpf.usermodel.Range documentRange = docDocument.getRange();
Range documentRange = docDocument.getRange();
// 按段落处理
int numParagraphs = documentRange.numParagraphs();
@ -569,7 +718,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
for (int i = 0; i < numParagraphs; i++) {
try {
org.apache.poi.hwpf.usermodel.Paragraph hwpfParagraph = documentRange.getParagraph(i);
Paragraph hwpfParagraph = documentRange.getParagraph(i);
String paragraphText = hwpfParagraph.text();
// 跳过空段落和只包含控制字符的段落
@ -585,7 +734,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
int numCharacterRuns = hwpfParagraph.numCharacterRuns();
for (int j = 0; j < numCharacterRuns; j++) {
try {
org.apache.poi.hwpf.usermodel.CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
String runText = characterRun.text();
if (runText != null && !runText.trim().isEmpty()) {
@ -744,7 +893,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
private String extractTextFromDocSimple(MultipartFile docFile) {
try {
// 尝试使用HWPFDocument提取文本最基本的方式
try (java.io.InputStream inputStream = docFile.getInputStream()) {
try (InputStream inputStream = docFile.getInputStream()) {
HWPFDocument docDocument = new HWPFDocument(inputStream);
WordExtractor extractor = new WordExtractor(docDocument);
String text = extractor.getText();
@ -765,7 +914,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
private byte[] convertDocToDocxUsingPOI(MultipartFile docFile) throws Exception {
ByteArrayOutputStream baos = null;
XWPFDocument docxDocument = null;
java.io.InputStream docInputStream = null;
InputStream docInputStream = null;
try {
// 创建输入流
@ -802,7 +951,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
/**
* 从DOC文件中提取文本
*/
private String extractTextFromDoc(java.io.InputStream docInputStream) throws Exception {
private String extractTextFromDoc(InputStream docInputStream) throws Exception {
HWPFDocument docDocument = null;
WordExtractor extractor = null;
@ -811,8 +960,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
if (!docInputStream.markSupported()) {
// 如果不支持mark将流转换为ByteArrayInputStream
// Java 8兼容使用Apache Commons IO读取所有字节
byte[] bytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
docInputStream = new java.io.ByteArrayInputStream(bytes);
byte[] bytes = IOUtils.toByteArray(docInputStream);
docInputStream = new ByteArrayInputStream(bytes);
}
docInputStream.mark(Integer.MAX_VALUE);
@ -842,11 +991,11 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
/**
* 备用的文本提取方法
*/
private String extractTextWithAlternativeMethod(java.io.InputStream docInputStream) throws Exception {
private String extractTextWithAlternativeMethod(InputStream docInputStream) throws Exception {
try {
// 尝试使用更宽松的方式读取
// Java 8兼容使用Apache Commons IO读取所有字节
byte[] docBytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
byte[] docBytes = IOUtils.toByteArray(docInputStream);
// 简单的文本提取 - 寻找可能的文本内容
String content = new String(docBytes, "UTF-8");
@ -905,7 +1054,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
/**
* 安全关闭资源
*/
private void closeResources(java.io.InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
private void closeResources(InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
try {
if (inputStream != null) inputStream.close();
} catch (Exception e) {
@ -972,13 +1121,13 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
}
@Override
public java.io.InputStream getInputStream() throws IOException {
return new java.io.ByteArrayInputStream(content);
public InputStream getInputStream() throws IOException {
return new ByteArrayInputStream(content);
}
@Override
public void transferTo(File dest) throws IOException, IllegalStateException {
try (java.io.FileOutputStream fos = new java.io.FileOutputStream(dest)) {
try (FileOutputStream fos = new FileOutputStream(dest)) {
fos.write(content);
}
}

View File

@ -61,6 +61,8 @@ public class KnowledgeBaseServiceImpl implements KnowledgeBaseService {
List<RecordDto> recordDtos = Lists.newArrayList();
log.info("datasetPath:{}", datasetPath);
log.info("apiKey:{}", apiKey);
//同步查询我需要异步
for (String datasetId : datasetIds) {
List<RecordDto> recordDtoList = RetrievalUtil.getRetrieval(datasetPath, apiKey, datasetId, knowledgeBaseDto);
if (recordDtoList != null && !recordDtoList.isEmpty()) {

View File

@ -58,11 +58,13 @@ public class TraceSourceServiceImpl implements TraceSourceService {
TraceResult traceResult;
try {
String key = Constants.TRACE + Constants.SYMBOL_SEMICOLON + sysMessageId;
Object o = redisTemplate.opsForValue().get(key);
//从redis中获取
traceResult = (TraceResult) redisTemplate.opsForValue().get(key);
traceResult = (TraceResult) o;
if (traceResult != null) {
return traceResult;
}
log.error("溯源文件为空 sysMessageId:{}", sysMessageId);
return null;
} catch (Exception e) {
log.error("溯源文件异常e:", e);